[x265] [PATCH 19 of 29] 16bpp: updated asm and unit test code for intrapred_ang_8x8, 16x16 and 32x32 mode 2 to 34

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Tue Jan 13 08:11:27 CET 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1421129284 -19800
#      Tue Jan 13 11:38:04 2015 +0530
# Node ID 10b8acec46ea4cdc43023b95e379588e86438601
# Parent  38f8903130fddbe71688f17f630d856adf9faaba
16bpp: updated asm and unit test code for intrapred_ang_8x8, 16x16 and 32x32 mode 2 to 34

diff -r 38f8903130fd -r 10b8acec46ea source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Jan 02 18:16:26 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Jan 13 11:38:04 2015 +0530
@@ -958,6 +958,11 @@
     p.intra_pred[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _ ## cpu; \
     p.intra_pred[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _ ## cpu;
 
+#define SETUP_INTRA_ANG_HIGH(mode, fno, cpu) \
+    p.intra_pred_new[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _new_ ## cpu; \
+    p.intra_pred_new[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _new_ ## cpu; \
+    p.intra_pred_new[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _new_ ## cpu;
+
 #define SETUP_INTRA_ANG4(mode, fno, cpu) \
     p.intra_pred[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _ ## cpu;
 
@@ -1047,6 +1052,23 @@
     SETUP_INTRA_ANG4(32, 4, cpu); \
     SETUP_INTRA_ANG4(33, 3, cpu);
 
+#define INTRA_ANG_SSE4_HIGH_NEW(cpu) \
+    SETUP_INTRA_ANG_HIGH(19, 19, cpu); \
+    SETUP_INTRA_ANG_HIGH(20, 20, cpu); \
+    SETUP_INTRA_ANG_HIGH(21, 21, cpu); \
+    SETUP_INTRA_ANG_HIGH(22, 22, cpu); \
+    SETUP_INTRA_ANG_HIGH(23, 23, cpu); \
+    SETUP_INTRA_ANG_HIGH(24, 24, cpu); \
+    SETUP_INTRA_ANG_HIGH(25, 25, cpu); \
+    SETUP_INTRA_ANG_HIGH(26, 26, cpu); \
+    SETUP_INTRA_ANG_HIGH(27, 27, cpu); \
+    SETUP_INTRA_ANG_HIGH(28, 28, cpu); \
+    SETUP_INTRA_ANG_HIGH(29, 29, cpu); \
+    SETUP_INTRA_ANG_HIGH(30, 30, cpu); \
+    SETUP_INTRA_ANG_HIGH(31, 31, cpu); \
+    SETUP_INTRA_ANG_HIGH(32, 32, cpu); \
+    SETUP_INTRA_ANG_HIGH(33, 33, cpu);
+
 #define INTRA_ANG_SSE4(cpu) \
     SETUP_INTRA_ANG4_8(19, 17, cpu); \
     SETUP_INTRA_ANG4_8(20, 16, cpu); \
@@ -1431,6 +1453,7 @@
         p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
 
         INTRA_ANG_SSSE3(ssse3);
+        INTRA_ANG_SSSE3_NEW(ssse3);
 
         p.dst4x4 = x265_dst4_ssse3;
         p.cu[BLOCK_8x8].idct = x265_idct8_ssse3;
@@ -1468,6 +1491,9 @@
         INTRA_ANG_SSE4_COMMON(sse4);
         INTRA_ANG_SSE4_HIGH(sse4);
 
+        INTRA_ANG_SSE4_COMMON_NEW(sse4);
+        INTRA_ANG_SSE4_HIGH_NEW(sse4);
+
         p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_sse4;
 #if X86_64
         p.cu[BLOCK_8x8].psy_cost_pp = x265_psyCost_pp_8x8_sse4;
diff -r 38f8903130fd -r 10b8acec46ea source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Fri Jan 02 18:16:26 2015 +0530
+++ b/source/common/x86/intrapred16.asm	Tue Jan 13 11:38:04 2015 +0530
@@ -4931,6 +4931,3643 @@
 
     RET
 
+;-----------------------------------------------------------------------------------------
+; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
+;-----------------------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal intra_pred_ang8_2_new, 3,5,3
+    lea         r4,            [r2]
+    add         r2,            32
+    cmp         r3m,           byte 34
+    cmove       r2,            r4
+    add         r1,            r1
+    lea         r3,            [r1 * 3]
+    movu        m0,            [r2 + 4]
+    movu        m1,            [r2 + 20]
+    movu        [r0],          m0
+    palignr     m2,            m1, m0, 2
+    movu        [r0 + r1],     m2
+    palignr     m2,            m1, m0, 4
+    movu        [r0 + r1 * 2], m2
+    palignr     m2,            m1, m0, 6
+    movu        [r0 + r3],     m2
+    lea         r0,            [r0 + r1 * 4]
+    palignr     m2,            m1, m0, 8
+    movu        [r0],          m2
+    palignr     m2,            m1, m0, 10
+    movu        [r0 + r1],     m2
+    palignr     m2,            m1, m0, 12
+    movu        [r0 + r1 * 2], m2
+    palignr     m1,            m0, 14
+    movu        [r0 + r3],     m1
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang8_3_new, 3,5,8
+    add         r2,        32
+    lea         r3,        [ang_table + 14 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
+    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
+    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
+
+    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
+    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
+    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
+    punpckhwd   m1,        m4                         ; [x 16 16 15 15 14 14 13]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 12 * 16]             ; [26]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 12 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m2,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
+    pmaddwd     m2,        [r3 + 6 * 16]              ; [20]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    palignr     m6,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    pmaddwd     m6,        [r3 + 6 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m2,        m6
+
+    palignr     m6,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
+    pmaddwd     m6,        [r3]                       ; [14]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    palignr     m7,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    pmaddwd     m7,        [r3]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    palignr     m7,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
+    pmaddwd     m7,        [r3 - 6 * 16]              ; [ 8]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m3,        m5, m0, 12                 ; [12 11 11 10 10 9 9 8]
+    pmaddwd     m3,        [r3 - 6 * 16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    packusdw    m7,        m3
+
+    punpckhwd   m3,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m7
+    punpcklwd   m6,        m7
+
+    punpckldq   m7,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m3, m2
+    punpckhdq   m3,        m2
+
+    lea         r4,              [r1 * 3]
+    movh        [r0],            m7
+    movhps      [r0 + r1],       m7
+    movh        [r0 + r1 * 2],   m4
+    movhps      [r0 + r4],       m4
+    lea         r2,              [r0 + r1 * 4]
+    movh        [r2],            m6
+    movhps      [r2 + r1],       m6
+    movh        [r2 + r1 * 2],   m3
+    movhps      [r2 + r4],       m3
+
+    mova        m4,        m0
+    pmaddwd     m4,        [r3 - 12 * 16]             ; [ 2]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m5
+    pmaddwd     m2,        [r3 - 12 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 14 * 16]             ; [28]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m6,        m5
+    pmaddwd     m6,        [r3 + 14 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m2,        m6
+
+    palignr     m6,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    pmaddwd     m6,        [r3 + 8 * 16]              ; [22]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    palignr     m7,        m1, m5, 4                  ; [14 13 13 12 12 11 11 10]
+    pmaddwd     m7,        [r3 + 8 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    palignr     m7,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    pmaddwd     m7,        [r3 + 2 * 16]              ; [16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m1,        m5, 8                      ; [15 14 14 13 13 12 12 11]
+    pmaddwd     m1,        [r3 + 2 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    punpckhwd   m3,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m7
+    punpcklwd   m6,        m7
+
+    punpckldq   m7,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m3, m2
+    punpckhdq   m3,        m2
+
+    movh        [r0 + 8],            m7
+    movhps      [r0 + r1 + 8],       m7
+    movh        [r0 + r1 * 2 + 8],   m4
+    movhps      [r0 + r4 + 8],       m4
+    lea         r0,                  [r0 + r1 * 4]
+    movh        [r0 + 8],            m6
+    movhps      [r0 + r1 + 8],       m6
+    movh        [r0 + r1 * 2 + 8],   m3
+    movhps      [r0 + r4 + 8],       m3
+    RET
+
+cglobal intra_pred_ang8_4_new, 3,6,8
+    add         r2,        32
+    lea         r3,        [ang_table + 19 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
+    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
+    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
+
+    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
+    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
+    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 2 * 16]              ; [21]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 2 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m2,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
+    mova        m6,        m2
+    pmaddwd     m2,        [r3 - 9 * 16]              ; [10]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    mova        m7,        m1
+    pmaddwd     m1,        [r3 - 9 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    pmaddwd     m6,        [r3 + 12 * 16]             ; [31]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    pmaddwd     m7,        [r3 + 12 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    palignr     m7,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
+    pmaddwd     m7,        [r3 + 1 * 16]              ; [20]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m1,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    pmaddwd     m1,        [r3 + 1 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    punpckhwd   m1,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m7
+    punpcklwd   m6,        m7
+
+    punpckldq   m7,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m1, m2
+    punpckhdq   m1,        m2
+
+    lea         r4,              [r1 * 3]
+    movh        [r0],            m7
+    movhps      [r0 + r1],       m7
+    movh        [r0 + r1 * 2],   m4
+    movhps      [r0 + r4],       m4
+    lea         r5,              [r0 + r1 * 4]
+    movh        [r5],            m6
+    movhps      [r5 + r1],       m6
+    movh        [r5 + r1 * 2],   m1
+    movhps      [r5 + r4],       m1
+
+    palignr     m4,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
+    mova        m2,        m4
+    pmaddwd     m4,        [r3 - 10 * 16]             ; [ 9]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    palignr     m3,        m5, m0, 12                 ; [12 11 11 10 10 9 9 8]
+    mova        m6,        m3
+    pmaddwd     m3,        [r3 - 10 * 16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    packusdw    m4,        m3
+
+    pmaddwd     m2,        [r3 + 11 * 16]             ; [30]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    pmaddwd     m6,        [r3 + 11 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m2,        m6
+
+    mova        m6,        m0
+    pmaddwd     m6,        [r3]                       ; [19]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m7,        m5
+    pmaddwd     m7,        [r3]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    movh        m1,        [r2 + 26]                  ; [16 15 14 13]
+    palignr     m7,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    pmaddwd     m7,        [r3 - 11 * 16]             ; [8]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m1,        m5, 4                      ; [14 13 13 12 12 11 11 10]
+    pmaddwd     m1,        [r3 - 11 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    punpckhwd   m3,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m7
+    punpcklwd   m6,        m7
+
+    punpckldq   m7,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m3, m2
+    punpckhdq   m3,        m2
+
+    movh        [r0 + 8],            m7
+    movhps      [r0 + r1 + 8],       m7
+    movh        [r0 + r1 * 2 + 8],   m4
+    movhps      [r0 + r4 + 8],       m4
+    lea         r0,                  [r0 + r1 * 4]
+    movh        [r0 + 8],            m6
+    movhps      [r0 + r1 + 8],       m6
+    movh        [r0 + r1 * 2 + 8],   m3
+    movhps      [r0 + r4 + 8],       m3
+    RET
+
+cglobal intra_pred_ang8_5_new, 3,5,8
+    add         r2,        32
+    lea         r3,        [ang_table + 13 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
+    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
+    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
+
+    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
+    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
+    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 4 * 16]              ; [17]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 4 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m2,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
+    mova        m6,        m2
+    pmaddwd     m2,        [r3 - 11 * 16]             ; [2]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    mova        m7,        m1
+    pmaddwd     m1,        [r3 - 11 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    pmaddwd     m6,        [r3 + 6 * 16]              ; [19]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    pmaddwd     m7,        [r3 + 6 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    palignr     m7,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
+    pmaddwd     m7,        [r3 - 9 * 16]              ; [4]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m1,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    pmaddwd     m1,        [r3 - 9 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    punpckhwd   m1,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m7
+    punpcklwd   m6,        m7
+
+    punpckldq   m7,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m1, m2
+    punpckhdq   m1,        m2
+
+    lea         r4,              [r1 * 3]
+    movh        [r0],            m7
+    movhps      [r0 + r1],       m7
+    movh        [r0 + r1 * 2],   m4
+    movhps      [r0 + r4],       m4
+    lea         r2,              [r0 + r1 * 4]
+    movh        [r2],            m6
+    movhps      [r2 + r1],       m6
+    movh        [r2 + r1 * 2],   m1
+    movhps      [r2 + r4],       m1
+
+    palignr     m4,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
+    pmaddwd     m4,        [r3 + 8 * 16]              ; [21]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    palignr     m2,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    pmaddwd     m2,        [r3 + 8 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m2,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
+    mova        m6,        m2
+    pmaddwd     m2,        [r3 - 7 * 16]              ; [6]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    palignr     m1,        m5, m0, 12                 ; [12 11 11 10 10 9 9 8]
+    mova        m7,        m1
+    pmaddwd     m1,        [r3 - 7 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    pmaddwd     m6,        [r3 + 10 * 16]             ; [23]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    pmaddwd     m7,        [r3 + 10 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    mova        m7,        m0
+    pmaddwd     m7,        [r3 - 5 * 16]              ; [8]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m5
+    pmaddwd     m1,        [r3 - 5 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    punpckhwd   m3,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m7
+    punpcklwd   m6,        m7
+
+    punpckldq   m7,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m3, m2
+    punpckhdq   m3,        m2
+
+    movh        [r0 + 8],            m7
+    movhps      [r0 + r1 + 8],       m7
+    movh        [r0 + r1 * 2 + 8],   m4
+    movhps      [r0 + r4 + 8],       m4
+    lea         r0,                  [r0 + r1 * 4]
+    movh        [r0 + 8],            m6
+    movhps      [r0 + r1 + 8],       m6
+    movh        [r0 + r1 * 2 + 8],   m3
+    movhps      [r0 + r4 + 8],       m3
+    RET
+
+cglobal intra_pred_ang8_6_new, 3,5,8
+    add         r2,        32
+    lea         r3,        [ang_table + 14 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
+    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
+    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
+
+    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
+    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
+    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 - 1 * 16]              ; [13]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 - 1 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 + 12 * 16]             ; [26]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 12 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    palignr     m6,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
+    mova        m7,        m6
+    pmaddwd     m6,        [r3 - 7 * 16]              ; [7]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    pmaddwd     m1,        [r3 - 7 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m7,        [r3 + 6 * 16]              ; [20]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    pmaddwd     m1,        [r3 + 6 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    punpckhwd   m1,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m7
+    punpcklwd   m6,        m7
+
+    punpckldq   m7,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m1, m2
+    punpckhdq   m1,        m2
+
+    lea         r4,              [r1 * 3]
+    movh        [r0],            m7
+    movhps      [r0 + r1],       m7
+    movh        [r0 + r1 * 2],   m4
+    movhps      [r0 + r4],       m4
+    lea         r2,              [r0 + r1 * 4]
+    movh        [r2],            m6
+    movhps      [r2 + r1],       m6
+    movh        [r2 + r1 * 2],   m1
+    movhps      [r2 + r4],       m1
+
+    palignr     m4,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
+    mova        m6,        m4
+    pmaddwd     m4,        [r3 - 13 * 16]             ; [1]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    palignr     m2,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    mova        m7,        m2
+    pmaddwd     m2,        [r3 - 13 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    pmaddwd     m2,        m6, [r3]                   ; [14]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    pmaddwd     m1,        m7, [r3]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    pmaddwd     m6,        [r3 + 13 * 16]             ; [27]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    pmaddwd     m7,        [r3 + 13 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    palignr     m7,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
+    pmaddwd     m7,        [r3 - 6 * 16]              ; [8]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m5,        m0, 12                     ; [12 11 11 10 10 9 9 8]
+    pmaddwd     m5,        [r3 - 6 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m7,        m5
+
+    punpckhwd   m3,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m7
+    punpcklwd   m6,        m7
+
+    punpckldq   m7,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m3, m2
+    punpckhdq   m3,        m2
+
+    movh        [r0 + 8],            m7
+    movhps      [r0 + r1 + 8],       m7
+    movh        [r0 + r1 * 2 + 8],   m4
+    movhps      [r0 + r4 + 8],       m4
+    lea         r0,                  [r0 + r1 * 4]
+    movh        [r0 + 8],            m6
+    movhps      [r0 + r1 + 8],       m6
+    movh        [r0 + r1 * 2 + 8],   m3
+    movhps      [r0 + r4 + 8],       m3
+    RET
+
+cglobal intra_pred_ang8_7_new, 3,5,8
+    add         r2,        32
+    lea         r3,        [ang_table + 18 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
+    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
+    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
+
+    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
+    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
+    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 - 9 * 16]              ; [9]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 - 9 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3]                       ; [18]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 + 9 * 16]              ; [27]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 9 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    palignr     m7,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
+    pmaddwd     m7,        [r3 - 14 * 16]             ; [4]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    pmaddwd     m1,        [r3 - 14 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    punpckhwd   m1,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m7
+    punpcklwd   m6,        m7
+
+    punpckldq   m7,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m1, m2
+    punpckhdq   m1,        m2
+
+    lea         r4,              [r1 * 3]
+    movh        [r0],            m7
+    movhps      [r0 + r1],       m7
+    movh        [r0 + r1 * 2],   m4
+    movhps      [r0 + r4],       m4
+    lea         r2,              [r0 + r1 * 4]
+    movh        [r2],            m6
+    movhps      [r2 + r1],       m6
+    movh        [r2 + r1 * 2],   m1
+    movhps      [r2 + r4],       m1
+
+    palignr     m4,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
+    mova        m6,        m4
+    pmaddwd     m4,        [r3 - 5 * 16]              ; [13]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    palignr     m2,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    mova        m7,        m2
+    pmaddwd     m2,        [r3 - 5 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    pmaddwd     m2,        m6, [r3 + 4 * 16]          ; [22]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    pmaddwd     m1,        m7, [r3 + 4 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    pmaddwd     m6,        [r3 + 13 * 16]             ; [31]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    pmaddwd     m7,        [r3 + 13 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    palignr     m7,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
+    pmaddwd     m7,        [r3 - 10 * 16]             ; [8]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m5,        m0, 8                      ; [11 10 10 9 9 8 8 7]
+    pmaddwd     m5,        [r3 - 10 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m7,        m5
+
+    punpckhwd   m3,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m7
+    punpcklwd   m6,        m7
+
+    punpckldq   m7,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m3, m2
+    punpckhdq   m3,        m2
+
+    movh        [r0 + 8],            m7
+    movhps      [r0 + r1 + 8],       m7
+    movh        [r0 + r1 * 2 + 8],   m4
+    movhps      [r0 + r4 + 8],       m4
+    lea         r0,                  [r0 + r1 * 4]
+    movh        [r0 + 8],            m6
+    movhps      [r0 + r1 + 8],       m6
+    movh        [r0 + r1 * 2 + 8],   m3
+    movhps      [r0 + r4 + 8],       m3
+    RET
+
+cglobal intra_pred_ang8_8_new, 3,6,7
+    add         r2,        32
+    lea         r3,        [ang_table + 17 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m1,        [r2 + 4]                   ; [9 8 7 6 5 4 3 2]
+
+    punpcklwd   m3,        m0, m1                     ; [5 4 4 3 3 2 2 1]
+    punpckhwd   m0,        m1                         ; [9 8 8 7 7 6 6 5]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 - 12 * 16]             ; [5]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 - 12 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 - 7 * 16]              ; [10]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 7 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 - 2 * 16]              ; [15]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 2 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    mova        m5,        m3
+    pmaddwd     m5,        [r3 + 3 * 16]              ; [20]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 3 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m5,        m1
+
+    punpckhwd   m1,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m5
+    punpcklwd   m6,        m5
+
+    punpckldq   m5,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m1, m2
+    punpckhdq   m1,        m2
+
+    lea         r4,              [r1 * 3]
+    movh        [r0],            m5
+    movhps      [r0 + r1],       m5
+    movh        [r0 + r1 * 2],   m4
+    movhps      [r0 + r4],       m4
+    lea         r5,              [r0 + r1 * 4]
+    movh        [r5],            m6
+    movhps      [r5 + r1],       m6
+    movh        [r5 + r1 * 2],   m1
+    movhps      [r5 + r4],       m1
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 8 * 16]              ; [25]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 8 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 + 13 * 16]             ; [30]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 13 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    movh        m1,        [r2 + 18]                  ; [12 11 10 9]
+
+    palignr     m6,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
+    mova        m5,        m6
+    pmaddwd     m6,        [r3 - 14 * 16]             ; [3]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    palignr     m1,        m0, 4                      ; [10 9 9 8 8 7 7 6]
+    mova        m3,        m1
+    pmaddwd     m1,        [r3 - 14 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m5,        [r3 - 9 * 16]              ; [8]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    pmaddwd     m3,        [r3 - 9 * 16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    packusdw    m5,        m3
+
+    punpckhwd   m3,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m5
+    punpcklwd   m6,        m5
+
+    punpckldq   m5,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m3, m2
+    punpckhdq   m3,        m2
+
+    movh        [r0 + 8],            m5
+    movhps      [r0 + r1 + 8],       m5
+    movh        [r0 + r1 * 2 + 8],   m4
+    movhps      [r0 + r4 + 8],       m4
+    lea         r0,                  [r0 + r1 * 4]
+    movh        [r0 + 8],            m6
+    movhps      [r0 + r1 + 8],       m6
+    movh        [r0 + r1 * 2 + 8],   m3
+    movhps      [r0 + r4 + 8],       m3
+    RET
+
+cglobal intra_pred_ang8_9_new, 3,5,7
+    add         r2,        32
+    lea         r3,        [ang_table + 9 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m1,        [r2 + 4]                   ; [9 8 7 6 5 4 3 2]
+
+    punpcklwd   m3,        m0, m1                     ; [5 4 4 3 3 2 2 1]
+    punpckhwd   m0,        m1                         ; [9 8 8 7 7 6 6 5]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 - 7 * 16]              ; [2]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 - 7 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 - 5 * 16]              ; [4]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 5 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 - 3 * 16]              ; [6]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 3 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    mova        m5,        m3
+    pmaddwd     m5,        [r3 - 1 * 16]              ; [8]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 1 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m5,        m1
+
+    punpckhwd   m1,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m5
+    punpcklwd   m6,        m5
+
+    punpckldq   m5,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m1, m2
+    punpckhdq   m1,        m2
+
+    lea         r4,              [r1 * 3]
+    movh        [r0],            m5
+    movhps      [r0 + r1],       m5
+    movh        [r0 + r1 * 2],   m4
+    movhps      [r0 + r4],       m4
+    lea         r2,              [r0 + r1 * 4]
+    movh        [r2],            m6
+    movhps      [r2 + r1],       m6
+    movh        [r2 + r1 * 2],   m1
+    movhps      [r2 + r4],       m1
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 1 * 16]              ; [10]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 1 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 + 3 * 16]              ; [12]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 3 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 + 5 * 16]              ; [14]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r3 + 5 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    pmaddwd     m3,        [r3 + 7 * 16]              ; [16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m0,        [r3 + 7 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m3,        m0
+
+    punpckhwd   m5,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m3
+    punpcklwd   m6,        m3
+
+    punpckldq   m3,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m5, m2
+    punpckhdq   m5,        m2
+
+    movh        [r0 + 8],            m3
+    movhps      [r0 + r1 + 8],       m3
+    movh        [r0 + r1 * 2 + 8],   m4
+    movhps      [r0 + r4 + 8],       m4
+    lea         r0,                  [r0 + r1 * 4]
+    movh        [r0 + 8],            m6
+    movhps      [r0 + r1 + 8],       m6
+    movh        [r0 + r1 * 2 + 8],   m5
+    movhps      [r0 + r4 + 8],       m5
+    RET
+
+cglobal intra_pred_ang8_10_new, 3,6,3
+    movu        m1,             [r2 + 34]           ; [8 7 6 5 4 3 2 1]
+    pshufb      m0,             m1, [pw_unpackwdq]  ; [1 1 1 1 1 1 1 1]
+    add         r1,             r1
+    lea         r3,             [r1 * 3]
+
+    psrldq      m1,             2
+    pshufb      m2,             m1, [pw_unpackwdq]  ; [2 2 2 2 2 2 2 2]
+    movu        [r0 + r1],      m2
+    psrldq      m1,             2
+    pshufb      m2,             m1, [pw_unpackwdq]  ; [3 3 3 3 3 3 3 3]
+    movu        [r0 + r1 * 2],  m2
+    psrldq      m1,             2
+    pshufb      m2,             m1, [pw_unpackwdq]  ; [4 4 4 4 4 4 4 4]
+    movu        [r0 + r3],      m2
+
+    lea         r5,             [r0 + r1 *4]
+    psrldq      m1,             2
+    pshufb      m2,             m1, [pw_unpackwdq]  ; [5 5 5 5 5 5 5 5]
+    movu        [r5],           m2
+    psrldq      m1,             2
+    pshufb      m2,             m1, [pw_unpackwdq]  ; [6 6 6 6 6 6 6 6]
+    movu        [r5 + r1],      m2
+    psrldq      m1,             2
+    pshufb      m2,             m1, [pw_unpackwdq]  ; [7 7 7 7 7 7 7 7]
+    movu        [r5 + r1 * 2],  m2
+    psrldq      m1,             2
+    pshufb      m2,             m1, [pw_unpackwdq]  ; [8 8 8 8 8 8 8 8]
+    movu        [r5 + r3],      m2
+
+    cmp         r4m,            byte 0
+    jz         .quit
+
+    ; filter
+
+    movh        m1,             [r2]                ; [3 2 1 0]
+    pshufb      m2,             m1, [pw_unpackwdq]  ; [0 0 0 0 0 0 0 0]
+    movu        m1,             [r2 + 2]            ; [8 7 6 5 4 3 2 1]
+    psubw       m1,             m2
+    psraw       m1,             1
+    paddw       m0,             m1
+    pxor        m1,             m1
+    pmaxsw      m0,             m1
+    pminsw      m0,             [pw_1023]
+.quit:
+    movu        [r0],           m0
+    RET
+
+cglobal intra_pred_ang8_11_new, 3,5,7
+    lea         r3,        [ang_table + 23 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2 + 32]                       ; [7 6 5 4 3 2 1 0]
+    pinsrw      m0,        [r2], 0
+    movu        m1,        [r2 + 34]                   ; [8 7 6 5 4 3 2 1]
+
+    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
+    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 7 * 16]              ; [30]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 7 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 + 5 * 16]              ; [28]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 5 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 + 3 * 16]              ; [26]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 3 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    mova        m5,        m3
+    pmaddwd     m5,        [r3 + 1 * 16]              ; [24]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 1 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m5,        m1
+
+    punpckhwd   m1,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m5
+    punpcklwd   m6,        m5
+
+    punpckldq   m5,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m1, m2
+    punpckhdq   m1,        m2
+
+    lea         r4,              [r1 * 3]
+    movh        [r0],            m5
+    movhps      [r0 + r1],       m5
+    movh        [r0 + r1 * 2],   m4
+    movhps      [r0 + r4],       m4
+    lea         r2,              [r0 + r1 * 4]
+    movh        [r2],            m6
+    movhps      [r2 + r1],       m6
+    movh        [r2 + r1 * 2],   m1
+    movhps      [r2 + r4],       m1
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 - 1 * 16]              ; [22]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 - 1 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 - 3 * 16]              ; [20]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 3 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 - 5 * 16]              ; [18]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r3 - 5 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    pmaddwd     m3,        [r3 - 7 * 16]              ; [16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m0,        [r3 - 7 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m3,        m0
+
+    punpckhwd   m5,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m3
+    punpcklwd   m6,        m3
+
+    punpckldq   m3,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m5, m2
+    punpckhdq   m5,        m2
+
+    movh        [r0 + 8],            m3
+    movhps      [r0 + r1 + 8],       m3
+    movh        [r0 + r1 * 2 + 8],   m4
+    movhps      [r0 + r4 + 8],       m4
+    lea         r0,                  [r0 + r1 * 4]
+    movh        [r0 + 8],            m6
+    movhps      [r0 + r1 + 8],       m6
+    movh        [r0 + r1 * 2 + 8],   m5
+    movhps      [r0 + r4 + 8],       m5
+    RET
+
+cglobal intra_pred_ang8_12_new, 3,6,7
+    lea         r5,        [ang_table + 16 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2 + 32]                  ; [7 6 5 4 3 2 1 0]
+    pinsrw      m0,        [r2], 0
+    movu        m1,        [r2 + 34]                  ; [8 7 6 5 4 3 2 1]
+
+    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
+    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r5 + 11 * 16]             ; [27]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r5 + 11 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r5 + 6 * 16]              ; [22]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r5 + 6 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r5 + 1 * 16]              ; [17]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r5 + 1 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    mova        m5,        m3
+    pmaddwd     m5,        [r5 - 4 * 16]              ; [12]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r5 - 4 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m5,        m1
+
+    punpckhwd   m1,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m5
+    punpcklwd   m6,        m5
+
+    punpckldq   m5,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m1, m2
+    punpckhdq   m1,        m2
+
+    lea         r4,              [r1 * 3]
+    movh        [r0],            m5
+    movhps      [r0 + r1],       m5
+    movh        [r0 + r1 * 2],   m4
+    movhps      [r0 + r4],       m4
+    lea         r3,              [r0 + r1 * 4]
+    movh        [r3],            m6
+    movhps      [r3 + r1],       m6
+    movh        [r3 + r1 * 2],   m1
+    movhps      [r3 + r4],       m1
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r5 - 9 * 16]              ; [7]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r5 - 9 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r5 - 14 * 16]             ; [2]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r5 - 14 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    palignr     m0,        m3, 12
+    movu        m1,        [r2]
+    pshufb      m1,        [pw_ang8_12]
+    palignr     m3,        m1, 12
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r5 + 13 * 16]             ; [29]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r5 + 13 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    pmaddwd     m3,        [r5 + 8 * 16]              ; [24]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m0,        [r5 + 8 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m3,        m0
+
+    punpckhwd   m5,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m3
+    punpcklwd   m6,        m3
+
+    punpckldq   m3,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m5, m2
+    punpckhdq   m5,        m2
+
+    movh        [r0 + 8],            m3
+    movhps      [r0 + r1 + 8],       m3
+    movh        [r0 + r1 * 2 + 8],   m4
+    movhps      [r0 + r4 + 8],       m4
+    lea         r0,                  [r0 + r1 * 4]
+    movh        [r0 + 8],            m6
+    movhps      [r0 + r1 + 8],       m6
+    movh        [r0 + r1 * 2 + 8],   m5
+    movhps      [r0 + r4 + 8],       m5
+    RET
+
+cglobal intra_pred_ang8_13_new, 3,6,8
+    lea         r5,        [ang_table + 14 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2 + 32]                  ; [7 6 5 4 3 2 1 0]
+    pinsrw      m0,        [r2], 0
+    movu        m1,        [r2 + 34]                  ; [8 7 6 5 4 3 2 1]
+
+    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
+    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r5 + 9 * 16]              ; [23]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r5 + 9 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r5]                       ; [14]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r5]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r5 - 9 * 16]              ; [5]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r5 - 9 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    palignr     m0,        m3, 12
+    movu        m1,        [r2]
+    pshufb      m1,        [pw_ang8_13]
+    palignr     m3,        m1, 12
+
+    mova        m5,        m3
+    pmaddwd     m5,        [r5 + 14 * 16]             ; [28]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    mova        m7,        m0
+    pmaddwd     m7,        [r5 + 14 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m5,        m7
+
+    punpckhwd   m7,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m5
+    punpcklwd   m6,        m5
+
+    punpckldq   m5,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m7, m2
+    punpckhdq   m7,        m2
+
+    lea         r4,              [r1 * 3]
+    movh        [r0],            m5
+    movhps      [r0 + r1],       m5
+    movh        [r0 + r1 * 2],   m4
+    movhps      [r0 + r4],       m4
+    lea         r2,              [r0 + r1 * 4]
+    movh        [r2],            m6
+    movhps      [r2 + r1],       m6
+    movh        [r2 + r1 * 2],   m7
+    movhps      [r2 + r4],       m7
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r5 + 5 * 16]              ; [19]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r5 + 5 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r5 - 4 * 16]              ; [10]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r5 - 4 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m2,        m5
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r5 - 13 * 16]             ; [1]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r5 - 13 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    pmaddwd     m3,        [r5 + 10 * 16]             ; [24]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m0,        [r5 + 10 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m3,        m0
+
+    punpckhwd   m5,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m3
+    punpcklwd   m6,        m3
+
+    punpckldq   m3,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m5, m2
+    punpckhdq   m5,        m2
+
+    movh        [r0 + 8],            m3
+    movhps      [r0 + r1 + 8],       m3
+    movh        [r0 + r1 * 2 + 8],   m4
+    movhps      [r0 + r4 + 8],       m4
+    lea         r0,                  [r0 + r1 * 4]
+    movh        [r0 + 8],            m6
+    movhps      [r0 + r1 + 8],       m6
+    movh        [r0 + r1 * 2 + 8],   m5
+    movhps      [r0 + r4 + 8],       m5
+    RET
+
+cglobal intra_pred_ang8_14_new, 3,6,8
+    lea         r5,        [ang_table + 18 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2 + 32]                  ; [7 6 5 4 3 2 1 0]
+    pinsrw      m0,        [r2], 0
+    movu        m1,        [r2 + 34]                  ; [8 7 6 5 4 3 2 1]
+
+    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
+    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r5 + 1 * 16]              ; [19]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r5 + 1 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r5 - 12 * 16]             ; [6]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r5 - 12 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    palignr     m0,        m3, 12
+    movu        m1,        [r2]
+    pshufb      m1,        [pw_ang8_14]
+    palignr     m3,        m1, 12
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r5 + 7 * 16]              ; [25]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r5 + 7 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    mova        m5,        m3
+    pmaddwd     m5,        [r5 - 6 * 16]              ; [12]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    mova        m7,        m0
+    pmaddwd     m7,        [r5 - 6 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m5,        m7
+
+    punpckhwd   m7,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m5
+    punpcklwd   m6,        m5
+
+    punpckldq   m5,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m7, m2
+    punpckhdq   m7,        m2
+
+    lea         r4,              [r1 * 3]
+    movh        [r0],            m5
+    movhps      [r0 + r1],       m5
+    movh        [r0 + r1 * 2],   m4
+    movhps      [r0 + r4],       m4
+    lea         r2,              [r0 + r1 * 4]
+    movh        [r2],            m6
+    movhps      [r2 + r1],       m6
+    movh        [r2 + r1 * 2],   m7
+    movhps      [r2 + r4],       m7
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r5 + 13 * 16]             ; [31]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r5 + 13 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r5]                       ; [18]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r5]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m2,        m5
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r5 - 13 * 16]             ; [5]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r5 - 13 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    pmaddwd     m3,        [r5 + 6 * 16]              ; [24]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m0,        [r5 + 6 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m3,        m0
+
+    punpckhwd   m5,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m3
+    punpcklwd   m6,        m3
+
+    punpckldq   m3,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m5, m2
+    punpckhdq   m5,        m2
+
+    movh        [r0 + 8],            m3
+    movhps      [r0 + r1 + 8],       m3
+    movh        [r0 + r1 * 2 + 8],   m4
+    movhps      [r0 + r4 + 8],       m4
+    lea         r0,                  [r0 + r1 * 4]
+    movh        [r0 + 8],            m6
+    movhps      [r0 + r1 + 8],       m6
+    movh        [r0 + r1 * 2 + 8],   m5
+    movhps      [r0 + r4 + 8],       m5
+    RET
+
+cglobal intra_pred_ang8_15_new, 3,6,8
+    lea         r5,        [ang_table + 20 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2 + 32]                  ; [7 6 5 4 3 2 1 0]
+    pinsrw      m0,        [r2], 0
+    movu        m1,        [r2 + 34]                  ; [8 7 6 5 4 3 2 1]
+
+    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
+    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r5 - 5 * 16]              ; [15]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r5 - 5 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m0,        m3, 12
+    movu        m1,        [r2]
+    pshufb      m1,        [pw_ang8_15]
+    palignr     m3,        m1, 12
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r5 + 10 * 16]             ; [30]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r5 + 10 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m2,        m5
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r5 - 7 * 16]              ; [13]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r5 - 7 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    mova        m5,        m3
+    pmaddwd     m5,        [r5 + 8 * 16]              ; [28]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    mova        m7,        m0
+    pmaddwd     m7,        [r5 + 8 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m5,        m7
+
+    punpckhwd   m7,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m5
+    punpcklwd   m6,        m5
+
+    punpckldq   m5,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m7, m2
+    punpckhdq   m7,        m2
+
+    lea         r4,              [r1 * 3]
+    movh        [r0],            m5
+    movhps      [r0 + r1],       m5
+    movh        [r0 + r1 * 2],   m4
+    movhps      [r0 + r4],       m4
+    lea         r3,              [r0 + r1 * 4]
+    movh        [r3],            m6
+    movhps      [r3 + r1],       m6
+    movh        [r3 + r1 * 2],   m7
+    movhps      [r3 + r4],       m7
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r5 - 9 * 16]              ; [11]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r5 - 9 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r5 + 6 * 16]              ; [26]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r5 + 6 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m2,        m5
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r5 - 11 * 16]             ; [9]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r5 - 11 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+    pinsrw      m3,        [r2 + 16], 0
+
+    pmaddwd     m3,        [r5 + 4 * 16]              ; [24]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m0,        [r5 + 4 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m3,        m0
+
+    punpckhwd   m5,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m3
+    punpcklwd   m6,        m3
+
+    punpckldq   m3,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m5, m2
+    punpckhdq   m5,        m2
+
+    movh        [r0 + 8],            m3
+    movhps      [r0 + r1 + 8],       m3
+    movh        [r0 + r1 * 2 + 8],   m4
+    movhps      [r0 + r4 + 8],       m4
+    lea         r0,                  [r0 + r1 * 4]
+    movh        [r0 + 8],            m6
+    movhps      [r0 + r1 + 8],       m6
+    movh        [r0 + r1 * 2 + 8],   m5
+    movhps      [r0 + r4 + 8],       m5
+    RET
+
+cglobal intra_pred_ang8_16_new, 3,6,8
+    lea         r5,        [ang_table + 13 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2 + 32]                  ; [7 6 5 4 3 2 1 0]
+    pinsrw      m0,        [r2], 0
+    movu        m1,        [r2 + 34]                  ; [8 7 6 5 4 3 2 1]
+
+    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
+    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r5 - 2 * 16]              ; [11]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r5 - 2 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m0,        m3, 12
+    movu        m1,        [r2]
+    pshufb      m1,        [pw_ang8_16]
+    palignr     m3,        m1, 12
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r5 + 9 * 16]              ; [22]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r5 + 9 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m2,        m5
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r5 - 12 * 16]             ; [1]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r5 - 12 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    mova        m5,        m3
+    pmaddwd     m5,        [r5 - 1 * 16]              ; [12]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    mova        m7,        m0
+    pmaddwd     m7,        [r5 - 1 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m5,        m7
+
+    punpckhwd   m7,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m5
+    punpcklwd   m6,        m5
+
+    punpckldq   m5,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m7, m2
+    punpckhdq   m7,        m2
+
+    lea         r4,              [r1 * 3]
+    movh        [r0],            m5
+    movhps      [r0 + r1],       m5
+    movh        [r0 + r1 * 2],   m4
+    movhps      [r0 + r4],       m4
+    lea         r3,              [r0 + r1 * 4]
+    movh        [r3],            m6
+    movhps      [r3 + r1],       m6
+    movh        [r3 + r1 * 2],   m7
+    movhps      [r3 + r4],       m7
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r5 + 10 * 16]             ; [23]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r5 + 10 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r5 - 11 * 16]             ; [2]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r5 - 11 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m2,        m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r5]                       ; [13]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r5]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+    pinsrw      m3,        [r2 + 16], 0
+
+    pmaddwd     m3,        [r5 + 11 * 16]             ; [24]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m0,        [r5 + 11 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m3,        m0
+
+    punpckhwd   m5,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m3
+    punpcklwd   m6,        m3
+
+    punpckldq   m3,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m5, m2
+    punpckhdq   m5,        m2
+
+    movh        [r0 + 8],            m3
+    movhps      [r0 + r1 + 8],       m3
+    movh        [r0 + r1 * 2 + 8],   m4
+    movhps      [r0 + r4 + 8],       m4
+    lea         r0,                  [r0 + r1 * 4]
+    movh        [r0 + 8],            m6
+    movhps      [r0 + r1 + 8],       m6
+    movh        [r0 + r1 * 2 + 8],   m5
+    movhps      [r0 + r4 + 8],       m5
+    RET
+
+cglobal intra_pred_ang8_17_new, 3,6,8
+    lea         r5,        [ang_table + 17 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2 + 32]                  ; [7 6 5 4 3 2 1 0]
+    pinsrw      m0,        [r2], 0
+    movu        m1,        [r2 + 34]                  ; [8 7 6 5 4 3 2 1]
+
+    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
+    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r5 - 11 * 16]             ; [6]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r5 - 11 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m0,        m3, 12
+    movu        m1,        [r2]
+    pshufb      m1,        [pw_ang8_17]
+    palignr     m3,        m1, 12
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r5 - 5 * 16]              ; [12]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r5 - 5 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m2,        m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r5 + 1 * 16]              ; [18]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r5 + 1 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    mova        m5,        m3
+    pmaddwd     m5,        [r5 + 7 * 16]              ; [24]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    mova        m7,        m0
+    pmaddwd     m7,        [r5 + 7 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m5,        m7
+
+    punpckhwd   m7,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m5
+    punpcklwd   m6,        m5
+
+    punpckldq   m5,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m7, m2
+    punpckhdq   m7,        m2
+
+    lea         r4,              [r1 * 3]
+    movh        [r0],            m5
+    movhps      [r0 + r1],       m5
+    movh        [r0 + r1 * 2],   m4
+    movhps      [r0 + r4],       m4
+    lea         r3,              [r0 + r1 * 4]
+    movh        [r3],            m6
+    movhps      [r3 + r1],       m6
+    movh        [r3 + r1 * 2],   m7
+    movhps      [r3 + r4],       m7
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r5 + 13 * 16]             ; [30]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r5 + 13 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r5 - 13 * 16]             ; [4]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r5 - 13 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m2,        m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r5 - 7 * 16]              ; [10]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r5 - 7 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    pmaddwd     m3,        [r5 - 1 * 16]              ; [16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m0,        [r5 - 1 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m3,        m0
+
+    punpckhwd   m5,        m4, m2
+    punpcklwd   m4,        m2
+    punpckhwd   m2,        m6, m3
+    punpcklwd   m6,        m3
+
+    punpckldq   m3,        m4, m6
+    punpckhdq   m4,        m6
+    punpckldq   m6,        m5, m2
+    punpckhdq   m5,        m2
+
+    movh        [r0 + 8],            m3
+    movhps      [r0 + r1 + 8],       m3
+    movh        [r0 + r1 * 2 + 8],   m4
+    movhps      [r0 + r4 + 8],       m4
+    lea         r0,                  [r0 + r1 * 4]
+    movh        [r0 + 8],            m6
+    movhps      [r0 + r1 + 8],       m6
+    movh        [r0 + r1 * 2 + 8],   m5
+    movhps      [r0 + r4 + 8],       m5
+    RET
+
+cglobal intra_pred_ang8_18_new, 3,4,3
+    add         r1,              r1
+    lea         r3,              [r1 * 3]
+    movu        m1,              [r2]
+    movu        m0,              [r2 + 34]
+    pshufb      m0,              [pw_swap16]
+    movu        [r0],            m1
+    palignr     m2,              m1, m0, 14
+    movu        [r0 + r1],       m2
+    palignr     m2,              m1, m0, 12
+    movu        [r0 + r1 * 2],   m2
+    palignr     m2,              m1, m0, 10
+    movu        [r0 + r3],       m2
+    lea         r0,              [r0 + r1 * 4]
+    palignr     m2,              m1, m0, 8
+    movu        [r0],            m2
+    palignr     m2,              m1, m0, 6
+    movu        [r0 + r1],       m2
+    palignr     m2,              m1, m0, 4
+    movu        [r0 + r1 * 2],   m2
+    palignr     m1,              m0, 2
+    movu        [r0 + r3],       m1
+    RET
+
+cglobal intra_pred_ang8_19_new, 3,5,8
+    lea         r3,        [ang_table + 17 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2]                       ; [7 6 5 4 3 2 1 0]
+    movu        m1,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+
+    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
+    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 - 11 * 16]             ; [6]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 - 11 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m0,        m3, 12
+    movu        m1,        [r2 + 32]
+    pinsrw      m1,        [r2], 0
+    pshufb      m1,        [pw_ang8_17]
+    palignr     m3,        m1, 12
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 - 5 * 16]              ; [12]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r3 - 5 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m2,        m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 + 1 * 16]              ; [18]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r3 + 1 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    mova        m5,        m3
+    pmaddwd     m5,        [r3 + 7 * 16]              ; [24]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    mova        m7,        m0
+    pmaddwd     m7,        [r3 + 7 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m5,        m7
+
+    lea         r4,              [r1 * 3]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 13 * 16]             ; [30]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 13 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 - 13 * 16]             ; [4]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r3 - 13 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m2,        m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 - 7 * 16]              ; [10]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r3 - 7 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    pmaddwd     m3,        [r3 - 1 * 16]              ; [16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m0,        [r3 - 1 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m3,        m0
+
+    lea         r0,              [r0 + r1 * 4]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m3
+    RET
+
+cglobal intra_pred_ang8_20_new, 3,5,8
+    lea         r3,        [ang_table + 13 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2]                       ; [7 6 5 4 3 2 1 0]
+    movu        m1,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+
+    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
+    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 - 2 * 16]              ; [11]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 - 2 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m0,        m3, 12
+    movu        m1,        [r2 + 32]
+    pinsrw      m1,        [r2], 0
+    pshufb      m1,        [pw_ang8_16]
+    palignr     m3,        m1, 12
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 + 9 * 16]              ; [22]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r3 + 9 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m2,        m5
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 - 12 * 16]             ; [1]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r3 - 12 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    mova        m5,        m3
+    pmaddwd     m5,        [r3 - 1 * 16]              ; [12]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    mova        m7,        m0
+    pmaddwd     m7,        [r3 - 1 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m5,        m7
+
+    lea         r4,              [r1 * 3]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 10 * 16]             ; [23]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 10 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 - 11 * 16]             ; [2]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r3 - 11 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m2,        m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3]                       ; [13]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r3]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+    pinsrw      m3,        [r2 + 16 + 32], 0
+
+    pmaddwd     m3,        [r3 + 11 * 16]             ; [24]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m0,        [r3 + 11 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m3,        m0
+
+    lea         r0,              [r0 + r1 * 4]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m3
+    RET
+
+cglobal intra_pred_ang8_21_new, 3,5,8
+    lea         r3,        [ang_table + 20 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2]                       ; [7 6 5 4 3 2 1 0]
+    movu        m1,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+
+    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
+    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 - 5 * 16]              ; [15]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 - 5 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m0,        m3, 12
+    movu        m1,        [r2 + 32]
+    pinsrw      m1,        [r2], 0
+    pshufb      m1,        [pw_ang8_15]
+    palignr     m3,        m1, 12
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 + 10 * 16]             ; [30]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r3 + 10 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m2,        m5
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 - 7 * 16]              ; [13]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r3 - 7 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    mova        m5,        m3
+    pmaddwd     m5,        [r3 + 8 * 16]              ; [28]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    mova        m7,        m0
+    pmaddwd     m7,        [r3 + 8 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m5,        m7
+
+    lea         r4,              [r1 * 3]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m5
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 - 9 * 16]              ; [11]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 - 9 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 + 6 * 16]              ; [26]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r3 + 6 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m2,        m5
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 - 11 * 16]             ; [9]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r3 - 11 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+    pinsrw      m3,        [r2 + 16 + 32], 0
+
+    pmaddwd     m3,        [r3 + 4 * 16]              ; [24]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m0,        [r3 + 4 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m3,        m0
+
+    lea         r0,              [r0 + r1 * 4]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m3
+    RET
+
+cglobal intra_pred_ang8_22_new, 3,5,8
+    lea         r3,        [ang_table + 18 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2]                       ; [7 6 5 4 3 2 1 0]
+    movu        m1,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+
+    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
+    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 1 * 16]              ; [19]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 1 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 - 12 * 16]             ; [6]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 12 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    palignr     m0,        m3, 12
+    movu        m1,        [r2 + 32]
+    pinsrw      m1,        [r2], 0
+    pshufb      m1,        [pw_ang8_14]
+    palignr     m3,        m1, 12
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 + 7 * 16]              ; [25]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r3 + 7 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    mova        m5,        m3
+    pmaddwd     m5,        [r3 - 6 * 16]              ; [12]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    mova        m7,        m0
+    pmaddwd     m7,        [r3 - 6 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m5,        m7
+
+    lea         r4,              [r1 * 3]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 13 * 16]             ; [31]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 13 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3]                       ; [18]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r3]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m2,        m5
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 - 13 * 16]             ; [5]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r3 - 13 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    pmaddwd     m3,        [r3 + 6 * 16]              ; [24]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m0,        [r3 + 6 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m3,        m0
+
+    lea         r0,              [r0 + r1 * 4]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m3
+    RET
+
+cglobal intra_pred_ang8_23_new, 3,5,8
+    lea         r3,        [ang_table + 14 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2]                       ; [7 6 5 4 3 2 1 0]
+    movu        m1,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+
+    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
+    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 9 * 16]              ; [23]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 9 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3]                       ; [14]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 - 9 * 16]              ; [5]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 9 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    palignr     m0,        m3, 12
+    movu        m1,        [r2 + 32]
+    pinsrw      m1,        [r2], 0
+    pshufb      m1,        [pw_ang8_13]
+    palignr     m3,        m1, 12
+
+    mova        m5,        m3
+    pmaddwd     m5,        [r3 + 14 * 16]             ; [28]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    mova        m7,        m0
+    pmaddwd     m7,        [r3 + 14 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m5,        m7
+
+    lea         r4,              [r1 * 3]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m5
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 5 * 16]              ; [19]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 5 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 - 4 * 16]              ; [10]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r3 - 4 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m2,        m5
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 - 13 * 16]             ; [1]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r3 - 13 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    pslldq      m1,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m1, 12
+
+    pmaddwd     m3,        [r3 + 10 * 16]             ; [24]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m0,        [r3 + 10 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m3,        m0
+
+    lea         r0,              [r0 + r1 * 4]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m3
+    RET
+
+cglobal intra_pred_ang8_24_new, 3,5,7
+    lea         r3,        [ang_table + 16 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2]                       ; [7 6 5 4 3 2 1 0]
+    movu        m1,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+
+    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
+    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 11 * 16]             ; [27]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 11 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 + 6 * 16]              ; [22]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 6 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 + 1 * 16]              ; [17]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 1 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    mova        m5,        m3
+    pmaddwd     m5,        [r3 - 4 * 16]              ; [12]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 4 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m5,        m1
+
+    lea         r4,              [r1 * 3]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m5
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 - 9 * 16]              ; [7]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 - 9 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 - 14 * 16]             ; [2]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 14 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    palignr     m0,        m3, 12
+    movu        m1,        [r2 + 32]
+    pinsrw      m1,        [r2], 0
+    pshufb      m1,        [pw_ang8_12]
+    palignr     m3,        m1, 12
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 + 13 * 16]             ; [29]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r3 + 13 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    pmaddwd     m3,        [r3 + 8 * 16]              ; [24]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m0,        [r3 + 8 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m3,        m0
+
+    lea         r0,              [r0 + r1 * 4]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m3
+    RET
+
+cglobal intra_pred_ang8_25_new, 3,5,7
+    lea         r3,        [ang_table + 23 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2]                       ; [7 6 5 4 3 2 1 0]
+    movu        m1,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+
+    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
+    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 7 * 16]              ; [30]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 7 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 + 5 * 16]              ; [28]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 5 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 + 3 * 16]              ; [26]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 3 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    mova        m5,        m3
+    pmaddwd     m5,        [r3 + 1 * 16]              ; [24]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 1 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m5,        m1
+
+    lea         r4,              [r1 * 3]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m5
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 - 1 * 16]              ; [22]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 - 1 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 - 3 * 16]              ; [20]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 3 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 - 5 * 16]              ; [18]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r3 - 5 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    pmaddwd     m3,        [r3 - 7 * 16]              ; [16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m0,        [r3 - 7 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m3,        m0
+
+    lea         r0,              [r0 + r1 * 4]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m3
+    RET
+
+cglobal intra_pred_ang8_26_new, 3,6,3
+    movu        m0,             [r2 + 2]            ; [8 7 6 5 4 3 2 1]
+    add         r1,             r1
+    lea         r5,             [r1 * 3]
+
+    movu        [r0],           m0
+    movu        [r0 + r1],      m0
+    movu        [r0 + r1 * 2],  m0
+    movu        [r0 + r5],      m0
+
+    lea         r3,             [r0 + r1 *4]
+    movu        [r3],           m0
+    movu        [r3 + r1],      m0
+    movu        [r3 + r1 * 2],  m0
+    movu        [r3 + r5],      m0
+
+    cmp         r4m,            byte 0
+    jz         .quit
+
+    ; filter
+    pshufb      m0,             [pw_unpackwdq]
+    pinsrw      m1,             [r2], 0             ; [3 2 1 0]
+    pshufb      m2,             m1, [pw_unpackwdq]  ; [0 0 0 0 0 0 0 0]
+    movu        m1,             [r2 + 2 + 32]       ; [8 7 6 5 4 3 2 1]
+    psubw       m1,             m2
+    psraw       m1,             1
+    paddw       m0,             m1
+    pxor        m1,             m1
+    pmaxsw      m0,             m1
+    pminsw      m0,             [pw_1023]
+    pextrw      [r0],          m0, 0
+    pextrw      [r0 + r1],     m0, 1
+    pextrw      [r0 + r1 * 2], m0, 2
+    pextrw      [r0 + r5],     m0, 3
+    pextrw      [r3],          m0, 4
+    pextrw      [r3 + r1],     m0, 5
+    pextrw      [r3 + r1 * 2], m0, 6
+    pextrw      [r3 + r5],     m0, 7
+.quit:
+    RET
+
+cglobal intra_pred_ang8_27_new, 3,5,7
+    lea         r3,        [ang_table + 9 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m1,        [r2 + 4]                   ; [9 8 7 6 5 4 3 2]
+
+    punpcklwd   m3,        m0, m1                     ; [5 4 4 3 3 2 2 1]
+    punpckhwd   m0,        m1                         ; [9 8 8 7 7 6 6 5]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 - 7 * 16]              ; [2]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 - 7 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 - 5 * 16]              ; [4]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 5 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 - 3 * 16]              ; [6]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 3 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    mova        m5,        m3
+    pmaddwd     m5,        [r3 - 1 * 16]              ; [8]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 1 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m5,        m1
+
+    lea         r4,              [r1 * 3]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m5
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 1 * 16]              ; [10]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 1 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 + 3 * 16]              ; [12]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 3 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 + 5 * 16]              ; [14]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m5,        m0
+    pmaddwd     m5,        [r3 + 5 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m6,        m5
+
+    pmaddwd     m3,        [r3 + 7 * 16]              ; [16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m0,        [r3 + 7 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m3,        m0
+
+    lea         r0,              [r0 + r1 * 4]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m3
+    RET
+
+cglobal intra_pred_ang8_28_new, 3,5,7
+    lea         r3,        [ang_table + 17 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m1,        [r2 + 4]                   ; [9 8 7 6 5 4 3 2]
+
+    punpcklwd   m3,        m0, m1                     ; [5 4 4 3 3 2 2 1]
+    punpckhwd   m0,        m1                         ; [9 8 8 7 7 6 6 5]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 - 12 * 16]             ; [5]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 - 12 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 - 7 * 16]              ; [10]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 7 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 - 2 * 16]              ; [15]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 2 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    mova        m5,        m3
+    pmaddwd     m5,        [r3 + 3 * 16]              ; [20]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 3 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m5,        m1
+
+    lea         r4,              [r1 * 3]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m5
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 8 * 16]              ; [25]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 8 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 + 13 * 16]             ; [30]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 13 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    movh        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
+
+    palignr     m6,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
+    mova        m5,        m6
+    pmaddwd     m6,        [r3 - 14 * 16]             ; [3]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    palignr     m1,        m0, 4                      ; [10 9 9 8 8 7 7 6]
+    mova        m3,        m1
+    pmaddwd     m1,        [r3 - 14 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m5,        [r3 - 9 * 16]              ; [8]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    pmaddwd     m3,        [r3 - 9 * 16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    packusdw    m5,        m3
+
+    lea         r0,              [r0 + r1 * 4]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m5
+    RET
+
+cglobal intra_pred_ang8_29_new, 3,5,8
+    lea         r3,        [ang_table + 18 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
+    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
+    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
+
+    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
+    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
+    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 - 9 * 16]              ; [9]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 - 9 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3]                       ; [18]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 + 9 * 16]              ; [27]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 9 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    palignr     m7,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
+    pmaddwd     m7,        [r3 - 14 * 16]             ; [4]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    pmaddwd     m1,        [r3 - 14 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r4,              [r1 * 3]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+
+    palignr     m4,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
+    mova        m6,        m4
+    pmaddwd     m4,        [r3 - 5 * 16]              ; [13]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    palignr     m2,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    mova        m7,        m2
+    pmaddwd     m2,        [r3 - 5 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    pmaddwd     m2,        m6, [r3 + 4 * 16]          ; [22]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    pmaddwd     m1,        m7, [r3 + 4 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    pmaddwd     m6,        [r3 + 13 * 16]             ; [31]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    pmaddwd     m7,        [r3 + 13 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    palignr     m7,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
+    pmaddwd     m7,        [r3 - 10 * 16]             ; [8]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m5,        m0, 8                      ; [11 10 10 9 9 8 8 7]
+    pmaddwd     m5,        [r3 - 10 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m7,        m5
+
+    lea         r0,              [r0 + r1 * 4]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+    RET
+
+cglobal intra_pred_ang8_30_new, 3,5,8
+    lea         r3,        [ang_table + 14 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
+    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
+    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
+
+    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
+    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
+    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 - 1 * 16]              ; [13]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 - 1 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 + 12 * 16]             ; [26]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 12 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    palignr     m6,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
+    mova        m7,        m6
+    pmaddwd     m6,        [r3 - 7 * 16]              ; [7]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    pmaddwd     m1,        [r3 - 7 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m7,        [r3 + 6 * 16]              ; [20]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    pmaddwd     m1,        [r3 + 6 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r4,              [r1 * 3]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+
+    palignr     m4,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
+    mova        m6,        m4
+    pmaddwd     m4,        [r3 - 13 * 16]             ; [1]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    palignr     m2,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    mova        m7,        m2
+    pmaddwd     m2,        [r3 - 13 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    pmaddwd     m2,        m6, [r3]                   ; [14]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    pmaddwd     m1,        m7, [r3]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    pmaddwd     m6,        [r3 + 13 * 16]             ; [27]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    pmaddwd     m7,        [r3 + 13 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    palignr     m7,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
+    pmaddwd     m7,        [r3 - 6 * 16]              ; [8]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m5,        m0, 12                     ; [12 11 11 10 10 9 9 8]
+    pmaddwd     m5,        [r3 - 6 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m7,        m5
+
+    lea         r0,              [r0 + r1 * 4]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+    RET
+
+cglobal intra_pred_ang8_31_new, 3,5,8
+    lea         r3,        [ang_table + 13 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
+    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
+    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
+
+    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
+    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
+    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 4 * 16]              ; [17]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 4 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m2,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
+    mova        m6,        m2
+    pmaddwd     m2,        [r3 - 11 * 16]             ; [2]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    mova        m7,        m1
+    pmaddwd     m1,        [r3 - 11 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    pmaddwd     m6,        [r3 + 6 * 16]              ; [19]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    pmaddwd     m7,        [r3 + 6 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    palignr     m7,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
+    pmaddwd     m7,        [r3 - 9 * 16]              ; [4]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m1,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    pmaddwd     m1,        [r3 - 9 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r4,              [r1 * 3]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+
+    palignr     m4,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
+    pmaddwd     m4,        [r3 + 8 * 16]              ; [21]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    palignr     m2,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    pmaddwd     m2,        [r3 + 8 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m2,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
+    mova        m6,        m2
+    pmaddwd     m2,        [r3 - 7 * 16]              ; [6]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    palignr     m1,        m5, m0, 12                 ; [12 11 11 10 10 9 9 8]
+    mova        m7,        m1
+    pmaddwd     m1,        [r3 - 7 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    pmaddwd     m6,        [r3 + 10 * 16]             ; [23]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    pmaddwd     m7,        [r3 + 10 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    mova        m7,        m0
+    pmaddwd     m7,        [r3 - 5 * 16]              ; [8]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m5
+    pmaddwd     m1,        [r3 - 5 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r0,              [r0 + r1 * 4]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+    RET
+
+cglobal intra_pred_ang8_32_new, 3,5,8
+    lea         r3,        [ang_table + 19 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
+    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
+    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
+
+    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
+    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
+    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 2 * 16]              ; [21]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 2 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m2,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
+    mova        m6,        m2
+    pmaddwd     m2,        [r3 - 9 * 16]              ; [10]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    mova        m7,        m1
+    pmaddwd     m1,        [r3 - 9 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    pmaddwd     m6,        [r3 + 12 * 16]             ; [31]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    pmaddwd     m7,        [r3 + 12 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    palignr     m7,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
+    pmaddwd     m7,        [r3 + 1 * 16]              ; [20]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m1,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    pmaddwd     m1,        [r3 + 1 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r4,              [r1 * 3]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+
+    palignr     m4,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
+    mova        m2,        m4
+    pmaddwd     m4,        [r3 - 10 * 16]             ; [ 9]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    palignr     m3,        m5, m0, 12                 ; [12 11 11 10 10 9 9 8]
+    mova        m6,        m3
+    pmaddwd     m3,        [r3 - 10 * 16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    packusdw    m4,        m3
+
+    pmaddwd     m2,        [r3 + 11 * 16]             ; [30]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    pmaddwd     m6,        [r3 + 11 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m2,        m6
+
+    mova        m6,        m0
+    pmaddwd     m6,        [r3]                       ; [19]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m7,        m5
+    pmaddwd     m7,        [r3]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    movh        m1,        [r2 + 26]                  ; [16 15 14 13]
+    palignr     m7,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    pmaddwd     m7,        [r3 - 11 * 16]             ; [8]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m1,        m5, 4                      ; [14 13 13 12 12 11 11 10]
+    pmaddwd     m1,        [r3 - 11 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r0,              [r0 + r1 * 4]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+    RET
+
+cglobal intra_pred_ang8_33_new, 3,5,8
+    lea         r3,        [ang_table + 14 * 16]
+    add         r1,        r1
+
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
+    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
+    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
+
+    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
+    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
+    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
+    punpckhwd   m1,        m4                         ; [x 16 16 15 15 14 14 13]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 12 * 16]             ; [26]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 12 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m2,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
+    pmaddwd     m2,        [r3 + 6 * 16]              ; [20]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    palignr     m6,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    pmaddwd     m6,        [r3 + 6 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m2,        m6
+
+    palignr     m6,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
+    pmaddwd     m6,        [r3]                       ; [14]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    palignr     m7,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    pmaddwd     m7,        [r3]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    palignr     m7,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
+    pmaddwd     m7,        [r3 - 6 * 16]              ; [ 8]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m3,        m5, m0, 12                 ; [12 11 11 10 10 9 9 8]
+    pmaddwd     m3,        [r3 - 6 * 16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    packusdw    m7,        m3
+
+    lea         r4,              [r1 * 3]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+
+    mova        m4,        m0
+    pmaddwd     m4,        [r3 - 12 * 16]             ; [ 2]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m5
+    pmaddwd     m2,        [r3 - 12 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 14 * 16]             ; [28]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m6,        m5
+    pmaddwd     m6,        [r3 + 14 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m2,        m6
+
+    palignr     m6,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    pmaddwd     m6,        [r3 + 8 * 16]              ; [22]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    palignr     m7,        m1, m5, 4                  ; [14 13 13 12 12 11 11 10]
+    pmaddwd     m7,        [r3 + 8 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    palignr     m7,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    pmaddwd     m7,        [r3 + 2 * 16]              ; [16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m1,        m5, 8                      ; [15 14 14 13 13 12 12 11]
+    pmaddwd     m1,        [r3 + 2 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r0,              [r0 + r1 * 4]
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+    RET
+
 ;-----------------------------------------------------------------------------
 ; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
 ;-----------------------------------------------------------------------------
@@ -8617,6 +12254,964 @@
 .quit:
     RET
 
+;------------------------------------------------------------------------------------------
+; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
+;------------------------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal intra_pred_ang16_2_new, 3,5,5
+    lea         r4,                 [r2]
+    add         r2,                 64
+    cmp         r3m,                byte 34
+    cmove       r2,                 r4
+    add         r1,                 r1
+    lea         r3,                 [r1 * 3]
+    movu        m0,                 [r2 + 4]
+    movu        m1,                 [r2 + 20]
+    movu        m2,                 [r2 + 36]
+
+    movu        [r0],               m0
+    movu        [r0 + 16],          m1
+    palignr     m3,                 m1, m0, 2
+    palignr     m4,                 m2, m1, 2
+    movu        [r0 + r1],          m3
+    movu        [r0 + r1 + 16],     m4
+    palignr     m3,                 m1, m0, 4
+    palignr     m4,                 m2, m1, 4
+    movu        [r0 + r1 * 2],      m3
+    movu        [r0 + r1 * 2 + 16], m4
+    palignr     m3,                 m1, m0, 6
+    palignr     m4,                 m2, m1, 6
+    movu        [r0 + r3],          m3
+    movu        [r0 + r3 + 16],     m4
+
+    lea         r0,                 [r0 + r1 * 4]
+    palignr     m3,                 m1, m0, 8
+    palignr     m4,                 m2, m1, 8
+    movu        [r0],               m3
+    movu        [r0 + 16],          m4
+    palignr     m3,                 m1, m0, 10
+    palignr     m4,                 m2, m1, 10
+    movu        [r0 + r1],          m3
+    movu        [r0 + r1 + 16],     m4
+    palignr     m3,                 m1, m0, 12
+    palignr     m4,                 m2, m1, 12
+    movu        [r0 + r1 * 2],      m3
+    movu        [r0 + r1 * 2 + 16], m4
+    palignr     m3,                 m1, m0, 14
+    palignr     m4,                 m2, m1, 14
+    movu        [r0 + r3],          m3
+    movu        [r0 + r3 + 16],     m4
+
+    movu        m0,                 [r2 + 52]
+    lea         r0,                 [r0 + r1 * 4]
+    movu        [r0],               m1
+    movu        [r0 + 16],          m2
+    palignr     m3,                 m2, m1, 2
+    palignr     m4,                 m0, m2, 2
+    movu        [r0 + r1],          m3
+    movu        [r0 + r1 + 16],     m4
+    palignr     m3,                 m2, m1, 4
+    palignr     m4,                 m0, m2, 4
+    movu        [r0 + r1 * 2],      m3
+    movu        [r0 + r1 * 2 + 16], m4
+    palignr     m3,                 m2, m1, 6
+    palignr     m4,                 m0, m2, 6
+    movu        [r0 + r3],          m3
+    movu        [r0 + r3 + 16],     m4
+
+    lea         r0,                 [r0 + r1 * 4]
+    palignr     m3,                 m2, m1, 8
+    palignr     m4,                 m0, m2, 8
+    movu        [r0],               m3
+    movu        [r0 + 16],          m4
+    palignr     m3,                 m2, m1, 10
+    palignr     m4,                 m0, m2, 10
+    movu        [r0 + r1],          m3
+    movu        [r0 + r1 + 16],     m4
+    palignr     m3,                 m2, m1, 12
+    palignr     m4,                 m0, m2, 12
+    movu        [r0 + r1 * 2],      m3
+    movu        [r0 + r1 * 2 + 16], m4
+    palignr     m3,                 m2, m1, 14
+    palignr     m4,                 m0, m2, 14
+    movu        [r0 + r3],          m3
+    movu        [r0 + r3 + 16],     m4
+    RET
+
+INIT_XMM sse4    
+cglobal intra_pred_ang16_3_new, 3,7,8
+    add         r2,        64
+    xor         r6d,       r6d
+    lea         r3,        [ang_table + 16 * 16]
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+
+    call        ang16_mode_3_33
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + r1 * 8]
+
+    call        ang16_mode_3_33
+    RET
+
+cglobal intra_pred_ang16_33_new, 3,7,8
+    xor         r6d,       r6d
+    inc         r6d
+    lea         r3,        [ang_table + 16 * 16]
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+
+    call        ang16_mode_3_33
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + 16]
+
+    call        ang16_mode_3_33
+    RET
+
+cglobal intra_pred_ang16_4_new, 3,7,8
+    add         r2,        64
+    xor         r6d,       r6d
+    lea         r3,        [ang_table + 18 * 16]
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+
+    call        ang16_mode_4_32
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + r1 * 8]
+
+    call        ang16_mode_4_32
+    RET
+
+cglobal intra_pred_ang16_32_new, 3,7,8
+    xor         r6d,       r6d
+    inc         r6d
+    lea         r3,        [ang_table + 18 * 16]
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+
+    call        ang16_mode_4_32
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + 16]
+
+    call        ang16_mode_4_32
+    RET
+
+cglobal intra_pred_ang16_5_new, 3,7,8
+    add         r2,        64
+    xor         r6d,       r6d
+    lea         r3,        [ang_table + 16 * 16]
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+
+    call        ang16_mode_5_31
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + r1 * 8]
+
+    call        ang16_mode_5_31
+    RET
+
+cglobal intra_pred_ang16_31_new, 3,7,8
+    xor         r6d,       r6d
+    inc         r6d
+    lea         r3,        [ang_table + 16 * 16]
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+
+    call        ang16_mode_5_31
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + 16]
+
+    call        ang16_mode_5_31
+    RET
+
+cglobal intra_pred_ang16_6_new, 3,7,8
+    add         r2,        64
+    xor         r6d,       r6d
+    lea         r3,        [ang_table + 15 * 16]
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+
+    call        ang16_mode_6_30
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + r1 * 8]
+
+    call        ang16_mode_6_30
+    RET
+
+cglobal intra_pred_ang16_30_new, 3,7,8
+    xor         r6d,       r6d
+    inc         r6d
+    lea         r3,        [ang_table + 15 * 16]
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+
+    call        ang16_mode_6_30
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + 16]
+
+    call        ang16_mode_6_30
+    RET
+
+cglobal intra_pred_ang16_7_new, 3,7,8
+    add         r2,        64
+    xor         r6d,       r6d
+    lea         r3,        [ang_table + 17 * 16]
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+
+    call        ang16_mode_7_29
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + r1 * 8]
+
+    call        ang16_mode_7_29
+    RET
+
+cglobal intra_pred_ang16_29_new, 3,7,8
+    xor         r6d,       r6d
+    inc         r6d
+    lea         r3,        [ang_table + 17 * 16]
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+
+    call        ang16_mode_7_29
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + 16]
+
+    call        ang16_mode_7_29
+    RET
+
+cglobal intra_pred_ang16_8_new, 3,7,8
+    add         r2,        64
+    xor         r6d,       r6d
+    lea         r3,        [ang_table + 15 * 16]
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+
+    call        ang16_mode_8_28
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + r1 * 8]
+
+    call        ang16_mode_8_28
+    RET
+
+cglobal intra_pred_ang16_28_new, 3,7,8
+    xor         r6d,       r6d
+    inc         r6d
+    lea         r3,        [ang_table + 15 * 16]
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+
+    call        ang16_mode_8_28
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + 16]
+
+    call        ang16_mode_8_28
+    RET
+
+cglobal intra_pred_ang16_9_new, 3,7,8
+    add         r2,        64
+    xor         r6d,       r6d
+    lea         r3,        [ang_table + 16 * 16]
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+
+    call        ang16_mode_9_27
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + r1 * 8]
+
+    call        ang16_mode_9_27
+    RET
+
+cglobal intra_pred_ang16_27_new, 3,7,8
+    xor         r6d,       r6d
+    inc         r6d
+    lea         r3,        [ang_table + 16 * 16]
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+
+    call        ang16_mode_9_27
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + 16]
+
+    call        ang16_mode_9_27
+    RET
+
+cglobal intra_pred_ang16_11_new, 3,7,8, 0-4
+    movzx       r5d,       word [r2 + 64]
+    movzx       r6d,       word [r2]
+    mov         [rsp],     r5w
+    mov         [r2 + 64], r6w
+
+    add         r2,        64
+    xor         r6d,       r6d
+    lea         r3,        [ang_table + 16 * 16]
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+
+    call        ang16_mode_11_25
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + r1 * 8]
+
+    call        ang16_mode_11_25
+
+    mov         r6d,       [rsp]
+    mov         [r2 - 16], r6w
+    RET
+
+cglobal intra_pred_ang16_25_new, 3,7,8
+    xor         r6d,       r6d
+    inc         r6d
+    lea         r3,        [ang_table + 16 * 16]
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+
+    call        ang16_mode_11_25
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + 16]
+
+    call        ang16_mode_11_25
+    RET
+
+cglobal intra_pred_ang16_12_new, 3,7,8, 0-4
+    movzx       r5d,       word [r2 + 64]
+    movzx       r6d,       word [r2]
+    mov         [rsp],     r5w
+    mov         [r2 + 64], r6w
+
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+    lea         r6,        [ang_table + 16 * 16]
+    movu        m5,        [r2]
+    pshufb      m5,        [pw_ang8_12]
+    pinsrw      m5,        [r2 + 26], 5
+    xor         r3d,       r3d
+    add         r2,        64
+
+    call        ang16_mode_12_24
+
+    lea         r0,        [r0 + r1 * 8]
+    movu        m5,        [r2 + 2]
+    lea         r2,        [r2 + 16]
+
+    call        ang16_mode_12_24
+
+    mov         r6d,       [rsp]
+    mov         [r2 - 16], r6w
+    RET
+
+cglobal intra_pred_ang16_24_new, 3,7,8, 0-4
+    movzx       r5d,       word [r2 + 64]
+    movzx       r6d,       word [r2]
+    mov         [rsp],     r5w
+    mov         [r2 + 64], r6w
+
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+    lea         r6,        [ang_table + 16 * 16]
+    movu        m5,        [r2 + 64]
+    pshufb      m5,        [pw_ang8_12]
+    pinsrw      m5,        [r2 + 26 + 64], 5
+    xor         r3d,       r3d
+    inc         r3d
+
+    call        ang16_mode_12_24
+
+    lea         r0,        [r0 + 16]
+    movu        m5,        [r2 + 2]
+    lea         r2,        [r2 + 16]
+
+    call        ang16_mode_12_24
+
+    mov         r6d,       [rsp]
+    mov         [r2 + 48], r6w
+    RET
+
+cglobal intra_pred_ang16_13_new, 3,7,8, 0-4
+    movzx       r5d,       word [r2 + 64]
+    movzx       r6d,       word [r2]
+    mov         [rsp],     r5w
+    mov         [r2 + 64], r6w
+
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+    lea         r6,        [ang_table + 15 * 16]
+    movu        m5,        [r2]
+    pshufb      m5,        [pw_ang16_13]
+    movu        m6,        [r2 + 14]
+    pshufb      m6,        [pw_ang8_13]
+    pslldq      m6,        2
+    palignr     m5,        m6, 6
+    xor         r3d,       r3d
+    add         r2,        64
+
+    call        ang16_mode_13_23
+
+    lea         r0,        [r0 + r1 * 8]
+    movu        m5,        [r2 + 2]
+    lea         r2,        [r2 + 16]
+
+    call        ang16_mode_13_23
+
+    mov         r6d,       [rsp]
+    mov         [r2 - 16], r6w
+    RET
+
+cglobal intra_pred_ang16_23_new, 3,7,8, 0-4
+    movzx       r5d,       word [r2 + 64]
+    movzx       r6d,       word [r2]
+    mov         [rsp],     r5w
+    mov         [r2 + 64], r6w
+
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+    lea         r6,        [ang_table + 15 * 16]
+    movu        m5,        [r2 + 64]
+    pshufb      m5,        [pw_ang16_13]
+    movu        m6,        [r2 + 14 + 64]
+    pshufb      m6,        [pw_ang8_13]
+    pslldq      m6,        2
+    palignr     m5,        m6, 6
+    xor         r3d,       r3d
+    inc         r3d
+
+    call        ang16_mode_13_23
+
+    lea         r0,        [r0 + 16]
+    movu        m5,        [r2 + 2]
+    lea         r2,        [r2 + 16]
+
+    call        ang16_mode_13_23
+
+    mov         r6d,       [rsp]
+    mov         [r2 + 48], r6w
+    RET
+
+cglobal intra_pred_ang16_14_new, 3,7,8, 0-4
+    movzx       r5d,       word [r2 + 64]
+    movzx       r6d,       word [r2]
+    mov         [rsp],     r5w
+    mov         [r2 + 64], r6w
+
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+    lea         r6,        [ang_table + 18 * 16]
+    movu        m6,        [r2]
+    pshufb      m6,        [pw_ang8_14]
+    movu        m5,        [r2 + 20]
+    pshufb      m5,        [pw_ang8_14]
+    punpckhqdq  m5,        m6
+    xor         r3d,       r3d
+    add         r2,        64
+
+    call        ang16_mode_14_22
+
+    lea         r0,        [r0 + r1 * 8]
+    movu        m5,        [r2 + 2]
+    lea         r2,        [r2 + 16]
+
+    call        ang16_mode_14_22
+
+    mov         r6d,       [rsp]
+    mov         [r2 - 16], r6w
+    RET
+
+cglobal intra_pred_ang16_22_new, 3,7,8, 0-4
+    movzx       r5d,       word [r2 + 64]
+    movzx       r6d,       word [r2]
+    mov         [rsp],     r5w
+    mov         [r2 + 64], r6w
+
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+    lea         r6,        [ang_table + 18 * 16]
+    movu        m6,        [r2 + 64]
+    pshufb      m6,        [pw_ang8_14]
+    movu        m5,        [r2 + 20 + 64]
+    pshufb      m5,        [pw_ang8_14]
+    punpckhqdq  m5,        m6
+    xor         r3d,       r3d
+    inc         r3d
+
+    call        ang16_mode_14_22
+
+    lea         r0,        [r0 + 16]
+    movu        m5,        [r2 + 2]
+    lea         r2,        [r2 + 16]
+
+    call        ang16_mode_14_22
+
+    mov         r6d,       [rsp]
+    mov         [r2 + 48], r6w
+    RET
+
+cglobal intra_pred_ang16_15_new, 3,7,8, 0-4
+    movzx       r5d,       word [r2 + 64]
+    movzx       r6d,       word [r2]
+    mov         [rsp],     r5w
+    mov         [r2 + 64], r6w
+
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+    lea         r6,        [ang_table + 15 * 16]
+    movu        m6,        [r2 + 4]
+    pshufb      m6,        [pw_ang8_15]
+    movu        m5,        [r2 + 18]
+    pshufb      m5,        [pw_ang8_15]
+    punpckhqdq  m5,        m6
+    xor         r3d,       r3d
+    add         r2,        64
+
+    call        ang16_mode_15_21
+
+    lea         r0,        [r0 + r1 * 8]
+    movu        m5,        [r2]
+    lea         r2,        [r2 + 16]
+
+    call        ang16_mode_15_21
+
+    mov         r6d,       [rsp]
+    mov         [r2 - 16], r6w
+    RET
+
+cglobal intra_pred_ang16_21_new, 3,7,8, 0-4
+    movzx       r5d,       word [r2 + 64]
+    movzx       r6d,       word [r2]
+    mov         [rsp],     r5w
+    mov         [r2 + 64], r6w
+
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+    lea         r6,        [ang_table + 15 * 16]
+    movu        m6,        [r2 + 4 + 64]
+    pshufb      m6,        [pw_ang8_15]
+    movu        m5,        [r2 + 18 + 64]
+    pshufb      m5,        [pw_ang8_15]
+    punpckhqdq  m5,        m6
+    xor         r3d,       r3d
+    inc         r3d
+
+    call        ang16_mode_15_21
+
+    lea         r0,        [r0 + 16]
+    movu        m5,        [r2]
+    lea         r2,        [r2 + 16]
+
+    call        ang16_mode_15_21
+
+    mov         r6d,       [rsp]
+    mov         [r2 + 48], r6w
+    RET
+
+cglobal intra_pred_ang16_16_new, 3,7,8,0-(1*mmsize+4)
+    movzx       r5d,       word [r2 + 64]
+    movzx       r6d,       word [r2]
+    mov         [rsp + 16], r5w
+    mov         [r2 + 64], r6w
+
+    add         r1,        r1
+    lea         r6,        [ang_table + 13 * 16]
+    movu        m6,        [r2 + 4]
+    pshufb      m6,        [pw_ang16_16]
+    movu        m5,        [r2 + 16]
+    pshufb      m5,        [pw_ang16_16]
+    punpckhqdq  m5,        m6
+    mov         [rsp],     r2
+    lea         r3,        [r2 + 24]
+    add         r2,        64
+    xor         r4,        r4
+
+    call        ang16_mode_16_20
+
+    lea         r0,        [r0 + r1 * 8]
+    mov         r3,        [rsp]
+    movu        m5,        [r2]
+    lea         r2,        [r2 + 16]
+    xor         r4,        r4
+
+    call        ang16_mode_16_20
+
+    mov         r6d,       [rsp + 16]
+    mov         [r2 - 16], r6w
+    RET
+
+cglobal intra_pred_ang16_20_new, 3,7,8,0-(1*mmsize+4)
+    movzx       r5d,       word [r2 + 64]
+    movzx       r6d,       word [r2]
+    mov         [rsp + 16], r5w
+    mov         [r2 + 64], r6w
+
+    lea         r3,        [r2 + 64]
+    add         r1,        r1
+    lea         r6,        [ang_table + 13 * 16]
+    movu        m6,        [r3 + 4]
+    pshufb      m6,        [pw_ang16_16]
+    movu        m5,        [r3 + 16]
+    pshufb      m5,        [pw_ang16_16]
+    punpckhqdq  m5,        m6
+    mov         [rsp],     r3
+    lea         r3,        [r3 + 24]
+    xor         r4,        r4
+    inc         r4
+
+    call        ang16_mode_16_20
+
+    lea         r0,        [r0 + 16]
+    mov         r3,        [rsp]
+    movu        m5,        [r2]
+    lea         r2,        [r2 + 16]
+    xor         r4,        r4
+    inc         r4
+
+    call        ang16_mode_16_20
+    mov         r6d,       [rsp + 16]
+    mov         [r3],      r6w
+    RET
+
+cglobal intra_pred_ang16_17_new, 3,7,8,0-(1*mmsize+4)
+    movzx       r5d,       word [r2 + 64]
+    movzx       r6d,       word [r2]
+    mov         [rsp + 16], r5w
+    mov         [r2 + 64], r6w
+
+    add         r1,        r1
+    lea         r6,        [ang_table + 16 * 16]
+    movu        m6,        [r2 + 2]
+    pshufb      m6,        [pw_ang16_16]
+    movu        m5,        [r2 + 12]
+    pshufb      m5,        [pw_ang16_16]
+    punpckhqdq  m5,        m6
+    mov         [rsp],     r2
+    lea         r3,        [r2 + 20]
+    add         r2,        64
+    xor         r4,        r4
+
+    call        ang16_mode_17_19
+
+    lea         r0,        [r0 + r1 * 8]
+    mov         r3,        [rsp]
+    movu        m5,        [r2]
+    lea         r2,        [r2 + 16]
+    xor         r4,        r4
+
+    call        ang16_mode_17_19
+
+    mov         r6d,       [rsp + 16]
+    mov         [r2 - 16], r6w
+    RET
+
+cglobal intra_pred_ang16_19_new, 3,7,8,0-(1*mmsize+4)
+    movzx       r5d,       word [r2 + 64]
+    movzx       r6d,       word [r2]
+    mov         [rsp + 16], r5w
+    mov         [r2 + 64], r6w
+
+    lea         r3,        [r2 + 64]
+    add         r1,        r1
+    lea         r6,        [ang_table + 16 * 16]
+    movu        m6,        [r3 + 2]
+    pshufb      m6,        [pw_ang16_16]
+    movu        m5,        [r3 + 12]
+    pshufb      m5,        [pw_ang16_16]
+    punpckhqdq  m5,        m6
+    mov         [rsp],     r3
+    lea         r3,        [r3 + 20]
+    xor         r4,        r4
+    inc         r4
+
+    call        ang16_mode_17_19
+
+    lea         r0,        [r0 + 16]
+    mov         r3,        [rsp]
+    movu        m5,        [r2]
+    lea         r2,        [r2 + 16]
+    xor         r4,        r4
+    inc         r4
+
+    call        ang16_mode_17_19
+
+    mov         r6d,       [rsp + 16]
+    mov         [r3],      r6w
+    RET
+
+cglobal intra_pred_ang16_18_new, 3,5,4
+    add         r1,                  r1
+    lea         r4,                  [r1 * 3]
+    movu        m1,                  [r2]
+    movu        m3,                  [r2 + 16]
+    movu        m0,                  [r2 + 2 + 64]
+    pshufb      m0,                  [pw_swap16]
+    movu        [r0],                m1
+    movu        [r0 + 16],           m3
+    palignr     m2,                  m1, m0, 14
+    movu        [r0 + r1],           m2
+    palignr     m2,                  m3, m1, 14
+    movu        [r0 + r1 + 16],      m2
+    palignr     m2,                  m1, m0, 12
+    movu        [r0 + r1 * 2],       m2
+    palignr     m2,                  m3, m1, 12
+    movu        [r0 + r1 * 2 + 16],  m2
+    palignr     m2,                  m1, m0, 10
+    movu        [r0 + r4],           m2
+    palignr     m2,                  m3, m1, 10
+    movu        [r0 + r4 + 16],      m2
+
+    lea         r0,                  [r0 + r1 * 4]
+    palignr     m2,                  m1, m0, 8
+    movu        [r0],                m2
+    palignr     m2,                  m3, m1, 8
+    movu        [r0 + 16],           m2
+    palignr     m2,                  m1, m0, 6
+    movu        [r0 + r1],           m2
+    palignr     m2,                  m3, m1, 6
+    movu        [r0 + r1 + 16],      m2
+    palignr     m2,                  m1, m0, 4
+    movu        [r0 + r1 * 2],       m2
+    palignr     m2,                  m3, m1, 4
+    movu        [r0 + r1 * 2 + 16],  m2
+    palignr     m2,                  m1, m0, 2
+    movu        [r0 + r4],           m2
+    palignr     m3,                  m1, 2
+    movu        [r0 + r4 + 16],      m3
+
+    lea         r0,                  [r0 + r1 * 4]
+    movu        [r0],                m0
+    movu        [r0 + 16],           m1
+    movu        m3,                  [r2 + 18 + 64]
+    pshufb      m3,                  [pw_swap16]
+    palignr     m2,                  m0, m3, 14
+    movu        [r0 + r1],           m2
+    palignr     m2,                  m1, m0, 14
+    movu        [r0 + r1 + 16],      m2
+    palignr     m2,                  m0, m3, 12
+    movu        [r0 + r1 * 2],       m2
+    palignr     m2,                  m1, m0, 12
+    movu        [r0 + r1 * 2 + 16],  m2
+    palignr     m2,                  m0, m3, 10
+    movu        [r0 + r4],           m2
+    palignr     m2,                  m1, m0, 10
+    movu        [r0 + r4 + 16],      m2
+
+    lea         r0,                  [r0 + r1 * 4]
+    palignr     m2,                  m0, m3, 8
+    movu        [r0],                m2
+    palignr     m2,                  m1, m0, 8
+    movu        [r0 + 16],           m2
+    palignr     m2,                  m0, m3, 6
+    movu        [r0 + r1],           m2
+    palignr     m2,                  m1, m0, 6
+    movu        [r0 + r1 + 16],      m2
+    palignr     m2,                  m0, m3, 4
+    movu        [r0 + r1 * 2],       m2
+    palignr     m2,                  m1, m0, 4
+    movu        [r0 + r1 * 2 + 16],  m2
+    palignr     m2,                  m0, m3, 2
+    movu        [r0 + r4],           m2
+    palignr     m1,                  m0, 2
+    movu        [r0 + r4 + 16],      m1
+    RET
+
+cglobal intra_pred_ang16_10_new, 3,6,4
+    mov         r5d,                    r4m
+    movu        m1,                     [r2 + 2 + 64]       ; [8 7 6 5 4 3 2 1]
+    movu        m3,                     [r2 + 18 + 64]      ; [16 15 14 13 12 11 10 9]
+    pshufb      m0,                     m1, [pw_unpackwdq]  ; [1 1 1 1 1 1 1 1]
+    add         r1,                     r1
+    lea         r4,                     [r1 * 3]
+
+    psrldq      m1,                     2
+    pshufb      m2,                     m1, [pw_unpackwdq]  ; [2 2 2 2 2 2 2 2]
+    movu        [r0 + r1],              m2
+    movu        [r0 + r1 + 16],         m2
+    psrldq      m1,                     2
+    pshufb      m2,                     m1, [pw_unpackwdq]  ; [3 3 3 3 3 3 3 3]
+    movu        [r0 + r1 * 2],          m2
+    movu        [r0 + r1 * 2 + 16],     m2
+    psrldq      m1,                     2
+    pshufb      m2,                     m1, [pw_unpackwdq]  ; [4 4 4 4 4 4 4 4]
+    movu        [r0 + r4],              m2
+    movu        [r0 + r4 + 16],         m2
+
+    lea         r3,                     [r0 + r1 *4]
+    psrldq      m1,                     2
+    pshufb      m2,                     m1, [pw_unpackwdq]  ; [5 5 5 5 5 5 5 5]
+    movu        [r3],                   m2
+    movu        [r3 + 16],              m2
+    psrldq      m1,                     2
+    pshufb      m2,                     m1, [pw_unpackwdq]  ; [6 6 6 6 6 6 6 6]
+    movu        [r3 + r1],              m2
+    movu        [r3 + r1 + 16],         m2
+    psrldq      m1,                     2
+    pshufb      m2,                     m1, [pw_unpackwdq]  ; [7 7 7 7 7 7 7 7]
+    movu        [r3 + r1 * 2],          m2
+    movu        [r3 + r1 * 2 + 16],     m2
+    psrldq      m1,                     2
+    pshufb      m2,                     m1, [pw_unpackwdq]  ; [8 8 8 8 8 8 8 8]
+    movu        [r3 + r4],              m2
+    movu        [r3 + r4 + 16],         m2
+
+    lea         r3,                     [r3 + r1 *4]
+    pshufb      m2,                     m3, [pw_unpackwdq]  ; [9 9 9 9 9 9 9 9]
+    movu        [r3],                   m2
+    movu        [r3 + 16],              m2
+    psrldq      m3,                     2
+    pshufb      m2,                     m3, [pw_unpackwdq]  ; [10 10 10 10 10 10 10 10]
+    movu        [r3 + r1],              m2
+    movu        [r3 + r1 + 16],         m2
+    psrldq      m3,                     2
+    pshufb      m2,                     m3, [pw_unpackwdq]  ; [11 11 11 11 11 11 11 11]
+    movu        [r3 + r1 * 2],          m2
+    movu        [r3 + r1 * 2 + 16],     m2
+    psrldq      m3,                     2
+    pshufb      m2,                     m3, [pw_unpackwdq]  ; [12 12 12 12 12 12 12 12]
+    movu        [r3 + r4],              m2
+    movu        [r3 + r4 + 16],         m2
+
+    lea         r3,                     [r3 + r1 *4]
+    psrldq      m3,                     2
+    pshufb      m2,                     m3, [pw_unpackwdq]  ; [13 13 13 13 13 13 13 13]
+    movu        [r3],                   m2
+    movu        [r3 + 16],              m2
+    psrldq      m3,                     2
+    pshufb      m2,                     m3, [pw_unpackwdq]  ; [14 14 14 14 14 14 14 14]
+    movu        [r3 + r1],              m2
+    movu        [r3 + r1 + 16],         m2
+    psrldq      m3,                     2
+    pshufb      m2,                     m3, [pw_unpackwdq]  ; [15 15 15 15 15 15 15 15]
+    movu        [r3 + r1 * 2],          m2
+    movu        [r3 + r1 * 2 + 16],     m2
+    psrldq      m3,                     2
+    pshufb      m2,                     m3, [pw_unpackwdq]  ; [16 16 16 16 16 16 16 16]
+    movu        [r3 + r4],              m2
+    movu        [r3 + r4 + 16],         m2
+    mova        m3,                     m0
+
+    cmp         r5d,                    byte 0
+    jz         .quit
+
+    ; filter
+    pinsrw      m1,                     [r2], 0             ; [3 2 1 0]
+    pshufb      m2,                     m1, [pw_unpackwdq]  ; [0 0 0 0 0 0 0 0]
+    movu        m1,                     [r2 + 2]            ; [8 7 6 5 4 3 2 1]
+    movu        m3,                     [r2 + 18]           ; [16 15 14 13 12 11 10 9]
+    psubw       m1,                     m2
+    psubw       m3,                     m2
+    psraw       m1,                     1
+    psraw       m3,                     1
+    paddw       m3,                     m0
+    paddw       m0,                     m1
+    pxor        m1,                     m1
+    pmaxsw      m0,                     m1
+    pminsw      m0,                     [pw_1023]
+    pmaxsw      m3,                     m1
+    pminsw      m3,                     [pw_1023]
+.quit:
+    movu        [r0],                   m0
+    movu        [r0 + 16],              m3
+    RET
+
+cglobal intra_pred_ang16_26_new, 3,6,4
+    mov         r5d,                r4m
+    movu        m0,                 [r2 + 2]            ; [8 7 6 5 4 3 2 1]
+    movu        m3,                 [r2 + 18]           ; [16 15 14 13 12 11 10 9]
+    add         r1,                 r1
+    lea         r4,                 [r1 * 3]
+
+    movu        [r0],               m0
+    movu        [r0 + 16],          m3
+    movu        [r0 + r1],          m0
+    movu        [r0 + r1 + 16],     m3
+    movu        [r0 + r1 * 2],      m0
+    movu        [r0 + r1 * 2 + 16], m3
+    movu        [r0 + r4],          m0
+    movu        [r0 + r4 + 16],     m3
+
+    lea         r3,                 [r0 + r1 *4]
+    movu        [r3],               m0
+    movu        [r3 + 16],          m3
+    movu        [r3 + r1],          m0
+    movu        [r3 + r1 + 16],     m3
+    movu        [r3 + r1 * 2],      m0
+    movu        [r3 + r1 * 2 + 16], m3
+    movu        [r3 + r4],          m0
+    movu        [r3 + r4 + 16],     m3
+
+    lea         r3,                 [r3 + r1 *4]
+    movu        [r3],               m0
+    movu        [r3 + 16],          m3
+    movu        [r3 + r1],          m0
+    movu        [r3 + r1 + 16],     m3
+    movu        [r3 + r1 * 2],      m0
+    movu        [r3 + r1 * 2 + 16], m3
+    movu        [r3 + r4],          m0
+    movu        [r3 + r4 + 16],     m3
+
+    lea         r3,                 [r3 + r1 *4]
+    movu        [r3],               m0
+    movu        [r3 + 16],          m3
+    movu        [r3 + r1],          m0
+    movu        [r3 + r1 + 16],     m3
+    movu        [r3 + r1 * 2],      m0
+    movu        [r3 + r1 * 2 + 16], m3
+    movu        [r3 + r4],          m0
+    movu        [r3 + r4 + 16],     m3
+
+    cmp         r5d,                byte 0
+    jz         .quit
+
+    ; filter
+
+    pshufb      m0,                 [pw_unpackwdq]
+    pinsrw      m1,                 [r2], 0             ; [3 2 1 0]
+    pshufb      m2,                 m1, [pw_unpackwdq]  ; [0 0 0 0 0 0 0 0]
+    movu        m1,                 [r2 + 2 + 64]       ; [8 7 6 5 4 3 2 1]
+    movu        m3,                 [r2 + 18 + 64]      ; [16 15 14 13 12 11 10 9]
+    psubw       m1,                 m2
+    psubw       m3,                 m2
+    psraw       m1,                 1
+    psraw       m3,                 1
+    paddw       m3,                 m0
+    paddw       m0,                 m1
+    pxor        m1,                 m1
+    pmaxsw      m0,                 m1
+    pminsw      m0,                 [pw_1023]
+    pmaxsw      m3,                 m1
+    pminsw      m3,                 [pw_1023]
+    pextrw      [r0],               m0, 0
+    pextrw      [r0 + r1],          m0, 1
+    pextrw      [r0 + r1 * 2],      m0, 2
+    pextrw      [r0 + r4],          m0, 3
+    lea         r0,                 [r0 + r1 * 4]
+    pextrw      [r0],               m0, 4
+    pextrw      [r0 + r1],          m0, 5
+    pextrw      [r0 + r1 * 2],      m0, 6
+    pextrw      [r0 + r4],          m0, 7
+    lea         r0,                 [r0 + r1 * 4]
+    pextrw      [r0],               m3, 0
+    pextrw      [r0 + r1],          m3, 1
+    pextrw      [r0 + r1 * 2],      m3, 2
+    pextrw      [r0 + r4],          m3, 3
+    pextrw      [r3],               m3, 4
+    pextrw      [r3 + r1],          m3, 5
+    pextrw      [r3 + r1 * 2],      m3, 6
+    pextrw      [r3 + r4],          m3, 7
+.quit:
+    RET
+
 %macro MODE_2_34 0
     movu            m0, [r2 + 4]
     movu            m1, [r2 + 20]
@@ -12699,3 +17294,1294 @@
     dec    r4
     jnz    .loop
     RET
+
+;------------------------------------------------------------------------------------------
+; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
+;------------------------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal intra_pred_ang32_2_new, 3,6,6
+    lea             r4, [r2]
+    add             r2, 128
+    cmp             r3m, byte 34
+    cmove           r2, r4
+
+    add             r1, r1
+    lea             r3, [r1 * 2]
+    lea             r4, [r1 * 3]
+    mov             r5, 2
+
+.loop:
+    MODE_2_34
+    add             r2, 32
+    dec             r5
+    jnz             .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_3_new, 3,6,8
+    add         r2, 128
+    lea         r3, [ang_table + 16 * 16]
+    mov         r4d, 8
+    add         r1, r1
+    lea         r5, [r1 * 3]
+
+.loop:
+    MODE_3_33 1
+    lea         r0, [r0 + r1 * 4 ]
+    add         r2, 8
+    dec         r4
+    jnz         .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_4_new, 3,6,8
+    add         r2, 128
+    lea         r3, [ang_table + 16 * 16]
+    mov         r4d, 8
+    add         r1, r1
+    lea         r5, [r1 * 3]
+
+.loop:
+    MODE_4_32 1
+    lea         r0, [r0 + r1 * 4 ]
+    add         r2, 8
+    dec         r4
+    jnz         .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_5_new, 3,6,8
+    add         r2, 128
+    lea         r3, [ang_table + 16 * 16]
+    mov         r4d, 8
+    add         r1, r1
+    lea         r5, [r1 * 3]
+
+.loop:
+    MODE_5_31 1
+    lea         r0, [r0 + r1 * 4 ]
+    add         r2, 8
+    dec         r4
+    jnz         .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_6_new, 3,6,8
+    add         r2, 128
+    lea         r3, [ang_table + 16 * 16]
+    mov         r4d, 8
+    add         r1, r1
+    lea         r5, [r1 * 3]
+
+.loop:
+    MODE_6_30 1
+    lea         r0, [r0 + r1 * 4 ]
+    add         r2, 8
+    dec         r4
+    jnz         .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_7_new, 3,6,8
+    add         r2, 128
+    lea         r3, [ang_table + 16 * 16]
+    mov         r4d, 8
+    add         r1, r1
+    lea         r5, [r1 * 3]
+
+.loop:
+    MODE_7_29 1
+    lea         r0, [r0 + r1 * 4 ]
+    add         r2, 8
+    dec         r4
+    jnz         .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_8_new, 3,6,8
+    add         r2, 128
+    lea         r3, [ang_table + 16 * 16]
+    mov         r4d, 8
+    add         r1, r1
+    lea         r5, [r1 * 3]
+
+.loop:
+    MODE_8_28 1
+    lea         r0, [r0 + r1 * 4 ]
+    add         r2, 8
+    dec         r4
+    jnz         .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_9_new, 3,6,8
+    add         r2, 128
+    lea         r3, [ang_table + 16 * 16]
+    mov         r4d, 8
+    add         r1, r1
+    lea         r5, [r1 * 3]
+
+.loop:
+    MODE_9_27 1
+    lea         r0, [r0 + r1 * 4 ]
+    add         r2, 8
+    dec         r4
+    jnz         .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_10_new, 3,7,8
+    add         r2, 128
+    mov         r6d, 4
+    add         r1, r1
+    lea         r5, [r1 * 3]
+    lea         r4, [r1 * 2]
+    lea         r3, [r1 * 4]
+    mova        m7, [c_mode32_10_0]
+
+.loop:
+    movu        m0, [r2 + 2]
+    pshufb      m1, m0, m7
+    movu        [r0], m1
+    movu        [r0 + 16], m1
+    movu        [r0 + 32], m1
+    movu        [r0 + 48], m1
+
+    palignr     m1, m0, 2
+    pshufb      m1, m7
+    movu        [r0 + r1], m1
+    movu        [r0 + r1 + 16], m1
+    movu        [r0 + r1 + 32], m1
+    movu        [r0 + r1 + 48], m1
+
+    palignr     m1, m0, 4
+    pshufb      m1, m7
+    movu        [r0 + r4], m1
+    movu        [r0 + r4 + 16], m1
+    movu        [r0 + r4 + 32], m1
+    movu        [r0 + r4 + 48], m1
+
+    palignr     m1, m0, 6
+    pshufb      m1, m7
+    movu        [r0 + r5], m1
+    movu        [r0 + r5 + 16], m1
+    movu        [r0 + r5 + 32], m1
+    movu        [r0 + r5 + 48], m1
+
+    add         r0, r3
+
+    palignr     m1, m0, 8
+    pshufb      m1, m7
+    movu        [r0], m1
+    movu        [r0 + 16], m1
+    movu        [r0 + 32], m1
+    movu        [r0 + 48], m1
+
+    palignr     m1, m0, 10
+    pshufb      m1, m7
+    movu        [r0 + r1], m1
+    movu        [r0 + r1 + 16], m1
+    movu        [r0 + r1 + 32], m1
+    movu        [r0 + r1 + 48], m1
+
+    palignr     m1, m0, 12
+    pshufb      m1, m7
+    movu        [r0 + r4], m1
+    movu        [r0 + r4 + 16], m1
+    movu        [r0 + r4 + 32], m1
+    movu        [r0 + r4 + 48], m1
+
+    palignr     m1, m0, 14
+    pshufb      m1, m7
+    movu        [r0 + r5], m1
+    movu        [r0 + r5 + 16], m1
+    movu        [r0 + r5 + 32], m1
+    movu        [r0 + r5 + 48], m1
+
+    add         r0, r3
+    add         r2, 16
+    dec         r6d
+    jnz         .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_11_new, 3,6,7,0-(4*mmsize+4)
+    mov      r3, r2mp
+    add      r2, 128
+    movu     m0, [r2 + 0*mmsize]
+    pinsrw   m0, [r3], 0
+    movu     m1, [r2 + 1*mmsize]
+    movu     m2, [r2 + 2*mmsize]
+    movu     m3, [r2 + 3*mmsize]
+    movu     [rsp + 0*mmsize + 2], m0
+    movu     [rsp + 1*mmsize + 2], m1
+    movu     [rsp + 2*mmsize + 2], m2
+    movu     [rsp + 3*mmsize + 2], m3
+    mov      r4w, [r3+32]
+    mov      [rsp], r4w
+    mov      r4w, [r2+64]
+    mov      [rsp+66], r4w
+
+    lea      r3, [ang_table + 16 * 16]
+    mov      r4d, 8
+    mov      r2, rsp
+    add      r1, r1
+    lea      r5, [r1 * 3]
+
+.loop:
+    MODE_11_25 1
+    lea      r0, [r0 + r1 * 4 ]
+    add      r2, 8
+    dec      r4
+    jnz      .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_12_new, 3,6,7,0-(4*mmsize+10)
+    mov      r3, r2mp
+    add      r2, 128
+    movu     m0, [r2 + 0*mmsize]
+    pinsrw   m0, [r3], 0
+    movu     m1, [r2 + 1*mmsize]
+    movu     m2, [r2 + 2*mmsize]
+    movu     m3, [r2 + 3*mmsize]
+    movu     [rsp + 0*mmsize + 8], m0
+    movu     [rsp + 1*mmsize + 8], m1
+    movu     [rsp + 2*mmsize + 8], m2
+    movu     [rsp + 3*mmsize + 8], m3
+
+    mov      r4w, [r2+64]
+    mov      [rsp+72], r4w
+    mov      r4w, [r3+12]
+    mov      [rsp+6], r4w
+    mov      r4w, [r3+26]
+    mov      [rsp+4], r4w
+    mov      r4w, [r3+38]
+    mov      [rsp+2], r4w
+    mov      r4w, [r3+52]
+    mov      [rsp], r4w
+
+    lea      r3, [ang_table + 16 * 16]
+    mov      r4d, 8
+    mov      r2, rsp
+    add      r1, r1
+    lea      r5, [r1 * 3]
+    mova     m2, [pw_punpcklwd]
+
+.loop:
+    MODE_12_24 1
+    lea      r0, [r0 + r1 * 4 ]
+    add      r2, 8
+    dec      r4
+    jnz      .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_13_new, 3,6,7,0-(5*mmsize+2)
+    mov      r3, r2mp
+    add      r2, 128
+    movu     m0, [r2 + 0*mmsize]
+    pinsrw   m0, [r3], 0
+    movu     m1, [r2 + 1*mmsize]
+    movu     m2, [r2 + 2*mmsize]
+    movu     m3, [r2 + 3*mmsize]
+    movu     [rsp + 1*mmsize], m0
+    movu     [rsp + 2*mmsize], m1
+    movu     [rsp + 3*mmsize], m2
+    movu     [rsp + 4*mmsize], m3
+
+    mov      r4w, [r2+64]
+    mov      [rsp+80], r4w
+    movu     m0, [r3 + 8]
+    movu     m1, [r3 + 36]
+    pshufb   m0, [shuf_mode_13_23]
+    pshufb   m1, [shuf_mode_13_23]
+    movh     [rsp + 8], m0
+    movh     [rsp], m1
+    mov      r4w, [r3+28]
+    mov      [rsp+8], r4w
+    mov      r4w, [r3+56]
+    mov      [rsp], r4w
+
+    lea      r3, [ang_table + 16 * 16]
+    mov      r4d, 8
+    mov      r2, rsp
+    add      r1, r1
+    lea      r5, [r1 * 3]
+    mova     m2, [pw_punpcklwd]
+
+.loop:
+    MODE_13_23 1
+    lea      r0, [r0 + r1 * 4 ]
+    add      r2, 8
+    dec      r4
+    jnz     .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_14_new, 3,6,7,0-(5*mmsize+10)
+    mov      r3, r2mp
+    add      r2, 128
+    movu     m0, [r2 + 0*mmsize]
+    pinsrw   m0, [r3], 0
+    movu     m1, [r2 + 1*mmsize]
+    movu     m2, [r2 + 2*mmsize]
+    movu     m3, [r2 + 3*mmsize]
+    movu     [rsp + 1*mmsize + 8], m0
+    movu     [rsp + 2*mmsize + 8], m1
+    movu     [rsp + 3*mmsize + 8], m2
+    movu     [rsp + 4*mmsize + 8], m3
+
+    mov      r4w, [r2 + 64]
+    mov      [rsp + 88], r4w
+    mov      r4w, [r3+4]
+    mov      [rsp+22], r4w
+    movu     m0, [r3 + 10]
+    movu     m1, [r3 + 30]
+    movu     m2, [r3 + 50]
+    pshufb   m0, [shuf_mode_14_22]
+    pshufb   m1, [shuf_mode_14_22]
+    pshufb   m2, [shuf_mode_14_22]
+    movh     [rsp + 14], m0
+    movh     [rsp + 6], m1
+    movh     [rsp - 2], m2
+
+    lea      r3, [ang_table + 16 * 16]
+    mov      r4d, 8
+    mov      r2, rsp
+    add      r1, r1
+    lea      r5, [r1 * 3]
+    mova     m2, [pw_punpcklwd]
+
+.loop:
+    MODE_14_22 1
+    lea      r0, [r0 + r1 * 4 ]
+    add      r2, 8
+    dec      r4
+    jnz     .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_15_new, 3,6,7,0-(6*mmsize+2)
+    mov      r3, r2mp
+    add      r2, 128
+    movu     m0, [r2 + 0*mmsize]
+    pinsrw   m0, [r3], 0
+    movu     m1, [r2 + 1*mmsize]
+    movu     m2, [r2 + 2*mmsize]
+    movu     m3, [r2 + 3*mmsize]
+    movu     [rsp + 2*mmsize], m0
+    movu     [rsp + 3*mmsize], m1
+    movu     [rsp + 4*mmsize], m2
+    movu     [rsp + 5*mmsize], m3
+
+    mov      r4w, [r2 + 64]
+    mov      [rsp + 96], r4w
+    movu     m0, [r3 + 4]
+    movu     m1, [r3 + 18]
+    movu     m2, [r3 + 34]
+    movu     m3, [r3 + 48]
+    pshufb   m0, [shuf_mode_15_21]
+    pshufb   m1, [shuf_mode_15_21]
+    pshufb   m2, [shuf_mode_15_21]
+    pshufb   m3, [shuf_mode_15_21]
+    movh     [rsp + 24], m0
+    movh     [rsp + 16], m1
+    movh     [rsp + 8], m2
+    movh     [rsp], m3
+
+    lea      r3, [ang_table + 16 * 16]
+    mov      r4d, 8
+    mov      r2, rsp
+    add      r1, r1
+    lea      r5, [r1 * 3]
+    mova     m2, [pw_punpcklwd]
+
+.loop:
+    MODE_15_21 1
+    lea      r0, [r0 + r1 * 4 ]
+    add      r2, 8
+    dec      r4
+    jnz     .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_16_new, 3,6,7,0-(6*mmsize+10)
+    mov      r3, r2mp
+    add      r2, 128
+    movu     m0, [r2 + 0*mmsize]
+    pinsrw   m0, [r3], 0
+    movu     m1, [r2 + 1*mmsize]
+    movu     m2, [r2 + 2*mmsize]
+    movu     m3, [r2 + 3*mmsize]
+    movu     [rsp + 2*mmsize + 8], m0
+    movu     [rsp + 3*mmsize + 8], m1
+    movu     [rsp + 4*mmsize + 8], m2
+    movu     [rsp + 5*mmsize + 8], m3
+
+    mov      r4w, [r2 + 64]
+    mov      [rsp + 104], r4w
+    movu     m0, [r3 + 4]
+    movu     m1, [r3 + 22]
+    movu     m2, [r3 + 40]
+    movd     m3, [r3 + 58]
+    pshufb   m0, [shuf_mode_16_20]
+    pshufb   m1, [shuf_mode_16_20]
+    pshufb   m2, [shuf_mode_16_20]
+    pshufb   m3, [shuf_mode_16_20]
+    movu     [rsp + 24], m0
+    movu     [rsp + 12], m1
+    movu     [rsp], m2
+    movd     [rsp], m3
+
+    lea      r3, [ang_table + 16 * 16]
+    mov      r4d, 8
+    mov      r2, rsp
+    add      r1, r1
+    lea      r5, [r1 * 3]
+    mova     m2, [pw_punpcklwd]
+
+.loop:
+    MODE_16_20 1
+    lea      r0, [r0 + r1 * 4 ]
+    add      r2, 8
+    dec      r4
+    jnz     .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_17_new, 3,6,7,0-(7*mmsize+4)
+    mov      r3, r2mp
+    add      r2, 128
+    movu     m0, [r2 + 0*mmsize]
+    pinsrw   m0, [r3], 0
+    movu     m1, [r2 + 1*mmsize]
+    movu     m2, [r2 + 2*mmsize]
+    movu     m3, [r2 + 3*mmsize]
+    movu     [rsp + 3*mmsize + 2], m0
+    movu     [rsp + 4*mmsize + 2], m1
+    movu     [rsp + 5*mmsize + 2], m2
+    movu     [rsp + 6*mmsize + 2], m3
+
+    mov      r4w, [r2 + 64]
+    mov      [rsp + 114], r4w
+    movu     m0, [r3 + 8]
+    movu     m1, [r3 + 30]
+    movu     m2, [r3 + 50]
+    movd     m3, [r3 + 2]
+    pshufb   m0, [shuf_mode_17_19]
+    pshufb   m1, [shuf_mode_17_19]
+    pshufb   m2, [shuf_mode_17_19]
+    pshufb   m3, [shuf_mode_16_20]
+    movd     [rsp + 46], m3
+    movu     [rsp + 30], m0
+    movu     [rsp + 12], m1
+    movu     [rsp - 4], m2
+    mov      r4w, [r3 + 24]
+    mov      [rsp + 30], r4w
+    mov      r4w, [r3 + 28]
+    mov      [rsp + 28], r4w
+    mov      r4w, [r3 + 46]
+    mov      [rsp + 12], r4w
+
+    lea      r3, [ang_table + 16 * 16]
+    mov      r4d, 8
+    mov      r2, rsp
+    add      r1, r1
+    lea      r5, [r1 * 3]
+    mova     m2, [pw_punpcklwd]
+
+.loop:
+    MODE_17_19 1
+    lea      r0, [r0 + r1 * 4 ]
+    add      r2, 8
+    dec      r4
+    jnz     .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_18_new, 3,7,8
+    mov      r3, r2mp
+    add      r2, 128
+    movu        m0, [r3]               ; [7 6 5 4 3 2 1 0]
+    movu        m1, [r3 + 16]          ; [15 14 13 12 11 10 9 8]
+    movu        m2, [r3 + 32]          ; [23 22 21 20 19 18 17 16]
+    movu        m3, [r3 + 48]          ; [31 30 29 28 27 26 25 24]
+    movu        m4, [r2 + 2]           ; [8 7 6 5 4 3 2 1]
+    movu        m5, [r2 + 18]          ; [16 15 14 13 12 11 10 9]
+
+    add         r1, r1
+    lea         r6, [r1 * 2]
+    lea         r3, [r1 * 3]
+    lea         r4, [r1 * 4]
+
+    movu        [r0], m0
+    movu        [r0 + 16], m1
+    movu        [r0 + 32], m2
+    movu        [r0 + 48], m3
+
+    pshufb      m4, [shuf_mode32_18]   ; [1 2 3 4 5 6 7 8]
+    pshufb      m5, [shuf_mode32_18]   ; [9 10 11 12 13 14 15 16]
+
+    palignr     m6, m0, m4, 14
+    movu        [r0 + r1], m6
+    palignr     m6, m1, m0, 14
+    movu        [r0 + r1 + 16], m6
+    palignr     m6, m2, m1, 14
+    movu        [r0 + r1 + 32], m6
+    palignr     m6, m3, m2, 14
+    movu        [r0 + r1 + 48], m6
+
+    palignr     m6, m0, m4, 12
+    movu        [r0 + r6], m6
+    palignr     m6, m1, m0, 12
+    movu        [r0 + r6 + 16], m6
+    palignr     m6, m2, m1, 12
+    movu        [r0 + r6 + 32], m6
+    palignr     m6, m3, m2, 12
+    movu        [r0 + r6 + 48], m6
+
+    palignr     m6, m0, m4, 10
+    movu        [r0 + r3], m6
+    palignr     m6, m1, m0, 10
+    movu        [r0 + r3 + 16], m6
+    palignr     m6, m2, m1, 10
+    movu        [r0 + r3 + 32], m6
+    palignr     m6, m3, m2, 10
+    movu        [r0 + r3 + 48], m6
+
+    add         r0, r4
+
+    palignr     m6, m0, m4, 8
+    movu        [r0], m6
+    palignr     m6, m1, m0, 8
+    movu        [r0 + 16], m6
+    palignr     m6, m2, m1, 8
+    movu        [r0 + 32], m6
+    palignr     m6, m3, m2, 8
+    movu        [r0 + 48], m6
+
+    palignr     m6, m0, m4, 6
+    movu        [r0 + r1], m6
+    palignr     m6, m1, m0, 6
+    movu        [r0 + r1 + 16], m6
+    palignr     m6, m2, m1, 6
+    movu        [r0 + r1 + 32], m6
+    palignr     m6, m3, m2, 6
+    movu        [r0 + r1 + 48], m6
+
+    palignr     m6, m0, m4, 4
+    movu        [r0 + r6], m6
+    palignr     m6, m1, m0, 4
+    movu        [r0 + r6 + 16], m6
+    palignr     m6, m2, m1, 4
+    movu        [r0 + r6 + 32], m6
+    palignr     m6, m3, m2, 4
+    movu        [r0 + r6 + 48], m6
+
+    palignr     m6, m0, m4, 2
+    movu        [r0 + r3], m6
+    palignr     m6, m1, m0, 2
+    movu        [r0 + r3 + 16], m6
+    palignr     m6, m2, m1, 2
+    movu        [r0 + r3 + 32], m6
+    palignr     m6, m3, m2, 2
+    movu        [r0 + r3 + 48], m6
+
+    add         r0, r4
+
+    movu        [r0], m4
+    movu        [r0 + 16], m0
+    movu        [r0 + 32], m1
+    movu        [r0 + 48], m2
+
+    palignr     m6, m4, m5, 14
+    movu        [r0 + r1], m6
+    palignr     m6, m0, m4, 14
+    movu        [r0 + r1 + 16], m6
+    palignr     m6, m1, m0, 14
+    movu        [r0 + r1 + 32], m6
+    palignr     m6, m2, m1, 14
+    movu        [r0 + r1 + 48], m6
+
+    palignr     m6, m4, m5, 12
+    movu        [r0 + r6], m6
+    palignr     m6, m0, m4, 12
+    movu        [r0 + r6 + 16], m6
+    palignr     m6, m1, m0, 12
+    movu        [r0 + r6 + 32], m6
+    palignr     m6, m2, m1, 12
+    movu        [r0 + r6 + 48], m6
+
+    palignr     m6, m4, m5, 10
+    movu        [r0 + r3], m6
+    palignr     m6, m0, m4, 10
+    movu        [r0 + r3 + 16], m6
+    palignr     m6, m1, m0, 10
+    movu        [r0 + r3 + 32], m6
+    palignr     m6, m2, m1, 10
+    movu        [r0 + r3 + 48], m6
+
+    add         r0, r4
+
+    palignr     m6, m4, m5, 8
+    movu        [r0], m6
+    palignr     m6, m0, m4, 8
+    movu        [r0 + 16], m6
+    palignr     m6, m1, m0, 8
+    movu        [r0 + 32], m6
+    palignr     m6, m2, m1, 8
+    movu        [r0 + 48], m6
+
+    palignr     m6, m4, m5, 6
+    movu        [r0 + r1], m6
+    palignr     m6, m0, m4, 6
+    movu        [r0 + r1 + 16], m6
+    palignr     m6, m1, m0, 6
+    movu        [r0 + r1 + 32], m6
+    palignr     m6, m2, m1, 6
+    movu        [r0 + r1 + 48], m6
+
+    palignr     m6, m4, m5, 4
+    movu        [r0 + r6], m6
+    palignr     m6, m0, m4, 4
+    movu        [r0 + r6 + 16], m6
+    palignr     m6, m1, m0, 4
+    movu        [r0 + r6 + 32], m6
+    palignr     m6, m2, m1, 4
+    movu        [r0 + r6 + 48], m6
+
+    palignr     m6, m4, m5, 2
+    movu        [r0 + r3], m6
+    palignr     m6, m0, m4, 2
+    movu        [r0 + r3 + 16], m6
+    palignr     m6, m1, m0, 2
+    movu        [r0 + r3 + 32], m6
+    palignr     m6, m2, m1, 2
+    movu        [r0 + r3 + 48], m6
+
+    add         r0, r4
+
+    movu        m2, [r2 + 34]
+    movu        m3, [r2 + 50]
+    pshufb      m2, [shuf_mode32_18]
+    pshufb      m3, [shuf_mode32_18]
+
+    movu        [r0], m5
+    movu        [r0 + 16], m4
+    movu        [r0 + 32], m0
+    movu        [r0 + 48], m1
+
+    palignr     m6, m5, m2, 14
+    movu        [r0 + r1], m6
+    palignr     m6, m4, m5, 14
+    movu        [r0 + r1 + 16], m6
+    palignr     m6, m0, m4, 14
+    movu        [r0 + r1 + 32], m6
+    palignr     m6, m1, m0, 14
+    movu        [r0 + r1 + 48], m6
+
+    palignr     m6, m5, m2, 12
+    movu        [r0 + r6], m6
+    palignr     m6, m4, m5, 12
+    movu        [r0 + r6 + 16], m6
+    palignr     m6, m0, m4, 12
+    movu        [r0 + r6 + 32], m6
+    palignr     m6, m1, m0, 12
+    movu        [r0 + r6 + 48], m6
+
+    palignr     m6, m5, m2, 10
+    movu        [r0 + r3], m6
+    palignr     m6, m4, m5, 10
+    movu        [r0 + r3 + 16], m6
+    palignr     m6, m0, m4, 10
+    movu        [r0 + r3 + 32], m6
+    palignr     m6, m1, m0, 10
+    movu        [r0 + r3 + 48], m6
+
+    add         r0, r4
+
+    palignr     m6, m5, m2, 8
+    movu        [r0], m6
+    palignr     m6, m4, m5, 8
+    movu        [r0 + 16], m6
+    palignr     m6, m0, m4, 8
+    movu        [r0 + 32], m6
+    palignr     m6, m1, m0, 8
+    movu        [r0 + 48], m6
+
+    palignr     m6, m5, m2, 6
+    movu        [r0 + r1], m6
+    palignr     m6, m4, m5, 6
+    movu        [r0 + r1 + 16], m6
+    palignr     m6, m0, m4, 6
+    movu        [r0 + r1 + 32], m6
+    palignr     m6, m1, m0, 6
+    movu        [r0 + r1 + 48], m6
+
+    palignr     m6, m5, m2, 4
+    movu        [r0 + r6], m6
+    palignr     m6, m4, m5, 4
+    movu        [r0 + r6 + 16], m6
+    palignr     m6, m0, m4, 4
+    movu        [r0 + r6 + 32], m6
+    palignr     m6, m1, m0, 4
+    movu        [r0 + r6 + 48], m6
+
+    palignr     m6, m5, m2, 2
+    movu        [r0 + r3], m6
+    palignr     m6, m4, m5, 2
+    movu        [r0 + r3 + 16], m6
+    palignr     m6, m0, m4, 2
+    movu        [r0 + r3 + 32], m6
+    palignr     m6, m1, m0, 2
+    movu        [r0 + r3 + 48], m6
+
+    add         r0, r4
+
+    movu        [r0], m2
+    movu        [r0 + 16], m5
+    movu        [r0 + 32], m4
+    movu        [r0 + 48], m0
+
+    palignr     m6, m2, m3, 14
+    movu        [r0 + r1], m6
+    palignr     m6, m5, m2, 14
+    movu        [r0 + r1 + 16], m6
+    palignr     m6, m4, m5, 14
+    movu        [r0 + r1 + 32], m6
+    palignr     m6, m0, m4, 14
+    movu        [r0 + r1 + 48], m6
+
+    palignr     m6, m2, m3, 12
+    movu        [r0 + r6], m6
+    palignr     m6, m5, m2, 12
+    movu        [r0 + r6 + 16], m6
+    palignr     m6, m4, m5, 12
+    movu        [r0 + r6 + 32], m6
+    palignr     m6, m0, m4, 12
+    movu        [r0 + r6 + 48], m6
+
+    palignr     m6, m2, m3, 10
+    movu        [r0 + r3], m6
+    palignr     m6, m5, m2, 10
+    movu        [r0 + r3 + 16], m6
+    palignr     m6, m4, m5, 10
+    movu        [r0 + r3 + 32], m6
+    palignr     m6, m0, m4, 10
+    movu        [r0 + r3 + 48], m6
+
+    add         r0, r4
+
+    palignr     m6, m2, m3, 8
+    movu        [r0], m6
+    palignr     m6, m5, m2, 8
+    movu        [r0 + 16], m6
+    palignr     m6, m4, m5, 8
+    movu        [r0 + 32], m6
+    palignr     m6, m0, m4, 8
+    movu        [r0 + 48], m6
+
+    palignr     m6, m2, m3, 6
+    movu        [r0 + r1], m6
+    palignr     m6, m5, m2, 6
+    movu        [r0 + r1 + 16], m6
+    palignr     m6, m4, m5, 6
+    movu        [r0 + r1 + 32], m6
+    palignr     m6, m0, m4, 6
+    movu        [r0 + r1 + 48], m6
+
+    palignr     m6, m2, m3, 4
+    movu        [r0 + r6], m6
+    palignr     m6, m5, m2, 4
+    movu        [r0 + r6 + 16], m6
+    palignr     m6, m4, m5, 4
+    movu        [r0 + r6 + 32], m6
+    palignr     m6, m0, m4, 4
+    movu        [r0 + r6 + 48], m6
+
+    palignr     m6, m2, m3, 2
+    movu        [r0 + r3], m6
+    palignr     m6, m5, m2, 2
+    movu        [r0 + r3 + 16], m6
+    palignr     m6, m4, m5, 2
+    movu        [r0 + r3 + 32], m6
+    palignr     m6, m0, m4, 2
+    movu        [r0 + r3 + 48], m6
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_19_new, 3,7,7,0-(7*mmsize+4)
+    lea      r3, [r2 + 128]
+    movu     m0, [r2 + 0*mmsize]
+    movu     m1, [r2 + 1*mmsize]
+    movu     m2, [r2 + 2*mmsize]
+    movu     m3, [r2 + 3*mmsize]
+    movu     [rsp + 3*mmsize + 2], m0
+    movu     [rsp + 4*mmsize + 2], m1
+    movu     [rsp + 5*mmsize + 2], m2
+    movu     [rsp + 6*mmsize + 2], m3
+
+    mov      r4w, [r2 + 64]
+    mov      [rsp + 114], r4w
+    movu     m0, [r3 + 8]
+    movu     m1, [r3 + 30]
+    movu     m2, [r3 + 50]
+    movd     m3, [r3 + 2]
+    pshufb   m0, [shuf_mode_17_19]
+    pshufb   m1, [shuf_mode_17_19]
+    pshufb   m2, [shuf_mode_17_19]
+    pshufb   m3, [shuf_mode_16_20]
+    movd     [rsp + 46], m3
+    movu     [rsp + 30], m0
+    movu     [rsp + 12], m1
+    movu     [rsp - 4], m2
+    mov      r4w, [r3 + 24]
+    mov      [rsp + 30], r4w
+    mov      r4w, [r3 + 28]
+    mov      [rsp + 28], r4w
+    mov      r4w, [r3 + 46]
+    mov      [rsp + 12], r4w
+
+    lea      r3, [ang_table + 16 * 16]
+    mov      r4d, 8
+    mov      r2, rsp
+    add      r1, r1
+    lea      r5, [r1 * 3]
+    mova     m2, [pw_punpcklwd]
+    mov      r6, r0
+
+.loop:
+    MODE_17_19 0
+    add      r6, 8
+    mov      r0, r6
+    add      r2, 8
+    dec      r4
+    jnz     .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_20_new, 3,7,7,0-(6*mmsize+10)
+    lea      r3, [r2 + 128]
+    movu     m0, [r2 + 0*mmsize]
+    movu     m1, [r2 + 1*mmsize]
+    movu     m2, [r2 + 2*mmsize]
+    movu     m3, [r2 + 3*mmsize]
+    movu     [rsp + 2*mmsize + 8], m0
+    movu     [rsp + 3*mmsize + 8], m1
+    movu     [rsp + 4*mmsize + 8], m2
+    movu     [rsp + 5*mmsize + 8], m3
+
+    mov      r4w, [r2 + 64]
+    mov      [rsp + 104], r4w
+    movu     m0, [r3 + 4]
+    movu     m1, [r3 + 22]
+    movu     m2, [r3 + 40]
+    movd     m3, [r3 + 58]
+    pshufb   m0, [shuf_mode_16_20]
+    pshufb   m1, [shuf_mode_16_20]
+    pshufb   m2, [shuf_mode_16_20]
+    pshufb   m3, [shuf_mode_16_20]
+    movu     [rsp + 24], m0
+    movu     [rsp + 12], m1
+    movu     [rsp], m2
+    movd     [rsp], m3
+
+    lea      r3, [ang_table + 16 * 16]
+    mov      r4d, 8
+    mov      r2, rsp
+    add      r1, r1
+    lea      r5, [r1 * 3]
+    mova     m2, [pw_punpcklwd]
+    mov      r6, r0
+
+.loop:
+    MODE_16_20 0
+    add      r6, 8
+    mov      r0, r6
+    add      r2, 8
+    dec      r4
+    jnz     .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_21_new, 3,7,7,0-(6*mmsize+2)
+    lea      r3, [r2 + 128]
+    movu     m0, [r2 + 0*mmsize]
+    movu     m1, [r2 + 1*mmsize]
+    movu     m2, [r2 + 2*mmsize]
+    movu     m3, [r2 + 3*mmsize]
+    movu     [rsp + 2*mmsize], m0
+    movu     [rsp + 3*mmsize], m1
+    movu     [rsp + 4*mmsize], m2
+    movu     [rsp + 5*mmsize], m3
+
+    mov      r4w, [r2 + 64]
+    mov      [rsp + 96], r4w
+    movu     m0, [r3 + 4]
+    movu     m1, [r3 + 18]
+    movu     m2, [r3 + 34]
+    movu     m3, [r3 + 48]
+    pshufb   m0, [shuf_mode_15_21]
+    pshufb   m1, [shuf_mode_15_21]
+    pshufb   m2, [shuf_mode_15_21]
+    pshufb   m3, [shuf_mode_15_21]
+    movh     [rsp + 24], m0
+    movh     [rsp + 16], m1
+    movh     [rsp + 8], m2
+    movh     [rsp], m3
+
+    lea      r3, [ang_table + 16 * 16]
+    mov      r4d, 8
+    mov      r2, rsp
+    add      r1, r1
+    lea      r5, [r1 * 3]
+    mova     m2, [pw_punpcklwd]
+    mov      r6, r0
+
+.loop:
+    MODE_15_21 0
+    add      r6, 8
+    mov      r0, r6
+    add      r2, 8
+    dec      r4
+    jnz     .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_22_new, 3,7,7,0-(5*mmsize+10)
+    lea      r3, [r2 + 128]
+    movu     m0, [r2 + 0*mmsize]
+    movu     m1, [r2 + 1*mmsize]
+    movu     m2, [r2 + 2*mmsize]
+    movu     m3, [r2 + 3*mmsize]
+    movu     [rsp + 1*mmsize + 8], m0
+    movu     [rsp + 2*mmsize + 8], m1
+    movu     [rsp + 3*mmsize + 8], m2
+    movu     [rsp + 4*mmsize + 8], m3
+
+    mov      r4w, [r2 + 64]
+    mov      [rsp + 88], r4w
+    mov      r4w, [r3+4]
+    mov      [rsp+22], r4w
+    movu     m0, [r3 + 10]
+    movu     m1, [r3 + 30]
+    movu     m2, [r3 + 50]
+    pshufb   m0, [shuf_mode_14_22]
+    pshufb   m1, [shuf_mode_14_22]
+    pshufb   m2, [shuf_mode_14_22]
+    movh     [rsp + 14], m0
+    movh     [rsp + 6], m1
+    movh     [rsp - 2], m2
+
+    lea      r3, [ang_table + 16 * 16]
+    mov      r4d, 8
+    mov      r2, rsp
+    add      r1, r1
+    lea      r5, [r1 * 3]
+    mova     m2, [pw_punpcklwd]
+    mov      r6, r0
+
+.loop:
+    MODE_14_22 0
+    add      r6, 8
+    mov      r0, r6
+    add      r2, 8
+    dec      r4
+    jnz     .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_23_new, 3,7,7,0-(5*mmsize+2)
+    lea      r3, [r2 + 128]
+    movu     m0, [r2 + 0*mmsize]
+    movu     m1, [r2 + 1*mmsize]
+    movu     m2, [r2 + 2*mmsize]
+    movu     m3, [r2 + 3*mmsize]
+    movu     [rsp + 1*mmsize], m0
+    movu     [rsp + 2*mmsize], m1
+    movu     [rsp + 3*mmsize], m2
+    movu     [rsp + 4*mmsize], m3
+
+    mov      r4w, [r2+64]
+    mov      [rsp+80], r4w
+    movu     m0, [r3 + 8]
+    movu     m1, [r3 + 36]
+    pshufb   m0, [shuf_mode_13_23]
+    pshufb   m1, [shuf_mode_13_23]
+    movh     [rsp + 8], m0
+    movh     [rsp], m1
+    mov      r4w, [r3+28]
+    mov      [rsp+8], r4w
+    mov      r4w, [r3+56]
+    mov      [rsp], r4w
+
+    lea      r3, [ang_table + 16 * 16]
+    mov      r4d, 8
+    mov      r2, rsp
+    add      r1, r1
+    lea      r5, [r1 * 3]
+    mova     m2, [pw_punpcklwd]
+    mov      r6, r0
+
+.loop:
+    MODE_13_23 0
+    add      r6, 8
+    mov      r0, r6
+    add      r2, 8
+    dec      r4
+    jnz     .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_24_new, 3,7,7,0-(4*mmsize+10)
+    lea      r3, [r2 + 128]
+    movu     m0, [r2 + 0*mmsize]
+    movu     m1, [r2 + 1*mmsize]
+    movu     m2, [r2 + 2*mmsize]
+    movu     m3, [r2 + 3*mmsize]
+
+    movu     [rsp + 0*mmsize + 8], m0
+    movu     [rsp + 1*mmsize + 8], m1
+    movu     [rsp + 2*mmsize + 8], m2
+    movu     [rsp + 3*mmsize + 8], m3
+
+    mov      r4w, [r2+64]
+    mov      [rsp+72], r4w
+    mov      r4w, [r3+12]
+    mov      [rsp+6], r4w
+    mov      r4w, [r3+26]
+    mov      [rsp+4], r4w
+    mov      r4w, [r3+38]
+    mov      [rsp+2], r4w
+    mov      r4w, [r3+52]
+    mov      [rsp], r4w
+
+    lea      r3, [ang_table + 16 * 16]
+    mov      r4d, 8
+    mov      r2, rsp
+    add      r1, r1
+    lea      r5, [r1 * 3]
+    mov     r6, r0
+    mova     m2, [pw_punpcklwd]
+
+.loop:
+    MODE_12_24 0
+    add      r6, 8
+    mov      r0, r6
+    add      r2, 8
+    dec      r4
+    jnz      .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_25_new, 3,7,7,0-(4*mmsize+4)
+    lea      r3, [r2 + 128]
+    movu     m0, [r2 + 0*mmsize]
+    movu     m1, [r2 + 1*mmsize]
+    movu     m2, [r2 + 2*mmsize]
+    movu     m3, [r2 + 3*mmsize]
+    movu     [rsp + 0*mmsize + 2], m0
+    movu     [rsp + 1*mmsize + 2], m1
+    movu     [rsp + 2*mmsize + 2], m2
+    movu     [rsp + 3*mmsize + 2], m3
+    mov      r4w, [r3+32]
+    mov      [rsp], r4w
+    mov      r4w, [r2+64]
+    mov      [rsp+66], r4w
+
+    lea      r3, [ang_table + 16 * 16]
+    mov      r4d, 8
+    mov      r2, rsp
+    add      r1, r1
+    lea      r5, [r1 * 3]
+    mov      r6, r0
+
+.loop:
+    MODE_11_25 0
+    add      r6, 8
+    mov      r0, r6
+    add      r2, 8
+    dec      r4
+    jnz      .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_26_new, 3,7,5
+    mov         r6d, 4
+    add         r1, r1
+    lea         r3, [r1 * 2]
+    lea         r4, [r1 * 3]
+    lea         r5, [r1 * 4]
+    mova        m4, [c_mode32_10_0]
+
+    movu        m0, [r2 + 2 ]
+    movu        m1, [r2 + 18]
+    movu        m2, [r2 + 34]
+    movu        m3, [r2 + 50]
+
+.loop:
+    movu        [r0], m0
+    movu        [r0 + 16], m1
+    movu        [r0 + 32], m2
+    movu        [r0 + 48], m3
+
+    movu        [r0 + r1], m0
+    movu        [r0 + r1 + 16], m1
+    movu        [r0 + r1 + 32], m2
+    movu        [r0 + r1 + 48], m3
+
+    movu        [r0 + r3], m0
+    movu        [r0 + r3 + 16], m1
+    movu        [r0 + r3 + 32], m2
+    movu        [r0 + r3 + 48], m3
+
+    movu        [r0 + r4], m0
+    movu        [r0 + r4 + 16], m1
+    movu        [r0 + r4 + 32], m2
+    movu        [r0 + r4 + 48], m3
+
+    add         r0, r5
+
+    movu        [r0], m0
+    movu        [r0 + 16], m1
+    movu        [r0 + 32], m2
+    movu        [r0 + 48], m3
+
+    movu        [r0 + r1], m0
+    movu        [r0 + r1 + 16], m1
+    movu        [r0 + r1 + 32], m2
+    movu        [r0 + r1 + 48], m3
+
+    movu        [r0 + r3], m0
+    movu        [r0 + r3 + 16], m1
+    movu        [r0 + r3 + 32], m2
+    movu        [r0 + r3 + 48], m3
+
+    movu        [r0 + r4], m0
+    movu        [r0 + r4 + 16], m1
+    movu        [r0 + r4 + 32], m2
+    movu        [r0 + r4 + 48], m3
+
+    add         r0, r5
+    dec         r6d
+    jnz         .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_27_new, 3,7,8
+    lea    r3, [ang_table + 16 * 16]
+    add    r1, r1
+    lea    r5, [r1 * 3]
+    mov    r6, r0
+    mov    r4d, 8
+
+.loop:
+    MODE_9_27 0
+    add    r6, 8
+    mov    r0, r6
+    add    r2, 8
+    dec    r4
+    jnz    .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_28_new, 3,7,8
+    lea    r3, [ang_table + 16 * 16]
+    add    r1, r1
+    lea    r5, [r1 * 3]
+    mov    r6, r0
+    mov    r4d, 8
+
+.loop:
+    MODE_8_28 0
+    add    r6, 8
+    mov    r0, r6
+    add    r2, 8
+    dec    r4
+    jnz    .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_29_new, 3,7,8
+    lea    r3, [ang_table + 16 * 16]
+    add    r1, r1
+    lea    r5, [r1 * 3]
+    mov    r6, r0
+    mov    r4d, 8
+
+.loop:
+    MODE_7_29 0
+    add    r6, 8
+    mov    r0, r6
+    add    r2, 8
+    dec    r4
+    jnz    .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_30_new, 3,7,8
+    lea    r3, [ang_table + 16 * 16]
+    add    r1, r1
+    lea    r5, [r1 * 3]
+    mov    r6, r0
+    mov    r4d, 8
+
+.loop:
+    MODE_6_30 0
+    add    r6, 8
+    mov    r0, r6
+    add    r2, 8
+    dec    r4
+    jnz    .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_31_new, 3,7,8
+    lea    r3, [ang_table + 16 * 16]
+    add    r1, r1
+    lea    r5, [r1 * 3]
+    mov    r6, r0
+    mov    r4d, 8
+
+.loop:
+    MODE_5_31 0
+    add    r6, 8
+    mov    r0, r6
+    add    r2, 8
+    dec    r4
+    jnz    .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_32_new, 3,7,8
+    lea    r3, [ang_table + 16 * 16]
+    add    r1, r1
+    lea    r5, [r1 * 3]
+    mov    r6, r0
+    mov    r4d, 8
+
+.loop:
+    MODE_4_32 0
+    add    r6, 8
+    mov    r0, r6
+    add    r2, 8
+    dec    r4
+    jnz    .loop
+    RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang32_33_new, 3,7,8
+    lea    r3, [ang_table + 16 * 16]
+    add    r1, r1
+    lea    r5, [r1 * 3]
+    mov    r6, r0
+    mov    r4d, 8
+.loop:
+    MODE_3_33 0
+    add    r6, 8
+    mov    r0, r6
+    add    r2, 8
+    dec    r4
+    jnz    .loop
+    RET
diff -r 38f8903130fd -r 10b8acec46ea source/test/intrapredharness.cpp
--- a/source/test/intrapredharness.cpp	Fri Jan 02 18:16:26 2015 +0530
+++ b/source/test/intrapredharness.cpp	Tue Jan 13 11:38:04 2015 +0530
@@ -164,7 +164,10 @@
                 for (int k = 0; k < width; k++)
                 {
                     if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width * sizeof(pixel)))
+                    {
+                        printf("ang_%dx%d, Mode = %d, Row = %d failed !!\n", width, width, pmode, k);
                         return false;
+                    }
                 }
 
                 reportfail();



More information about the x265-devel mailing list