[x265] [PATCH 20 of 29] 16bpp: cleanup intra_ang8x8, intra_ang16x16 and intra_ang32x32 older asm code

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Tue Jan 13 08:11:28 CET 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1420528688 -19800
#      Tue Jan 06 12:48:08 2015 +0530
# Node ID 063d9417105a345b169129c3bb2a259d6c7e8b06
# Parent  10b8acec46ea4cdc43023b95e379588e86438601
16bpp: cleanup intra_ang8x8, intra_ang16x16 and intra_ang32x32 older asm code

diff -r 10b8acec46ea -r 063d9417105a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jan 13 11:38:04 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Jan 06 12:48:08 2015 +0530
@@ -953,11 +953,6 @@
     p.intra_pred_new[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _new_ ## cpu; \
     p.intra_pred_new[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _new_ ## cpu;
 
-#define SETUP_INTRA_ANG(mode, fno, cpu) \
-    p.intra_pred[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _ ## cpu; \
-    p.intra_pred[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _ ## cpu; \
-    p.intra_pred[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _ ## cpu;
-
 #define SETUP_INTRA_ANG_HIGH(mode, fno, cpu) \
     p.intra_pred_new[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _new_ ## cpu; \
     p.intra_pred_new[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _new_ ## cpu; \
@@ -1021,21 +1016,6 @@
     SETUP_INTRA_ANG_COMMON_NEW(18, 18, cpu);
 
 #define INTRA_ANG_SSE4_HIGH(cpu) \
-    SETUP_INTRA_ANG(19, 19, cpu); \
-    SETUP_INTRA_ANG(20, 20, cpu); \
-    SETUP_INTRA_ANG(21, 21, cpu); \
-    SETUP_INTRA_ANG(22, 22, cpu); \
-    SETUP_INTRA_ANG(23, 23, cpu); \
-    SETUP_INTRA_ANG(24, 24, cpu); \
-    SETUP_INTRA_ANG(25, 25, cpu); \
-    SETUP_INTRA_ANG(26, 26, cpu); \
-    SETUP_INTRA_ANG(27, 27, cpu); \
-    SETUP_INTRA_ANG(28, 28, cpu); \
-    SETUP_INTRA_ANG(29, 29, cpu); \
-    SETUP_INTRA_ANG(30, 30, cpu); \
-    SETUP_INTRA_ANG(31, 31, cpu); \
-    SETUP_INTRA_ANG(32, 32, cpu); \
-    SETUP_INTRA_ANG(33, 33, cpu); \
     SETUP_INTRA_ANG4(19, 17, cpu); \
     SETUP_INTRA_ANG4(20, 16, cpu); \
     SETUP_INTRA_ANG4(21, 15, cpu); \
diff -r 10b8acec46ea -r 063d9417105a source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Tue Jan 13 11:38:04 2015 +0530
+++ b/source/common/x86/intrapred16.asm	Tue Jan 06 12:48:08 2015 +0530
@@ -1277,3660 +1277,6 @@
     movh        [r0], m0
     RET
 
-;-----------------------------------------------------------------------------
-; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------
-INIT_XMM ssse3
-cglobal intra_pred_ang8_2, 3,4,3
-    cmp         r4m,           byte 34
-    cmove       r2,            r3mp
-    add         r1,            r1
-    lea         r3,            [r1 * 3]
-    movu        m0,            [r2 + 4]
-    movu        m1,            [r2 + 20]
-    movu        [r0],          m0
-    palignr     m2,            m1, m0, 2
-    movu        [r0 + r1],     m2
-    palignr     m2,            m1, m0, 4
-    movu        [r0 + r1 * 2], m2
-    palignr     m2,            m1, m0, 6
-    movu        [r0 + r3],     m2
-    lea         r0,            [r0 + r1 * 4]
-    palignr     m2,            m1, m0, 8
-    movu        [r0],          m2
-    palignr     m2,            m1, m0, 10
-    movu        [r0 + r1],     m2
-    palignr     m2,            m1, m0, 12
-    movu        [r0 + r1 * 2], m2
-    palignr     m1,            m0, 14
-    movu        [r0 + r3],     m1
-    RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang8_3, 3,5,8
-    lea         r3,        [ang_table + 14 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
-    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
-    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
-
-    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
-    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
-    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
-    punpckhwd   m1,        m4                         ; [x 16 16 15 15 14 14 13]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r3 + 12 * 16]             ; [26]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 + 12 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    palignr     m2,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
-    pmaddwd     m2,        [r3 + 6 * 16]              ; [20]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    palignr     m6,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
-    pmaddwd     m6,        [r3 + 6 * 16]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m2,        m6
-
-    palignr     m6,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
-    pmaddwd     m6,        [r3]                       ; [14]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    palignr     m7,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
-    pmaddwd     m7,        [r3]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m6,        m7
-
-    palignr     m7,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
-    pmaddwd     m7,        [r3 - 6 * 16]              ; [ 8]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    palignr     m3,        m5, m0, 12                 ; [12 11 11 10 10 9 9 8]
-    pmaddwd     m3,        [r3 - 6 * 16]
-    paddd       m3,        [pd_16]
-    psrld       m3,        5
-    packusdw    m7,        m3
-
-    punpckhwd   m3,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m7
-    punpcklwd   m6,        m7
-
-    punpckldq   m7,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m3, m2
-    punpckhdq   m3,        m2
-
-    lea         r4,              [r1 * 3]
-    movh        [r0],            m7
-    movhps      [r0 + r1],       m7
-    movh        [r0 + r1 * 2],   m4
-    movhps      [r0 + r4],       m4
-    lea         r2,              [r0 + r1 * 4]
-    movh        [r2],            m6
-    movhps      [r2 + r1],       m6
-    movh        [r2 + r1 * 2],   m3
-    movhps      [r2 + r4],       m3
-
-    mova        m4,        m0
-    pmaddwd     m4,        [r3 - 12 * 16]             ; [ 2]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m5
-    pmaddwd     m2,        [r3 - 12 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 + 14 * 16]             ; [28]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m6,        m5
-    pmaddwd     m6,        [r3 + 14 * 16]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m2,        m6
-
-    palignr     m6,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
-    pmaddwd     m6,        [r3 + 8 * 16]              ; [22]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    palignr     m7,        m1, m5, 4                  ; [14 13 13 12 12 11 11 10]
-    pmaddwd     m7,        [r3 + 8 * 16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m6,        m7
-
-    palignr     m7,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
-    pmaddwd     m7,        [r3 + 2 * 16]              ; [16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    palignr     m1,        m5, 8                      ; [15 14 14 13 13 12 12 11]
-    pmaddwd     m1,        [r3 + 2 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m7,        m1
-
-    punpckhwd   m3,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m7
-    punpcklwd   m6,        m7
-
-    punpckldq   m7,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m3, m2
-    punpckhdq   m3,        m2
-
-    movh        [r0 + 8],            m7
-    movhps      [r0 + r1 + 8],       m7
-    movh        [r0 + r1 * 2 + 8],   m4
-    movhps      [r0 + r4 + 8],       m4
-    lea         r0,                  [r0 + r1 * 4]
-    movh        [r0 + 8],            m6
-    movhps      [r0 + r1 + 8],       m6
-    movh        [r0 + r1 * 2 + 8],   m3
-    movhps      [r0 + r4 + 8],       m3
-
-    RET
-
-cglobal intra_pred_ang8_4, 3,6,8
-    lea         r3,        [ang_table + 19 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
-    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
-    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
-
-    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
-    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
-    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r3 + 2 * 16]              ; [21]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 + 2 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    palignr     m2,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
-    mova        m6,        m2
-    pmaddwd     m2,        [r3 - 9 * 16]              ; [10]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
-    mova        m7,        m1
-    pmaddwd     m1,        [r3 - 9 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    pmaddwd     m6,        [r3 + 12 * 16]             ; [31]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    pmaddwd     m7,        [r3 + 12 * 16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m6,        m7
-
-    palignr     m7,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
-    pmaddwd     m7,        [r3 + 1 * 16]              ; [20]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    palignr     m1,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
-    pmaddwd     m1,        [r3 + 1 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m7,        m1
-
-    punpckhwd   m1,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m7
-    punpcklwd   m6,        m7
-
-    punpckldq   m7,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m1, m2
-    punpckhdq   m1,        m2
-
-    lea         r4,              [r1 * 3]
-    movh        [r0],            m7
-    movhps      [r0 + r1],       m7
-    movh        [r0 + r1 * 2],   m4
-    movhps      [r0 + r4],       m4
-    lea         r5,              [r0 + r1 * 4]
-    movh        [r5],            m6
-    movhps      [r5 + r1],       m6
-    movh        [r5 + r1 * 2],   m1
-    movhps      [r5 + r4],       m1
-
-    palignr     m4,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
-    mova        m2,        m4
-    pmaddwd     m4,        [r3 - 10 * 16]             ; [ 9]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    palignr     m3,        m5, m0, 12                 ; [12 11 11 10 10 9 9 8]
-    mova        m6,        m3
-    pmaddwd     m3,        [r3 - 10 * 16]
-    paddd       m3,        [pd_16]
-    psrld       m3,        5
-    packusdw    m4,        m3
-
-    pmaddwd     m2,        [r3 + 11 * 16]             ; [30]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    pmaddwd     m6,        [r3 + 11 * 16]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m2,        m6
-
-    mova        m6,        m0
-    pmaddwd     m6,        [r3]                       ; [19]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m7,        m5
-    pmaddwd     m7,        [r3]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m6,        m7
-
-    movh        m1,        [r2 + 26]                  ; [16 15 14 13]
-    palignr     m7,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
-    pmaddwd     m7,        [r3 - 11 * 16]             ; [8]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    palignr     m1,        m5, 4                      ; [14 13 13 12 12 11 11 10]
-    pmaddwd     m1,        [r3 - 11 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m7,        m1
-
-    punpckhwd   m3,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m7
-    punpcklwd   m6,        m7
-
-    punpckldq   m7,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m3, m2
-    punpckhdq   m3,        m2
-
-    movh        [r0 + 8],            m7
-    movhps      [r0 + r1 + 8],       m7
-    movh        [r0 + r1 * 2 + 8],   m4
-    movhps      [r0 + r4 + 8],       m4
-    lea         r0,                  [r0 + r1 * 4]
-    movh        [r0 + 8],            m6
-    movhps      [r0 + r1 + 8],       m6
-    movh        [r0 + r1 * 2 + 8],   m3
-    movhps      [r0 + r4 + 8],       m3
-
-    RET
-
-cglobal intra_pred_ang8_5, 3,5,8
-    lea         r3,        [ang_table + 13 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
-    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
-    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
-
-    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
-    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
-    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r3 + 4 * 16]              ; [17]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 + 4 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    palignr     m2,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
-    mova        m6,        m2
-    pmaddwd     m2,        [r3 - 11 * 16]             ; [2]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
-    mova        m7,        m1
-    pmaddwd     m1,        [r3 - 11 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    pmaddwd     m6,        [r3 + 6 * 16]              ; [19]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    pmaddwd     m7,        [r3 + 6 * 16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m6,        m7
-
-    palignr     m7,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
-    pmaddwd     m7,        [r3 - 9 * 16]              ; [4]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    palignr     m1,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
-    pmaddwd     m1,        [r3 - 9 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m7,        m1
-
-    punpckhwd   m1,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m7
-    punpcklwd   m6,        m7
-
-    punpckldq   m7,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m1, m2
-    punpckhdq   m1,        m2
-
-    lea         r4,              [r1 * 3]
-    movh        [r0],            m7
-    movhps      [r0 + r1],       m7
-    movh        [r0 + r1 * 2],   m4
-    movhps      [r0 + r4],       m4
-    lea         r2,              [r0 + r1 * 4]
-    movh        [r2],            m6
-    movhps      [r2 + r1],       m6
-    movh        [r2 + r1 * 2],   m1
-    movhps      [r2 + r4],       m1
-
-    palignr     m4,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
-    pmaddwd     m4,        [r3 + 8 * 16]              ; [21]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    palignr     m2,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
-    pmaddwd     m2,        [r3 + 8 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    palignr     m2,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
-    mova        m6,        m2
-    pmaddwd     m2,        [r3 - 7 * 16]              ; [6]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    palignr     m1,        m5, m0, 12                 ; [12 11 11 10 10 9 9 8]
-    mova        m7,        m1
-    pmaddwd     m1,        [r3 - 7 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    pmaddwd     m6,        [r3 + 10 * 16]             ; [23]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    pmaddwd     m7,        [r3 + 10 * 16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m6,        m7
-
-    mova        m7,        m0
-    pmaddwd     m7,        [r3 - 5 * 16]              ; [8]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    mova        m1,        m5
-    pmaddwd     m1,        [r3 - 5 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m7,        m1
-
-    punpckhwd   m3,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m7
-    punpcklwd   m6,        m7
-
-    punpckldq   m7,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m3, m2
-    punpckhdq   m3,        m2
-
-    movh        [r0 + 8],            m7
-    movhps      [r0 + r1 + 8],       m7
-    movh        [r0 + r1 * 2 + 8],   m4
-    movhps      [r0 + r4 + 8],       m4
-    lea         r0,                  [r0 + r1 * 4]
-    movh        [r0 + 8],            m6
-    movhps      [r0 + r1 + 8],       m6
-    movh        [r0 + r1 * 2 + 8],   m3
-    movhps      [r0 + r4 + 8],       m3
-
-    RET
-
-cglobal intra_pred_ang8_6, 3,5,8
-    lea         r3,        [ang_table + 14 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
-    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
-    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
-
-    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
-    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
-    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r3 - 1 * 16]              ; [13]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 - 1 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r3 + 12 * 16]             ; [26]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 + 12 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    palignr     m6,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
-    mova        m7,        m6
-    pmaddwd     m6,        [r3 - 7 * 16]              ; [7]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
-    pmaddwd     m1,        [r3 - 7 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m7,        [r3 + 6 * 16]              ; [20]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
-    pmaddwd     m1,        [r3 + 6 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m7,        m1
-
-    punpckhwd   m1,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m7
-    punpcklwd   m6,        m7
-
-    punpckldq   m7,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m1, m2
-    punpckhdq   m1,        m2
-
-    lea         r4,              [r1 * 3]
-    movh        [r0],            m7
-    movhps      [r0 + r1],       m7
-    movh        [r0 + r1 * 2],   m4
-    movhps      [r0 + r4],       m4
-    lea         r2,              [r0 + r1 * 4]
-    movh        [r2],            m6
-    movhps      [r2 + r1],       m6
-    movh        [r2 + r1 * 2],   m1
-    movhps      [r2 + r4],       m1
-
-    palignr     m4,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
-    mova        m6,        m4
-    pmaddwd     m4,        [r3 - 13 * 16]             ; [1]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    palignr     m2,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
-    mova        m7,        m2
-    pmaddwd     m2,        [r3 - 13 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    pmaddwd     m2,        m6, [r3]                   ; [14]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    pmaddwd     m1,        m7, [r3]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    pmaddwd     m6,        [r3 + 13 * 16]             ; [27]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    pmaddwd     m7,        [r3 + 13 * 16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m6,        m7
-
-    palignr     m7,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
-    pmaddwd     m7,        [r3 - 6 * 16]              ; [8]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    palignr     m5,        m0, 12                     ; [12 11 11 10 10 9 9 8]
-    pmaddwd     m5,        [r3 - 6 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m7,        m5
-
-    punpckhwd   m3,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m7
-    punpcklwd   m6,        m7
-
-    punpckldq   m7,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m3, m2
-    punpckhdq   m3,        m2
-
-    movh        [r0 + 8],            m7
-    movhps      [r0 + r1 + 8],       m7
-    movh        [r0 + r1 * 2 + 8],   m4
-    movhps      [r0 + r4 + 8],       m4
-    lea         r0,                  [r0 + r1 * 4]
-    movh        [r0 + 8],            m6
-    movhps      [r0 + r1 + 8],       m6
-    movh        [r0 + r1 * 2 + 8],   m3
-    movhps      [r0 + r4 + 8],       m3
-
-    RET
-
-cglobal intra_pred_ang8_7, 3,5,8
-    lea         r3,        [ang_table + 18 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
-    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
-    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
-
-    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
-    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
-    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r3 - 9 * 16]              ; [9]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 - 9 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r3]                       ; [18]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r3 + 9 * 16]              ; [27]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 + 9 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    palignr     m7,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
-    pmaddwd     m7,        [r3 - 14 * 16]             ; [4]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
-    pmaddwd     m1,        [r3 - 14 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m7,        m1
-
-    punpckhwd   m1,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m7
-    punpcklwd   m6,        m7
-
-    punpckldq   m7,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m1, m2
-    punpckhdq   m1,        m2
-
-    lea         r4,              [r1 * 3]
-    movh        [r0],            m7
-    movhps      [r0 + r1],       m7
-    movh        [r0 + r1 * 2],   m4
-    movhps      [r0 + r4],       m4
-    lea         r2,              [r0 + r1 * 4]
-    movh        [r2],            m6
-    movhps      [r2 + r1],       m6
-    movh        [r2 + r1 * 2],   m1
-    movhps      [r2 + r4],       m1
-
-    palignr     m4,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
-    mova        m6,        m4
-    pmaddwd     m4,        [r3 - 5 * 16]              ; [13]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    palignr     m2,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
-    mova        m7,        m2
-    pmaddwd     m2,        [r3 - 5 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    pmaddwd     m2,        m6, [r3 + 4 * 16]          ; [22]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    pmaddwd     m1,        m7, [r3 + 4 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    pmaddwd     m6,        [r3 + 13 * 16]             ; [31]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    pmaddwd     m7,        [r3 + 13 * 16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m6,        m7
-
-    palignr     m7,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
-    pmaddwd     m7,        [r3 - 10 * 16]             ; [8]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    palignr     m5,        m0, 8                      ; [11 10 10 9 9 8 8 7]
-    pmaddwd     m5,        [r3 - 10 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m7,        m5
-
-    punpckhwd   m3,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m7
-    punpcklwd   m6,        m7
-
-    punpckldq   m7,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m3, m2
-    punpckhdq   m3,        m2
-
-    movh        [r0 + 8],            m7
-    movhps      [r0 + r1 + 8],       m7
-    movh        [r0 + r1 * 2 + 8],   m4
-    movhps      [r0 + r4 + 8],       m4
-    lea         r0,                  [r0 + r1 * 4]
-    movh        [r0 + 8],            m6
-    movhps      [r0 + r1 + 8],       m6
-    movh        [r0 + r1 * 2 + 8],   m3
-    movhps      [r0 + r4 + 8],       m3
-
-    RET
-
-cglobal intra_pred_ang8_8, 3,6,7
-    lea         r3,        [ang_table + 17 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-    movu        m1,        [r2 + 4]                   ; [9 8 7 6 5 4 3 2]
-
-    punpcklwd   m3,        m0, m1                     ; [5 4 4 3 3 2 2 1]
-    punpckhwd   m0,        m1                         ; [9 8 8 7 7 6 6 5]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r3 - 12 * 16]             ; [5]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 - 12 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r3 - 7 * 16]              ; [10]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 - 7 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r3 - 2 * 16]              ; [15]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 - 2 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    mova        m5,        m3
-    pmaddwd     m5,        [r3 + 3 * 16]              ; [20]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 + 3 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m5,        m1
-
-    punpckhwd   m1,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m5
-    punpcklwd   m6,        m5
-
-    punpckldq   m5,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m1, m2
-    punpckhdq   m1,        m2
-
-    lea         r4,              [r1 * 3]
-    movh        [r0],            m5
-    movhps      [r0 + r1],       m5
-    movh        [r0 + r1 * 2],   m4
-    movhps      [r0 + r4],       m4
-    lea         r5,              [r0 + r1 * 4]
-    movh        [r5],            m6
-    movhps      [r5 + r1],       m6
-    movh        [r5 + r1 * 2],   m1
-    movhps      [r5 + r4],       m1
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r3 + 8 * 16]              ; [25]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 + 8 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r3 + 13 * 16]             ; [30]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 + 13 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    movh        m1,        [r2 + 18]                  ; [12 11 10 9]
-
-    palignr     m6,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
-    mova        m5,        m6
-    pmaddwd     m6,        [r3 - 14 * 16]             ; [3]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    palignr     m1,        m0, 4                      ; [10 9 9 8 8 7 7 6]
-    mova        m3,        m1
-    pmaddwd     m1,        [r3 - 14 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m5,        [r3 - 9 * 16]              ; [8]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    pmaddwd     m3,        [r3 - 9 * 16]
-    paddd       m3,        [pd_16]
-    psrld       m3,        5
-    packusdw    m5,        m3
-
-    punpckhwd   m3,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m5
-    punpcklwd   m6,        m5
-
-    punpckldq   m5,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m3, m2
-    punpckhdq   m3,        m2
-
-    movh        [r0 + 8],            m5
-    movhps      [r0 + r1 + 8],       m5
-    movh        [r0 + r1 * 2 + 8],   m4
-    movhps      [r0 + r4 + 8],       m4
-    lea         r0,                  [r0 + r1 * 4]
-    movh        [r0 + 8],            m6
-    movhps      [r0 + r1 + 8],       m6
-    movh        [r0 + r1 * 2 + 8],   m3
-    movhps      [r0 + r4 + 8],       m3
-
-    RET
-
-cglobal intra_pred_ang8_9, 3,5,7
-    lea         r3,        [ang_table + 9 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-    movu        m1,        [r2 + 4]                   ; [9 8 7 6 5 4 3 2]
-
-    punpcklwd   m3,        m0, m1                     ; [5 4 4 3 3 2 2 1]
-    punpckhwd   m0,        m1                         ; [9 8 8 7 7 6 6 5]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r3 - 7 * 16]              ; [2]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 - 7 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r3 - 5 * 16]              ; [4]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 - 5 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r3 - 3 * 16]              ; [6]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 - 3 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    mova        m5,        m3
-    pmaddwd     m5,        [r3 - 1 * 16]              ; [8]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 - 1 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m5,        m1
-
-    punpckhwd   m1,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m5
-    punpcklwd   m6,        m5
-
-    punpckldq   m5,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m1, m2
-    punpckhdq   m1,        m2
-
-    lea         r4,              [r1 * 3]
-    movh        [r0],            m5
-    movhps      [r0 + r1],       m5
-    movh        [r0 + r1 * 2],   m4
-    movhps      [r0 + r4],       m4
-    lea         r2,              [r0 + r1 * 4]
-    movh        [r2],            m6
-    movhps      [r2 + r1],       m6
-    movh        [r2 + r1 * 2],   m1
-    movhps      [r2 + r4],       m1
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r3 + 1 * 16]              ; [10]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 + 1 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r3 + 3 * 16]              ; [12]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 + 3 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r3 + 5 * 16]              ; [14]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r3 + 5 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    pmaddwd     m3,        [r3 + 7 * 16]              ; [16]
-    paddd       m3,        [pd_16]
-    psrld       m3,        5
-    pmaddwd     m0,        [r3 + 7 * 16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m3,        m0
-
-    punpckhwd   m5,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m3
-    punpcklwd   m6,        m3
-
-    punpckldq   m3,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m5, m2
-    punpckhdq   m5,        m2
-
-    movh        [r0 + 8],            m3
-    movhps      [r0 + r1 + 8],       m3
-    movh        [r0 + r1 * 2 + 8],   m4
-    movhps      [r0 + r4 + 8],       m4
-    lea         r0,                  [r0 + r1 * 4]
-    movh        [r0 + 8],            m6
-    movhps      [r0 + r1 + 8],       m6
-    movh        [r0 + r1 * 2 + 8],   m5
-    movhps      [r0 + r4 + 8],       m5
-
-    RET
-
-cglobal intra_pred_ang8_10, 4,5,3
-    movu        m1,             [r2 + 2]            ; [8 7 6 5 4 3 2 1]
-    pshufb      m0,             m1, [pw_unpackwdq]  ; [1 1 1 1 1 1 1 1]
-    add         r1,             r1
-    lea         r4,             [r1 * 3]
-
-    psrldq      m1,             2
-    pshufb      m2,             m1, [pw_unpackwdq]  ; [2 2 2 2 2 2 2 2]
-    movu        [r0 + r1],      m2
-    psrldq      m1,             2
-    pshufb      m2,             m1, [pw_unpackwdq]  ; [3 3 3 3 3 3 3 3]
-    movu        [r0 + r1 * 2],  m2
-    psrldq      m1,             2
-    pshufb      m2,             m1, [pw_unpackwdq]  ; [4 4 4 4 4 4 4 4]
-    movu        [r0 + r4],      m2
-
-    lea         r2,             [r0 + r1 *4]
-    psrldq      m1,             2
-    pshufb      m2,             m1, [pw_unpackwdq]  ; [5 5 5 5 5 5 5 5]
-    movu        [r2],           m2
-    psrldq      m1,             2
-    pshufb      m2,             m1, [pw_unpackwdq]  ; [6 6 6 6 6 6 6 6]
-    movu        [r2 + r1],      m2
-    psrldq      m1,             2
-    pshufb      m2,             m1, [pw_unpackwdq]  ; [7 7 7 7 7 7 7 7]
-    movu        [r2 + r1 * 2],  m2
-    psrldq      m1,             2
-    pshufb      m2,             m1, [pw_unpackwdq]  ; [8 8 8 8 8 8 8 8]
-    movu        [r2 + r4],      m2
-
-    cmp         r5m,            byte 0
-    jz         .quit
-
-    ; filter
-
-    movh        m1,             [r3]                ; [3 2 1 0]
-    pshufb      m2,             m1, [pw_unpackwdq]  ; [0 0 0 0 0 0 0 0]
-    movu        m1,             [r3 + 2]            ; [8 7 6 5 4 3 2 1]
-    psubw       m1,             m2
-    psraw       m1,             1
-    paddw       m0,             m1
-    pxor        m1,             m1
-    pmaxsw      m0,             m1
-    pminsw      m0,             [pw_1023]
-
-.quit:
-    movu        [r0],           m0
-    RET
-
-cglobal intra_pred_ang8_11, 3,5,7
-    lea         r3,        [ang_table + 23 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r2]                       ; [7 6 5 4 3 2 1 0]
-    movu        m1,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-
-    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
-    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r3 + 7 * 16]              ; [30]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 + 7 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r3 + 5 * 16]              ; [28]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 + 5 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r3 + 3 * 16]              ; [26]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 + 3 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    mova        m5,        m3
-    pmaddwd     m5,        [r3 + 1 * 16]              ; [24]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 + 1 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m5,        m1
-
-    punpckhwd   m1,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m5
-    punpcklwd   m6,        m5
-
-    punpckldq   m5,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m1, m2
-    punpckhdq   m1,        m2
-
-    lea         r4,              [r1 * 3]
-    movh        [r0],            m5
-    movhps      [r0 + r1],       m5
-    movh        [r0 + r1 * 2],   m4
-    movhps      [r0 + r4],       m4
-    lea         r2,              [r0 + r1 * 4]
-    movh        [r2],            m6
-    movhps      [r2 + r1],       m6
-    movh        [r2 + r1 * 2],   m1
-    movhps      [r2 + r4],       m1
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r3 - 1 * 16]              ; [22]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 - 1 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r3 - 3 * 16]              ; [20]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 - 3 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r3 - 5 * 16]              ; [18]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r3 - 5 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    pmaddwd     m3,        [r3 - 7 * 16]              ; [16]
-    paddd       m3,        [pd_16]
-    psrld       m3,        5
-    pmaddwd     m0,        [r3 - 7 * 16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m3,        m0
-
-    punpckhwd   m5,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m3
-    punpcklwd   m6,        m3
-
-    punpckldq   m3,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m5, m2
-    punpckhdq   m5,        m2
-
-    movh        [r0 + 8],            m3
-    movhps      [r0 + r1 + 8],       m3
-    movh        [r0 + r1 * 2 + 8],   m4
-    movhps      [r0 + r4 + 8],       m4
-    lea         r0,                  [r0 + r1 * 4]
-    movh        [r0 + 8],            m6
-    movhps      [r0 + r1 + 8],       m6
-    movh        [r0 + r1 * 2 + 8],   m5
-    movhps      [r0 + r4 + 8],       m5
-
-    RET
-
-cglobal intra_pred_ang8_12, 4,6,7
-    lea         r5,        [ang_table + 16 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r2]                       ; [7 6 5 4 3 2 1 0]
-    movu        m1,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-
-    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
-    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 + 11 * 16]             ; [27]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 + 11 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5 + 6 * 16]              ; [22]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r5 + 6 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5 + 1 * 16]              ; [17]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r5 + 1 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    mova        m5,        m3
-    pmaddwd     m5,        [r5 - 4 * 16]              ; [12]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r5 - 4 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m5,        m1
-
-    punpckhwd   m1,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m5
-    punpcklwd   m6,        m5
-
-    punpckldq   m5,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m1, m2
-    punpckhdq   m1,        m2
-
-    lea         r4,              [r1 * 3]
-    movh        [r0],            m5
-    movhps      [r0 + r1],       m5
-    movh        [r0 + r1 * 2],   m4
-    movhps      [r0 + r4],       m4
-    lea         r2,              [r0 + r1 * 4]
-    movh        [r2],            m6
-    movhps      [r2 + r1],       m6
-    movh        [r2 + r1 * 2],   m1
-    movhps      [r2 + r4],       m1
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 - 9 * 16]              ; [7]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 - 9 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5 - 14 * 16]             ; [2]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r5 - 14 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    palignr     m0,        m3, 12
-    movu        m1,        [r3]
-    pshufb      m1,        [pw_ang8_12]
-    palignr     m3,        m1, 12
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5 + 13 * 16]             ; [29]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 + 13 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    pmaddwd     m3,        [r5 + 8 * 16]              ; [24]
-    paddd       m3,        [pd_16]
-    psrld       m3,        5
-    pmaddwd     m0,        [r5 + 8 * 16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m3,        m0
-
-    punpckhwd   m5,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m3
-    punpcklwd   m6,        m3
-
-    punpckldq   m3,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m5, m2
-    punpckhdq   m5,        m2
-
-    movh        [r0 + 8],            m3
-    movhps      [r0 + r1 + 8],       m3
-    movh        [r0 + r1 * 2 + 8],   m4
-    movhps      [r0 + r4 + 8],       m4
-    lea         r0,                  [r0 + r1 * 4]
-    movh        [r0 + 8],            m6
-    movhps      [r0 + r1 + 8],       m6
-    movh        [r0 + r1 * 2 + 8],   m5
-    movhps      [r0 + r4 + 8],       m5
-
-    RET
-
-cglobal intra_pred_ang8_13, 4,6,8
-    lea         r5,        [ang_table + 14 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r2]                       ; [7 6 5 4 3 2 1 0]
-    movu        m1,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-
-    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
-    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 + 9 * 16]              ; [23]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 + 9 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5]                       ; [14]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r5]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5 - 9 * 16]              ; [5]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r5 - 9 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    palignr     m0,        m3, 12
-    movu        m1,        [r3]
-    pshufb      m1,        [pw_ang8_13]
-    palignr     m3,        m1, 12
-
-    mova        m5,        m3
-    pmaddwd     m5,        [r5 + 14 * 16]             ; [28]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    mova        m7,        m0
-    pmaddwd     m7,        [r5 + 14 * 16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m5,        m7
-
-    punpckhwd   m7,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m5
-    punpcklwd   m6,        m5
-
-    punpckldq   m5,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m7, m2
-    punpckhdq   m7,        m2
-
-    lea         r4,              [r1 * 3]
-    movh        [r0],            m5
-    movhps      [r0 + r1],       m5
-    movh        [r0 + r1 * 2],   m4
-    movhps      [r0 + r4],       m4
-    lea         r2,              [r0 + r1 * 4]
-    movh        [r2],            m6
-    movhps      [r2 + r1],       m6
-    movh        [r2 + r1 * 2],   m7
-    movhps      [r2 + r4],       m7
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 + 5 * 16]              ; [19]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 + 5 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5 - 4 * 16]              ; [10]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 - 4 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m2,        m5
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5 - 13 * 16]             ; [1]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 - 13 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    pmaddwd     m3,        [r5 + 10 * 16]             ; [24]
-    paddd       m3,        [pd_16]
-    psrld       m3,        5
-    pmaddwd     m0,        [r5 + 10 * 16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m3,        m0
-
-    punpckhwd   m5,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m3
-    punpcklwd   m6,        m3
-
-    punpckldq   m3,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m5, m2
-    punpckhdq   m5,        m2
-
-    movh        [r0 + 8],            m3
-    movhps      [r0 + r1 + 8],       m3
-    movh        [r0 + r1 * 2 + 8],   m4
-    movhps      [r0 + r4 + 8],       m4
-    lea         r0,                  [r0 + r1 * 4]
-    movh        [r0 + 8],            m6
-    movhps      [r0 + r1 + 8],       m6
-    movh        [r0 + r1 * 2 + 8],   m5
-    movhps      [r0 + r4 + 8],       m5
-
-    RET
-
-cglobal intra_pred_ang8_14, 4,6,8
-    lea         r5,        [ang_table + 18 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r2]                       ; [7 6 5 4 3 2 1 0]
-    movu        m1,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-
-    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
-    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 + 1 * 16]              ; [19]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 + 1 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5 - 12 * 16]             ; [6]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r5 - 12 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    palignr     m0,        m3, 12
-    movu        m1,        [r3]
-    pshufb      m1,        [pw_ang8_14]
-    palignr     m3,        m1, 12
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5 + 7 * 16]              ; [25]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 + 7 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    mova        m5,        m3
-    pmaddwd     m5,        [r5 - 6 * 16]              ; [12]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    mova        m7,        m0
-    pmaddwd     m7,        [r5 - 6 * 16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m5,        m7
-
-    punpckhwd   m7,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m5
-    punpcklwd   m6,        m5
-
-    punpckldq   m5,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m7, m2
-    punpckhdq   m7,        m2
-
-    lea         r4,              [r1 * 3]
-    movh        [r0],            m5
-    movhps      [r0 + r1],       m5
-    movh        [r0 + r1 * 2],   m4
-    movhps      [r0 + r4],       m4
-    lea         r2,              [r0 + r1 * 4]
-    movh        [r2],            m6
-    movhps      [r2 + r1],       m6
-    movh        [r2 + r1 * 2],   m7
-    movhps      [r2 + r4],       m7
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 + 13 * 16]             ; [31]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 + 13 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5]                       ; [18]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m2,        m5
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5 - 13 * 16]             ; [5]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 - 13 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    pmaddwd     m3,        [r5 + 6 * 16]              ; [24]
-    paddd       m3,        [pd_16]
-    psrld       m3,        5
-    pmaddwd     m0,        [r5 + 6 * 16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m3,        m0
-
-    punpckhwd   m5,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m3
-    punpcklwd   m6,        m3
-
-    punpckldq   m3,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m5, m2
-    punpckhdq   m5,        m2
-
-    movh        [r0 + 8],            m3
-    movhps      [r0 + r1 + 8],       m3
-    movh        [r0 + r1 * 2 + 8],   m4
-    movhps      [r0 + r4 + 8],       m4
-    lea         r0,                  [r0 + r1 * 4]
-    movh        [r0 + 8],            m6
-    movhps      [r0 + r1 + 8],       m6
-    movh        [r0 + r1 * 2 + 8],   m5
-    movhps      [r0 + r4 + 8],       m5
-
-    RET
-
-cglobal intra_pred_ang8_15, 4,6,8
-    lea         r5,        [ang_table + 20 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r2]                       ; [7 6 5 4 3 2 1 0]
-    movu        m1,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-
-    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
-    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 - 5 * 16]              ; [15]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 - 5 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    palignr     m0,        m3, 12
-    movu        m1,        [r3]
-    pshufb      m1,        [pw_ang8_15]
-    palignr     m3,        m1, 12
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5 + 10 * 16]             ; [30]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 + 10 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m2,        m5
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5 - 7 * 16]              ; [13]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 - 7 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    mova        m5,        m3
-    pmaddwd     m5,        [r5 + 8 * 16]              ; [28]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    mova        m7,        m0
-    pmaddwd     m7,        [r5 + 8 * 16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m5,        m7
-
-    punpckhwd   m7,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m5
-    punpcklwd   m6,        m5
-
-    punpckldq   m5,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m7, m2
-    punpckhdq   m7,        m2
-
-    lea         r4,              [r1 * 3]
-    movh        [r0],            m5
-    movhps      [r0 + r1],       m5
-    movh        [r0 + r1 * 2],   m4
-    movhps      [r0 + r4],       m4
-    lea         r2,              [r0 + r1 * 4]
-    movh        [r2],            m6
-    movhps      [r2 + r1],       m6
-    movh        [r2 + r1 * 2],   m7
-    movhps      [r2 + r4],       m7
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 - 9 * 16]              ; [11]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 - 9 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5 + 6 * 16]              ; [26]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 + 6 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m2,        m5
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5 - 11 * 16]             ; [9]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 - 11 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-    pinsrw      m3,        [r3 + 16], 0
-
-    pmaddwd     m3,        [r5 + 4 * 16]              ; [24]
-    paddd       m3,        [pd_16]
-    psrld       m3,        5
-    pmaddwd     m0,        [r5 + 4 * 16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m3,        m0
-
-    punpckhwd   m5,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m3
-    punpcklwd   m6,        m3
-
-    punpckldq   m3,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m5, m2
-    punpckhdq   m5,        m2
-
-    movh        [r0 + 8],            m3
-    movhps      [r0 + r1 + 8],       m3
-    movh        [r0 + r1 * 2 + 8],   m4
-    movhps      [r0 + r4 + 8],       m4
-    lea         r0,                  [r0 + r1 * 4]
-    movh        [r0 + 8],            m6
-    movhps      [r0 + r1 + 8],       m6
-    movh        [r0 + r1 * 2 + 8],   m5
-    movhps      [r0 + r4 + 8],       m5
-
-    RET
-
-cglobal intra_pred_ang8_16, 4,6,8
-    lea         r5,        [ang_table + 13 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r2]                       ; [7 6 5 4 3 2 1 0]
-    movu        m1,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-
-    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
-    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 - 2 * 16]              ; [11]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 - 2 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    palignr     m0,        m3, 12
-    movu        m1,        [r3]
-    pshufb      m1,        [pw_ang8_16]
-    palignr     m3,        m1, 12
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5 + 9 * 16]              ; [22]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 + 9 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m2,        m5
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5 - 12 * 16]             ; [1]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 - 12 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    mova        m5,        m3
-    pmaddwd     m5,        [r5 - 1 * 16]              ; [12]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    mova        m7,        m0
-    pmaddwd     m7,        [r5 - 1 * 16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m5,        m7
-
-    punpckhwd   m7,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m5
-    punpcklwd   m6,        m5
-
-    punpckldq   m5,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m7, m2
-    punpckhdq   m7,        m2
-
-    lea         r4,              [r1 * 3]
-    movh        [r0],            m5
-    movhps      [r0 + r1],       m5
-    movh        [r0 + r1 * 2],   m4
-    movhps      [r0 + r4],       m4
-    lea         r2,              [r0 + r1 * 4]
-    movh        [r2],            m6
-    movhps      [r2 + r1],       m6
-    movh        [r2 + r1 * 2],   m7
-    movhps      [r2 + r4],       m7
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 + 10 * 16]             ; [23]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 + 10 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5 - 11 * 16]             ; [2]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 - 11 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m2,        m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5]                       ; [13]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-    pinsrw      m3,        [r3 + 16], 0
-
-    pmaddwd     m3,        [r5 + 11 * 16]             ; [24]
-    paddd       m3,        [pd_16]
-    psrld       m3,        5
-    pmaddwd     m0,        [r5 + 11 * 16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m3,        m0
-
-    punpckhwd   m5,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m3
-    punpcklwd   m6,        m3
-
-    punpckldq   m3,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m5, m2
-    punpckhdq   m5,        m2
-
-    movh        [r0 + 8],            m3
-    movhps      [r0 + r1 + 8],       m3
-    movh        [r0 + r1 * 2 + 8],   m4
-    movhps      [r0 + r4 + 8],       m4
-    lea         r0,                  [r0 + r1 * 4]
-    movh        [r0 + 8],            m6
-    movhps      [r0 + r1 + 8],       m6
-    movh        [r0 + r1 * 2 + 8],   m5
-    movhps      [r0 + r4 + 8],       m5
-
-    RET
-
-cglobal intra_pred_ang8_17, 4,6,8
-    lea         r5,        [ang_table + 17 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r2]                       ; [7 6 5 4 3 2 1 0]
-    movu        m1,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-
-    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
-    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 - 11 * 16]             ; [6]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 - 11 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    palignr     m0,        m3, 12
-    movu        m1,        [r3]
-    pshufb      m1,        [pw_ang8_17]
-    palignr     m3,        m1, 12
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5 - 5 * 16]              ; [12]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 - 5 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m2,        m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5 + 1 * 16]              ; [18]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 + 1 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    mova        m5,        m3
-    pmaddwd     m5,        [r5 + 7 * 16]              ; [24]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    mova        m7,        m0
-    pmaddwd     m7,        [r5 + 7 * 16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m5,        m7
-
-    punpckhwd   m7,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m5
-    punpcklwd   m6,        m5
-
-    punpckldq   m5,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m7, m2
-    punpckhdq   m7,        m2
-
-    lea         r4,              [r1 * 3]
-    movh        [r0],            m5
-    movhps      [r0 + r1],       m5
-    movh        [r0 + r1 * 2],   m4
-    movhps      [r0 + r4],       m4
-    lea         r2,              [r0 + r1 * 4]
-    movh        [r2],            m6
-    movhps      [r2 + r1],       m6
-    movh        [r2 + r1 * 2],   m7
-    movhps      [r2 + r4],       m7
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 + 13 * 16]             ; [30]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 + 13 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5 - 13 * 16]             ; [4]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 - 13 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m2,        m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5 - 7 * 16]              ; [10]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 - 7 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    pmaddwd     m3,        [r5 - 1 * 16]              ; [16]
-    paddd       m3,        [pd_16]
-    psrld       m3,        5
-    pmaddwd     m0,        [r5 - 1 * 16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m3,        m0
-
-    punpckhwd   m5,        m4, m2
-    punpcklwd   m4,        m2
-    punpckhwd   m2,        m6, m3
-    punpcklwd   m6,        m3
-
-    punpckldq   m3,        m4, m6
-    punpckhdq   m4,        m6
-    punpckldq   m6,        m5, m2
-    punpckhdq   m5,        m2
-
-    movh        [r0 + 8],            m3
-    movhps      [r0 + r1 + 8],       m3
-    movh        [r0 + r1 * 2 + 8],   m4
-    movhps      [r0 + r4 + 8],       m4
-    lea         r0,                  [r0 + r1 * 4]
-    movh        [r0 + 8],            m6
-    movhps      [r0 + r1 + 8],       m6
-    movh        [r0 + r1 * 2 + 8],   m5
-    movhps      [r0 + r4 + 8],       m5
-
-    RET
-
-cglobal intra_pred_ang8_18, 4,5,3
-    add         r1,              r1
-    lea         r4,              [r1 * 3]
-    movu        m1,              [r3]
-    movu        m0,              [r2 + 2]
-    pshufb      m0,              [pw_swap16]
-    movu        [r0],            m1
-    palignr     m2,              m1, m0, 14
-    movu        [r0 + r1],       m2
-    palignr     m2,              m1, m0, 12
-    movu        [r0 + r1 * 2],   m2
-    palignr     m2,              m1, m0, 10
-    movu        [r0 + r4],       m2
-    lea         r0,              [r0 + r1 * 4]
-    palignr     m2,              m1, m0, 8
-    movu        [r0],            m2
-    palignr     m2,              m1, m0, 6
-    movu        [r0 + r1],       m2
-    palignr     m2,              m1, m0, 4
-    movu        [r0 + r1 * 2],   m2
-    palignr     m1,              m0, 2
-    movu        [r0 + r4],       m1
-    RET
-
-cglobal intra_pred_ang8_19, 4,6,8
-    lea         r5,        [ang_table + 17 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r3]                       ; [7 6 5 4 3 2 1 0]
-    movu        m1,        [r3 + 2]                   ; [8 7 6 5 4 3 2 1]
-
-    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
-    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 - 11 * 16]             ; [6]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 - 11 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    palignr     m0,        m3, 12
-    movu        m1,        [r2]
-    pshufb      m1,        [pw_ang8_17]
-    palignr     m3,        m1, 12
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5 - 5 * 16]              ; [12]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 - 5 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m2,        m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5 + 1 * 16]              ; [18]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 + 1 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    mova        m5,        m3
-    pmaddwd     m5,        [r5 + 7 * 16]              ; [24]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    mova        m7,        m0
-    pmaddwd     m7,        [r5 + 7 * 16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m5,        m7
-
-    lea         r4,              [r1 * 3]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 + 13 * 16]             ; [30]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 + 13 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5 - 13 * 16]             ; [4]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 - 13 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m2,        m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5 - 7 * 16]              ; [10]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 - 7 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    pmaddwd     m3,        [r5 - 1 * 16]              ; [16]
-    paddd       m3,        [pd_16]
-    psrld       m3,        5
-    pmaddwd     m0,        [r5 - 1 * 16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m3,        m0
-
-    lea         r0,              [r0 + r1 * 4]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m3
-
-    RET
-
-cglobal intra_pred_ang8_20, 4,6,8
-    lea         r5,        [ang_table + 13 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r3]                       ; [7 6 5 4 3 2 1 0]
-    movu        m1,        [r3 + 2]                   ; [8 7 6 5 4 3 2 1]
-
-    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
-    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 - 2 * 16]              ; [11]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 - 2 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    palignr     m0,        m3, 12
-    movu        m1,        [r2]
-    pshufb      m1,        [pw_ang8_16]
-    palignr     m3,        m1, 12
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5 + 9 * 16]              ; [22]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 + 9 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m2,        m5
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5 - 12 * 16]             ; [1]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 - 12 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    mova        m5,        m3
-    pmaddwd     m5,        [r5 - 1 * 16]              ; [12]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    mova        m7,        m0
-    pmaddwd     m7,        [r5 - 1 * 16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m5,        m7
-
-    lea         r4,              [r1 * 3]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 + 10 * 16]             ; [23]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 + 10 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5 - 11 * 16]             ; [2]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 - 11 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m2,        m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5]                       ; [13]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-    pinsrw      m3,        [r2 + 16], 0
-
-    pmaddwd     m3,        [r5 + 11 * 16]             ; [24]
-    paddd       m3,        [pd_16]
-    psrld       m3,        5
-    pmaddwd     m0,        [r5 + 11 * 16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m3,        m0
-
-    lea         r0,              [r0 + r1 * 4]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m3
-
-    RET
-
-cglobal intra_pred_ang8_21, 4,6,8
-    lea         r5,        [ang_table + 20 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r3]                       ; [7 6 5 4 3 2 1 0]
-    movu        m1,        [r3 + 2]                   ; [8 7 6 5 4 3 2 1]
-
-    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
-    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 - 5 * 16]              ; [15]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 - 5 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    palignr     m0,        m3, 12
-    movu        m1,        [r2]
-    pshufb      m1,        [pw_ang8_15]
-    palignr     m3,        m1, 12
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5 + 10 * 16]             ; [30]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 + 10 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m2,        m5
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5 - 7 * 16]              ; [13]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 - 7 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    mova        m5,        m3
-    pmaddwd     m5,        [r5 + 8 * 16]              ; [28]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    mova        m7,        m0
-    pmaddwd     m7,        [r5 + 8 * 16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m5,        m7
-
-    lea         r4,              [r1 * 3]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m5
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 - 9 * 16]              ; [11]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 - 9 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5 + 6 * 16]              ; [26]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 + 6 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m2,        m5
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5 - 11 * 16]             ; [9]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 - 11 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-    pinsrw      m3,        [r2 + 16], 0
-
-    pmaddwd     m3,        [r5 + 4 * 16]              ; [24]
-    paddd       m3,        [pd_16]
-    psrld       m3,        5
-    pmaddwd     m0,        [r5 + 4 * 16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m3,        m0
-
-    lea         r0,              [r0 + r1 * 4]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m3
-
-    RET
-
-cglobal intra_pred_ang8_22, 4,6,8
-    lea         r5,        [ang_table + 18 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r3]                       ; [7 6 5 4 3 2 1 0]
-    movu        m1,        [r3 + 2]                   ; [8 7 6 5 4 3 2 1]
-
-    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
-    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 + 1 * 16]              ; [19]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 + 1 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5 - 12 * 16]             ; [6]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r5 - 12 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    palignr     m0,        m3, 12
-    movu        m1,        [r2]
-    pshufb      m1,        [pw_ang8_14]
-    palignr     m3,        m1, 12
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5 + 7 * 16]              ; [25]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 + 7 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    mova        m5,        m3
-    pmaddwd     m5,        [r5 - 6 * 16]              ; [12]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    mova        m7,        m0
-    pmaddwd     m7,        [r5 - 6 * 16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m5,        m7
-
-    lea         r4,              [r1 * 3]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 + 13 * 16]             ; [31]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 + 13 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5]                       ; [18]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m2,        m5
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5 - 13 * 16]             ; [5]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 - 13 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    pmaddwd     m3,        [r5 + 6 * 16]              ; [24]
-    paddd       m3,        [pd_16]
-    psrld       m3,        5
-    pmaddwd     m0,        [r5 + 6 * 16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m3,        m0
-
-    lea         r0,              [r0 + r1 * 4]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m3
-
-    RET
-
-cglobal intra_pred_ang8_23, 4,6,8
-    lea         r5,        [ang_table + 14 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r3]                       ; [7 6 5 4 3 2 1 0]
-    movu        m1,        [r3 + 2]                   ; [8 7 6 5 4 3 2 1]
-
-    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
-    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 + 9 * 16]              ; [23]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 + 9 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5]                       ; [14]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r5]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5 - 9 * 16]              ; [5]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r5 - 9 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    palignr     m0,        m3, 12
-    movu        m1,        [r2]
-    pshufb      m1,        [pw_ang8_13]
-    palignr     m3,        m1, 12
-
-    mova        m5,        m3
-    pmaddwd     m5,        [r5 + 14 * 16]             ; [28]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    mova        m7,        m0
-    pmaddwd     m7,        [r5 + 14 * 16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m5,        m7
-
-    lea         r4,              [r1 * 3]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m5
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 + 5 * 16]              ; [19]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 + 5 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5 - 4 * 16]              ; [10]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 - 4 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m2,        m5
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5 - 13 * 16]             ; [1]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 - 13 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    pslldq      m1,        2
-    palignr     m0,        m3, 12
-    palignr     m3,        m1, 12
-
-    pmaddwd     m3,        [r5 + 10 * 16]             ; [24]
-    paddd       m3,        [pd_16]
-    psrld       m3,        5
-    pmaddwd     m0,        [r5 + 10 * 16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m3,        m0
-
-    lea         r0,              [r0 + r1 * 4]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m3
-
-    RET
-
-cglobal intra_pred_ang8_24, 4,6,7
-    lea         r5,        [ang_table + 16 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r3]                       ; [7 6 5 4 3 2 1 0]
-    movu        m1,        [r3 + 2]                   ; [8 7 6 5 4 3 2 1]
-
-    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
-    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 + 11 * 16]             ; [27]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 + 11 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5 + 6 * 16]              ; [22]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r5 + 6 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5 + 1 * 16]              ; [17]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r5 + 1 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    mova        m5,        m3
-    pmaddwd     m5,        [r5 - 4 * 16]              ; [12]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r5 - 4 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m5,        m1
-
-    lea         r4,              [r1 * 3]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m5
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r5 - 9 * 16]              ; [7]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r5 - 9 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r5 - 14 * 16]             ; [2]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r5 - 14 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    palignr     m0,        m3, 12
-    movu        m1,        [r2]
-    pshufb      m1,        [pw_ang8_12]
-    palignr     m3,        m1, 12
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r5 + 13 * 16]             ; [29]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r5 + 13 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    pmaddwd     m3,        [r5 + 8 * 16]              ; [24]
-    paddd       m3,        [pd_16]
-    psrld       m3,        5
-    pmaddwd     m0,        [r5 + 8 * 16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m3,        m0
-
-    lea         r0,              [r0 + r1 * 4]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m3
-
-    RET
-
-cglobal intra_pred_ang8_25, 3,5,7
-    mov         r2,        r3mp
-    lea         r3,        [ang_table + 23 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r2]                       ; [7 6 5 4 3 2 1 0]
-    movu        m1,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-
-    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
-    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r3 + 7 * 16]              ; [30]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 + 7 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r3 + 5 * 16]              ; [28]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 + 5 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r3 + 3 * 16]              ; [26]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 + 3 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    mova        m5,        m3
-    pmaddwd     m5,        [r3 + 1 * 16]              ; [24]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 + 1 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m5,        m1
-
-    lea         r4,              [r1 * 3]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m5
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r3 - 1 * 16]              ; [22]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 - 1 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r3 - 3 * 16]              ; [20]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 - 3 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r3 - 5 * 16]              ; [18]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r3 - 5 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    pmaddwd     m3,        [r3 - 7 * 16]              ; [16]
-    paddd       m3,        [pd_16]
-    psrld       m3,        5
-    pmaddwd     m0,        [r3 - 7 * 16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m3,        m0
-
-    lea         r0,              [r0 + r1 * 4]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m3
-
-    RET
-
-cglobal intra_pred_ang8_26, 4,5,3
-    movu        m0,             [r3 + 2]            ; [8 7 6 5 4 3 2 1]
-    add         r1,             r1
-    lea         r4,             [r1 * 3]
-
-    movu        [r0],           m0
-    movu        [r0 + r1],      m0
-    movu        [r0 + r1 * 2],  m0
-    movu        [r0 + r4],      m0
-
-    lea         r3,             [r0 + r1 *4]
-    movu        [r3],           m0
-    movu        [r3 + r1],      m0
-    movu        [r3 + r1 * 2],  m0
-    movu        [r3 + r4],      m0
-
-    cmp         r5m,            byte 0
-    jz         .quit
-
-    ; filter
-
-    pshufb      m0,             [pw_unpackwdq]
-    movh        m1,             [r2]                ; [3 2 1 0]
-    pshufb      m2,             m1, [pw_unpackwdq]  ; [0 0 0 0 0 0 0 0]
-    movu        m1,             [r2 + 2]            ; [8 7 6 5 4 3 2 1]
-    psubw       m1,             m2
-    psraw       m1,             1
-    paddw       m0,             m1
-    pxor        m1,             m1
-    pmaxsw      m0,             m1
-    pminsw      m0,             [pw_1023]
-    pextrw      [r0],          m0, 0
-    pextrw      [r0 + r1],     m0, 1
-    pextrw      [r0 + r1 * 2], m0, 2
-    pextrw      [r0 + r4],     m0, 3
-    pextrw      [r3],          m0, 4
-    pextrw      [r3 + r1],     m0, 5
-    pextrw      [r3 + r1 * 2], m0, 6
-    pextrw      [r3 + r4],     m0, 7
-
-.quit:
-    RET
-
-cglobal intra_pred_ang8_27, 3,5,7
-    mov         r2,        r3mp
-    lea         r3,        [ang_table + 9 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-    movu        m1,        [r2 + 4]                   ; [9 8 7 6 5 4 3 2]
-
-    punpcklwd   m3,        m0, m1                     ; [5 4 4 3 3 2 2 1]
-    punpckhwd   m0,        m1                         ; [9 8 8 7 7 6 6 5]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r3 - 7 * 16]              ; [2]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 - 7 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r3 - 5 * 16]              ; [4]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 - 5 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r3 - 3 * 16]              ; [6]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 - 3 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    mova        m5,        m3
-    pmaddwd     m5,        [r3 - 1 * 16]              ; [8]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 - 1 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m5,        m1
-
-    lea         r4,              [r1 * 3]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m5
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r3 + 1 * 16]              ; [10]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 + 1 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r3 + 3 * 16]              ; [12]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 + 3 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r3 + 5 * 16]              ; [14]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m5,        m0
-    pmaddwd     m5,        [r3 + 5 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m6,        m5
-
-    pmaddwd     m3,        [r3 + 7 * 16]              ; [16]
-    paddd       m3,        [pd_16]
-    psrld       m3,        5
-    pmaddwd     m0,        [r3 + 7 * 16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m3,        m0
-
-    lea         r0,              [r0 + r1 * 4]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m3
-
-    RET
-
-cglobal intra_pred_ang8_28, 3,5,7
-    mov         r2,        r3mp
-    lea         r3,        [ang_table + 17 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-    movu        m1,        [r2 + 4]                   ; [9 8 7 6 5 4 3 2]
-
-    punpcklwd   m3,        m0, m1                     ; [5 4 4 3 3 2 2 1]
-    punpckhwd   m0,        m1                         ; [9 8 8 7 7 6 6 5]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r3 - 12 * 16]             ; [5]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 - 12 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r3 - 7 * 16]              ; [10]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 - 7 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r3 - 2 * 16]              ; [15]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 - 2 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    mova        m5,        m3
-    pmaddwd     m5,        [r3 + 3 * 16]              ; [20]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 + 3 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m5,        m1
-
-    lea         r4,              [r1 * 3]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m5
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r3 + 8 * 16]              ; [25]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 + 8 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r3 + 13 * 16]             ; [30]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 + 13 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    movh        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
-
-    palignr     m6,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
-    mova        m5,        m6
-    pmaddwd     m6,        [r3 - 14 * 16]             ; [3]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    palignr     m1,        m0, 4                      ; [10 9 9 8 8 7 7 6]
-    mova        m3,        m1
-    pmaddwd     m1,        [r3 - 14 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m5,        [r3 - 9 * 16]              ; [8]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    pmaddwd     m3,        [r3 - 9 * 16]
-    paddd       m3,        [pd_16]
-    psrld       m3,        5
-    packusdw    m5,        m3
-
-    lea         r0,              [r0 + r1 * 4]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m5
-
-    RET
-
-cglobal intra_pred_ang8_29, 3,5,8
-    mov         r2,        r3mp
-    lea         r3,        [ang_table + 18 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
-    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
-    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
-
-    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
-    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
-    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r3 - 9 * 16]              ; [9]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 - 9 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r3]                       ; [18]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    mova        m6,        m3
-    pmaddwd     m6,        [r3 + 9 * 16]              ; [27]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 + 9 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    palignr     m7,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
-    pmaddwd     m7,        [r3 - 14 * 16]             ; [4]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
-    pmaddwd     m1,        [r3 - 14 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m7,        m1
-
-    lea         r4,              [r1 * 3]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m7
-
-    palignr     m4,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
-    mova        m6,        m4
-    pmaddwd     m4,        [r3 - 5 * 16]              ; [13]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    palignr     m2,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
-    mova        m7,        m2
-    pmaddwd     m2,        [r3 - 5 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    pmaddwd     m2,        m6, [r3 + 4 * 16]          ; [22]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    pmaddwd     m1,        m7, [r3 + 4 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    pmaddwd     m6,        [r3 + 13 * 16]             ; [31]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    pmaddwd     m7,        [r3 + 13 * 16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m6,        m7
-
-    palignr     m7,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
-    pmaddwd     m7,        [r3 - 10 * 16]             ; [8]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    palignr     m5,        m0, 8                      ; [11 10 10 9 9 8 8 7]
-    pmaddwd     m5,        [r3 - 10 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m7,        m5
-
-    lea         r0,              [r0 + r1 * 4]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m7
-
-    RET
-
-cglobal intra_pred_ang8_30, 3,5,8
-    mov         r2,        r3mp
-    lea         r3,        [ang_table + 14 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
-    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
-    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
-
-    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
-    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
-    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r3 - 1 * 16]              ; [13]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 - 1 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m3
-    pmaddwd     m2,        [r3 + 12 * 16]             ; [26]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m1,        m0
-    pmaddwd     m1,        [r3 + 12 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    palignr     m6,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
-    mova        m7,        m6
-    pmaddwd     m6,        [r3 - 7 * 16]              ; [7]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
-    pmaddwd     m1,        [r3 - 7 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m7,        [r3 + 6 * 16]              ; [20]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
-    pmaddwd     m1,        [r3 + 6 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m7,        m1
-
-    lea         r4,              [r1 * 3]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m7
-
-    palignr     m4,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
-    mova        m6,        m4
-    pmaddwd     m4,        [r3 - 13 * 16]             ; [1]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    palignr     m2,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
-    mova        m7,        m2
-    pmaddwd     m2,        [r3 - 13 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    pmaddwd     m2,        m6, [r3]                   ; [14]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    pmaddwd     m1,        m7, [r3]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    pmaddwd     m6,        [r3 + 13 * 16]             ; [27]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    pmaddwd     m7,        [r3 + 13 * 16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m6,        m7
-
-    palignr     m7,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
-    pmaddwd     m7,        [r3 - 6 * 16]              ; [8]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    palignr     m5,        m0, 12                     ; [12 11 11 10 10 9 9 8]
-    pmaddwd     m5,        [r3 - 6 * 16]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-    packusdw    m7,        m5
-
-    lea         r0,              [r0 + r1 * 4]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m7
-
-    RET
-
-cglobal intra_pred_ang8_31, 3,5,8
-    mov         r2,        r3mp
-    lea         r3,        [ang_table + 13 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
-    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
-    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
-
-    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
-    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
-    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r3 + 4 * 16]              ; [17]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 + 4 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    palignr     m2,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
-    mova        m6,        m2
-    pmaddwd     m2,        [r3 - 11 * 16]             ; [2]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
-    mova        m7,        m1
-    pmaddwd     m1,        [r3 - 11 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    pmaddwd     m6,        [r3 + 6 * 16]              ; [19]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    pmaddwd     m7,        [r3 + 6 * 16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m6,        m7
-
-    palignr     m7,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
-    pmaddwd     m7,        [r3 - 9 * 16]              ; [4]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    palignr     m1,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
-    pmaddwd     m1,        [r3 - 9 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m7,        m1
-
-    lea         r4,              [r1 * 3]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m7
-
-    palignr     m4,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
-    pmaddwd     m4,        [r3 + 8 * 16]              ; [21]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    palignr     m2,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
-    pmaddwd     m2,        [r3 + 8 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    palignr     m2,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
-    mova        m6,        m2
-    pmaddwd     m2,        [r3 - 7 * 16]              ; [6]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    palignr     m1,        m5, m0, 12                 ; [12 11 11 10 10 9 9 8]
-    mova        m7,        m1
-    pmaddwd     m1,        [r3 - 7 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    pmaddwd     m6,        [r3 + 10 * 16]             ; [23]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    pmaddwd     m7,        [r3 + 10 * 16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m6,        m7
-
-    mova        m7,        m0
-    pmaddwd     m7,        [r3 - 5 * 16]              ; [8]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    mova        m1,        m5
-    pmaddwd     m1,        [r3 - 5 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m7,        m1
-
-    lea         r0,              [r0 + r1 * 4]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m7
-
-    RET
-
-cglobal intra_pred_ang8_32, 3,6,8
-    mov         r2,        r3mp
-    lea         r3,        [ang_table + 19 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
-    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
-    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
-
-    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
-    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
-    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r3 + 2 * 16]              ; [21]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 + 2 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    palignr     m2,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
-    mova        m6,        m2
-    pmaddwd     m2,        [r3 - 9 * 16]              ; [10]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
-    mova        m7,        m1
-    pmaddwd     m1,        [r3 - 9 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m2,        m1
-
-    pmaddwd     m6,        [r3 + 12 * 16]             ; [31]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    pmaddwd     m7,        [r3 + 12 * 16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m6,        m7
-
-    palignr     m7,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
-    pmaddwd     m7,        [r3 + 1 * 16]              ; [20]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    palignr     m1,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
-    pmaddwd     m1,        [r3 + 1 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m7,        m1
-
-    lea         r4,              [r1 * 3]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m7
-
-    palignr     m4,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
-    mova        m2,        m4
-    pmaddwd     m4,        [r3 - 10 * 16]             ; [ 9]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    palignr     m3,        m5, m0, 12                 ; [12 11 11 10 10 9 9 8]
-    mova        m6,        m3
-    pmaddwd     m3,        [r3 - 10 * 16]
-    paddd       m3,        [pd_16]
-    psrld       m3,        5
-    packusdw    m4,        m3
-
-    pmaddwd     m2,        [r3 + 11 * 16]             ; [30]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    pmaddwd     m6,        [r3 + 11 * 16]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m2,        m6
-
-    mova        m6,        m0
-    pmaddwd     m6,        [r3]                       ; [19]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    mova        m7,        m5
-    pmaddwd     m7,        [r3]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m6,        m7
-
-    movh        m1,        [r2 + 26]                  ; [16 15 14 13]
-    palignr     m7,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
-    pmaddwd     m7,        [r3 - 11 * 16]             ; [8]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    palignr     m1,        m5, 4                      ; [14 13 13 12 12 11 11 10]
-    pmaddwd     m1,        [r3 - 11 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m7,        m1
-
-    lea         r0,              [r0 + r1 * 4]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m7
-
-    RET
-
-cglobal intra_pred_ang8_33, 3,5,8
-    mov         r2,        r3mp
-    lea         r3,        [ang_table + 14 * 16]
-    add         r1,        r1
-
-    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
-    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
-    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
-
-    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
-    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
-    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
-    punpckhwd   m1,        m4                         ; [x 16 16 15 15 14 14 13]
-
-    mova        m4,        m3
-    pmaddwd     m4,        [r3 + 12 * 16]             ; [26]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 + 12 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    palignr     m2,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
-    pmaddwd     m2,        [r3 + 6 * 16]              ; [20]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    palignr     m6,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
-    pmaddwd     m6,        [r3 + 6 * 16]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m2,        m6
-
-    palignr     m6,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
-    pmaddwd     m6,        [r3]                       ; [14]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    palignr     m7,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
-    pmaddwd     m7,        [r3]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m6,        m7
-
-    palignr     m7,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
-    pmaddwd     m7,        [r3 - 6 * 16]              ; [ 8]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    palignr     m3,        m5, m0, 12                 ; [12 11 11 10 10 9 9 8]
-    pmaddwd     m3,        [r3 - 6 * 16]
-    paddd       m3,        [pd_16]
-    psrld       m3,        5
-    packusdw    m7,        m3
-
-    lea         r4,              [r1 * 3]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m7
-
-    mova        m4,        m0
-    pmaddwd     m4,        [r3 - 12 * 16]             ; [ 2]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-    mova        m2,        m5
-    pmaddwd     m2,        [r3 - 12 * 16]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    packusdw    m4,        m2
-
-    mova        m2,        m0
-    pmaddwd     m2,        [r3 + 14 * 16]             ; [28]
-    paddd       m2,        [pd_16]
-    psrld       m2,        5
-    mova        m6,        m5
-    pmaddwd     m6,        [r3 + 14 * 16]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m2,        m6
-
-    palignr     m6,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
-    pmaddwd     m6,        [r3 + 8 * 16]              ; [22]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    palignr     m7,        m1, m5, 4                  ; [14 13 13 12 12 11 11 10]
-    pmaddwd     m7,        [r3 + 8 * 16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m6,        m7
-
-    palignr     m7,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
-    pmaddwd     m7,        [r3 + 2 * 16]              ; [16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    palignr     m1,        m5, 8                      ; [15 14 14 13 13 12 12 11]
-    pmaddwd     m1,        [r3 + 2 * 16]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m7,        m1
-
-    lea         r0,              [r0 + r1 * 4]
-    movu        [r0],            m4
-    movu        [r0 + r1],       m2
-    movu        [r0 + r1 * 2],   m6
-    movu        [r0 + r4],       m7
-
-    RET
-
 ;-----------------------------------------------------------------------------------------
 ; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
 ;-----------------------------------------------------------------------------------------
@@ -8568,89 +4914,6 @@
     movu        [r0 + r4],       m7
     RET
 
-;-----------------------------------------------------------------------------
-; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------
-INIT_XMM ssse3
-cglobal intra_pred_ang16_2, 3,4,5
-    cmp         r4m,                byte 34
-    cmove       r2,                 r3mp
-    add         r1,                 r1
-    lea         r3,                 [r1 * 3]
-    movu        m0,                 [r2 + 4]
-    movu        m1,                 [r2 + 20]
-    movu        m2,                 [r2 + 36]
-
-    movu        [r0],               m0
-    movu        [r0 + 16],          m1
-    palignr     m3,                 m1, m0, 2
-    palignr     m4,                 m2, m1, 2
-    movu        [r0 + r1],          m3
-    movu        [r0 + r1 + 16],     m4
-    palignr     m3,                 m1, m0, 4
-    palignr     m4,                 m2, m1, 4
-    movu        [r0 + r1 * 2],      m3
-    movu        [r0 + r1 * 2 + 16], m4
-    palignr     m3,                 m1, m0, 6
-    palignr     m4,                 m2, m1, 6
-    movu        [r0 + r3],          m3
-    movu        [r0 + r3 + 16],     m4
-
-    lea         r0,                 [r0 + r1 * 4]
-    palignr     m3,                 m1, m0, 8
-    palignr     m4,                 m2, m1, 8
-    movu        [r0],               m3
-    movu        [r0 + 16],          m4
-    palignr     m3,                 m1, m0, 10
-    palignr     m4,                 m2, m1, 10
-    movu        [r0 + r1],          m3
-    movu        [r0 + r1 + 16],     m4
-    palignr     m3,                 m1, m0, 12
-    palignr     m4,                 m2, m1, 12
-    movu        [r0 + r1 * 2],      m3
-    movu        [r0 + r1 * 2 + 16], m4
-    palignr     m3,                 m1, m0, 14
-    palignr     m4,                 m2, m1, 14
-    movu        [r0 + r3],          m3
-    movu        [r0 + r3 + 16],     m4
-
-    movu        m0,                 [r2 + 52]
-    lea         r0,                 [r0 + r1 * 4]
-    movu        [r0],               m1
-    movu        [r0 + 16],          m2
-    palignr     m3,                 m2, m1, 2
-    palignr     m4,                 m0, m2, 2
-    movu        [r0 + r1],          m3
-    movu        [r0 + r1 + 16],     m4
-    palignr     m3,                 m2, m1, 4
-    palignr     m4,                 m0, m2, 4
-    movu        [r0 + r1 * 2],      m3
-    movu        [r0 + r1 * 2 + 16], m4
-    palignr     m3,                 m2, m1, 6
-    palignr     m4,                 m0, m2, 6
-    movu        [r0 + r3],          m3
-    movu        [r0 + r3 + 16],     m4
-
-    lea         r0,                 [r0 + r1 * 4]
-    palignr     m3,                 m2, m1, 8
-    palignr     m4,                 m0, m2, 8
-    movu        [r0],               m3
-    movu        [r0 + 16],          m4
-    palignr     m3,                 m2, m1, 10
-    palignr     m4,                 m0, m2, 10
-    movu        [r0 + r1],          m3
-    movu        [r0 + r1 + 16],     m4
-    palignr     m3,                 m2, m1, 12
-    palignr     m4,                 m0, m2, 12
-    movu        [r0 + r1 * 2],      m3
-    movu        [r0 + r1 * 2 + 16], m4
-    palignr     m3,                 m2, m1, 14
-    palignr     m4,                 m0, m2, 14
-    movu        [r0 + r3],          m3
-    movu        [r0 + r3 + 16],     m4
-
-    RET
-
 %macro TRANSPOSE_STORE 6
     jnz         .skip%6
     punpckhwd   %5,        %1, %2
@@ -8870,38 +5133,6 @@
 
     ret
 
-cglobal intra_pred_ang16_3, 3,7,8
-    xor         r6d,       r6d
-    lea         r3,        [ang_table + 16 * 16]
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-
-    call        ang16_mode_3_33
-
-    lea         r2,        [r2 + 16]
-    lea         r0,        [r0 + r1 * 8]
-
-    call        ang16_mode_3_33
-
-    RET
-
-cglobal intra_pred_ang16_33, 4,7,8
-    xor         r6d,       r6d
-    inc         r6d
-    mov         r2,        r3
-    lea         r3,        [ang_table + 16 * 16]
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-
-    call        ang16_mode_3_33
-
-    lea         r2,        [r2 + 16]
-    lea         r0,        [r0 + 16]
-
-    call        ang16_mode_3_33
-
-    RET
-
 cglobal ang16_mode_4_32
     test        r6d,       r6d
     movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
@@ -9098,38 +5329,6 @@
 
     ret
 
-cglobal intra_pred_ang16_4, 3,7,8
-    xor         r6d,       r6d
-    lea         r3,        [ang_table + 18 * 16]
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-
-    call        ang16_mode_4_32
-
-    lea         r2,        [r2 + 16]
-    lea         r0,        [r0 + r1 * 8]
-
-    call        ang16_mode_4_32
-
-    RET
-
-cglobal intra_pred_ang16_32, 4,7,8
-    xor         r6d,       r6d
-    inc         r6d
-    mov         r2,        r3
-    lea         r3,        [ang_table + 18 * 16]
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-
-    call        ang16_mode_4_32
-
-    lea         r2,        [r2 + 16]
-    lea         r0,        [r0 + 16]
-
-    call        ang16_mode_4_32
-
-    RET
-
 cglobal ang16_mode_5_31
     test        r6d,       r6d
     movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
@@ -9322,38 +5521,6 @@
 
     ret
 
-cglobal intra_pred_ang16_5, 3,7,8
-    xor         r6d,       r6d
-    lea         r3,        [ang_table + 16 * 16]
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-
-    call        ang16_mode_5_31
-
-    lea         r2,        [r2 + 16]
-    lea         r0,        [r0 + r1 * 8]
-
-    call        ang16_mode_5_31
-
-    RET
-
-cglobal intra_pred_ang16_31, 4,7,8
-    xor         r6d,       r6d
-    inc         r6d
-    mov         r2,        r3
-    lea         r3,        [ang_table + 16 * 16]
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-
-    call        ang16_mode_5_31
-
-    lea         r2,        [r2 + 16]
-    lea         r0,        [r0 + 16]
-
-    call        ang16_mode_5_31
-
-    RET
-
 cglobal ang16_mode_6_30
     test        r6d,       r6d
     movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
@@ -9547,38 +5714,6 @@
 
     ret
 
-cglobal intra_pred_ang16_6, 3,7,8
-    xor         r6d,       r6d
-    lea         r3,        [ang_table + 15 * 16]
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-
-    call        ang16_mode_6_30
-
-    lea         r2,        [r2 + 16]
-    lea         r0,        [r0 + r1 * 8]
-
-    call        ang16_mode_6_30
-
-    RET
-
-cglobal intra_pred_ang16_30, 4,7,8
-    xor         r6d,       r6d
-    inc         r6d
-    mov         r2,        r3
-    lea         r3,        [ang_table + 15 * 16]
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-
-    call        ang16_mode_6_30
-
-    lea         r2,        [r2 + 16]
-    lea         r0,        [r0 + 16]
-
-    call        ang16_mode_6_30
-
-    RET
-
 cglobal ang16_mode_7_29
     test        r6d,       r6d
     movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
@@ -9766,38 +5901,6 @@
 
     ret
 
-cglobal intra_pred_ang16_7, 3,7,8
-    xor         r6d,       r6d
-    lea         r3,        [ang_table + 17 * 16]
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-
-    call        ang16_mode_7_29
-
-    lea         r2,        [r2 + 16]
-    lea         r0,        [r0 + r1 * 8]
-
-    call        ang16_mode_7_29
-
-    RET
-
-cglobal intra_pred_ang16_29, 4,7,8
-    xor         r6d,       r6d
-    inc         r6d
-    mov         r2,        r3
-    lea         r3,        [ang_table + 17 * 16]
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-
-    call        ang16_mode_7_29
-
-    lea         r2,        [r2 + 16]
-    lea         r0,        [r0 + 16]
-
-    call        ang16_mode_7_29
-
-    RET
-
 cglobal ang16_mode_8_28
     test        r6d,       r6d
     movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
@@ -9987,38 +6090,6 @@
 
     ret
 
-cglobal intra_pred_ang16_8, 3,7,8
-    xor         r6d,       r6d
-    lea         r3,        [ang_table + 15 * 16]
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-
-    call        ang16_mode_8_28
-
-    lea         r2,        [r2 + 16]
-    lea         r0,        [r0 + r1 * 8]
-
-    call        ang16_mode_8_28
-
-    RET
-
-cglobal intra_pred_ang16_28, 4,7,8
-    xor         r6d,       r6d
-    inc         r6d
-    mov         r2,        r3
-    lea         r3,        [ang_table + 15 * 16]
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-
-    call        ang16_mode_8_28
-
-    lea         r2,        [r2 + 16]
-    lea         r0,        [r0 + 16]
-
-    call        ang16_mode_8_28
-
-    RET
-
 cglobal ang16_mode_9_27
     test        r6d,       r6d
     movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
@@ -10195,38 +6266,6 @@
 
     ret
 
-cglobal intra_pred_ang16_9, 3,7,8
-    xor         r6d,       r6d
-    lea         r3,        [ang_table + 16 * 16]
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-
-    call        ang16_mode_9_27
-
-    lea         r2,        [r2 + 16]
-    lea         r0,        [r0 + r1 * 8]
-
-    call        ang16_mode_9_27
-
-    RET
-
-cglobal intra_pred_ang16_27, 4,7,8
-    xor         r6d,       r6d
-    inc         r6d
-    mov         r2,        r3
-    lea         r3,        [ang_table + 16 * 16]
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-
-    call        ang16_mode_9_27
-
-    lea         r2,        [r2 + 16]
-    lea         r0,        [r0 + 16]
-
-    call        ang16_mode_9_27
-
-    RET
-
 cglobal ang16_mode_11_25
     test        r6d,       r6d
     movu        m0,        [r2]                       ; [7 6 5 4 3 2 1 0]
@@ -10405,38 +6444,6 @@
 
     ret
 
-cglobal intra_pred_ang16_11, 3,7,8
-    xor         r6d,       r6d
-    lea         r3,        [ang_table + 16 * 16]
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-
-    call        ang16_mode_11_25
-
-    lea         r2,        [r2 + 16]
-    lea         r0,        [r0 + r1 * 8]
-
-    call        ang16_mode_11_25
-
-    RET
-
-cglobal intra_pred_ang16_25, 4,7,8
-    xor         r6d,       r6d
-    inc         r6d
-    mov         r2,        r3
-    lea         r3,        [ang_table + 16 * 16]
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-
-    call        ang16_mode_11_25
-
-    lea         r2,        [r2 + 16]
-    lea         r0,        [r0 + 16]
-
-    call        ang16_mode_11_25
-
-    RET
-
 cglobal ang16_mode_12_24
     test        r3d,       r3d
     movu        m0,        [r2]                       ; [7 6 5 4 3 2 1 0]
@@ -10628,46 +6635,6 @@
 
     ret
 
-cglobal intra_pred_ang16_12, 4,7,8
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-    lea         r6,        [ang_table + 16 * 16]
-    movu        m5,        [r3]
-    pshufb      m5,        [pw_ang8_12]
-    pinsrw      m5,        [r3 + 26], 5
-    xor         r3d,       r3d
-
-    call        ang16_mode_12_24
-
-    lea         r0,        [r0 + r1 * 8]
-    movu        m5,        [r2 + 2]
-    lea         r2,        [r2 + 16]
-
-    call        ang16_mode_12_24
-
-    RET
-
-cglobal intra_pred_ang16_24, 4,7,8
-    xchg        r2,        r3
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-    lea         r6,        [ang_table + 16 * 16]
-    movu        m5,        [r3]
-    pshufb      m5,        [pw_ang8_12]
-    pinsrw      m5,        [r3 + 26], 5
-    xor         r3d,       r3d
-    inc         r3d
-
-    call        ang16_mode_12_24
-
-    lea         r0,        [r0 + 16]
-    movu        m5,        [r2 + 2]
-    lea         r2,        [r2 + 16]
-
-    call        ang16_mode_12_24
-
-    RET
-
 cglobal ang16_mode_13_23
     test        r3d,       r3d
     movu        m0,        [r2]                       ; [7 6 5 4 3 2 1 0]
@@ -10867,52 +6834,6 @@
 
     ret
 
-cglobal intra_pred_ang16_13, 4,7,8
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-    lea         r6,        [ang_table + 15 * 16]
-    movu        m5,        [r3]
-    pshufb      m5,        [pw_ang16_13]
-    movu        m6,        [r3 + 14]
-    pshufb      m6,        [pw_ang8_13]
-    pslldq      m6,        2
-    palignr     m5,        m6, 6
-    xor         r3d,       r3d
-
-    call        ang16_mode_13_23
-
-    lea         r0,        [r0 + r1 * 8]
-    movu        m5,        [r2 + 2]
-    lea         r2,        [r2 + 16]
-
-    call        ang16_mode_13_23
-
-    RET
-
-cglobal intra_pred_ang16_23, 4,7,8
-    xchg        r2,        r3
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-    lea         r6,        [ang_table + 15 * 16]
-    movu        m5,        [r3]
-    pshufb      m5,        [pw_ang16_13]
-    movu        m6,        [r3 + 14]
-    pshufb      m6,        [pw_ang8_13]
-    pslldq      m6,        2
-    palignr     m5,        m6, 6
-    xor         r3d,       r3d
-    inc         r3d
-
-    call        ang16_mode_13_23
-
-    lea         r0,        [r0 + 16]
-    movu        m5,        [r2 + 2]
-    lea         r2,        [r2 + 16]
-
-    call        ang16_mode_13_23
-
-    RET
-
 cglobal ang16_mode_14_22
     test        r3d,       r3d
     movu        m0,        [r2]                       ; [7 6 5 4 3 2 1 0]
@@ -11120,50 +7041,6 @@
 
     ret
 
-cglobal intra_pred_ang16_14, 4,7,8
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-    lea         r6,        [ang_table + 18 * 16]
-    movu        m6,        [r3]
-    pshufb      m6,        [pw_ang8_14]
-    movu        m5,        [r3 + 20]
-    pshufb      m5,        [pw_ang8_14]
-    punpckhqdq  m5,        m6
-    xor         r3d,       r3d
-
-    call        ang16_mode_14_22
-
-    lea         r0,        [r0 + r1 * 8]
-    movu        m5,        [r2 + 2]
-    lea         r2,        [r2 + 16]
-
-    call        ang16_mode_14_22
-
-    RET
-
-cglobal intra_pred_ang16_22, 4,7,8
-    xchg        r2,        r3
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-    lea         r6,        [ang_table + 18 * 16]
-    movu        m6,        [r3]
-    pshufb      m6,        [pw_ang8_14]
-    movu        m5,        [r3 + 20]
-    pshufb      m5,        [pw_ang8_14]
-    punpckhqdq  m5,        m6
-    xor         r3d,       r3d
-    inc         r3d
-
-    call        ang16_mode_14_22
-
-    lea         r0,        [r0 + 16]
-    movu        m5,        [r2 + 2]
-    lea         r2,        [r2 + 16]
-
-    call        ang16_mode_14_22
-
-    RET
-
 cglobal ang16_mode_15_21
     test        r3d,       r3d
     movu        m0,        [r2]                       ; [7 6 5 4 3 2 1 0]
@@ -11380,50 +7257,6 @@
 
     ret
 
-cglobal intra_pred_ang16_15, 4,7,8
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-    lea         r6,        [ang_table + 15 * 16]
-    movu        m6,        [r3 + 4]
-    pshufb      m6,        [pw_ang8_15]
-    movu        m5,        [r3 + 18]
-    pshufb      m5,        [pw_ang8_15]
-    punpckhqdq  m5,        m6
-    xor         r3d,       r3d
-
-    call        ang16_mode_15_21
-
-    lea         r0,        [r0 + r1 * 8]
-    movu        m5,        [r2]
-    lea         r2,        [r2 + 16]
-
-    call        ang16_mode_15_21
-
-    RET
-
-cglobal intra_pred_ang16_21, 4,7,8
-    xchg        r2,        r3
-    add         r1,        r1
-    lea         r4,        [r1 * 3]
-    lea         r6,        [ang_table + 15 * 16]
-    movu        m6,        [r3 + 4]
-    pshufb      m6,        [pw_ang8_15]
-    movu        m5,        [r3 + 18]
-    pshufb      m5,        [pw_ang8_15]
-    punpckhqdq  m5,        m6
-    xor         r3d,       r3d
-    inc         r3d
-
-    call        ang16_mode_15_21
-
-    lea         r0,        [r0 + 16]
-    movu        m5,        [r2]
-    lea         r2,        [r2 + 16]
-
-    call        ang16_mode_15_21
-
-    RET
-
 cglobal ang16_mode_16_20
     test        r4d,       r4d
     lea         r4,        [r1 * 3]
@@ -11651,57 +7484,6 @@
 
     ret
 
-cglobal intra_pred_ang16_16, 4,7,8,0-(1*mmsize)
-    add         r1,        r1
-    lea         r6,        [ang_table + 13 * 16]
-    movu        m6,        [r3 + 4]
-    pshufb      m6,        [pw_ang16_16]
-    movu        m5,        [r3 + 16]
-    pshufb      m5,        [pw_ang16_16]
-    punpckhqdq  m5,        m6
-    mov         [rsp],     r3
-    lea         r3,        [r3 + 24]
-    xor         r4,        r4
-
-    call        ang16_mode_16_20
-
-    lea         r0,        [r0 + r1 * 8]
-    mov         r3,        [rsp]
-    movu        m5,        [r2]
-    lea         r2,        [r2 + 16]
-    xor         r4,        r4
-
-    call        ang16_mode_16_20
-
-    RET
-
-cglobal intra_pred_ang16_20, 4,7,8,0-(1*mmsize)
-    xchg        r2,        r3
-    add         r1,        r1
-    lea         r6,        [ang_table + 13 * 16]
-    movu        m6,        [r3 + 4]
-    pshufb      m6,        [pw_ang16_16]
-    movu        m5,        [r3 + 16]
-    pshufb      m5,        [pw_ang16_16]
-    punpckhqdq  m5,        m6
-    mov         [rsp],     r3
-    lea         r3,        [r3 + 24]
-    xor         r4,        r4
-    inc         r4
-
-    call        ang16_mode_16_20
-
-    lea         r0,        [r0 + 16]
-    mov         r3,        [rsp]
-    movu        m5,        [r2]
-    lea         r2,        [r2 + 16]
-    xor         r4,        r4
-    inc         r4
-
-    call        ang16_mode_16_20
-
-    RET
-
 cglobal ang16_mode_17_19
     test        r4d,       r4d
     lea         r4,        [r1 * 3]
@@ -11937,323 +7719,6 @@
 
     ret
 
-cglobal intra_pred_ang16_17, 4,7,8,0-(1*mmsize)
-    add         r1,        r1
-    lea         r6,        [ang_table + 16 * 16]
-    movu        m6,        [r3 + 2]
-    pshufb      m6,        [pw_ang16_16]
-    movu        m5,        [r3 + 12]
-    pshufb      m5,        [pw_ang16_16]
-    punpckhqdq  m5,        m6
-    mov         [rsp],     r3
-    lea         r3,        [r3 + 20]
-    xor         r4,        r4
-
-    call        ang16_mode_17_19
-
-    lea         r0,        [r0 + r1 * 8]
-    mov         r3,        [rsp]
-    movu        m5,        [r2]
-    lea         r2,        [r2 + 16]
-    xor         r4,        r4
-
-    call        ang16_mode_17_19
-
-    RET
-
-cglobal intra_pred_ang16_19, 4,7,8,0-(1*mmsize)
-    xchg        r2,        r3
-    add         r1,        r1
-    lea         r6,        [ang_table + 16 * 16]
-    movu        m6,        [r3 + 2]
-    pshufb      m6,        [pw_ang16_16]
-    movu        m5,        [r3 + 12]
-    pshufb      m5,        [pw_ang16_16]
-    punpckhqdq  m5,        m6
-    mov         [rsp],     r3
-    lea         r3,        [r3 + 20]
-    xor         r4,        r4
-    inc         r4
-
-    call        ang16_mode_17_19
-
-    lea         r0,        [r0 + 16]
-    mov         r3,        [rsp]
-    movu        m5,        [r2]
-    lea         r2,        [r2 + 16]
-    xor         r4,        r4
-    inc         r4
-
-    call        ang16_mode_17_19
-
-    RET
-
-cglobal intra_pred_ang16_18, 4,5,4
-    add         r1,                  r1
-    lea         r4,                  [r1 * 3]
-    movu        m1,                  [r3]
-    movu        m3,                  [r3 + 16]
-    movu        m0,                  [r2 + 2]
-    pshufb      m0,                  [pw_swap16]
-    movu        [r0],                m1
-    movu        [r0 + 16],           m3
-    palignr     m2,                  m1, m0, 14
-    movu        [r0 + r1],           m2
-    palignr     m2,                  m3, m1, 14
-    movu        [r0 + r1 + 16],      m2
-    palignr     m2,                  m1, m0, 12
-    movu        [r0 + r1 * 2],       m2
-    palignr     m2,                  m3, m1, 12
-    movu        [r0 + r1 * 2 + 16],  m2
-    palignr     m2,                  m1, m0, 10
-    movu        [r0 + r4],           m2
-    palignr     m2,                  m3, m1, 10
-    movu        [r0 + r4 + 16],      m2
-
-    lea         r0,                  [r0 + r1 * 4]
-    palignr     m2,                  m1, m0, 8
-    movu        [r0],                m2
-    palignr     m2,                  m3, m1, 8
-    movu        [r0 + 16],           m2
-    palignr     m2,                  m1, m0, 6
-    movu        [r0 + r1],           m2
-    palignr     m2,                  m3, m1, 6
-    movu        [r0 + r1 + 16],      m2
-    palignr     m2,                  m1, m0, 4
-    movu        [r0 + r1 * 2],       m2
-    palignr     m2,                  m3, m1, 4
-    movu        [r0 + r1 * 2 + 16],  m2
-    palignr     m2,                  m1, m0, 2
-    movu        [r0 + r4],           m2
-    palignr     m3,                  m1, 2
-    movu        [r0 + r4 + 16],      m3
-
-    lea         r0,                  [r0 + r1 * 4]
-    movu        [r0],                m0
-    movu        [r0 + 16],           m1
-    movu        m3,                  [r2 + 18]
-    pshufb      m3,                  [pw_swap16]
-    palignr     m2,                  m0, m3, 14
-    movu        [r0 + r1],           m2
-    palignr     m2,                  m1, m0, 14
-    movu        [r0 + r1 + 16],      m2
-    palignr     m2,                  m0, m3, 12
-    movu        [r0 + r1 * 2],       m2
-    palignr     m2,                  m1, m0, 12
-    movu        [r0 + r1 * 2 + 16],  m2
-    palignr     m2,                  m0, m3, 10
-    movu        [r0 + r4],           m2
-    palignr     m2,                  m1, m0, 10
-    movu        [r0 + r4 + 16],      m2
-
-    lea         r0,                  [r0 + r1 * 4]
-    palignr     m2,                  m0, m3, 8
-    movu        [r0],                m2
-    palignr     m2,                  m1, m0, 8
-    movu        [r0 + 16],           m2
-    palignr     m2,                  m0, m3, 6
-    movu        [r0 + r1],           m2
-    palignr     m2,                  m1, m0, 6
-    movu        [r0 + r1 + 16],      m2
-    palignr     m2,                  m0, m3, 4
-    movu        [r0 + r1 * 2],       m2
-    palignr     m2,                  m1, m0, 4
-    movu        [r0 + r1 * 2 + 16],  m2
-    palignr     m2,                  m0, m3, 2
-    movu        [r0 + r4],           m2
-    palignr     m1,                  m0, 2
-    movu        [r0 + r4 + 16],      m1
-
-    RET
-
-cglobal intra_pred_ang16_10, 4,5,4
-    movu        m1,                     [r2 + 2]            ; [8 7 6 5 4 3 2 1]
-    movu        m3,                     [r2 + 18]           ; [16 15 14 13 12 11 10 9]
-    pshufb      m0,                     m1, [pw_unpackwdq]  ; [1 1 1 1 1 1 1 1]
-    add         r1,                     r1
-    lea         r4,                     [r1 * 3]
-
-    psrldq      m1,                     2
-    pshufb      m2,                     m1, [pw_unpackwdq]  ; [2 2 2 2 2 2 2 2]
-    movu        [r0 + r1],              m2
-    movu        [r0 + r1 + 16],         m2
-    psrldq      m1,                     2
-    pshufb      m2,                     m1, [pw_unpackwdq]  ; [3 3 3 3 3 3 3 3]
-    movu        [r0 + r1 * 2],          m2
-    movu        [r0 + r1 * 2 + 16],     m2
-    psrldq      m1,                     2
-    pshufb      m2,                     m1, [pw_unpackwdq]  ; [4 4 4 4 4 4 4 4]
-    movu        [r0 + r4],              m2
-    movu        [r0 + r4 + 16],         m2
-
-    lea         r2,                     [r0 + r1 *4]
-    psrldq      m1,                     2
-    pshufb      m2,                     m1, [pw_unpackwdq]  ; [5 5 5 5 5 5 5 5]
-    movu        [r2],                   m2
-    movu        [r2 + 16],              m2
-    psrldq      m1,                     2
-    pshufb      m2,                     m1, [pw_unpackwdq]  ; [6 6 6 6 6 6 6 6]
-    movu        [r2 + r1],              m2
-    movu        [r2 + r1 + 16],         m2
-    psrldq      m1,                     2
-    pshufb      m2,                     m1, [pw_unpackwdq]  ; [7 7 7 7 7 7 7 7]
-    movu        [r2 + r1 * 2],          m2
-    movu        [r2 + r1 * 2 + 16],     m2
-    psrldq      m1,                     2
-    pshufb      m2,                     m1, [pw_unpackwdq]  ; [8 8 8 8 8 8 8 8]
-    movu        [r2 + r4],              m2
-    movu        [r2 + r4 + 16],         m2
-
-    lea         r2,                     [r2 + r1 *4]
-    pshufb      m2,                     m3, [pw_unpackwdq]  ; [9 9 9 9 9 9 9 9]
-    movu        [r2],                   m2
-    movu        [r2 + 16],              m2
-    psrldq      m3,                     2
-    pshufb      m2,                     m3, [pw_unpackwdq]  ; [10 10 10 10 10 10 10 10]
-    movu        [r2 + r1],              m2
-    movu        [r2 + r1 + 16],         m2
-    psrldq      m3,                     2
-    pshufb      m2,                     m3, [pw_unpackwdq]  ; [11 11 11 11 11 11 11 11]
-    movu        [r2 + r1 * 2],          m2
-    movu        [r2 + r1 * 2 + 16],     m2
-    psrldq      m3,                     2
-    pshufb      m2,                     m3, [pw_unpackwdq]  ; [12 12 12 12 12 12 12 12]
-    movu        [r2 + r4],              m2
-    movu        [r2 + r4 + 16],         m2
-
-    lea         r2,                     [r2 + r1 *4]
-    psrldq      m3,                     2
-    pshufb      m2,                     m3, [pw_unpackwdq]  ; [13 13 13 13 13 13 13 13]
-    movu        [r2],                   m2
-    movu        [r2 + 16],              m2
-    psrldq      m3,                     2
-    pshufb      m2,                     m3, [pw_unpackwdq]  ; [14 14 14 14 14 14 14 14]
-    movu        [r2 + r1],              m2
-    movu        [r2 + r1 + 16],         m2
-    psrldq      m3,                     2
-    pshufb      m2,                     m3, [pw_unpackwdq]  ; [15 15 15 15 15 15 15 15]
-    movu        [r2 + r1 * 2],          m2
-    movu        [r2 + r1 * 2 + 16],     m2
-    psrldq      m3,                     2
-    pshufb      m2,                     m3, [pw_unpackwdq]  ; [16 16 16 16 16 16 16 16]
-    movu        [r2 + r4],              m2
-    movu        [r2 + r4 + 16],         m2
-    mova        m3,                     m0
-
-    cmp         r5m,                    byte 0
-    jz         .quit
-
-    ; filter
-
-    movh        m1,                     [r3]                ; [3 2 1 0]
-    pshufb      m2,                     m1, [pw_unpackwdq]  ; [0 0 0 0 0 0 0 0]
-    movu        m1,                     [r3 + 2]            ; [8 7 6 5 4 3 2 1]
-    movu        m3,                     [r3 + 18]           ; [16 15 14 13 12 11 10 9]
-    psubw       m1,                     m2
-    psubw       m3,                     m2
-    psraw       m1,                     1
-    psraw       m3,                     1
-    paddw       m3,                     m0
-    paddw       m0,                     m1
-    pxor        m1,                     m1
-    pmaxsw      m0,                     m1
-    pminsw      m0,                     [pw_1023]
-    pmaxsw      m3,                     m1
-    pminsw      m3,                     [pw_1023]
-.quit:
-    movu        [r0],                   m0
-    movu        [r0 + 16],              m3
-
-    RET
-
-cglobal intra_pred_ang16_26, 4,5,4
-    movu        m0,                 [r3 + 2]            ; [8 7 6 5 4 3 2 1]
-    movu        m3,                 [r3 + 18]           ; [16 15 14 13 12 11 10 9]
-    add         r1,                 r1
-    lea         r4,                 [r1 * 3]
-
-    movu        [r0],               m0
-    movu        [r0 + 16],          m3
-    movu        [r0 + r1],          m0
-    movu        [r0 + r1 + 16],     m3
-    movu        [r0 + r1 * 2],      m0
-    movu        [r0 + r1 * 2 + 16], m3
-    movu        [r0 + r4],          m0
-    movu        [r0 + r4 + 16],     m3
-
-    lea         r3,                 [r0 + r1 *4]
-    movu        [r3],               m0
-    movu        [r3 + 16],          m3
-    movu        [r3 + r1],          m0
-    movu        [r3 + r1 + 16],     m3
-    movu        [r3 + r1 * 2],      m0
-    movu        [r3 + r1 * 2 + 16], m3
-    movu        [r3 + r4],          m0
-    movu        [r3 + r4 + 16],     m3
-
-    lea         r3,                 [r3 + r1 *4]
-    movu        [r3],               m0
-    movu        [r3 + 16],          m3
-    movu        [r3 + r1],          m0
-    movu        [r3 + r1 + 16],     m3
-    movu        [r3 + r1 * 2],      m0
-    movu        [r3 + r1 * 2 + 16], m3
-    movu        [r3 + r4],          m0
-    movu        [r3 + r4 + 16],     m3
-
-    lea         r3,                 [r3 + r1 *4]
-    movu        [r3],               m0
-    movu        [r3 + 16],          m3
-    movu        [r3 + r1],          m0
-    movu        [r3 + r1 + 16],     m3
-    movu        [r3 + r1 * 2],      m0
-    movu        [r3 + r1 * 2 + 16], m3
-    movu        [r3 + r4],          m0
-    movu        [r3 + r4 + 16],     m3
-
-    cmp         r5m,                byte 0
-    jz         .quit
-
-    ; filter
-
-    pshufb      m0,                 [pw_unpackwdq]
-    movh        m1,                 [r2]                ; [3 2 1 0]
-    pshufb      m2,                 m1, [pw_unpackwdq]  ; [0 0 0 0 0 0 0 0]
-    movu        m1,                 [r2 + 2]            ; [8 7 6 5 4 3 2 1]
-    movu        m3,                 [r2 + 18]           ; [16 15 14 13 12 11 10 9]
-    psubw       m1,                 m2
-    psubw       m3,                 m2
-    psraw       m1,                 1
-    psraw       m3,                 1
-    paddw       m3,                 m0
-    paddw       m0,                 m1
-    pxor        m1,                 m1
-    pmaxsw      m0,                 m1
-    pminsw      m0,                 [pw_1023]
-    pmaxsw      m3,                 m1
-    pminsw      m3,                 [pw_1023]
-    pextrw      [r0],               m0, 0
-    pextrw      [r0 + r1],          m0, 1
-    pextrw      [r0 + r1 * 2],      m0, 2
-    pextrw      [r0 + r4],          m0, 3
-    lea         r0,                 [r0 + r1 * 4]
-    pextrw      [r0],               m0, 4
-    pextrw      [r0 + r1],          m0, 5
-    pextrw      [r0 + r1 * 2],      m0, 6
-    pextrw      [r0 + r4],          m0, 7
-    lea         r0,                 [r0 + r1 * 4]
-    pextrw      [r0],               m3, 0
-    pextrw      [r0 + r1],          m3, 1
-    pextrw      [r0 + r1 * 2],      m3, 2
-    pextrw      [r0 + r4],          m3, 3
-    pextrw      [r3],               m3, 4
-    pextrw      [r3 + r1],          m3, 5
-    pextrw      [r3 + r1 * 2],      m3, 6
-    pextrw      [r3 + r4],          m3, 7
-
-.quit:
-    RET
-
 ;------------------------------------------------------------------------------------------
 ; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
 ;------------------------------------------------------------------------------------------
@@ -13344,25 +8809,6 @@
     movu            [r0 + r4 + 48], m5
     lea             r0,    [r0 + r1 * 4]
 %endmacro
-;--------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_2_34(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;--------------------------------------------------------------------------------------------------------------------
-INIT_XMM ssse3
-cglobal intra_pred_ang32_2, 3,6,6
-    cmp             r4m, byte 34
-    cmove           r2, r3mp
-
-    add             r1, r1
-    lea             r3, [r1 * 2]
-    lea             r4, [r1 * 3]
-    mov             r5, 2
-
-.loop:
-    MODE_2_34
-    add             r2, 32
-    dec             r5
-    jnz             .loop
-    RET
 
 %macro TRANSPOSE_STORE_8x8 6
   %if %2 == 1
@@ -13600,11 +9046,2402 @@
 
     TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
 %endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_3(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
+
+%macro MODE_4_32 1
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m3,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
+    palignr     m1,        m3, m0, 2                  ; [9 8 7 6 5 4 3 2]
+    punpckhwd   m2,        m0, m1                     ; [9 8 8 7 7 6 6 5]
+    punpcklwd   m0,        m1                         ; [5 4 4 3 3 2 2 1]
+
+    pmaddwd     m4,        m0, [r3 + 5 * 16]          ; [21]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    palignr     m5,        m2, m0, 4                  ; [6 5 5 4 4 3 3 2]
+    pmaddwd     m1,        m5, [r3 - 6 * 16]          ; [10]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        [r3 + 15 * 16]             ; [31]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    palignr     m6,        m2, m0, 8
+    pmaddwd     m6,        [r3 + 4 * 16]              ; [ 20]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    palignr     m1,        m2, m0, 12
+    pmaddwd     m6,        m1, [r3 - 7 * 16]          ; [ 9]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        [r3 + 14 * 16]             ; [30]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m2, [r3 + 3 * 16]          ; [19]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    palignr     m7,        m3, m2, 4                  ; [10 9 9 8 7 6 5 4]
+    pmaddwd     m0,        m7, [r3 - 8 * 16]          ; [8]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+    pmaddwd     m4,        m7, [r3 + 13 * 16]         ; [29]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    movu        m0,        [r2 + 34]                  ; [24 23 22 21 20 19 18 17]
+
+    palignr     m2,        m0, m3, 2                  ; [17 16 15 14 13 12 11 10]
+    palignr     m1,        m0, m3, 4                  ; [18 17 16 15 14 13 12 11]
+    punpckhwd   m3,        m2, m1                     ; [18 17 17 16 16 15 15 14]
+    punpcklwd   m2,        m1                         ; [14 13 13 12 12 11 11 10]
+
+    palignr     m1,        m2, m7, 4                  ; [11 10 10 9 9 8 7 6]
+    pmaddwd     m1,        [r3 +  2 * 16]             ; [18]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    palignr     m5,        m2, m7, 8
+    mova        m6,        m5
+    pmaddwd     m5,        [r3 - 9 * 16]              ; [07]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m6,        [r3 + 12 * 16]             ; [28]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    palignr     m6,        m2, m7, 12
+    pmaddwd     m6,        [r3 +      16]             ; [17]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m2, [r3 - 10 * 16]         ; [06]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m2, [r3 + 11 * 16]         ; [27]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    palignr     m7,        m3, m2, 4
+    pmaddwd     m7,        [r3]                       ; [16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m1,        m7
+    mova        m7,        m0
+
+    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+    palignr     m0,        m3, m2, 8
+    pmaddwd     m4,        m0, [r3 - 11 * 16]         ; [5]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m0, [r3 + 10 * 16]         ; [26]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    palignr     m5,        m3, m2, 12
+    pmaddwd     m5,        [r3 - 16]                  ; [15]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m1,        m3, [r3 - 12 * 16]         ; [4]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m5,        m1
+
+    pmaddwd     m6,        m3, [r3 + 9 * 16]          ; [25]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    movu        m0,        [r2 + 50]                  ; [32 31 30 29 28 27 26 25]
+    palignr     m2,        m0, m7, 2                  ; [25 24 23 22 21 20 19 18]
+    palignr     m1,        m0, m7, 4                  ; [26 25 24 23 22 21 20 19]
+    punpckhwd   m7,        m2, m1                     ; [26 25 25 24 24 23 23 22]
+    punpcklwd   m2,        m1                         ; [22 21 21 20 20 19 19 18]
+
+    palignr     m1,        m2, m3, 4
+    pmaddwd     m1,        [r3 - 2 * 16]              ; [14]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    palignr     m1,        m2, m3, 8
+    mova        m0,        m1
+    pmaddwd     m1,        [r3 - 13 * 16]             ; [3]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    pmaddwd     m0,        [r3 + 8 * 16]              ; [24]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+    palignr     m4,        m2, m3, 12
+    pmaddwd     m4,        [r3 - 3 * 16]              ; [13]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m2, [r3 - 14 * 16]         ; [2]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m2, [r3 + 7 * 16]          ; [23]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    palignr     m6,        m7, m2, 4
+    pmaddwd     m6,        [r3 - 4 * 16]              ; [12]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    palignr     m1,        m7, m2, 8
+    pmaddwd     m6,        m1, [r3 - 15 * 16]         ; [1]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        [r3 + 6 * 16]              ; [22]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    palignr     m1,        m7, m2, 12
+    pmaddwd     m1,        [r3 - 5 * 16]              ; [11]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m1,        m1
+    movhps      m1,        [r2 + 44]                  ; [00]
+
+    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+%macro MODE_5_31 1
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m3,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
+    palignr     m1,        m3, m0, 2                  ; [9 8 7 6 5 4 3 2]
+    punpckhwd   m2,        m0, m1                     ; [9 8 8 7 7 6 6 5]
+    punpcklwd   m0,        m1                         ; [5 4 4 3 3 2 2 1]
+
+    pmaddwd     m4,        m0, [r3 + 16]              ; [17]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    palignr     m1,        m2, m0, 4
+    mova        m5,        m1
+    pmaddwd     m1,        [r3 - 14 * 16]             ; [2]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        [r3 + 3 * 16]              ; [19]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    palignr     m6,        m2, m0, 8
+    mova        m1,        m6
+    pmaddwd     m6,        [r3 - 12 * 16]             ; [4]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    pmaddwd     m6,        m1, [r3 + 5 * 16]          ; [21]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    palignr     m1,        m2, m0, 12
+    mova        m7,        m1
+    pmaddwd     m7,        [r3 - 10 * 16]             ; [6]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    pmaddwd     m1,        [r3 + 7 * 16]              ; [23]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    pmaddwd     m7,        m2, [r3 - 8 * 16]          ; [8]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m1,        m7
+
+    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+    pmaddwd     m4,        m2, [r3 + 9 * 16]          ; [25]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    palignr     m7,        m3, m2, 4                  ; [10 9 9 8 7 6 5 4]
+    pmaddwd     m1,        m7, [r3 - 6 * 16]          ; [10]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m7, [r3 + 11 * 16]         ; [27]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    movu        m0,        [r2 + 34]                  ; [24 23 22 21 20 19 18 17]
+    palignr     m2,        m0, m3, 2                  ; [17 16 15 14 13 12 11 10]
+    palignr     m1,        m0, m3, 4                  ; [18 17 16 15 14 13 12 11]
+    punpckhwd   m3,        m2, m1                     ; [18 17 17 16 16 15 15 14]
+    punpcklwd   m2,        m1                         ; [14 13 13 12 12 11 11 10]
+
+    palignr     m6,        m2, m7, 4
+    pmaddwd     m1,        m6, [r3 - 4 * 16]          ; [12]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m5,        m1
+
+    pmaddwd     m6,        [r3 + 13 * 16]             ; [29]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    palignr     m1,        m2, m7, 8
+    mova        m0,        m1
+    pmaddwd     m1,        [r3 - 2 * 16]              ; [14]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m0, [r3 + 15 * 16]         ; [31]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    palignr     m0,        m2, m7, 12
+    pmaddwd     m0,        [r3]                       ; [16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+    pmaddwd     m4,        m2, [r3 - 15 * 16]         ; [1]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m2, [r3 + 2 * 16]          ; [18]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    palignr     m1,        m3, m2, 4
+    pmaddwd     m5,        m1, [r3 - 13 * 16]         ; [3]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m1,        [r3 + 4 * 16]              ; [20]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m5,        m1
+
+    palignr     m1,        m3, m2, 8
+    pmaddwd     m6,        m1, [r3 - 11 * 16]         ; [5]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        [r3 + 6 * 16]              ; [22]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    palignr     m7,        m3, m2, 12
+    pmaddwd     m1,        m7, [r3 - 9 * 16]          ; [7]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    pmaddwd     m7,        [r3 + 8 * 16]              ; [24]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m1,        m7
+
+    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+    pmaddwd     m4,        m3, [r3 - 7 * 16]          ; [9]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m3, [r3 + 10 * 16]         ; [26]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    movu        m0,        [r2 + 36]                  ; [25 24 23 22 21 20 19 18]
+    palignr     m1,        m0, 2                      ; [x 25 24 23 22 21 20 19]
+    punpcklwd   m0,        m1                         ; [22 21 21 20 20 19 19 18]
+
+    palignr     m1,        m0, m3, 4
+    pmaddwd     m5,        m1, [r3 - 5 * 16]          ; [11]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m1,        [r3 + 12 * 16]             ; [28]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m5,        m1
+
+    palignr     m1,        m0, m3, 8
+    pmaddwd     m6,        m1, [r3 - 3 * 16]          ; [13]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        [r3 + 14 * 16]             ; [30]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    palignr     m1,        m0, m3, 12
+    pmaddwd     m1,        [r3 - 16]                  ; [15]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m1,        m1
+    movhps      m1,        [r2 + 36]                  ; [00]
+
+    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+%macro MODE_6_30 1
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m3,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
+    palignr     m1,        m3, m0, 2                  ; [9 8 7 6 5 4 3 2]
+    punpckhwd   m2,        m0, m1                     ; [9 8 8 7 7 6 6 5]
+    punpcklwd   m0,        m1                         ; [5 4 4 3 3 2 2 1]
+
+    pmaddwd     m4,        m0, [r3 - 3 * 16]          ; [13]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m0, [r3 + 10 * 16]         ; [26]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    palignr     m1,        m2, m0, 4
+    pmaddwd     m5,        m1, [r3 - 9 * 16]          ; [7]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m1,        [r3 + 4 * 16]              ; [20]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m5,        m1
+
+    palignr     m1,        m2, m0, 8
+    pmaddwd     m6,        m1, [r3 - 15 * 16]         ; [1]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m7,        m1, [r3 - 2 * 16]          ; [14]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    pmaddwd     m1,        [r3 + 11 * 16]             ; [27]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    palignr     m7,        m2, m0, 12
+    pmaddwd     m0,        m7, [r3 - 8 * 16]          ; [8]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+    pmaddwd     m4,        m7, [r3 +  5 * 16]         ; [21]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m2, [r3 - 14 * 16]         ; [2]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m2, [r3 - 16]              ; [15]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m6,        m2, [r3 + 12 * 16]         ; [28]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    palignr     m7,        m3, m2, 4
+    pmaddwd     m6,        m7, [r3 - 7 * 16]          ; [9]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m7, [r3 + 6 * 16]          ; [22]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    movu        m0,        [r2 + 34]                  ; [24 23 22 21 20 19 18 17]
+    palignr     m2,        m0, m3, 2                  ; [17 16 15 14 13 12 11 10]
+    palignr     m1,        m0, m3, 4                  ; [18 17 16 15 14 13 12 11]
+    punpckhwd   m3,        m2, m1                     ; [18 17 17 16 16 15 15 14]
+    punpcklwd   m2,        m1                         ; [14 13 13 12 12 11 11 10]
+
+    palignr     m0,        m2, m7, 4
+    pmaddwd     m1,        m0, [r3 - 13 * 16]         ; [3]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    pmaddwd     m0,        [r3]                       ; [16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+    palignr     m4,        m2, m7, 4
+    pmaddwd     m4,        [r3 +  13 * 16]            ; [29]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    palignr     m5,        m2, m7, 8
+    pmaddwd     m1,        m5, [r3 - 6 * 16]          ; [10]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        [r3 + 7 * 16]              ; [23]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    palignr     m1,        m2, m7, 12
+    pmaddwd     m6,        m1, [r3 - 12 * 16]         ; [4]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    pmaddwd     m6,        m1, [r3 + 16]              ; [17]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        [r3 + 14 * 16]             ; [30]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m2, [r3 - 5 * 16]          ; [11]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    pmaddwd     m0,        m2, [r3 + 8 * 16]          ; [24]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+    palignr     m5,        m3, m2, 4
+    pmaddwd     m4,        m5, [r3 - 11 * 16]         ; [5]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m5, [r3 + 2 * 16]          ; [18]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        [r3 + 15 * 16]             ; [31]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    palignr     m6,        m3, m2, 8
+    pmaddwd     m1,        m6, [r3 - 4 * 16]          ; [12]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m5,        m1
+
+    pmaddwd     m6,        [r3 + 9 * 16]              ; [25]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    palignr     m1,        m3, m2, 12
+    pmaddwd     m0,        m1, [r3 - 10 * 16]         ; [6]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m6,        m0
+
+    pmaddwd     m1,        [r3 + 3 * 16]              ; [19]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m1,        m1
+    movhps      m1,        [r2 + 28]                  ; [00]
+
+    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+%macro MODE_7_29 1
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movd        m3,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
+    palignr     m1,        m3, m0, 2                  ; [9 8 7 6 5 4 3 2]
+    punpckhwd   m2,        m0, m1                     ; [9 8 8 7 7 6 6 5]
+    punpcklwd   m0,        m1                         ; [5 4 4 3 3 2 2 1]
+
+    pmaddwd     m4,        m0, [r3 - 7 * 16]          ; [9]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m0, [r3 + 2 * 16]          ; [18]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m0, [r3 + 11 * 16]         ; [27]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    palignr     m1,        m2, m0, 4
+    pmaddwd     m6,        m1, [r3 - 12 * 16]         ; [4]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    pmaddwd     m6,        m1, [r3 - 3 * 16]          ; [13]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m7,        m1, [r3 + 6 * 16]          ; [22]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    pmaddwd     m1,        [r3 + 15 * 16]             ; [31]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    mova        m3,        m0
+    palignr     m7,        m2, m0, 8
+    pmaddwd     m0,        m7, [r3 - 8 * 16]          ; [8]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+    pmaddwd     m4,        m7, [r3 + 16]              ; [17]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m7, [r3 + 10 * 16]         ; [26]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    palignr     m1,        m2, m3, 12
+    pmaddwd     m5,        m1, [r3 - 13 * 16]         ; [3]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m6,        m1, [r3 - 4 * 16]          ; [12]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    pmaddwd     m6,        m1, [r3 + 5 * 16]          ; [21]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        [r3 + 14 * 16]             ; [30]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m2, [r3 - 9 * 16]          ; [7]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    pmaddwd     m0,        m2, [r3]                   ; [16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+    pmaddwd     m4,        m2, [r3 + 9 * 16]          ; [25]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    movu        m7,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
+    palignr     m1,        m7, 2                      ; [x 16 15 14 13 12 11 10]
+    punpcklwd   m7,        m1                         ; [13 12 12 11 11 10 10 9]
+
+    palignr     m6,        m7, m2, 4
+    pmaddwd     m1,        m6, [r3 - 14 * 16]         ; [2]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m6, [r3 - 5 * 16]          ; [11]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m0,        m6, [r3 + 4 * 16]          ; [20]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m5,        m0
+
+    pmaddwd     m6,        [r3 + 13 * 16]             ; [29]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    palignr     m0,        m7, m2, 8
+    pmaddwd     m1,        m0, [r3 - 10 * 16]         ; [6]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m0, [r3 - 16]              ; [15]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    pmaddwd     m0,        [r3 + 8 * 16]              ; [24]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+    palignr     m0,        m7, m2, 12
+    pmaddwd     m4,        m0, [r3 - 15 * 16]         ; [1]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m0, [r3 - 6 * 16]          ; [10]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m0, [r3 + 3 * 16]          ; [19]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m0,        [r3 + 12 * 16]             ; [28]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m5,        m0
+
+    pmaddwd     m6,        m7, [r3 - 11 * 16]         ; [5]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m0,        m7, [r3 - 2 * 16]          ; [14]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m6,        m0
+
+    pmaddwd     m1,        m7, [r3 + 7 * 16]          ; [23]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m1,        m1
+    movhps      m1,        [r2 + 20]                  ; [00]
+
+    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+%macro MODE_8_28 1
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movd        m3,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
+    palignr     m1,        m3, m0, 2                  ; [9 8 7 6 5 4 3 2]
+    punpckhwd   m2,        m0, m1                     ; [9 8 8 7 7 6 6 5]
+    punpcklwd   m0,        m1                         ; [5 4 4 3 3 2 2 1]
+
+    pmaddwd     m4,        m0, [r3 - 11 * 16]         ; [5]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m0, [r3 - 6 * 16]          ; [10]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m0, [r3 - 16]              ; [15]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m6,        m0, [r3 + 4 * 16]          ; [20]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    pmaddwd     m6,        m0, [r3 + 9 * 16]          ; [25]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m0, [r3 + 14 * 16]         ; [30]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    palignr     m7,        m2, m0, 4
+    pmaddwd     m1,        m7, [r3 - 13 * 16]         ; [3]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    mova        m3,        m0
+    pmaddwd     m0,        m7, [r3 - 8 * 16]          ; [8]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+    pmaddwd     m4,        m7, [r3 - 3 * 16]          ; [13]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m7, [r3 + 2 * 16]          ; [18]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m7, [r3 + 7 * 16]          ; [23]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m6,        m7, [r3 + 12 * 16]         ; [28]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    palignr     m7,        m2, m3, 8
+    pmaddwd     m6,        m7, [r3 - 15 * 16]         ; [1]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m7, [r3 - 10 * 16]         ; [6]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m7, [r3 - 5 * 16]          ; [11]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    pmaddwd     m0,        m7, [r3]                   ; [16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+    pmaddwd     m4,        m7, [r3 + 5 * 16]          ; [21]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m7, [r3 + 10 * 16]         ; [26]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m7, [r3 + 15 * 16]         ; [31]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    palignr     m7,        m2, m3, 12
+    pmaddwd     m0,        m7, [r3 - 12 * 16]         ; [4]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m5,        m0
+
+    pmaddwd     m6,        m7, [r3 - 7 * 16]          ; [9]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m7, [r3 - 2 * 16]          ; [14]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m7, [r3 + 3 * 16]          ; [19]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    pmaddwd     m0,        m7, [r3 + 8 * 16]          ; [24]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+    pmaddwd     m4,        m7, [r3 + 13 * 16]         ; [29]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m2, [r3 - 14 * 16]         ; [2]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m2, [r3 - 9 * 16]          ; [7]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m0,        m2, [r3 - 4 * 16]          ; [12]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m5,        m0
+
+    pmaddwd     m6,        m2, [r3 + 16]              ; [17]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m0,        m2, [r3 + 6 * 16]          ; [22]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m6,        m0
+
+    pmaddwd     m1,        m2, [r3 + 11 * 16]         ; [27]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m1,        m1
+    movhps      m1,        [r2 + 12]                  ; [00]
+
+    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+%macro MODE_9_27 1
+    movu        m3,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    palignr     m1,        m3, 2                      ; [9 8 7 6 5 4 3 2]
+    punpckhwd   m2,        m3, m1                     ; [9 8 8 7 7 6 6 5]
+    punpcklwd   m3,        m1                         ; [5 4 4 3 3 2 2 1]
+
+    pmaddwd     m4,        m3, [r3 - 14 * 16]         ; [2]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m3, [r3 - 12 * 16]         ; [4]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m3, [r3 - 10 * 16]         ; [6]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m6,        m3, [r3 - 8 * 16]          ; [8]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    pmaddwd     m6,        m3, [r3 - 6 * 16]          ; [10]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m3, [r3 - 4 * 16]          ; [12]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m3, [r3 - 2 * 16]          ; [14]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    pmaddwd     m0,        m3, [r3]                   ; [16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+    pmaddwd     m4,        m3, [r3 + 2 * 16]          ; [18]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m3, [r3 + 4 * 16]          ; [20]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m3, [r3 + 6 * 16]          ; [22]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m6,        m3, [r3 + 8 * 16]          ; [24]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    pmaddwd     m6,        m3, [r3 + 10 * 16]         ; [26]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m3, [r3 + 12 * 16]         ; [28]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m3, [r3 + 14 * 16]         ; [30]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    packusdw    m1,        m1
+    movhps      m1,        [r2 + 4]                   ; [00]
+
+    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+    palignr     m7,        m2, m3, 4
+    pmaddwd     m4,        m7, [r3 - 14 * 16]         ; [2]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m7, [r3 - 12 * 16]         ; [4]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m7, [r3 - 10 * 16]         ; [6]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m0,        m7, [r3 - 8 * 16]          ; [8]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m5,        m0
+
+    pmaddwd     m6,        m7, [r3 - 6 * 16]          ; [10]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m7, [r3 - 4 * 16]          ; [12]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m7, [r3 - 2 * 16]          ; [14]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    pmaddwd     m0,        m7, [r3]                   ; [16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+    pmaddwd     m4,        m7, [r3 + 2 * 16]          ; [18]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m7, [r3 + 4 * 16]          ; [20]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m7, [r3 + 6 * 16]          ; [22]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m0,        m7, [r3 + 8 * 16]          ; [24]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m5,        m0
+
+    pmaddwd     m6,        m7, [r3 + 10 * 16]         ; [26]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m0,        m7, [r3 + 12 * 16]         ; [28]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m6,        m0
+
+    pmaddwd     m7,        [r3 + 14 * 16]             ; [30]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m7,        m7
+    movhps      m7,        [r2 + 6]                   ; [00]
+
+    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m7
+%endmacro
+
+%macro MODE_11_25 1
+    movu        m3,        [r2 + 2]                   ; [7 6 5 4 3 2 1 0]
+    pshufb      m3,        [pw_punpcklwd]             ; [4 3 3 2 2 1 1 0]
+
+    pmaddwd     m4,        m3, [r3 + 14 * 16]         ; [30]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m3, [r3 + 12 * 16]         ; [28]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m3, [r3 + 10 * 16]         ; [26]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m6,        m3, [r3 + 8 * 16]          ; [24]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    pmaddwd     m6,        m3, [r3 + 6 * 16]          ; [22]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m3, [r3 + 4 * 16]          ; [20]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m3, [r3 + 2 * 16]          ; [18]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    pmaddwd     m0,        m3, [r3]                   ; [16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+    pmaddwd     m4,        m3, [r3 - 2 * 16]          ; [14]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m3, [r3 - 4 * 16]          ; [12]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m3, [r3 - 6 * 16]          ; [10]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m6,        m3, [r3 - 8 * 16]          ; [8]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    pmaddwd     m6,        m3, [r3 - 10 * 16]         ; [6]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m3, [r3 - 12 * 16]         ; [4]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m3, [r3 - 14 * 16]         ; [2]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    packusdw    m1,        m1
+    movhps      m1,        [r2 + 2]                   ; [00]
+
+    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+    movu        m3,        [r2]                       ; [6 5 4 3 2 1 0 16]
+    pshufb      m3,        [pw_punpcklwd]             ; [3 2 2 1 1 0 0 16]
+
+    pmaddwd     m4,        m3, [r3 + 14 * 16]         ; [30]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m3, [r3 + 12 * 16]         ; [28]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m3, [r3 + 10 * 16]         ; [26]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m0,        m3, [r3 + 8 * 16]          ; [24]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m5,        m0
+
+    pmaddwd     m6,        m3, [r3 + 6 * 16]          ; [22]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m3, [r3 + 4 * 16]          ; [20]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m3, [r3 + 2 * 16]          ; [18]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    pmaddwd     m0,        m3, [r3]                   ; [16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+    pmaddwd     m4,        m3, [r3 - 2 * 16]          ; [14]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m3, [r3 - 4 * 16]          ; [12]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m3, [r3 - 6 * 16]          ; [10]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m6,        m3, [r3 - 8 * 16]          ; [8]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    pmaddwd     m6,        m3, [r3 - 10 * 16]         ; [6]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m3, [r3 - 12 * 16]         ; [4]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m3, [r3 - 14 * 16]         ; [2]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    packusdw    m1,        m1
+    movhps      m1,        [r2]                       ; [00]
+
+    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+%macro MODE_12_24 1
+    movu        m3,        [r2 + 8]                   ; [7 6 5 4 3 2 1 0]
+    pshufb      m3,        m2                         ; [4 3 3 2 2 1 1 0]
+
+    pmaddwd     m4,        m3, [r3 + 11 * 16]         ; [27]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m3, [r3 + 6 * 16]          ; [22]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m3, [r3 + 16]              ; [17]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m6,        m3, [r3 - 4 * 16]          ; [12]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    pmaddwd     m6,        m3, [r3 - 9 * 16]          ; [7]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m3, [r3 - 14 * 16]         ; [2]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    movu        m3,        [r2 + 6]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 13 * 16]         ; [29]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    pmaddwd     m0,        m3, [r3 + 8 * 16]          ; [24]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+    pmaddwd     m4,        m3, [r3 + 3 * 16]          ; [19]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m3, [r3 - 2 * 16]          ; [14]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m3, [r3 - 7 * 16]          ; [9]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m6,        m3, [r3 - 12 * 16]         ; [4]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    movu        m3,        [r2 + 4]
+    pshufb      m3,        m2
+
+    pmaddwd     m6,        m3, [r3 + 15 * 16]         ; [31]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m3, [r3 + 10 * 16]         ; [26]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m3, [r3 + 5 * 16]          ; [21]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    pmaddwd     m0,        m3, [r3]                   ; [16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+    pmaddwd     m4,        m3, [r3 - 5 * 16]          ; [11]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m3, [r3 - 10 * 16]         ; [6]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m3, [r3 - 15 * 16]         ; [1]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    movu        m3,        [r2 + 2]
+    pshufb      m3,        m2
+
+    pmaddwd     m0,        m3, [r3 + 12 * 16]         ; [28]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m5,        m0
+
+    pmaddwd     m6,        m3, [r3 + 7 * 16]          ; [23]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m3, [r3 + 2 * 16]          ; [18]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m3, [r3 - 3 * 16]          ; [13]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    pmaddwd     m0,        m3, [r3 - 8 * 16]          ; [8]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+    pmaddwd     m4,        m3, [r3 - 13 * 16]         ; [3]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    movu        m3,        [r2]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 14 * 16]         ; [30]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m3, [r3 + 9 * 16]          ; [25]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m6,        m3, [r3 + 4 * 16]          ; [20]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    pmaddwd     m6,        m3, [r3 - 16]              ; [15]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m3, [r3 - 6 * 16]          ; [10]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m3, [r3 - 11 * 16]         ; [5]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    packusdw    m1,        m1
+    movhps      m1,        [r2]                       ; [00]
+
+    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+%macro MODE_13_23 1
+    movu        m3,        [r2 + 16]                  ; [7 6 5 4 3 2 1 0]
+    pshufb      m3,        m2                         ; [4 3 3 2 2 1 1 0]
+
+    pmaddwd     m4,        m3, [r3 + 7 * 16]          ; [23]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m3, [r3 - 2 * 16]          ; [14]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m3, [r3 - 11 * 16]         ; [05]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    movu        m3,        [r2 + 14]
+    pshufb      m3,        m2
+
+    pmaddwd     m6,        m3, [r3 + 12 * 16]         ; [28]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    pmaddwd     m6,        m3, [r3 + 3 * 16]          ; [19]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m3, [r3 - 6 * 16]          ; [10]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m3, [r3 - 15 * 16]         ; [01]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    movu        m3,        [r2 + 12]
+    pshufb      m3,        m2
+
+    pmaddwd     m0,        m3, [r3 + 8 * 16]          ; [24]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+    pmaddwd     m4,        m3, [r3 - 16]              ; [15]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m3, [r3 - 10 * 16]         ; [06]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    movu        m3,        [r2 + 10]
+    pshufb      m3,        m2
+
+    pmaddwd     m5,        m3, [r3 + 13 * 16]         ; [29]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m6,        m3, [r3 + 4 * 16]          ; [20]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    pmaddwd     m6,        m3, [r3 - 5 * 16]          ; [11]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m3, [r3 - 14 * 16]         ; [02]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    movu        m3,        [r2 + 8]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 9 * 16]          ; [25]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    pmaddwd     m0,        m3, [r3]                   ; [16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+    pmaddwd     m4,        m3, [r3 - 9 * 16]          ; [07]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    movu        m3,        [r2 + 6]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 14 * 16]         ; [30]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m3, [r3 + 5 * 16]          ; [21]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m0,        m3, [r3 - 4 * 16]          ; [12]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m5,        m0
+
+    pmaddwd     m6,        m3, [r3 - 13 * 16]         ; [03]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    movu        m3,        [r2 + 4]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 10 * 16]         ; [26]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m3, [r3 + 16]              ; [17]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    pmaddwd     m0,        m3, [r3 - 8 * 16]          ; [08]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+    movu        m3,        [r2 + 2]
+    pshufb      m3,        m2
+
+    pmaddwd     m4,        m3, [r3 + 15 * 16]         ; [31]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m3, [r3 + 6 * 16]          ; [22]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m3, [r3 - 3 * 16]          ; [13]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m6,        m3, [r3 - 12 * 16]         ; [04]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    movu        m3,        [r2]
+    pshufb      m3,        m2
+
+    pmaddwd     m6,        m3, [r3 + 11 * 16]         ; [27]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m3, [r3 + 2 * 16]          ; [18]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m3, [r3 - 7 * 16]          ; [09]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    packusdw    m1,        m1
+    movhps      m1,        [r2]                       ; [00]
+
+    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+%macro MODE_14_22 1
+    movu        m3,        [r2 + 24]                  ; [7 6 5 4 3 2 1 0]
+    pshufb      m3,        m2                         ; [4 3 3 2 2 1 1 0]
+
+    pmaddwd     m4,        m3, [r3 + 3 * 16]          ; [19]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m3, [r3 - 10 * 16]         ; [06]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    movu        m3,        [r2 + 22]
+    pshufb      m3,        m2
+
+    pmaddwd     m5,        m3, [r3 + 9 * 16]          ; [25]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m6,        m3, [r3 - 4 * 16]          ; [12]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    movu        m3,        [r2 + 20]
+    pshufb      m3,        m2
+
+    pmaddwd     m6,        m3, [r3 + 15 * 16]         ; [31]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m3, [r3 + 2 * 16]          ; [18]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m3, [r3 - 11 * 16]         ; [05]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    movu        m3,        [r2 + 18]
+    pshufb      m3,        m2
+
+    pmaddwd     m0,        m3, [r3 + 8 * 16]          ; [24]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+    pmaddwd     m4,        m3, [r3 - 5 * 16]          ; [11]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    movu        m3,        [r2 + 16]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 14 * 16]         ; [30]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m3, [r3 + 16]              ; [17]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m6,        m3, [r3 - 12 * 16]         ; [04]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    movu        m3,        [r2 + 14]
+    pshufb      m3,        m2
+
+    pmaddwd     m6,        m3, [r3 + 7 * 16]          ; [23]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m3, [r3 - 6 * 16]          ; [10]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    movu        m3,        [r2 + 12]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 13 * 16]         ; [29]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    pmaddwd     m0,        m3, [r3]                   ; [16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+    pmaddwd     m4,        m3, [r3 - 13 * 16]         ; [03]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    movu        m3,        [r2 + 10]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 6 * 16]          ; [22]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m3, [r3 - 7 * 16]          ; [09]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    movu        m3,        [r2 + 8]
+    pshufb      m3,        m2
+
+    pmaddwd     m0,        m3, [r3 + 12 * 16]         ; [28]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m5,        m0
+
+    pmaddwd     m6,        m3, [r3 - 16]              ; [15]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m3, [r3 - 14 * 16]         ; [02]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    movu        m3,        [r2 + 6]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 5 * 16]          ; [21]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    pmaddwd     m0,        m3, [r3 - 8 * 16]          ; [08]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+    movu        m3,        [r2 + 4]
+    pshufb      m3,        m2
+
+    pmaddwd     m4,        m3, [r3 + 11 * 16]         ; [27]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m3, [r3 - 2 * 16]          ; [14]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m3, [r3 - 15 * 16]         ; [01]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    movu        m3,        [r2 + 2]
+    pshufb      m3,        m2
+
+    pmaddwd     m6,        m3, [r3 + 4 * 16]          ; [20]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    pmaddwd     m6,        m3, [r3 - 9 * 16]          ; [07]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    movu        m3,        [r2]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 10 * 16]         ; [26]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m3, [r3 - 3 * 16]          ; [13]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    packusdw    m1,        m1
+    movhps      m1,        [r2]                       ; [00]
+
+    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+%macro MODE_15_21 1
+    movu        m3,        [r2 + 32]                  ; [7 6 5 4 3 2 1 0]
+    pshufb      m3,        m2                         ; [4 3 3 2 2 1 1 0]
+
+    pmaddwd     m4,        m3, [r3 - 16]              ; [15]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    movu        m3,        [r2 + 30]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 14 * 16]         ; [30]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m3, [r3 - 3 * 16]          ; [13]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    movu        m3,        [r2 + 28]
+    pshufb      m3,        m2
+
+    pmaddwd     m6,        m3, [r3 + 12 * 16]         ; [28]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    pmaddwd     m6,        m3, [r3 - 5 * 16]          ; [11]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    movu        m3,        [r2 + 26]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 10 * 16]         ; [26]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m3, [r3 - 7 * 16]          ; [09]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    movu        m3,        [r2 + 24]
+    pshufb      m3,        m2
+
+    pmaddwd     m0,        m3, [r3 + 8 * 16]          ; [24]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+    pmaddwd     m4,        m3, [r3 - 9 * 16]          ; [07]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    movu        m3,        [r2 + 22]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 6 * 16]          ; [22]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m3, [r3 - 11 * 16]         ; [05]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    movu        m3,        [r2 + 20]
+    pshufb      m3,        m2
+
+    pmaddwd     m6,        m3, [r3 + 4 * 16]          ; [20]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    pmaddwd     m6,        m3, [r3 - 13 * 16]         ; [03]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    movu        m3,        [r2 + 18]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 2 * 16]          ; [18]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m3, [r3 - 15 * 16]         ; [01]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    movu        m3,        [r2 + 16]
+    pshufb      m3,        m2
+
+    pmaddwd     m0,        m3, [r3]                   ; [16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+    movu        m3,        [r2 + 14]
+    pshufb      m3,        m2
+
+    pmaddwd     m4,        m3, [r3 + 15 * 16]         ; [31]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m3, [r3 - 2 * 16]          ; [14]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    movu        m3,        [r2 + 12]
+    pshufb      m3,        m2
+
+    pmaddwd     m5,        m3, [r3 + 13 * 16]         ; [29]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m0,        m3, [r3 - 4 * 16]          ; [12]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m5,        m0
+
+    movu        m3,        [r2 + 10]
+    pshufb      m3,        m2
+
+    pmaddwd     m6,        m3, [r3 + 11 * 16]         ; [27]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m3, [r3 - 6 * 16]          ; [10]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    movu        m3,        [r2 + 8]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 9 * 16]          ; [25]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    pmaddwd     m0,        m3, [r3 - 8 * 16]          ; [08]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+    movu        m3,        [r2 + 6]
+    pshufb      m3,        m2
+
+    pmaddwd     m4,        m3, [r3 + 7 * 16]          ; [23]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m3, [r3 - 10 * 16]         ; [06]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    movu        m3,        [r2 + 4]
+    pshufb      m3,        m2
+
+    pmaddwd     m5,        m3, [r3 + 5 * 16]          ; [21]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m6,        m3, [r3 - 12 * 16]         ; [04]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    movu        m3,        [r2 + 2]
+    pshufb      m3,        m2
+
+    pmaddwd     m6,        m3, [r3 + 3 * 16]          ; [19]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m3, [r3 - 14 * 16]         ; [02]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    movu        m3,        [r2]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 16]              ; [17]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    packusdw    m1,        m1
+    movhps      m1,        [r2]                       ; [00]
+
+    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+%macro MODE_16_20 1
+    movu        m3,        [r2 + 40]                  ; [7 6 5 4 3 2 1 0]
+    pshufb      m3,        m2                         ; [4 3 3 2 2 1 1 0]
+
+    pmaddwd     m4,        m3, [r3 - 5 * 16]          ; [11]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    movu        m3,        [r2 + 38]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 6 * 16]          ; [22]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m3, [r3 - 15 * 16]         ; [01]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    movu        m3,        [r2 + 36]
+    pshufb      m3,        m2
+
+    pmaddwd     m6,        m3, [r3 - 4 * 16]          ; [12]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    movu        m3,        [r2 + 34]
+    pshufb      m3,        m2
+
+    pmaddwd     m6,        m3, [r3 + 7 * 16]          ; [23]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m3, [r3 - 14 * 16]         ; [02]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    movu        m3,        [r2 + 32]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 - 3 * 16]          ; [13]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    movu        m3,        [r2 + 30]
+    pshufb      m3,        m2
+
+    pmaddwd     m0,        m3, [r3 + 8 * 16]          ; [24]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+    pmaddwd     m4,        m3, [r3 - 13 * 16]         ; [03]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    movu        m3,        [r2 + 28]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 - 2 * 16]          ; [14]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    movu        m3,        [r2 + 26]
+    pshufb      m3,        m2
+
+    pmaddwd     m5,        m3, [r3 + 9 * 16]          ; [25]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    pmaddwd     m6,        m3, [r3 - 12 * 16]         ; [04]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    movu        m3,        [r2 + 24]
+    pshufb      m3,        m2
+
+    pmaddwd     m6,        m3, [r3 - 16]              ; [15]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    movu        m3,        [r2 + 22]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 10 * 16]         ; [26]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m1,        m3, [r3 - 11 * 16]         ; [05]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    movu        m3,        [r2 + 20]
+    pshufb      m3,        m2
+
+    pmaddwd     m0,        m3, [r3]                   ; [16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+    movu        m3,        [r2 + 18]
+    pshufb      m3,        m2
+
+    pmaddwd     m4,        m3, [r3 + 11 * 16]         ; [27]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    pmaddwd     m1,        m3, [r3 - 10 * 16]         ; [06]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    movu        m3,        [r2 + 16]
+    pshufb      m3,        m2
+
+    pmaddwd     m5,        m3, [r3 + 16]              ; [17]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    movu        m3,        [r2 + 14]
+    pshufb      m3,        m2
+
+    pmaddwd     m0,        m3, [r3 + 12 * 16]         ; [28]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m5,        m0
+
+    pmaddwd     m6,        m3, [r3 - 9 * 16]          ; [07]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    movu        m3,        [r2 + 12]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 2 * 16]          ; [18]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    movu        m3,        [r2 + 10]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 13 * 16]         ; [29]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    pmaddwd     m0,        m3, [r3 - 8 * 16]          ; [08]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+    movu        m3,        [r2 + 8]
+    pshufb      m3,        m2
+
+    pmaddwd     m4,        m3, [r3 + 3 * 16]          ; [19]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    movu        m3,        [r2 + 6]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 14 * 16]         ; [30]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m3, [r3 - 7 * 16]          ; [09]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    movu        m3,        [r2 + 4]
+    pshufb      m3,        m2
+
+    pmaddwd     m6,        m3, [r3 + 4 * 16]          ; [20]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    movu        m3,        [r2 + 2]
+    pshufb      m3,        m2
+
+    pmaddwd     m6,        m3, [r3 + 15 * 16]         ; [31]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m3, [r3 - 6 * 16]          ; [10]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    movu        m3,        [r2]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 5 * 16]          ; [21]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    packusdw    m1,        m1
+    movhps      m1,        [r2]                       ; [00]
+
+    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+%macro MODE_17_19 1
+    movu        m3,        [r2 + 50]                  ; [7 6 5 4 3 2 1 0]
+    pshufb      m3,        m2                         ; [4 3 3 2 2 1 1 0]
+
+    pmaddwd     m4,        m3, [r3 - 10 * 16]         ; [06]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    movu        m3,        [r2 + 48]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 - 4 * 16]          ; [12]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    movu        m3,        [r2 + 46]
+    pshufb      m3,        m2
+
+    pmaddwd     m5,        m3, [r3 + 2 * 16]          ; [18]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    movu        m3,        [r2 + 44]
+    pshufb      m3,        m2
+
+    pmaddwd     m6,        m3, [r3 + 8 * 16]          ; [24]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    movu        m3,        [r2 + 42]
+    pshufb      m3,        m2
+
+    pmaddwd     m6,        m3, [r3 + 14 * 16]         ; [30]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m3, [r3 - 12 * 16]         ; [04]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    movu        m3,        [r2 + 40]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 - 6 * 16]          ; [10]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    movu        m3,        [r2 + 38]
+    pshufb      m3,        m2
+
+    pmaddwd     m0,        m3, [r3]                   ; [16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+    movu        m3,        [r2 + 36]
+    pshufb      m3,        m2
+
+    pmaddwd     m4,        m3, [r3 + 6 * 16]          ; [22]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    movu        m3,        [r2 + 34]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 12 * 16]         ; [28]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m3, [r3 - 14 * 16]         ; [02]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    movu        m3,        [r2 + 32]
+    pshufb      m3,        m2
+
+    pmaddwd     m6,        m3, [r3 - 8 * 16]          ; [08]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    movu        m3,        [r2 + 30]
+    pshufb      m3,        m2
+
+    pmaddwd     m6,        m3, [r3 - 2 * 16]          ; [14]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    movu        m3,        [r2 + 28]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 4 * 16]          ; [20]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    movu        m3,        [r2 + 26]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 10 * 16]         ; [26]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    packusdw    m1,        m1
+    movhps      m1,        [r2 + 26]                  ; [00]
+
+    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+    movu        m3,        [r2 + 24]
+    pshufb      m3,        m2
+
+    pmaddwd     m4,        m3, [r3 - 10 * 16]         ; [06]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    movu        m3,        [r2 + 22]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 - 4 * 16]          ; [12]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    movu        m3,        [r2 + 20]
+    pshufb      m3,        m2
+
+    pmaddwd     m5,        m3, [r3 + 2 * 16]          ; [18]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    movu        m3,        [r2 + 18]
+    pshufb      m3,        m2
+
+    pmaddwd     m0,        m3, [r3 + 8 * 16]          ; [24]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m5,        m0
+
+    movu        m3,        [r2 + 16]
+    pshufb      m3,        m2
+
+    pmaddwd     m6,        m3, [r3 + 14 * 16]         ; [30]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    pmaddwd     m1,        m3, [r3 - 12 * 16]         ; [04]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    movu        m3,        [r2 + 14]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 - 6 * 16]          ; [10]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    movu        m3,        [r2 + 12]
+    pshufb      m3,        m2
+
+    pmaddwd     m0,        m3, [r3]                   ; [16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m1,        m0
+
+    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+    movu        m3,        [r2 + 10]
+    pshufb      m3,        m2
+
+    pmaddwd     m4,        m3, [r3 + 6 * 16]          ; [22]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+
+    movu        m3,        [r2 + 8]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 12 * 16]         ; [28]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m5,        m3, [r3 - 14 * 16]         ; [02]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+
+    movu        m3,        [r2 + 6]
+    pshufb      m3,        m2
+
+    pmaddwd     m6,        m3, [r3 - 8 * 16]          ; [08]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m5,        m6
+
+    movu        m3,        [r2 + 4]
+    pshufb      m3,        m2
+
+    pmaddwd     m6,        m3, [r3 - 2 * 16]          ; [14]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+
+    movu        m3,        [r2 + 2]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 4 * 16]          ; [20]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    movu        m3,        [r2]
+    pshufb      m3,        m2
+
+    pmaddwd     m1,        m3, [r3 + 10 * 16]         ; [26]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+
+    packusdw    m1,        m1
+    movhps      m1,        [r2]                       ; [00]
+
+    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+;------------------------------------------------------------------------------------------
+; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
+;------------------------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal intra_pred_ang32_2_new, 3,6,6
+    lea             r4, [r2]
+    add             r2, 128
+    cmp             r3m, byte 34
+    cmove           r2, r4
+
+    add             r1, r1
+    lea             r3, [r1 * 2]
+    lea             r4, [r1 * 3]
+    mov             r5, 2
+
+.loop:
+    MODE_2_34
+    add             r2, 32
+    dec             r5
+    jnz             .loop
+    RET
+
 INIT_XMM sse4
-cglobal intra_pred_ang32_3, 3,6,8
+cglobal intra_pred_ang32_3_new, 3,6,8
+    add         r2, 128
     lea         r3, [ang_table + 16 * 16]
     mov         r4d, 8
     add         r1, r1
@@ -13618,199 +11455,9 @@
     jnz         .loop
     RET
 
-%macro MODE_4_32 1
-    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-    movu        m3,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
-    palignr     m1,        m3, m0, 2                  ; [9 8 7 6 5 4 3 2]
-    punpckhwd   m2,        m0, m1                     ; [9 8 8 7 7 6 6 5]
-    punpcklwd   m0,        m1                         ; [5 4 4 3 3 2 2 1]
-
-    pmaddwd     m4,        m0, [r3 + 5 * 16]          ; [21]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    palignr     m5,        m2, m0, 4                  ; [6 5 5 4 4 3 3 2]
-    pmaddwd     m1,        m5, [r3 - 6 * 16]          ; [10]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        [r3 + 15 * 16]             ; [31]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    palignr     m6,        m2, m0, 8
-    pmaddwd     m6,        [r3 + 4 * 16]              ; [ 20]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    palignr     m1,        m2, m0, 12
-    pmaddwd     m6,        m1, [r3 - 7 * 16]          ; [ 9]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        [r3 + 14 * 16]             ; [30]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m2, [r3 + 3 * 16]          ; [19]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    palignr     m7,        m3, m2, 4                  ; [10 9 9 8 7 6 5 4]
-    pmaddwd     m0,        m7, [r3 - 8 * 16]          ; [8]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
-    pmaddwd     m4,        m7, [r3 + 13 * 16]         ; [29]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    movu        m0,        [r2 + 34]                  ; [24 23 22 21 20 19 18 17]
-
-    palignr     m2,        m0, m3, 2                  ; [17 16 15 14 13 12 11 10]
-    palignr     m1,        m0, m3, 4                  ; [18 17 16 15 14 13 12 11]
-    punpckhwd   m3,        m2, m1                     ; [18 17 17 16 16 15 15 14]
-    punpcklwd   m2,        m1                         ; [14 13 13 12 12 11 11 10]
-
-    palignr     m1,        m2, m7, 4                  ; [11 10 10 9 9 8 7 6]
-    pmaddwd     m1,        [r3 +  2 * 16]             ; [18]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    palignr     m5,        m2, m7, 8
-    mova        m6,        m5
-    pmaddwd     m5,        [r3 - 9 * 16]              ; [07]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m6,        [r3 + 12 * 16]             ; [28]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    palignr     m6,        m2, m7, 12
-    pmaddwd     m6,        [r3 +      16]             ; [17]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m2, [r3 - 10 * 16]         ; [06]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m2, [r3 + 11 * 16]         ; [27]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    palignr     m7,        m3, m2, 4
-    pmaddwd     m7,        [r3]                       ; [16]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m1,        m7
-    mova        m7,        m0
-
-    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
-    palignr     m0,        m3, m2, 8
-    pmaddwd     m4,        m0, [r3 - 11 * 16]         ; [5]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m0, [r3 + 10 * 16]         ; [26]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    palignr     m5,        m3, m2, 12
-    pmaddwd     m5,        [r3 - 16]                  ; [15]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m1,        m3, [r3 - 12 * 16]         ; [4]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m5,        m1
-
-    pmaddwd     m6,        m3, [r3 + 9 * 16]          ; [25]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    movu        m0,        [r2 + 50]                  ; [32 31 30 29 28 27 26 25]
-    palignr     m2,        m0, m7, 2                  ; [25 24 23 22 21 20 19 18]
-    palignr     m1,        m0, m7, 4                  ; [26 25 24 23 22 21 20 19]
-    punpckhwd   m7,        m2, m1                     ; [26 25 25 24 24 23 23 22]
-    punpcklwd   m2,        m1                         ; [22 21 21 20 20 19 19 18]
-
-    palignr     m1,        m2, m3, 4
-    pmaddwd     m1,        [r3 - 2 * 16]              ; [14]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    palignr     m1,        m2, m3, 8
-    mova        m0,        m1
-    pmaddwd     m1,        [r3 - 13 * 16]             ; [3]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    pmaddwd     m0,        [r3 + 8 * 16]              ; [24]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
-    palignr     m4,        m2, m3, 12
-    pmaddwd     m4,        [r3 - 3 * 16]              ; [13]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m2, [r3 - 14 * 16]         ; [2]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m2, [r3 + 7 * 16]          ; [23]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    palignr     m6,        m7, m2, 4
-    pmaddwd     m6,        [r3 - 4 * 16]              ; [12]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    palignr     m1,        m7, m2, 8
-    pmaddwd     m6,        m1, [r3 - 15 * 16]         ; [1]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        [r3 + 6 * 16]              ; [22]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    palignr     m1,        m7, m2, 12
-    pmaddwd     m1,        [r3 - 5 * 16]              ; [11]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m1,        m1
-    movhps      m1,        [r2 + 44]                  ; [00]
-
-    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_4(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal intra_pred_ang32_4, 3,6,8
+cglobal intra_pred_ang32_4_new, 3,6,8
+    add         r2, 128
     lea         r3, [ang_table + 16 * 16]
     mov         r4d, 8
     add         r1, r1
@@ -13824,194 +11471,9 @@
     jnz         .loop
     RET
 
-%macro MODE_5_31 1
-    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-    movu        m3,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
-    palignr     m1,        m3, m0, 2                  ; [9 8 7 6 5 4 3 2]
-    punpckhwd   m2,        m0, m1                     ; [9 8 8 7 7 6 6 5]
-    punpcklwd   m0,        m1                         ; [5 4 4 3 3 2 2 1]
-
-    pmaddwd     m4,        m0, [r3 + 16]              ; [17]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    palignr     m1,        m2, m0, 4
-    mova        m5,        m1
-    pmaddwd     m1,        [r3 - 14 * 16]             ; [2]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        [r3 + 3 * 16]              ; [19]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    palignr     m6,        m2, m0, 8
-    mova        m1,        m6
-    pmaddwd     m6,        [r3 - 12 * 16]             ; [4]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    pmaddwd     m6,        m1, [r3 + 5 * 16]          ; [21]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    palignr     m1,        m2, m0, 12
-    mova        m7,        m1
-    pmaddwd     m7,        [r3 - 10 * 16]             ; [6]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m6,        m7
-
-    pmaddwd     m1,        [r3 + 7 * 16]              ; [23]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    pmaddwd     m7,        m2, [r3 - 8 * 16]          ; [8]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m1,        m7
-
-    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
-    pmaddwd     m4,        m2, [r3 + 9 * 16]          ; [25]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    palignr     m7,        m3, m2, 4                  ; [10 9 9 8 7 6 5 4]
-    pmaddwd     m1,        m7, [r3 - 6 * 16]          ; [10]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m7, [r3 + 11 * 16]         ; [27]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    movu        m0,        [r2 + 34]                  ; [24 23 22 21 20 19 18 17]
-    palignr     m2,        m0, m3, 2                  ; [17 16 15 14 13 12 11 10]
-    palignr     m1,        m0, m3, 4                  ; [18 17 16 15 14 13 12 11]
-    punpckhwd   m3,        m2, m1                     ; [18 17 17 16 16 15 15 14]
-    punpcklwd   m2,        m1                         ; [14 13 13 12 12 11 11 10]
-
-    palignr     m6,        m2, m7, 4
-    pmaddwd     m1,        m6, [r3 - 4 * 16]          ; [12]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m5,        m1
-
-    pmaddwd     m6,        [r3 + 13 * 16]             ; [29]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    palignr     m1,        m2, m7, 8
-    mova        m0,        m1
-    pmaddwd     m1,        [r3 - 2 * 16]              ; [14]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m0, [r3 + 15 * 16]         ; [31]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    palignr     m0,        m2, m7, 12
-    pmaddwd     m0,        [r3]                       ; [16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
-    pmaddwd     m4,        m2, [r3 - 15 * 16]         ; [1]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m2, [r3 + 2 * 16]          ; [18]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    palignr     m1,        m3, m2, 4
-    pmaddwd     m5,        m1, [r3 - 13 * 16]         ; [3]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m1,        [r3 + 4 * 16]              ; [20]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m5,        m1
-
-    palignr     m1,        m3, m2, 8
-    pmaddwd     m6,        m1, [r3 - 11 * 16]         ; [5]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        [r3 + 6 * 16]              ; [22]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    palignr     m7,        m3, m2, 12
-    pmaddwd     m1,        m7, [r3 - 9 * 16]          ; [7]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    pmaddwd     m7,        [r3 + 8 * 16]              ; [24]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m1,        m7
-
-    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
-    pmaddwd     m4,        m3, [r3 - 7 * 16]          ; [9]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m3, [r3 + 10 * 16]         ; [26]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    movu        m0,        [r2 + 36]                  ; [25 24 23 22 21 20 19 18]
-    palignr     m1,        m0, 2                      ; [x 25 24 23 22 21 20 19]
-    punpcklwd   m0,        m1                         ; [22 21 21 20 20 19 19 18]
-
-    palignr     m1,        m0, m3, 4
-    pmaddwd     m5,        m1, [r3 - 5 * 16]          ; [11]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m1,        [r3 + 12 * 16]             ; [28]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m5,        m1
-
-    palignr     m1,        m0, m3, 8
-    pmaddwd     m6,        m1, [r3 - 3 * 16]          ; [13]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        [r3 + 14 * 16]             ; [30]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    palignr     m1,        m0, m3, 12
-    pmaddwd     m1,        [r3 - 16]                  ; [15]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m1,        m1
-    movhps      m1,        [r2 + 36]                  ; [00]
-
-    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_5(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal intra_pred_ang32_5, 3,6,8
+cglobal intra_pred_ang32_5_new, 3,6,8
+    add         r2, 128
     lea         r3, [ang_table + 16 * 16]
     mov         r4d, 8
     add         r1, r1
@@ -14025,184 +11487,9 @@
     jnz         .loop
     RET
 
-%macro MODE_6_30 1
-    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-    movu        m3,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
-    palignr     m1,        m3, m0, 2                  ; [9 8 7 6 5 4 3 2]
-    punpckhwd   m2,        m0, m1                     ; [9 8 8 7 7 6 6 5]
-    punpcklwd   m0,        m1                         ; [5 4 4 3 3 2 2 1]
-
-    pmaddwd     m4,        m0, [r3 - 3 * 16]          ; [13]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m0, [r3 + 10 * 16]         ; [26]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    palignr     m1,        m2, m0, 4
-    pmaddwd     m5,        m1, [r3 - 9 * 16]          ; [7]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m1,        [r3 + 4 * 16]              ; [20]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m5,        m1
-
-    palignr     m1,        m2, m0, 8
-    pmaddwd     m6,        m1, [r3 - 15 * 16]         ; [1]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m7,        m1, [r3 - 2 * 16]          ; [14]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m6,        m7
-
-    pmaddwd     m1,        [r3 + 11 * 16]             ; [27]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    palignr     m7,        m2, m0, 12
-    pmaddwd     m0,        m7, [r3 - 8 * 16]          ; [8]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
-    pmaddwd     m4,        m7, [r3 +  5 * 16]         ; [21]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m2, [r3 - 14 * 16]         ; [2]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m2, [r3 - 16]              ; [15]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m6,        m2, [r3 + 12 * 16]         ; [28]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    palignr     m7,        m3, m2, 4
-    pmaddwd     m6,        m7, [r3 - 7 * 16]          ; [9]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m7, [r3 + 6 * 16]          ; [22]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    movu        m0,        [r2 + 34]                  ; [24 23 22 21 20 19 18 17]
-    palignr     m2,        m0, m3, 2                  ; [17 16 15 14 13 12 11 10]
-    palignr     m1,        m0, m3, 4                  ; [18 17 16 15 14 13 12 11]
-    punpckhwd   m3,        m2, m1                     ; [18 17 17 16 16 15 15 14]
-    punpcklwd   m2,        m1                         ; [14 13 13 12 12 11 11 10]
-
-    palignr     m0,        m2, m7, 4
-    pmaddwd     m1,        m0, [r3 - 13 * 16]         ; [3]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    pmaddwd     m0,        [r3]                       ; [16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
-    palignr     m4,        m2, m7, 4
-    pmaddwd     m4,        [r3 +  13 * 16]            ; [29]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    palignr     m5,        m2, m7, 8
-    pmaddwd     m1,        m5, [r3 - 6 * 16]          ; [10]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        [r3 + 7 * 16]              ; [23]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    palignr     m1,        m2, m7, 12
-    pmaddwd     m6,        m1, [r3 - 12 * 16]         ; [4]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    pmaddwd     m6,        m1, [r3 + 16]              ; [17]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        [r3 + 14 * 16]             ; [30]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m2, [r3 - 5 * 16]          ; [11]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    pmaddwd     m0,        m2, [r3 + 8 * 16]          ; [24]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
-    palignr     m5,        m3, m2, 4
-    pmaddwd     m4,        m5, [r3 - 11 * 16]         ; [5]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m5, [r3 + 2 * 16]          ; [18]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        [r3 + 15 * 16]             ; [31]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    palignr     m6,        m3, m2, 8
-    pmaddwd     m1,        m6, [r3 - 4 * 16]          ; [12]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m5,        m1
-
-    pmaddwd     m6,        [r3 + 9 * 16]              ; [25]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    palignr     m1,        m3, m2, 12
-    pmaddwd     m0,        m1, [r3 - 10 * 16]         ; [6]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m6,        m0
-
-    pmaddwd     m1,        [r3 + 3 * 16]              ; [19]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m1,        m1
-    movhps      m1,        [r2 + 28]                  ; [00]
-
-    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_6(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal intra_pred_ang32_6, 3,6,8
+cglobal intra_pred_ang32_6_new, 3,6,8
+    add         r2, 128
     lea         r3, [ang_table + 16 * 16]
     mov         r4d, 8
     add         r1, r1
@@ -14216,178 +11503,9 @@
     jnz         .loop
     RET
 
-%macro MODE_7_29 1
-    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-    movd        m3,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
-    palignr     m1,        m3, m0, 2                  ; [9 8 7 6 5 4 3 2]
-    punpckhwd   m2,        m0, m1                     ; [9 8 8 7 7 6 6 5]
-    punpcklwd   m0,        m1                         ; [5 4 4 3 3 2 2 1]
-
-    pmaddwd     m4,        m0, [r3 - 7 * 16]          ; [9]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m0, [r3 + 2 * 16]          ; [18]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m0, [r3 + 11 * 16]         ; [27]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    palignr     m1,        m2, m0, 4
-    pmaddwd     m6,        m1, [r3 - 12 * 16]         ; [4]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    pmaddwd     m6,        m1, [r3 - 3 * 16]          ; [13]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m7,        m1, [r3 + 6 * 16]          ; [22]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m6,        m7
-
-    pmaddwd     m1,        [r3 + 15 * 16]             ; [31]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    mova        m3,        m0
-    palignr     m7,        m2, m0, 8
-    pmaddwd     m0,        m7, [r3 - 8 * 16]          ; [8]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
-    pmaddwd     m4,        m7, [r3 + 16]              ; [17]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m7, [r3 + 10 * 16]         ; [26]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    palignr     m1,        m2, m3, 12
-    pmaddwd     m5,        m1, [r3 - 13 * 16]         ; [3]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m6,        m1, [r3 - 4 * 16]          ; [12]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    pmaddwd     m6,        m1, [r3 + 5 * 16]          ; [21]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        [r3 + 14 * 16]             ; [30]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m2, [r3 - 9 * 16]          ; [7]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    pmaddwd     m0,        m2, [r3]                   ; [16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
-    pmaddwd     m4,        m2, [r3 + 9 * 16]          ; [25]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    movu        m7,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
-    palignr     m1,        m7, 2                      ; [x 16 15 14 13 12 11 10]
-    punpcklwd   m7,        m1                         ; [13 12 12 11 11 10 10 9]
-
-    palignr     m6,        m7, m2, 4
-    pmaddwd     m1,        m6, [r3 - 14 * 16]         ; [2]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m6, [r3 - 5 * 16]          ; [11]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m0,        m6, [r3 + 4 * 16]          ; [20]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m5,        m0
-
-    pmaddwd     m6,        [r3 + 13 * 16]             ; [29]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    palignr     m0,        m7, m2, 8
-    pmaddwd     m1,        m0, [r3 - 10 * 16]         ; [6]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m0, [r3 - 16]              ; [15]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    pmaddwd     m0,        [r3 + 8 * 16]              ; [24]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
-    palignr     m0,        m7, m2, 12
-    pmaddwd     m4,        m0, [r3 - 15 * 16]         ; [1]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m0, [r3 - 6 * 16]          ; [10]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m0, [r3 + 3 * 16]          ; [19]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m0,        [r3 + 12 * 16]             ; [28]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m5,        m0
-
-    pmaddwd     m6,        m7, [r3 - 11 * 16]         ; [5]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m0,        m7, [r3 - 2 * 16]          ; [14]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m6,        m0
-
-    pmaddwd     m1,        m7, [r3 + 7 * 16]          ; [23]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m1,        m1
-    movhps      m1,        [r2 + 20]                  ; [00]
-
-    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_7(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal intra_pred_ang32_7, 3,6,8
+cglobal intra_pred_ang32_7_new, 3,6,8
+    add         r2, 128
     lea         r3, [ang_table + 16 * 16]
     mov         r4d, 8
     add         r1, r1
@@ -14401,171 +11519,9 @@
     jnz         .loop
     RET
 
-%macro MODE_8_28 1
-    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-    movd        m3,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
-    palignr     m1,        m3, m0, 2                  ; [9 8 7 6 5 4 3 2]
-    punpckhwd   m2,        m0, m1                     ; [9 8 8 7 7 6 6 5]
-    punpcklwd   m0,        m1                         ; [5 4 4 3 3 2 2 1]
-
-    pmaddwd     m4,        m0, [r3 - 11 * 16]         ; [5]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m0, [r3 - 6 * 16]          ; [10]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m0, [r3 - 16]              ; [15]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m6,        m0, [r3 + 4 * 16]          ; [20]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    pmaddwd     m6,        m0, [r3 + 9 * 16]          ; [25]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m0, [r3 + 14 * 16]         ; [30]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    palignr     m7,        m2, m0, 4
-    pmaddwd     m1,        m7, [r3 - 13 * 16]         ; [3]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    mova        m3,        m0
-    pmaddwd     m0,        m7, [r3 - 8 * 16]          ; [8]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
-    pmaddwd     m4,        m7, [r3 - 3 * 16]          ; [13]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m7, [r3 + 2 * 16]          ; [18]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m7, [r3 + 7 * 16]          ; [23]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m6,        m7, [r3 + 12 * 16]         ; [28]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    palignr     m7,        m2, m3, 8
-    pmaddwd     m6,        m7, [r3 - 15 * 16]         ; [1]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m7, [r3 - 10 * 16]         ; [6]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m7, [r3 - 5 * 16]          ; [11]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    pmaddwd     m0,        m7, [r3]                   ; [16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
-    pmaddwd     m4,        m7, [r3 + 5 * 16]          ; [21]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m7, [r3 + 10 * 16]         ; [26]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m7, [r3 + 15 * 16]         ; [31]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    palignr     m7,        m2, m3, 12
-    pmaddwd     m0,        m7, [r3 - 12 * 16]         ; [4]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m5,        m0
-
-    pmaddwd     m6,        m7, [r3 - 7 * 16]          ; [9]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m7, [r3 - 2 * 16]          ; [14]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m7, [r3 + 3 * 16]          ; [19]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    pmaddwd     m0,        m7, [r3 + 8 * 16]          ; [24]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
-    pmaddwd     m4,        m7, [r3 + 13 * 16]         ; [29]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m2, [r3 - 14 * 16]         ; [2]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m2, [r3 - 9 * 16]          ; [7]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m0,        m2, [r3 - 4 * 16]          ; [12]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m5,        m0
-
-    pmaddwd     m6,        m2, [r3 + 16]              ; [17]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m0,        m2, [r3 + 6 * 16]          ; [22]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m6,        m0
-
-    pmaddwd     m1,        m2, [r3 + 11 * 16]         ; [27]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m1,        m1
-    movhps      m1,        [r2 + 12]                  ; [00]
-
-    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal intra_pred_ang32_8, 3,6,8
+cglobal intra_pred_ang32_8_new, 3,6,8
+    add         r2, 128
     lea         r3, [ang_table + 16 * 16]
     mov         r4d, 8
     add         r1, r1
@@ -14579,165 +11535,9 @@
     jnz         .loop
     RET
 
-%macro MODE_9_27 1
-    movu        m3,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
-    palignr     m1,        m3, 2                      ; [9 8 7 6 5 4 3 2]
-    punpckhwd   m2,        m3, m1                     ; [9 8 8 7 7 6 6 5]
-    punpcklwd   m3,        m1                         ; [5 4 4 3 3 2 2 1]
-
-    pmaddwd     m4,        m3, [r3 - 14 * 16]         ; [2]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m3, [r3 - 12 * 16]         ; [4]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m3, [r3 - 10 * 16]         ; [6]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m6,        m3, [r3 - 8 * 16]          ; [8]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    pmaddwd     m6,        m3, [r3 - 6 * 16]          ; [10]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m3, [r3 - 4 * 16]          ; [12]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m3, [r3 - 2 * 16]          ; [14]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    pmaddwd     m0,        m3, [r3]                   ; [16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
-    pmaddwd     m4,        m3, [r3 + 2 * 16]          ; [18]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m3, [r3 + 4 * 16]          ; [20]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m3, [r3 + 6 * 16]          ; [22]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m6,        m3, [r3 + 8 * 16]          ; [24]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    pmaddwd     m6,        m3, [r3 + 10 * 16]         ; [26]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m3, [r3 + 12 * 16]         ; [28]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m3, [r3 + 14 * 16]         ; [30]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    packusdw    m1,        m1
-    movhps      m1,        [r2 + 4]                   ; [00]
-
-    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
-    palignr     m7,        m2, m3, 4
-    pmaddwd     m4,        m7, [r3 - 14 * 16]         ; [2]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m7, [r3 - 12 * 16]         ; [4]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m7, [r3 - 10 * 16]         ; [6]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m0,        m7, [r3 - 8 * 16]          ; [8]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m5,        m0
-
-    pmaddwd     m6,        m7, [r3 - 6 * 16]          ; [10]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m7, [r3 - 4 * 16]          ; [12]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m7, [r3 - 2 * 16]          ; [14]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    pmaddwd     m0,        m7, [r3]                   ; [16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
-    pmaddwd     m4,        m7, [r3 + 2 * 16]          ; [18]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m7, [r3 + 4 * 16]          ; [20]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m7, [r3 + 6 * 16]          ; [22]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m0,        m7, [r3 + 8 * 16]          ; [24]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m5,        m0
-
-    pmaddwd     m6,        m7, [r3 + 10 * 16]         ; [26]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m0,        m7, [r3 + 12 * 16]         ; [28]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m6,        m0
-
-    pmaddwd     m7,        [r3 + 14 * 16]             ; [30]
-    paddd       m7,        [pd_16]
-    psrld       m7,        5
-    packusdw    m7,        m7
-    movhps      m7,        [r2 + 6]                   ; [00]
-
-    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m7
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_9(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal intra_pred_ang32_9, 3,6,8
+cglobal intra_pred_ang32_9_new, 3,6,8
+    add         r2, 128
     lea         r3, [ang_table + 16 * 16]
     mov         r4d, 8
     add         r1, r1
@@ -14751,11 +11551,9 @@
     jnz         .loop
     RET
 
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_10(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal intra_pred_ang32_10, 4,7,8
+cglobal intra_pred_ang32_10_new, 3,7,8
+    add         r2, 128
     mov         r6d, 4
     add         r1, r1
     lea         r5, [r1 * 3]
@@ -14828,167 +11626,12 @@
     jnz         .loop
     RET
 
-%macro MODE_11_25 1
-    movu        m3,        [r2 + 2]                   ; [7 6 5 4 3 2 1 0]
-    pshufb      m3,        [pw_punpcklwd]             ; [4 3 3 2 2 1 1 0]
-
-    pmaddwd     m4,        m3, [r3 + 14 * 16]         ; [30]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m3, [r3 + 12 * 16]         ; [28]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m3, [r3 + 10 * 16]         ; [26]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m6,        m3, [r3 + 8 * 16]          ; [24]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    pmaddwd     m6,        m3, [r3 + 6 * 16]          ; [22]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m3, [r3 + 4 * 16]          ; [20]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m3, [r3 + 2 * 16]          ; [18]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    pmaddwd     m0,        m3, [r3]                   ; [16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
-    pmaddwd     m4,        m3, [r3 - 2 * 16]          ; [14]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m3, [r3 - 4 * 16]          ; [12]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m3, [r3 - 6 * 16]          ; [10]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m6,        m3, [r3 - 8 * 16]          ; [8]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    pmaddwd     m6,        m3, [r3 - 10 * 16]         ; [6]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m3, [r3 - 12 * 16]         ; [4]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m3, [r3 - 14 * 16]         ; [2]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    packusdw    m1,        m1
-    movhps      m1,        [r2 + 2]                   ; [00]
-
-    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
-    movu        m3,        [r2]                       ; [6 5 4 3 2 1 0 16]
-    pshufb      m3,        [pw_punpcklwd]             ; [3 2 2 1 1 0 0 16]
-
-    pmaddwd     m4,        m3, [r3 + 14 * 16]         ; [30]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m3, [r3 + 12 * 16]         ; [28]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m3, [r3 + 10 * 16]         ; [26]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m0,        m3, [r3 + 8 * 16]          ; [24]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m5,        m0
-
-    pmaddwd     m6,        m3, [r3 + 6 * 16]          ; [22]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m3, [r3 + 4 * 16]          ; [20]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m3, [r3 + 2 * 16]          ; [18]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    pmaddwd     m0,        m3, [r3]                   ; [16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
-    pmaddwd     m4,        m3, [r3 - 2 * 16]          ; [14]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m3, [r3 - 4 * 16]          ; [12]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m3, [r3 - 6 * 16]          ; [10]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m6,        m3, [r3 - 8 * 16]          ; [8]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    pmaddwd     m6,        m3, [r3 - 10 * 16]         ; [6]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m3, [r3 - 12 * 16]         ; [4]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m3, [r3 - 14 * 16]         ; [2]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    packusdw    m1,        m1
-    movhps      m1,        [r2]                       ; [00]
-
-    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_11(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal intra_pred_ang32_11, 4,6,7,0-(4*mmsize+4)
+cglobal intra_pred_ang32_11_new, 3,6,7,0-(4*mmsize+4)
+    mov      r3, r2mp
+    add      r2, 128
     movu     m0, [r2 + 0*mmsize]
+    pinsrw   m0, [r3], 0
     movu     m1, [r2 + 1*mmsize]
     movu     m2, [r2 + 2*mmsize]
     movu     m3, [r2 + 3*mmsize]
@@ -15015,178 +11658,12 @@
     jnz      .loop
     RET
 
-%macro MODE_12_24 1
-    movu        m3,        [r2 + 8]                   ; [7 6 5 4 3 2 1 0]
-    pshufb      m3,        m2                         ; [4 3 3 2 2 1 1 0]
-
-    pmaddwd     m4,        m3, [r3 + 11 * 16]         ; [27]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m3, [r3 + 6 * 16]          ; [22]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m3, [r3 + 16]              ; [17]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m6,        m3, [r3 - 4 * 16]          ; [12]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    pmaddwd     m6,        m3, [r3 - 9 * 16]          ; [7]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m3, [r3 - 14 * 16]         ; [2]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    movu        m3,        [r2 + 6]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 13 * 16]         ; [29]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    pmaddwd     m0,        m3, [r3 + 8 * 16]          ; [24]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
-    pmaddwd     m4,        m3, [r3 + 3 * 16]          ; [19]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m3, [r3 - 2 * 16]          ; [14]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m3, [r3 - 7 * 16]          ; [9]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m6,        m3, [r3 - 12 * 16]         ; [4]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    movu        m3,        [r2 + 4]
-    pshufb      m3,        m2
-
-    pmaddwd     m6,        m3, [r3 + 15 * 16]         ; [31]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m3, [r3 + 10 * 16]         ; [26]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m3, [r3 + 5 * 16]          ; [21]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    pmaddwd     m0,        m3, [r3]                   ; [16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
-    pmaddwd     m4,        m3, [r3 - 5 * 16]          ; [11]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m3, [r3 - 10 * 16]         ; [6]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m3, [r3 - 15 * 16]         ; [1]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    movu        m3,        [r2 + 2]
-    pshufb      m3,        m2
-
-    pmaddwd     m0,        m3, [r3 + 12 * 16]         ; [28]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m5,        m0
-
-    pmaddwd     m6,        m3, [r3 + 7 * 16]          ; [23]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m3, [r3 + 2 * 16]          ; [18]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m3, [r3 - 3 * 16]          ; [13]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    pmaddwd     m0,        m3, [r3 - 8 * 16]          ; [8]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
-    pmaddwd     m4,        m3, [r3 - 13 * 16]         ; [3]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    movu        m3,        [r2]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 14 * 16]         ; [30]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m3, [r3 + 9 * 16]          ; [25]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m6,        m3, [r3 + 4 * 16]          ; [20]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    pmaddwd     m6,        m3, [r3 - 16]              ; [15]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m3, [r3 - 6 * 16]          ; [10]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m3, [r3 - 11 * 16]         ; [5]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    packusdw    m1,        m1
-    movhps      m1,        [r2]                       ; [00]
-
-    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_12(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal intra_pred_ang32_12, 4,6,7,0-(4*mmsize+10)
+cglobal intra_pred_ang32_12_new, 3,6,7,0-(4*mmsize+10)
+    mov      r3, r2mp
+    add      r2, 128
     movu     m0, [r2 + 0*mmsize]
+    pinsrw   m0, [r3], 0
     movu     m1, [r2 + 1*mmsize]
     movu     m2, [r2 + 2*mmsize]
     movu     m3, [r2 + 3*mmsize]
@@ -15221,190 +11698,12 @@
     jnz      .loop
     RET
 
-%macro MODE_13_23 1
-    movu        m3,        [r2 + 16]                  ; [7 6 5 4 3 2 1 0]
-    pshufb      m3,        m2                         ; [4 3 3 2 2 1 1 0]
-
-    pmaddwd     m4,        m3, [r3 + 7 * 16]          ; [23]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m3, [r3 - 2 * 16]          ; [14]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m3, [r3 - 11 * 16]         ; [05]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    movu        m3,        [r2 + 14]
-    pshufb      m3,        m2
-
-    pmaddwd     m6,        m3, [r3 + 12 * 16]         ; [28]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    pmaddwd     m6,        m3, [r3 + 3 * 16]          ; [19]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m3, [r3 - 6 * 16]          ; [10]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m3, [r3 - 15 * 16]         ; [01]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    movu        m3,        [r2 + 12]
-    pshufb      m3,        m2
-
-    pmaddwd     m0,        m3, [r3 + 8 * 16]          ; [24]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
-    pmaddwd     m4,        m3, [r3 - 16]              ; [15]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m3, [r3 - 10 * 16]         ; [06]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    movu        m3,        [r2 + 10]
-    pshufb      m3,        m2
-
-    pmaddwd     m5,        m3, [r3 + 13 * 16]         ; [29]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m6,        m3, [r3 + 4 * 16]          ; [20]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    pmaddwd     m6,        m3, [r3 - 5 * 16]          ; [11]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m3, [r3 - 14 * 16]         ; [02]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    movu        m3,        [r2 + 8]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 9 * 16]          ; [25]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    pmaddwd     m0,        m3, [r3]                   ; [16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
-    pmaddwd     m4,        m3, [r3 - 9 * 16]          ; [07]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    movu        m3,        [r2 + 6]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 14 * 16]         ; [30]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m3, [r3 + 5 * 16]          ; [21]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m0,        m3, [r3 - 4 * 16]          ; [12]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m5,        m0
-
-    pmaddwd     m6,        m3, [r3 - 13 * 16]         ; [03]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    movu        m3,        [r2 + 4]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 10 * 16]         ; [26]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m3, [r3 + 16]              ; [17]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    pmaddwd     m0,        m3, [r3 - 8 * 16]          ; [08]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
-    movu        m3,        [r2 + 2]
-    pshufb      m3,        m2
-
-    pmaddwd     m4,        m3, [r3 + 15 * 16]         ; [31]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m3, [r3 + 6 * 16]          ; [22]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m3, [r3 - 3 * 16]          ; [13]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m6,        m3, [r3 - 12 * 16]         ; [04]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    movu        m3,        [r2]
-    pshufb      m3,        m2
-
-    pmaddwd     m6,        m3, [r3 + 11 * 16]         ; [27]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m3, [r3 + 2 * 16]          ; [18]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m3, [r3 - 7 * 16]          ; [09]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    packusdw    m1,        m1
-    movhps      m1,        [r2]                       ; [00]
-
-    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_13(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal intra_pred_ang32_13, 4,6,7,0-(5*mmsize+2)
+cglobal intra_pred_ang32_13_new, 3,6,7,0-(5*mmsize+2)
+    mov      r3, r2mp
+    add      r2, 128
     movu     m0, [r2 + 0*mmsize]
+    pinsrw   m0, [r3], 0
     movu     m1, [r2 + 1*mmsize]
     movu     m2, [r2 + 2*mmsize]
     movu     m3, [r2 + 3*mmsize]
@@ -15441,202 +11740,12 @@
     jnz     .loop
     RET
 
-%macro MODE_14_22 1
-    movu        m3,        [r2 + 24]                  ; [7 6 5 4 3 2 1 0]
-    pshufb      m3,        m2                         ; [4 3 3 2 2 1 1 0]
-
-    pmaddwd     m4,        m3, [r3 + 3 * 16]          ; [19]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m3, [r3 - 10 * 16]         ; [06]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    movu        m3,        [r2 + 22]
-    pshufb      m3,        m2
-
-    pmaddwd     m5,        m3, [r3 + 9 * 16]          ; [25]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m6,        m3, [r3 - 4 * 16]          ; [12]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    movu        m3,        [r2 + 20]
-    pshufb      m3,        m2
-
-    pmaddwd     m6,        m3, [r3 + 15 * 16]         ; [31]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m3, [r3 + 2 * 16]          ; [18]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m3, [r3 - 11 * 16]         ; [05]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    movu        m3,        [r2 + 18]
-    pshufb      m3,        m2
-
-    pmaddwd     m0,        m3, [r3 + 8 * 16]          ; [24]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
-    pmaddwd     m4,        m3, [r3 - 5 * 16]          ; [11]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    movu        m3,        [r2 + 16]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 14 * 16]         ; [30]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m3, [r3 + 16]              ; [17]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m6,        m3, [r3 - 12 * 16]         ; [04]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    movu        m3,        [r2 + 14]
-    pshufb      m3,        m2
-
-    pmaddwd     m6,        m3, [r3 + 7 * 16]          ; [23]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m3, [r3 - 6 * 16]          ; [10]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    movu        m3,        [r2 + 12]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 13 * 16]         ; [29]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    pmaddwd     m0,        m3, [r3]                   ; [16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
-    pmaddwd     m4,        m3, [r3 - 13 * 16]         ; [03]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    movu        m3,        [r2 + 10]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 6 * 16]          ; [22]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m3, [r3 - 7 * 16]          ; [09]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    movu        m3,        [r2 + 8]
-    pshufb      m3,        m2
-
-    pmaddwd     m0,        m3, [r3 + 12 * 16]         ; [28]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m5,        m0
-
-    pmaddwd     m6,        m3, [r3 - 16]              ; [15]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m3, [r3 - 14 * 16]         ; [02]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    movu        m3,        [r2 + 6]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 5 * 16]          ; [21]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    pmaddwd     m0,        m3, [r3 - 8 * 16]          ; [08]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
-    movu        m3,        [r2 + 4]
-    pshufb      m3,        m2
-
-    pmaddwd     m4,        m3, [r3 + 11 * 16]         ; [27]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m3, [r3 - 2 * 16]          ; [14]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m3, [r3 - 15 * 16]         ; [01]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    movu        m3,        [r2 + 2]
-    pshufb      m3,        m2
-
-    pmaddwd     m6,        m3, [r3 + 4 * 16]          ; [20]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    pmaddwd     m6,        m3, [r3 - 9 * 16]          ; [07]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    movu        m3,        [r2]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 10 * 16]         ; [26]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m3, [r3 - 3 * 16]          ; [13]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    packusdw    m1,        m1
-    movhps      m1,        [r2]                       ; [00]
-
-    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_14(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal intra_pred_ang32_14, 4,6,7,0-(5*mmsize+10)
+cglobal intra_pred_ang32_14_new, 3,6,7,0-(5*mmsize+10)
+    mov      r3, r2mp
+    add      r2, 128
     movu     m0, [r2 + 0*mmsize]
+    pinsrw   m0, [r3], 0
     movu     m1, [r2 + 1*mmsize]
     movu     m2, [r2 + 2*mmsize]
     movu     m3, [r2 + 3*mmsize]
@@ -15674,214 +11783,12 @@
     jnz     .loop
     RET
 
-%macro MODE_15_21 1
-    movu        m3,        [r2 + 32]                  ; [7 6 5 4 3 2 1 0]
-    pshufb      m3,        m2                         ; [4 3 3 2 2 1 1 0]
-
-    pmaddwd     m4,        m3, [r3 - 16]              ; [15]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    movu        m3,        [r2 + 30]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 14 * 16]         ; [30]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m3, [r3 - 3 * 16]          ; [13]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    movu        m3,        [r2 + 28]
-    pshufb      m3,        m2
-
-    pmaddwd     m6,        m3, [r3 + 12 * 16]         ; [28]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    pmaddwd     m6,        m3, [r3 - 5 * 16]          ; [11]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    movu        m3,        [r2 + 26]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 10 * 16]         ; [26]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m3, [r3 - 7 * 16]          ; [09]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    movu        m3,        [r2 + 24]
-    pshufb      m3,        m2
-
-    pmaddwd     m0,        m3, [r3 + 8 * 16]          ; [24]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
-    pmaddwd     m4,        m3, [r3 - 9 * 16]          ; [07]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    movu        m3,        [r2 + 22]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 6 * 16]          ; [22]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m3, [r3 - 11 * 16]         ; [05]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    movu        m3,        [r2 + 20]
-    pshufb      m3,        m2
-
-    pmaddwd     m6,        m3, [r3 + 4 * 16]          ; [20]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    pmaddwd     m6,        m3, [r3 - 13 * 16]         ; [03]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    movu        m3,        [r2 + 18]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 2 * 16]          ; [18]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m3, [r3 - 15 * 16]         ; [01]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    movu        m3,        [r2 + 16]
-    pshufb      m3,        m2
-
-    pmaddwd     m0,        m3, [r3]                   ; [16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
-    movu        m3,        [r2 + 14]
-    pshufb      m3,        m2
-
-    pmaddwd     m4,        m3, [r3 + 15 * 16]         ; [31]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m3, [r3 - 2 * 16]          ; [14]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    movu        m3,        [r2 + 12]
-    pshufb      m3,        m2
-
-    pmaddwd     m5,        m3, [r3 + 13 * 16]         ; [29]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m0,        m3, [r3 - 4 * 16]          ; [12]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m5,        m0
-
-    movu        m3,        [r2 + 10]
-    pshufb      m3,        m2
-
-    pmaddwd     m6,        m3, [r3 + 11 * 16]         ; [27]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m3, [r3 - 6 * 16]          ; [10]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    movu        m3,        [r2 + 8]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 9 * 16]          ; [25]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    pmaddwd     m0,        m3, [r3 - 8 * 16]          ; [08]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
-    movu        m3,        [r2 + 6]
-    pshufb      m3,        m2
-
-    pmaddwd     m4,        m3, [r3 + 7 * 16]          ; [23]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m3, [r3 - 10 * 16]         ; [06]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    movu        m3,        [r2 + 4]
-    pshufb      m3,        m2
-
-    pmaddwd     m5,        m3, [r3 + 5 * 16]          ; [21]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m6,        m3, [r3 - 12 * 16]         ; [04]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    movu        m3,        [r2 + 2]
-    pshufb      m3,        m2
-
-    pmaddwd     m6,        m3, [r3 + 3 * 16]          ; [19]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m3, [r3 - 14 * 16]         ; [02]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    movu        m3,        [r2]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 16]              ; [17]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    packusdw    m1,        m1
-    movhps      m1,        [r2]                       ; [00]
-
-    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_15(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal intra_pred_ang32_15, 4,6,7,0-(6*mmsize+2)
+cglobal intra_pred_ang32_15_new, 3,6,7,0-(6*mmsize+2)
+    mov      r3, r2mp
+    add      r2, 128
     movu     m0, [r2 + 0*mmsize]
+    pinsrw   m0, [r3], 0
     movu     m1, [r2 + 1*mmsize]
     movu     m2, [r2 + 2*mmsize]
     movu     m3, [r2 + 3*mmsize]
@@ -15920,226 +11827,12 @@
     jnz     .loop
     RET
 
-%macro MODE_16_20 1
-    movu        m3,        [r2 + 40]                  ; [7 6 5 4 3 2 1 0]
-    pshufb      m3,        m2                         ; [4 3 3 2 2 1 1 0]
-
-    pmaddwd     m4,        m3, [r3 - 5 * 16]          ; [11]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    movu        m3,        [r2 + 38]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 6 * 16]          ; [22]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m3, [r3 - 15 * 16]         ; [01]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    movu        m3,        [r2 + 36]
-    pshufb      m3,        m2
-
-    pmaddwd     m6,        m3, [r3 - 4 * 16]          ; [12]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    movu        m3,        [r2 + 34]
-    pshufb      m3,        m2
-
-    pmaddwd     m6,        m3, [r3 + 7 * 16]          ; [23]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m3, [r3 - 14 * 16]         ; [02]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    movu        m3,        [r2 + 32]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 - 3 * 16]          ; [13]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    movu        m3,        [r2 + 30]
-    pshufb      m3,        m2
-
-    pmaddwd     m0,        m3, [r3 + 8 * 16]          ; [24]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
-    pmaddwd     m4,        m3, [r3 - 13 * 16]         ; [03]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    movu        m3,        [r2 + 28]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 - 2 * 16]          ; [14]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    movu        m3,        [r2 + 26]
-    pshufb      m3,        m2
-
-    pmaddwd     m5,        m3, [r3 + 9 * 16]          ; [25]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    pmaddwd     m6,        m3, [r3 - 12 * 16]         ; [04]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    movu        m3,        [r2 + 24]
-    pshufb      m3,        m2
-
-    pmaddwd     m6,        m3, [r3 - 16]              ; [15]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    movu        m3,        [r2 + 22]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 10 * 16]         ; [26]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    pmaddwd     m1,        m3, [r3 - 11 * 16]         ; [05]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    movu        m3,        [r2 + 20]
-    pshufb      m3,        m2
-
-    pmaddwd     m0,        m3, [r3]                   ; [16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
-    movu        m3,        [r2 + 18]
-    pshufb      m3,        m2
-
-    pmaddwd     m4,        m3, [r3 + 11 * 16]         ; [27]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    pmaddwd     m1,        m3, [r3 - 10 * 16]         ; [06]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    movu        m3,        [r2 + 16]
-    pshufb      m3,        m2
-
-    pmaddwd     m5,        m3, [r3 + 16]              ; [17]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    movu        m3,        [r2 + 14]
-    pshufb      m3,        m2
-
-    pmaddwd     m0,        m3, [r3 + 12 * 16]         ; [28]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m5,        m0
-
-    pmaddwd     m6,        m3, [r3 - 9 * 16]          ; [07]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    movu        m3,        [r2 + 12]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 2 * 16]          ; [18]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    movu        m3,        [r2 + 10]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 13 * 16]         ; [29]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    pmaddwd     m0,        m3, [r3 - 8 * 16]          ; [08]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
-    movu        m3,        [r2 + 8]
-    pshufb      m3,        m2
-
-    pmaddwd     m4,        m3, [r3 + 3 * 16]          ; [19]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    movu        m3,        [r2 + 6]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 14 * 16]         ; [30]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m3, [r3 - 7 * 16]          ; [09]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    movu        m3,        [r2 + 4]
-    pshufb      m3,        m2
-
-    pmaddwd     m6,        m3, [r3 + 4 * 16]          ; [20]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    movu        m3,        [r2 + 2]
-    pshufb      m3,        m2
-
-    pmaddwd     m6,        m3, [r3 + 15 * 16]         ; [31]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m3, [r3 - 6 * 16]          ; [10]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    movu        m3,        [r2]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 5 * 16]          ; [21]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    packusdw    m1,        m1
-    movhps      m1,        [r2]                       ; [00]
-
-    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal intra_pred_ang32_16, 4,6,7,0-(6*mmsize+10)
+cglobal intra_pred_ang32_16_new, 3,6,7,0-(6*mmsize+10)
+    mov      r3, r2mp
+    add      r2, 128
     movu     m0, [r2 + 0*mmsize]
+    pinsrw   m0, [r3], 0
     movu     m1, [r2 + 1*mmsize]
     movu     m2, [r2 + 2*mmsize]
     movu     m3, [r2 + 3*mmsize]
@@ -16178,239 +11871,12 @@
     jnz     .loop
     RET
 
-%macro MODE_17_19 1
-    movu        m3,        [r2 + 50]                  ; [7 6 5 4 3 2 1 0]
-    pshufb      m3,        m2                         ; [4 3 3 2 2 1 1 0]
-
-    pmaddwd     m4,        m3, [r3 - 10 * 16]         ; [06]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    movu        m3,        [r2 + 48]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 - 4 * 16]          ; [12]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    movu        m3,        [r2 + 46]
-    pshufb      m3,        m2
-
-    pmaddwd     m5,        m3, [r3 + 2 * 16]          ; [18]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    movu        m3,        [r2 + 44]
-    pshufb      m3,        m2
-
-    pmaddwd     m6,        m3, [r3 + 8 * 16]          ; [24]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    movu        m3,        [r2 + 42]
-    pshufb      m3,        m2
-
-    pmaddwd     m6,        m3, [r3 + 14 * 16]         ; [30]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m3, [r3 - 12 * 16]         ; [04]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    movu        m3,        [r2 + 40]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 - 6 * 16]          ; [10]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    movu        m3,        [r2 + 38]
-    pshufb      m3,        m2
-
-    pmaddwd     m0,        m3, [r3]                   ; [16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
-    movu        m3,        [r2 + 36]
-    pshufb      m3,        m2
-
-    pmaddwd     m4,        m3, [r3 + 6 * 16]          ; [22]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    movu        m3,        [r2 + 34]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 12 * 16]         ; [28]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m3, [r3 - 14 * 16]         ; [02]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    movu        m3,        [r2 + 32]
-    pshufb      m3,        m2
-
-    pmaddwd     m6,        m3, [r3 - 8 * 16]          ; [08]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    movu        m3,        [r2 + 30]
-    pshufb      m3,        m2
-
-    pmaddwd     m6,        m3, [r3 - 2 * 16]          ; [14]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    movu        m3,        [r2 + 28]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 4 * 16]          ; [20]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    movu        m3,        [r2 + 26]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 10 * 16]         ; [26]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    packusdw    m1,        m1
-    movhps      m1,        [r2 + 26]                  ; [00]
-
-    TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
-    movu        m3,        [r2 + 24]
-    pshufb      m3,        m2
-
-    pmaddwd     m4,        m3, [r3 - 10 * 16]         ; [06]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    movu        m3,        [r2 + 22]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 - 4 * 16]          ; [12]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    movu        m3,        [r2 + 20]
-    pshufb      m3,        m2
-
-    pmaddwd     m5,        m3, [r3 + 2 * 16]          ; [18]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    movu        m3,        [r2 + 18]
-    pshufb      m3,        m2
-
-    pmaddwd     m0,        m3, [r3 + 8 * 16]          ; [24]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m5,        m0
-
-    movu        m3,        [r2 + 16]
-    pshufb      m3,        m2
-
-    pmaddwd     m6,        m3, [r3 + 14 * 16]         ; [30]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    pmaddwd     m1,        m3, [r3 - 12 * 16]         ; [04]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    movu        m3,        [r2 + 14]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 - 6 * 16]          ; [10]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    movu        m3,        [r2 + 12]
-    pshufb      m3,        m2
-
-    pmaddwd     m0,        m3, [r3]                   ; [16]
-    paddd       m0,        [pd_16]
-    psrld       m0,        5
-    packusdw    m1,        m0
-
-    TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
-    movu        m3,        [r2 + 10]
-    pshufb      m3,        m2
-
-    pmaddwd     m4,        m3, [r3 + 6 * 16]          ; [22]
-    paddd       m4,        [pd_16]
-    psrld       m4,        5
-
-    movu        m3,        [r2 + 8]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 12 * 16]         ; [28]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m4,        m1
-
-    pmaddwd     m5,        m3, [r3 - 14 * 16]         ; [02]
-    paddd       m5,        [pd_16]
-    psrld       m5,        5
-
-    movu        m3,        [r2 + 6]
-    pshufb      m3,        m2
-
-    pmaddwd     m6,        m3, [r3 - 8 * 16]          ; [08]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-    packusdw    m5,        m6
-
-    movu        m3,        [r2 + 4]
-    pshufb      m3,        m2
-
-    pmaddwd     m6,        m3, [r3 - 2 * 16]          ; [14]
-    paddd       m6,        [pd_16]
-    psrld       m6,        5
-
-    movu        m3,        [r2 + 2]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 4 * 16]          ; [20]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-    packusdw    m6,        m1
-
-    movu        m3,        [r2]
-    pshufb      m3,        m2
-
-    pmaddwd     m1,        m3, [r3 + 10 * 16]         ; [26]
-    paddd       m1,        [pd_16]
-    psrld       m1,        5
-
-    packusdw    m1,        m1
-    movhps      m1,        [r2]                       ; [00]
-
-    TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_17(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal intra_pred_ang32_17, 4,6,7,0-(7*mmsize+4)
+cglobal intra_pred_ang32_17_new, 3,6,7,0-(7*mmsize+4)
+    mov      r3, r2mp
+    add      r2, 128
     movu     m0, [r2 + 0*mmsize]
+    pinsrw   m0, [r3], 0
     movu     m1, [r2 + 1*mmsize]
     movu     m2, [r2 + 2*mmsize]
     movu     m3, [r2 + 3*mmsize]
@@ -16455,11 +11921,10 @@
     jnz     .loop
     RET
 
-;-------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_18(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-------------------------------------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal intra_pred_ang32_18, 4,7,8
+cglobal intra_pred_ang32_18_new, 3,7,8
+    mov      r3, r2mp
+    add      r2, 128
     movu        m0, [r3]               ; [7 6 5 4 3 2 1 0]
     movu        m1, [r3 + 16]          ; [15 14 13 12 11 10 9 8]
     movu        m2, [r3 + 32]          ; [23 22 21 20 19 18 17 16]
@@ -16767,12 +12232,9 @@
     movu        [r0 + r3 + 48], m6
     RET
 
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_19(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal intra_pred_ang32_19, 4,7,7,0-(7*mmsize+4)
-    xchg     r2, r3
+cglobal intra_pred_ang32_19_new, 3,7,7,0-(7*mmsize+4)
+    lea      r3, [r2 + 128]
     movu     m0, [r2 + 0*mmsize]
     movu     m1, [r2 + 1*mmsize]
     movu     m2, [r2 + 2*mmsize]
@@ -16820,1346 +12282,6 @@
     jnz     .loop
     RET
 
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_20(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_20, 4,7,7,0-(6*mmsize+10)
-    xchg     r2, r3
-    movu     m0, [r2 + 0*mmsize]
-    movu     m1, [r2 + 1*mmsize]
-    movu     m2, [r2 + 2*mmsize]
-    movu     m3, [r2 + 3*mmsize]
-    movu     [rsp + 2*mmsize + 8], m0
-    movu     [rsp + 3*mmsize + 8], m1
-    movu     [rsp + 4*mmsize + 8], m2
-    movu     [rsp + 5*mmsize + 8], m3
-
-    mov      r4w, [r2 + 64]
-    mov      [rsp + 104], r4w
-    movu     m0, [r3 + 4]
-    movu     m1, [r3 + 22]
-    movu     m2, [r3 + 40]
-    movd     m3, [r3 + 58]
-    pshufb   m0, [shuf_mode_16_20]
-    pshufb   m1, [shuf_mode_16_20]
-    pshufb   m2, [shuf_mode_16_20]
-    pshufb   m3, [shuf_mode_16_20]
-    movu     [rsp + 24], m0
-    movu     [rsp + 12], m1
-    movu     [rsp], m2
-    movd     [rsp], m3
-
-    lea      r3, [ang_table + 16 * 16]
-    mov      r4d, 8
-    mov      r2, rsp
-    add      r1, r1
-    lea      r5, [r1 * 3]
-    mova     m2, [pw_punpcklwd]
-    mov      r6, r0
-
-.loop:
-    MODE_16_20 0
-    add      r6, 8
-    mov      r0, r6
-    add      r2, 8
-    dec      r4
-    jnz     .loop
-    RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_21(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_21, 4,7,7,0-(6*mmsize+2)
-    xchg     r2, r3
-    movu     m0, [r2 + 0*mmsize]
-    movu     m1, [r2 + 1*mmsize]
-    movu     m2, [r2 + 2*mmsize]
-    movu     m3, [r2 + 3*mmsize]
-    movu     [rsp + 2*mmsize], m0
-    movu     [rsp + 3*mmsize], m1
-    movu     [rsp + 4*mmsize], m2
-    movu     [rsp + 5*mmsize], m3
-
-    mov      r4w, [r2 + 64]
-    mov      [rsp + 96], r4w
-    movu     m0, [r3 + 4]
-    movu     m1, [r3 + 18]
-    movu     m2, [r3 + 34]
-    movu     m3, [r3 + 48]
-    pshufb   m0, [shuf_mode_15_21]
-    pshufb   m1, [shuf_mode_15_21]
-    pshufb   m2, [shuf_mode_15_21]
-    pshufb   m3, [shuf_mode_15_21]
-    movh     [rsp + 24], m0
-    movh     [rsp + 16], m1
-    movh     [rsp + 8], m2
-    movh     [rsp], m3
-
-    lea      r3, [ang_table + 16 * 16]
-    mov      r4d, 8
-    mov      r2, rsp
-    add      r1, r1
-    lea      r5, [r1 * 3]
-    mova     m2, [pw_punpcklwd]
-    mov      r6, r0
-
-.loop:
-    MODE_15_21 0
-    add      r6, 8
-    mov      r0, r6
-    add      r2, 8
-    dec      r4
-    jnz     .loop
-    RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_22(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_22, 4,7,7,0-(5*mmsize+10)
-    xchg     r2, r3
-    movu     m0, [r2 + 0*mmsize]
-    movu     m1, [r2 + 1*mmsize]
-    movu     m2, [r2 + 2*mmsize]
-    movu     m3, [r2 + 3*mmsize]
-    movu     [rsp + 1*mmsize + 8], m0
-    movu     [rsp + 2*mmsize + 8], m1
-    movu     [rsp + 3*mmsize + 8], m2
-    movu     [rsp + 4*mmsize + 8], m3
-
-    mov      r4w, [r2 + 64]
-    mov      [rsp + 88], r4w
-    mov      r4w, [r3+4]
-    mov      [rsp+22], r4w
-    movu     m0, [r3 + 10]
-    movu     m1, [r3 + 30]
-    movu     m2, [r3 + 50]
-    pshufb   m0, [shuf_mode_14_22]
-    pshufb   m1, [shuf_mode_14_22]
-    pshufb   m2, [shuf_mode_14_22]
-    movh     [rsp + 14], m0
-    movh     [rsp + 6], m1
-    movh     [rsp - 2], m2
-
-    lea      r3, [ang_table + 16 * 16]
-    mov      r4d, 8
-    mov      r2, rsp
-    add      r1, r1
-    lea      r5, [r1 * 3]
-    mova     m2, [pw_punpcklwd]
-    mov      r6, r0
-
-.loop:
-    MODE_14_22 0
-    add      r6, 8
-    mov      r0, r6
-    add      r2, 8
-    dec      r4
-    jnz     .loop
-    RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_23(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_23, 4,7,7,0-(5*mmsize+2)
-    xchg     r2, r3
-    movu     m0, [r2 + 0*mmsize]
-    movu     m1, [r2 + 1*mmsize]
-    movu     m2, [r2 + 2*mmsize]
-    movu     m3, [r2 + 3*mmsize]
-    movu     [rsp + 1*mmsize], m0
-    movu     [rsp + 2*mmsize], m1
-    movu     [rsp + 3*mmsize], m2
-    movu     [rsp + 4*mmsize], m3
-
-    mov      r4w, [r2+64]
-    mov      [rsp+80], r4w
-    movu     m0, [r3 + 8]
-    movu     m1, [r3 + 36]
-    pshufb   m0, [shuf_mode_13_23]
-    pshufb   m1, [shuf_mode_13_23]
-    movh     [rsp + 8], m0
-    movh     [rsp], m1
-    mov      r4w, [r3+28]
-    mov      [rsp+8], r4w
-    mov      r4w, [r3+56]
-    mov      [rsp], r4w
-
-    lea      r3, [ang_table + 16 * 16]
-    mov      r4d, 8
-    mov      r2, rsp
-    add      r1, r1
-    lea      r5, [r1 * 3]
-    mova     m2, [pw_punpcklwd]
-    mov      r6, r0
-
-.loop:
-    MODE_13_23 0
-    add      r6, 8
-    mov      r0, r6
-    add      r2, 8
-    dec      r4
-    jnz     .loop
-    RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_24(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_24, 4,7,7,0-(4*mmsize+10)
-    xchg     r2, r3
-    movu     m0, [r2 + 0*mmsize]
-    movu     m1, [r2 + 1*mmsize]
-    movu     m2, [r2 + 2*mmsize]
-    movu     m3, [r2 + 3*mmsize]
-
-    movu     [rsp + 0*mmsize + 8], m0
-    movu     [rsp + 1*mmsize + 8], m1
-    movu     [rsp + 2*mmsize + 8], m2
-    movu     [rsp + 3*mmsize + 8], m3
-
-    mov      r4w, [r2+64]
-    mov      [rsp+72], r4w
-    mov      r4w, [r3+12]
-    mov      [rsp+6], r4w
-    mov      r4w, [r3+26]
-    mov      [rsp+4], r4w
-    mov      r4w, [r3+38]
-    mov      [rsp+2], r4w
-    mov      r4w, [r3+52]
-    mov      [rsp], r4w
-
-    lea      r3, [ang_table + 16 * 16]
-    mov      r4d, 8
-    mov      r2, rsp
-    add      r1, r1
-    lea      r5, [r1 * 3]
-    mov     r6, r0
-    mova     m2, [pw_punpcklwd]
-
-.loop:
-    MODE_12_24 0
-    add      r6, 8
-    mov      r0, r6
-    add      r2, 8
-    dec      r4
-    jnz      .loop
-    RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_25(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_25, 4,7,7,0-(4*mmsize+4)
-    xchg     r2, r3
-    movu     m0, [r2 + 0*mmsize]
-    movu     m1, [r2 + 1*mmsize]
-    movu     m2, [r2 + 2*mmsize]
-    movu     m3, [r2 + 3*mmsize]
-    movu     [rsp + 0*mmsize + 2], m0
-    movu     [rsp + 1*mmsize + 2], m1
-    movu     [rsp + 2*mmsize + 2], m2
-    movu     [rsp + 3*mmsize + 2], m3
-    mov      r4w, [r3+32]
-    mov      [rsp], r4w
-    mov      r4w, [r2+64]
-    mov      [rsp+66], r4w
-
-    lea         r3, [ang_table + 16 * 16]
-    mov         r4d, 8
-    mov         r2, rsp
-    add         r1, r1
-    lea         r5, [r1 * 3]
-    mov         r6, r0
-
-.loop:
-    MODE_11_25 0
-    add         r6, 8
-    mov         r0, r6
-    add         r2, 8
-    dec         r4
-    jnz         .loop
-    RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_26(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_26, 4,7,5
-    mov         r6d, 4
-    add         r1, r1
-    lea         r2, [r1 * 2]
-    lea         r4, [r1 * 3]
-    lea         r5, [r1 * 4]
-    mova        m4, [c_mode32_10_0]
-
-    movu        m0, [r3 + 2]
-    movu        m1, [r3 + 18]
-    movu        m2, [r3 + 34]
-    movu        m3, [r3 + 50]
-
-.loop:
-    movu        [r0], m0
-    movu        [r0 + 16], m1
-    movu        [r0 + 32], m2
-    movu        [r0 + 48], m3
-
-    movu        [r0 + r1], m0
-    movu        [r0 + r1 + 16], m1
-    movu        [r0 + r1 + 32], m2
-    movu        [r0 + r1 + 48], m3
-
-    movu        [r0 + r2], m0
-    movu        [r0 + r2 + 16], m1
-    movu        [r0 + r2 + 32], m2
-    movu        [r0 + r2 + 48], m3
-
-    movu        [r0 + r4], m0
-    movu        [r0 + r4 + 16], m1
-    movu        [r0 + r4 + 32], m2
-    movu        [r0 + r4 + 48], m3
-
-    add         r0, r5
-
-    movu        [r0], m0
-    movu        [r0 + 16], m1
-    movu        [r0 + 32], m2
-    movu        [r0 + 48], m3
-
-    movu        [r0 + r1], m0
-    movu        [r0 + r1 + 16], m1
-    movu        [r0 + r1 + 32], m2
-    movu        [r0 + r1 + 48], m3
-
-    movu        [r0 + r2], m0
-    movu        [r0 + r2 + 16], m1
-    movu        [r0 + r2 + 32], m2
-    movu        [r0 + r2 + 48], m3
-
-    movu        [r0 + r4], m0
-    movu        [r0 + r4 + 16], m1
-    movu        [r0 + r4 + 32], m2
-    movu        [r0 + r4 + 48], m3
-
-    add         r0, r5
-    dec         r6d
-    jnz         .loop
-    RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_27(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_27, 4,7,8
-    xchg   r2, r3mp
-    lea    r3, [ang_table + 16 * 16]
-    add    r1, r1
-    lea    r5, [r1 * 3]
-    mov    r6, r0
-    mov    r4d, 8
-
-.loop:
-    MODE_9_27 0
-    add    r6, 8
-    mov    r0, r6
-    add    r2, 8
-    dec    r4
-    jnz    .loop
-    RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_28(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_28, 4,7,8
-    xchg   r2, r3mp
-    lea    r3, [ang_table + 16 * 16]
-    add    r1, r1
-    lea    r5, [r1 * 3]
-    mov    r6, r0
-    mov    r4d, 8
-
-.loop:
-    MODE_8_28 0
-    add    r6, 8
-    mov    r0, r6
-    add    r2, 8
-    dec    r4
-    jnz    .loop
-    RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_29(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_29, 4,7,8
-    xchg   r2, r3mp
-    lea    r3, [ang_table + 16 * 16]
-    add    r1, r1
-    lea    r5, [r1 * 3]
-    mov    r6, r0
-    mov    r4d, 8
-
-.loop:
-    MODE_7_29 0
-    add    r6, 8
-    mov    r0, r6
-    add    r2, 8
-    dec    r4
-    jnz    .loop
-    RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_30(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_30, 4,7,8
-    xchg   r2, r3mp
-    lea    r3, [ang_table + 16 * 16]
-    add    r1, r1
-    lea    r5, [r1 * 3]
-    mov    r6, r0
-    mov    r4d, 8
-
-.loop:
-    MODE_6_30 0
-    add    r6, 8
-    mov    r0, r6
-    add    r2, 8
-    dec    r4
-    jnz    .loop
-    RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_31(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_31, 4,7,8
-    xchg   r2, r3mp
-    lea    r3, [ang_table + 16 * 16]
-    add    r1, r1
-    lea    r5, [r1 * 3]
-    mov    r6, r0
-    mov    r4d, 8
-
-.loop:
-    MODE_5_31 0
-    add    r6, 8
-    mov    r0, r6
-    add    r2, 8
-    dec    r4
-    jnz    .loop
-    RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_32, 4,7,8
-    xchg   r2, r3mp
-    lea    r3, [ang_table + 16 * 16]
-    add    r1, r1
-    lea    r5, [r1 * 3]
-    mov    r6, r0
-    mov    r4d, 8
-
-.loop:
-    MODE_4_32 0
-    add    r6, 8
-    mov    r0, r6
-    add    r2, 8
-    dec    r4
-    jnz    .loop
-    RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_33(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_33, 4,7,8
-    xchg   r2, r3mp
-    lea    r3, [ang_table + 16 * 16]
-    add    r1, r1
-    lea    r5, [r1 * 3]
-    mov    r6, r0
-    mov    r4d, 8
-.loop:
-    MODE_3_33 0
-    add    r6, 8
-    mov    r0, r6
-    add    r2, 8
-    dec    r4
-    jnz    .loop
-    RET
-
-;------------------------------------------------------------------------------------------
-; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------
-INIT_XMM ssse3
-cglobal intra_pred_ang32_2_new, 3,6,6
-    lea             r4, [r2]
-    add             r2, 128
-    cmp             r3m, byte 34
-    cmove           r2, r4
-
-    add             r1, r1
-    lea             r3, [r1 * 2]
-    lea             r4, [r1 * 3]
-    mov             r5, 2
-
-.loop:
-    MODE_2_34
-    add             r2, 32
-    dec             r5
-    jnz             .loop
-    RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_3_new, 3,6,8
-    add         r2, 128
-    lea         r3, [ang_table + 16 * 16]
-    mov         r4d, 8
-    add         r1, r1
-    lea         r5, [r1 * 3]
-
-.loop:
-    MODE_3_33 1
-    lea         r0, [r0 + r1 * 4 ]
-    add         r2, 8
-    dec         r4
-    jnz         .loop
-    RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_4_new, 3,6,8
-    add         r2, 128
-    lea         r3, [ang_table + 16 * 16]
-    mov         r4d, 8
-    add         r1, r1
-    lea         r5, [r1 * 3]
-
-.loop:
-    MODE_4_32 1
-    lea         r0, [r0 + r1 * 4 ]
-    add         r2, 8
-    dec         r4
-    jnz         .loop
-    RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_5_new, 3,6,8
-    add         r2, 128
-    lea         r3, [ang_table + 16 * 16]
-    mov         r4d, 8
-    add         r1, r1
-    lea         r5, [r1 * 3]
-
-.loop:
-    MODE_5_31 1
-    lea         r0, [r0 + r1 * 4 ]
-    add         r2, 8
-    dec         r4
-    jnz         .loop
-    RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_6_new, 3,6,8
-    add         r2, 128
-    lea         r3, [ang_table + 16 * 16]
-    mov         r4d, 8
-    add         r1, r1
-    lea         r5, [r1 * 3]
-
-.loop:
-    MODE_6_30 1
-    lea         r0, [r0 + r1 * 4 ]
-    add         r2, 8
-    dec         r4
-    jnz         .loop
-    RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_7_new, 3,6,8
-    add         r2, 128
-    lea         r3, [ang_table + 16 * 16]
-    mov         r4d, 8
-    add         r1, r1
-    lea         r5, [r1 * 3]
-
-.loop:
-    MODE_7_29 1
-    lea         r0, [r0 + r1 * 4 ]
-    add         r2, 8
-    dec         r4
-    jnz         .loop
-    RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_8_new, 3,6,8
-    add         r2, 128
-    lea         r3, [ang_table + 16 * 16]
-    mov         r4d, 8
-    add         r1, r1
-    lea         r5, [r1 * 3]
-
-.loop:
-    MODE_8_28 1
-    lea         r0, [r0 + r1 * 4 ]
-    add         r2, 8
-    dec         r4
-    jnz         .loop
-    RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_9_new, 3,6,8
-    add         r2, 128
-    lea         r3, [ang_table + 16 * 16]
-    mov         r4d, 8
-    add         r1, r1
-    lea         r5, [r1 * 3]
-
-.loop:
-    MODE_9_27 1
-    lea         r0, [r0 + r1 * 4 ]
-    add         r2, 8
-    dec         r4
-    jnz         .loop
-    RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_10_new, 3,7,8
-    add         r2, 128
-    mov         r6d, 4
-    add         r1, r1
-    lea         r5, [r1 * 3]
-    lea         r4, [r1 * 2]
-    lea         r3, [r1 * 4]
-    mova        m7, [c_mode32_10_0]
-
-.loop:
-    movu        m0, [r2 + 2]
-    pshufb      m1, m0, m7
-    movu        [r0], m1
-    movu        [r0 + 16], m1
-    movu        [r0 + 32], m1
-    movu        [r0 + 48], m1
-
-    palignr     m1, m0, 2
-    pshufb      m1, m7
-    movu        [r0 + r1], m1
-    movu        [r0 + r1 + 16], m1
-    movu        [r0 + r1 + 32], m1
-    movu        [r0 + r1 + 48], m1
-
-    palignr     m1, m0, 4
-    pshufb      m1, m7
-    movu        [r0 + r4], m1
-    movu        [r0 + r4 + 16], m1
-    movu        [r0 + r4 + 32], m1
-    movu        [r0 + r4 + 48], m1
-
-    palignr     m1, m0, 6
-    pshufb      m1, m7
-    movu        [r0 + r5], m1
-    movu        [r0 + r5 + 16], m1
-    movu        [r0 + r5 + 32], m1
-    movu        [r0 + r5 + 48], m1
-
-    add         r0, r3
-
-    palignr     m1, m0, 8
-    pshufb      m1, m7
-    movu        [r0], m1
-    movu        [r0 + 16], m1
-    movu        [r0 + 32], m1
-    movu        [r0 + 48], m1
-
-    palignr     m1, m0, 10
-    pshufb      m1, m7
-    movu        [r0 + r1], m1
-    movu        [r0 + r1 + 16], m1
-    movu        [r0 + r1 + 32], m1
-    movu        [r0 + r1 + 48], m1
-
-    palignr     m1, m0, 12
-    pshufb      m1, m7
-    movu        [r0 + r4], m1
-    movu        [r0 + r4 + 16], m1
-    movu        [r0 + r4 + 32], m1
-    movu        [r0 + r4 + 48], m1
-
-    palignr     m1, m0, 14
-    pshufb      m1, m7
-    movu        [r0 + r5], m1
-    movu        [r0 + r5 + 16], m1
-    movu        [r0 + r5 + 32], m1
-    movu        [r0 + r5 + 48], m1
-
-    add         r0, r3
-    add         r2, 16
-    dec         r6d
-    jnz         .loop
-    RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_11_new, 3,6,7,0-(4*mmsize+4)
-    mov      r3, r2mp
-    add      r2, 128
-    movu     m0, [r2 + 0*mmsize]
-    pinsrw   m0, [r3], 0
-    movu     m1, [r2 + 1*mmsize]
-    movu     m2, [r2 + 2*mmsize]
-    movu     m3, [r2 + 3*mmsize]
-    movu     [rsp + 0*mmsize + 2], m0
-    movu     [rsp + 1*mmsize + 2], m1
-    movu     [rsp + 2*mmsize + 2], m2
-    movu     [rsp + 3*mmsize + 2], m3
-    mov      r4w, [r3+32]
-    mov      [rsp], r4w
-    mov      r4w, [r2+64]
-    mov      [rsp+66], r4w
-
-    lea      r3, [ang_table + 16 * 16]
-    mov      r4d, 8
-    mov      r2, rsp
-    add      r1, r1
-    lea      r5, [r1 * 3]
-
-.loop:
-    MODE_11_25 1
-    lea      r0, [r0 + r1 * 4 ]
-    add      r2, 8
-    dec      r4
-    jnz      .loop
-    RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_12_new, 3,6,7,0-(4*mmsize+10)
-    mov      r3, r2mp
-    add      r2, 128
-    movu     m0, [r2 + 0*mmsize]
-    pinsrw   m0, [r3], 0
-    movu     m1, [r2 + 1*mmsize]
-    movu     m2, [r2 + 2*mmsize]
-    movu     m3, [r2 + 3*mmsize]
-    movu     [rsp + 0*mmsize + 8], m0
-    movu     [rsp + 1*mmsize + 8], m1
-    movu     [rsp + 2*mmsize + 8], m2
-    movu     [rsp + 3*mmsize + 8], m3
-
-    mov      r4w, [r2+64]
-    mov      [rsp+72], r4w
-    mov      r4w, [r3+12]
-    mov      [rsp+6], r4w
-    mov      r4w, [r3+26]
-    mov      [rsp+4], r4w
-    mov      r4w, [r3+38]
-    mov      [rsp+2], r4w
-    mov      r4w, [r3+52]
-    mov      [rsp], r4w
-
-    lea      r3, [ang_table + 16 * 16]
-    mov      r4d, 8
-    mov      r2, rsp
-    add      r1, r1
-    lea      r5, [r1 * 3]
-    mova     m2, [pw_punpcklwd]
-
-.loop:
-    MODE_12_24 1
-    lea      r0, [r0 + r1 * 4 ]
-    add      r2, 8
-    dec      r4
-    jnz      .loop
-    RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_13_new, 3,6,7,0-(5*mmsize+2)
-    mov      r3, r2mp
-    add      r2, 128
-    movu     m0, [r2 + 0*mmsize]
-    pinsrw   m0, [r3], 0
-    movu     m1, [r2 + 1*mmsize]
-    movu     m2, [r2 + 2*mmsize]
-    movu     m3, [r2 + 3*mmsize]
-    movu     [rsp + 1*mmsize], m0
-    movu     [rsp + 2*mmsize], m1
-    movu     [rsp + 3*mmsize], m2
-    movu     [rsp + 4*mmsize], m3
-
-    mov      r4w, [r2+64]
-    mov      [rsp+80], r4w
-    movu     m0, [r3 + 8]
-    movu     m1, [r3 + 36]
-    pshufb   m0, [shuf_mode_13_23]
-    pshufb   m1, [shuf_mode_13_23]
-    movh     [rsp + 8], m0
-    movh     [rsp], m1
-    mov      r4w, [r3+28]
-    mov      [rsp+8], r4w
-    mov      r4w, [r3+56]
-    mov      [rsp], r4w
-
-    lea      r3, [ang_table + 16 * 16]
-    mov      r4d, 8
-    mov      r2, rsp
-    add      r1, r1
-    lea      r5, [r1 * 3]
-    mova     m2, [pw_punpcklwd]
-
-.loop:
-    MODE_13_23 1
-    lea      r0, [r0 + r1 * 4 ]
-    add      r2, 8
-    dec      r4
-    jnz     .loop
-    RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_14_new, 3,6,7,0-(5*mmsize+10)
-    mov      r3, r2mp
-    add      r2, 128
-    movu     m0, [r2 + 0*mmsize]
-    pinsrw   m0, [r3], 0
-    movu     m1, [r2 + 1*mmsize]
-    movu     m2, [r2 + 2*mmsize]
-    movu     m3, [r2 + 3*mmsize]
-    movu     [rsp + 1*mmsize + 8], m0
-    movu     [rsp + 2*mmsize + 8], m1
-    movu     [rsp + 3*mmsize + 8], m2
-    movu     [rsp + 4*mmsize + 8], m3
-
-    mov      r4w, [r2 + 64]
-    mov      [rsp + 88], r4w
-    mov      r4w, [r3+4]
-    mov      [rsp+22], r4w
-    movu     m0, [r3 + 10]
-    movu     m1, [r3 + 30]
-    movu     m2, [r3 + 50]
-    pshufb   m0, [shuf_mode_14_22]
-    pshufb   m1, [shuf_mode_14_22]
-    pshufb   m2, [shuf_mode_14_22]
-    movh     [rsp + 14], m0
-    movh     [rsp + 6], m1
-    movh     [rsp - 2], m2
-
-    lea      r3, [ang_table + 16 * 16]
-    mov      r4d, 8
-    mov      r2, rsp
-    add      r1, r1
-    lea      r5, [r1 * 3]
-    mova     m2, [pw_punpcklwd]
-
-.loop:
-    MODE_14_22 1
-    lea      r0, [r0 + r1 * 4 ]
-    add      r2, 8
-    dec      r4
-    jnz     .loop
-    RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_15_new, 3,6,7,0-(6*mmsize+2)
-    mov      r3, r2mp
-    add      r2, 128
-    movu     m0, [r2 + 0*mmsize]
-    pinsrw   m0, [r3], 0
-    movu     m1, [r2 + 1*mmsize]
-    movu     m2, [r2 + 2*mmsize]
-    movu     m3, [r2 + 3*mmsize]
-    movu     [rsp + 2*mmsize], m0
-    movu     [rsp + 3*mmsize], m1
-    movu     [rsp + 4*mmsize], m2
-    movu     [rsp + 5*mmsize], m3
-
-    mov      r4w, [r2 + 64]
-    mov      [rsp + 96], r4w
-    movu     m0, [r3 + 4]
-    movu     m1, [r3 + 18]
-    movu     m2, [r3 + 34]
-    movu     m3, [r3 + 48]
-    pshufb   m0, [shuf_mode_15_21]
-    pshufb   m1, [shuf_mode_15_21]
-    pshufb   m2, [shuf_mode_15_21]
-    pshufb   m3, [shuf_mode_15_21]
-    movh     [rsp + 24], m0
-    movh     [rsp + 16], m1
-    movh     [rsp + 8], m2
-    movh     [rsp], m3
-
-    lea      r3, [ang_table + 16 * 16]
-    mov      r4d, 8
-    mov      r2, rsp
-    add      r1, r1
-    lea      r5, [r1 * 3]
-    mova     m2, [pw_punpcklwd]
-
-.loop:
-    MODE_15_21 1
-    lea      r0, [r0 + r1 * 4 ]
-    add      r2, 8
-    dec      r4
-    jnz     .loop
-    RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_16_new, 3,6,7,0-(6*mmsize+10)
-    mov      r3, r2mp
-    add      r2, 128
-    movu     m0, [r2 + 0*mmsize]
-    pinsrw   m0, [r3], 0
-    movu     m1, [r2 + 1*mmsize]
-    movu     m2, [r2 + 2*mmsize]
-    movu     m3, [r2 + 3*mmsize]
-    movu     [rsp + 2*mmsize + 8], m0
-    movu     [rsp + 3*mmsize + 8], m1
-    movu     [rsp + 4*mmsize + 8], m2
-    movu     [rsp + 5*mmsize + 8], m3
-
-    mov      r4w, [r2 + 64]
-    mov      [rsp + 104], r4w
-    movu     m0, [r3 + 4]
-    movu     m1, [r3 + 22]
-    movu     m2, [r3 + 40]
-    movd     m3, [r3 + 58]
-    pshufb   m0, [shuf_mode_16_20]
-    pshufb   m1, [shuf_mode_16_20]
-    pshufb   m2, [shuf_mode_16_20]
-    pshufb   m3, [shuf_mode_16_20]
-    movu     [rsp + 24], m0
-    movu     [rsp + 12], m1
-    movu     [rsp], m2
-    movd     [rsp], m3
-
-    lea      r3, [ang_table + 16 * 16]
-    mov      r4d, 8
-    mov      r2, rsp
-    add      r1, r1
-    lea      r5, [r1 * 3]
-    mova     m2, [pw_punpcklwd]
-
-.loop:
-    MODE_16_20 1
-    lea      r0, [r0 + r1 * 4 ]
-    add      r2, 8
-    dec      r4
-    jnz     .loop
-    RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_17_new, 3,6,7,0-(7*mmsize+4)
-    mov      r3, r2mp
-    add      r2, 128
-    movu     m0, [r2 + 0*mmsize]
-    pinsrw   m0, [r3], 0
-    movu     m1, [r2 + 1*mmsize]
-    movu     m2, [r2 + 2*mmsize]
-    movu     m3, [r2 + 3*mmsize]
-    movu     [rsp + 3*mmsize + 2], m0
-    movu     [rsp + 4*mmsize + 2], m1
-    movu     [rsp + 5*mmsize + 2], m2
-    movu     [rsp + 6*mmsize + 2], m3
-
-    mov      r4w, [r2 + 64]
-    mov      [rsp + 114], r4w
-    movu     m0, [r3 + 8]
-    movu     m1, [r3 + 30]
-    movu     m2, [r3 + 50]
-    movd     m3, [r3 + 2]
-    pshufb   m0, [shuf_mode_17_19]
-    pshufb   m1, [shuf_mode_17_19]
-    pshufb   m2, [shuf_mode_17_19]
-    pshufb   m3, [shuf_mode_16_20]
-    movd     [rsp + 46], m3
-    movu     [rsp + 30], m0
-    movu     [rsp + 12], m1
-    movu     [rsp - 4], m2
-    mov      r4w, [r3 + 24]
-    mov      [rsp + 30], r4w
-    mov      r4w, [r3 + 28]
-    mov      [rsp + 28], r4w
-    mov      r4w, [r3 + 46]
-    mov      [rsp + 12], r4w
-
-    lea      r3, [ang_table + 16 * 16]
-    mov      r4d, 8
-    mov      r2, rsp
-    add      r1, r1
-    lea      r5, [r1 * 3]
-    mova     m2, [pw_punpcklwd]
-
-.loop:
-    MODE_17_19 1
-    lea      r0, [r0 + r1 * 4 ]
-    add      r2, 8
-    dec      r4
-    jnz     .loop
-    RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_18_new, 3,7,8
-    mov      r3, r2mp
-    add      r2, 128
-    movu        m0, [r3]               ; [7 6 5 4 3 2 1 0]
-    movu        m1, [r3 + 16]          ; [15 14 13 12 11 10 9 8]
-    movu        m2, [r3 + 32]          ; [23 22 21 20 19 18 17 16]
-    movu        m3, [r3 + 48]          ; [31 30 29 28 27 26 25 24]
-    movu        m4, [r2 + 2]           ; [8 7 6 5 4 3 2 1]
-    movu        m5, [r2 + 18]          ; [16 15 14 13 12 11 10 9]
-
-    add         r1, r1
-    lea         r6, [r1 * 2]
-    lea         r3, [r1 * 3]
-    lea         r4, [r1 * 4]
-
-    movu        [r0], m0
-    movu        [r0 + 16], m1
-    movu        [r0 + 32], m2
-    movu        [r0 + 48], m3
-
-    pshufb      m4, [shuf_mode32_18]   ; [1 2 3 4 5 6 7 8]
-    pshufb      m5, [shuf_mode32_18]   ; [9 10 11 12 13 14 15 16]
-
-    palignr     m6, m0, m4, 14
-    movu        [r0 + r1], m6
-    palignr     m6, m1, m0, 14
-    movu        [r0 + r1 + 16], m6
-    palignr     m6, m2, m1, 14
-    movu        [r0 + r1 + 32], m6
-    palignr     m6, m3, m2, 14
-    movu        [r0 + r1 + 48], m6
-
-    palignr     m6, m0, m4, 12
-    movu        [r0 + r6], m6
-    palignr     m6, m1, m0, 12
-    movu        [r0 + r6 + 16], m6
-    palignr     m6, m2, m1, 12
-    movu        [r0 + r6 + 32], m6
-    palignr     m6, m3, m2, 12
-    movu        [r0 + r6 + 48], m6
-
-    palignr     m6, m0, m4, 10
-    movu        [r0 + r3], m6
-    palignr     m6, m1, m0, 10
-    movu        [r0 + r3 + 16], m6
-    palignr     m6, m2, m1, 10
-    movu        [r0 + r3 + 32], m6
-    palignr     m6, m3, m2, 10
-    movu        [r0 + r3 + 48], m6
-
-    add         r0, r4
-
-    palignr     m6, m0, m4, 8
-    movu        [r0], m6
-    palignr     m6, m1, m0, 8
-    movu        [r0 + 16], m6
-    palignr     m6, m2, m1, 8
-    movu        [r0 + 32], m6
-    palignr     m6, m3, m2, 8
-    movu        [r0 + 48], m6
-
-    palignr     m6, m0, m4, 6
-    movu        [r0 + r1], m6
-    palignr     m6, m1, m0, 6
-    movu        [r0 + r1 + 16], m6
-    palignr     m6, m2, m1, 6
-    movu        [r0 + r1 + 32], m6
-    palignr     m6, m3, m2, 6
-    movu        [r0 + r1 + 48], m6
-
-    palignr     m6, m0, m4, 4
-    movu        [r0 + r6], m6
-    palignr     m6, m1, m0, 4
-    movu        [r0 + r6 + 16], m6
-    palignr     m6, m2, m1, 4
-    movu        [r0 + r6 + 32], m6
-    palignr     m6, m3, m2, 4
-    movu        [r0 + r6 + 48], m6
-
-    palignr     m6, m0, m4, 2
-    movu        [r0 + r3], m6
-    palignr     m6, m1, m0, 2
-    movu        [r0 + r3 + 16], m6
-    palignr     m6, m2, m1, 2
-    movu        [r0 + r3 + 32], m6
-    palignr     m6, m3, m2, 2
-    movu        [r0 + r3 + 48], m6
-
-    add         r0, r4
-
-    movu        [r0], m4
-    movu        [r0 + 16], m0
-    movu        [r0 + 32], m1
-    movu        [r0 + 48], m2
-
-    palignr     m6, m4, m5, 14
-    movu        [r0 + r1], m6
-    palignr     m6, m0, m4, 14
-    movu        [r0 + r1 + 16], m6
-    palignr     m6, m1, m0, 14
-    movu        [r0 + r1 + 32], m6
-    palignr     m6, m2, m1, 14
-    movu        [r0 + r1 + 48], m6
-
-    palignr     m6, m4, m5, 12
-    movu        [r0 + r6], m6
-    palignr     m6, m0, m4, 12
-    movu        [r0 + r6 + 16], m6
-    palignr     m6, m1, m0, 12
-    movu        [r0 + r6 + 32], m6
-    palignr     m6, m2, m1, 12
-    movu        [r0 + r6 + 48], m6
-
-    palignr     m6, m4, m5, 10
-    movu        [r0 + r3], m6
-    palignr     m6, m0, m4, 10
-    movu        [r0 + r3 + 16], m6
-    palignr     m6, m1, m0, 10
-    movu        [r0 + r3 + 32], m6
-    palignr     m6, m2, m1, 10
-    movu        [r0 + r3 + 48], m6
-
-    add         r0, r4
-
-    palignr     m6, m4, m5, 8
-    movu        [r0], m6
-    palignr     m6, m0, m4, 8
-    movu        [r0 + 16], m6
-    palignr     m6, m1, m0, 8
-    movu        [r0 + 32], m6
-    palignr     m6, m2, m1, 8
-    movu        [r0 + 48], m6
-
-    palignr     m6, m4, m5, 6
-    movu        [r0 + r1], m6
-    palignr     m6, m0, m4, 6
-    movu        [r0 + r1 + 16], m6
-    palignr     m6, m1, m0, 6
-    movu        [r0 + r1 + 32], m6
-    palignr     m6, m2, m1, 6
-    movu        [r0 + r1 + 48], m6
-
-    palignr     m6, m4, m5, 4
-    movu        [r0 + r6], m6
-    palignr     m6, m0, m4, 4
-    movu        [r0 + r6 + 16], m6
-    palignr     m6, m1, m0, 4
-    movu        [r0 + r6 + 32], m6
-    palignr     m6, m2, m1, 4
-    movu        [r0 + r6 + 48], m6
-
-    palignr     m6, m4, m5, 2
-    movu        [r0 + r3], m6
-    palignr     m6, m0, m4, 2
-    movu        [r0 + r3 + 16], m6
-    palignr     m6, m1, m0, 2
-    movu        [r0 + r3 + 32], m6
-    palignr     m6, m2, m1, 2
-    movu        [r0 + r3 + 48], m6
-
-    add         r0, r4
-
-    movu        m2, [r2 + 34]
-    movu        m3, [r2 + 50]
-    pshufb      m2, [shuf_mode32_18]
-    pshufb      m3, [shuf_mode32_18]
-
-    movu        [r0], m5
-    movu        [r0 + 16], m4
-    movu        [r0 + 32], m0
-    movu        [r0 + 48], m1
-
-    palignr     m6, m5, m2, 14
-    movu        [r0 + r1], m6
-    palignr     m6, m4, m5, 14
-    movu        [r0 + r1 + 16], m6
-    palignr     m6, m0, m4, 14
-    movu        [r0 + r1 + 32], m6
-    palignr     m6, m1, m0, 14
-    movu        [r0 + r1 + 48], m6
-
-    palignr     m6, m5, m2, 12
-    movu        [r0 + r6], m6
-    palignr     m6, m4, m5, 12
-    movu        [r0 + r6 + 16], m6
-    palignr     m6, m0, m4, 12
-    movu        [r0 + r6 + 32], m6
-    palignr     m6, m1, m0, 12
-    movu        [r0 + r6 + 48], m6
-
-    palignr     m6, m5, m2, 10
-    movu        [r0 + r3], m6
-    palignr     m6, m4, m5, 10
-    movu        [r0 + r3 + 16], m6
-    palignr     m6, m0, m4, 10
-    movu        [r0 + r3 + 32], m6
-    palignr     m6, m1, m0, 10
-    movu        [r0 + r3 + 48], m6
-
-    add         r0, r4
-
-    palignr     m6, m5, m2, 8
-    movu        [r0], m6
-    palignr     m6, m4, m5, 8
-    movu        [r0 + 16], m6
-    palignr     m6, m0, m4, 8
-    movu        [r0 + 32], m6
-    palignr     m6, m1, m0, 8
-    movu        [r0 + 48], m6
-
-    palignr     m6, m5, m2, 6
-    movu        [r0 + r1], m6
-    palignr     m6, m4, m5, 6
-    movu        [r0 + r1 + 16], m6
-    palignr     m6, m0, m4, 6
-    movu        [r0 + r1 + 32], m6
-    palignr     m6, m1, m0, 6
-    movu        [r0 + r1 + 48], m6
-
-    palignr     m6, m5, m2, 4
-    movu        [r0 + r6], m6
-    palignr     m6, m4, m5, 4
-    movu        [r0 + r6 + 16], m6
-    palignr     m6, m0, m4, 4
-    movu        [r0 + r6 + 32], m6
-    palignr     m6, m1, m0, 4
-    movu        [r0 + r6 + 48], m6
-
-    palignr     m6, m5, m2, 2
-    movu        [r0 + r3], m6
-    palignr     m6, m4, m5, 2
-    movu        [r0 + r3 + 16], m6
-    palignr     m6, m0, m4, 2
-    movu        [r0 + r3 + 32], m6
-    palignr     m6, m1, m0, 2
-    movu        [r0 + r3 + 48], m6
-
-    add         r0, r4
-
-    movu        [r0], m2
-    movu        [r0 + 16], m5
-    movu        [r0 + 32], m4
-    movu        [r0 + 48], m0
-
-    palignr     m6, m2, m3, 14
-    movu        [r0 + r1], m6
-    palignr     m6, m5, m2, 14
-    movu        [r0 + r1 + 16], m6
-    palignr     m6, m4, m5, 14
-    movu        [r0 + r1 + 32], m6
-    palignr     m6, m0, m4, 14
-    movu        [r0 + r1 + 48], m6
-
-    palignr     m6, m2, m3, 12
-    movu        [r0 + r6], m6
-    palignr     m6, m5, m2, 12
-    movu        [r0 + r6 + 16], m6
-    palignr     m6, m4, m5, 12
-    movu        [r0 + r6 + 32], m6
-    palignr     m6, m0, m4, 12
-    movu        [r0 + r6 + 48], m6
-
-    palignr     m6, m2, m3, 10
-    movu        [r0 + r3], m6
-    palignr     m6, m5, m2, 10
-    movu        [r0 + r3 + 16], m6
-    palignr     m6, m4, m5, 10
-    movu        [r0 + r3 + 32], m6
-    palignr     m6, m0, m4, 10
-    movu        [r0 + r3 + 48], m6
-
-    add         r0, r4
-
-    palignr     m6, m2, m3, 8
-    movu        [r0], m6
-    palignr     m6, m5, m2, 8
-    movu        [r0 + 16], m6
-    palignr     m6, m4, m5, 8
-    movu        [r0 + 32], m6
-    palignr     m6, m0, m4, 8
-    movu        [r0 + 48], m6
-
-    palignr     m6, m2, m3, 6
-    movu        [r0 + r1], m6
-    palignr     m6, m5, m2, 6
-    movu        [r0 + r1 + 16], m6
-    palignr     m6, m4, m5, 6
-    movu        [r0 + r1 + 32], m6
-    palignr     m6, m0, m4, 6
-    movu        [r0 + r1 + 48], m6
-
-    palignr     m6, m2, m3, 4
-    movu        [r0 + r6], m6
-    palignr     m6, m5, m2, 4
-    movu        [r0 + r6 + 16], m6
-    palignr     m6, m4, m5, 4
-    movu        [r0 + r6 + 32], m6
-    palignr     m6, m0, m4, 4
-    movu        [r0 + r6 + 48], m6
-
-    palignr     m6, m2, m3, 2
-    movu        [r0 + r3], m6
-    palignr     m6, m5, m2, 2
-    movu        [r0 + r3 + 16], m6
-    palignr     m6, m4, m5, 2
-    movu        [r0 + r3 + 32], m6
-    palignr     m6, m0, m4, 2
-    movu        [r0 + r3 + 48], m6
-    RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_19_new, 3,7,7,0-(7*mmsize+4)
-    lea      r3, [r2 + 128]
-    movu     m0, [r2 + 0*mmsize]
-    movu     m1, [r2 + 1*mmsize]
-    movu     m2, [r2 + 2*mmsize]
-    movu     m3, [r2 + 3*mmsize]
-    movu     [rsp + 3*mmsize + 2], m0
-    movu     [rsp + 4*mmsize + 2], m1
-    movu     [rsp + 5*mmsize + 2], m2
-    movu     [rsp + 6*mmsize + 2], m3
-
-    mov      r4w, [r2 + 64]
-    mov      [rsp + 114], r4w
-    movu     m0, [r3 + 8]
-    movu     m1, [r3 + 30]
-    movu     m2, [r3 + 50]
-    movd     m3, [r3 + 2]
-    pshufb   m0, [shuf_mode_17_19]
-    pshufb   m1, [shuf_mode_17_19]
-    pshufb   m2, [shuf_mode_17_19]
-    pshufb   m3, [shuf_mode_16_20]
-    movd     [rsp + 46], m3
-    movu     [rsp + 30], m0
-    movu     [rsp + 12], m1
-    movu     [rsp - 4], m2
-    mov      r4w, [r3 + 24]
-    mov      [rsp + 30], r4w
-    mov      r4w, [r3 + 28]
-    mov      [rsp + 28], r4w
-    mov      r4w, [r3 + 46]
-    mov      [rsp + 12], r4w
-
-    lea      r3, [ang_table + 16 * 16]
-    mov      r4d, 8
-    mov      r2, rsp
-    add      r1, r1
-    lea      r5, [r1 * 3]
-    mova     m2, [pw_punpcklwd]
-    mov      r6, r0
-
-.loop:
-    MODE_17_19 0
-    add      r6, 8
-    mov      r0, r6
-    add      r2, 8
-    dec      r4
-    jnz     .loop
-    RET
-
 INIT_XMM sse4
 cglobal intra_pred_ang32_20_new, 3,7,7,0-(6*mmsize+10)
     lea      r3, [r2 + 128]


More information about the x265-devel mailing list