[x265] [PATCH] asm: 16bpp asm code for intra_pred_ang16 - mode 19 to 33

yuvaraj at multicorewareinc.com yuvaraj at multicorewareinc.com
Fri Feb 21 10:46:50 CET 2014


# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1392975978 -19800
#      Fri Feb 21 15:16:18 2014 +0530
# Node ID 01b0e41600bce25e211c33fbc92059c021944240
# Parent  262d3efc3167a6d5a6f3365f1f3837b4f7a0355a
asm: 16bpp asm code for intra_pred_ang16 - mode 19 to 33

diff -r 262d3efc3167 -r 01b0e41600bc source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Feb 21 15:11:08 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Feb 21 15:16:18 2014 +0530
@@ -934,6 +934,21 @@
         SETUP_INTRA_ANG16(16, 16, sse4);
         SETUP_INTRA_ANG16(17, 17, sse4);
         SETUP_INTRA_ANG16(18, 18, sse4);
+        SETUP_INTRA_ANG16(19, 19, sse4);
+        SETUP_INTRA_ANG16(20, 20, sse4);
+        SETUP_INTRA_ANG16(21, 21, sse4);
+        SETUP_INTRA_ANG16(22, 22, sse4);
+        SETUP_INTRA_ANG16(23, 23, sse4);
+        SETUP_INTRA_ANG16(24, 24, sse4);
+        SETUP_INTRA_ANG16(25, 25, sse4);
+        SETUP_INTRA_ANG16(26, 26, sse4);
+        SETUP_INTRA_ANG16(27, 27, sse4);
+        SETUP_INTRA_ANG16(28, 28, sse4);
+        SETUP_INTRA_ANG16(29, 29, sse4);
+        SETUP_INTRA_ANG16(30, 30, sse4);
+        SETUP_INTRA_ANG16(31, 31, sse4);
+        SETUP_INTRA_ANG16(32, 32, sse4);
+        SETUP_INTRA_ANG16(33, 33, sse4);
 
         SETUP_INTRA_ANG32(3,  3,  sse4);
         SETUP_INTRA_ANG32(4,  4,  sse4);
diff -r 262d3efc3167 -r 01b0e41600bc source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Fri Feb 21 15:11:08 2014 +0530
+++ b/source/common/x86/intrapred16.asm	Fri Feb 21 15:16:18 2014 +0530
@@ -9119,6 +9119,3152 @@
 
     RET
 
+cglobal intra_pred_ang16_19, 4,7,8,0-(2*mmsize)
+    add         r1,            r1
+    lea         r4,            [r1 * 3]
+    lea         r6,            [ang_table + 16 * 16]
+    movu        m6,            [r2 + 2]
+    pshufb      m6,            [pw_ang16_16]
+    movu        m5,            [r2 + 12]
+    pshufb      m5,            [pw_ang16_16]
+    punpckhqdq  m5,            m6
+    mov         [rsp + mmsize],r2
+    lea         r2,            [r2 + 20]
+    mov         [rsp],         byte 2
+
+.loop:
+    movu        m0,        [r3]                       ; [7 6 5 4 3 2 1 0]
+    movu        m1,        [r3 + 2]                   ; [8 7 6 5 4 3 2 1]
+
+    palignr     m6,        m0, m5, 2
+
+    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
+    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6 - 10 * 16]             ; [6]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r6 - 10 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m0,        m3, 12
+    palignr     m3,        m6, 12
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6 - 4 * 16]              ; [12]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 4 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r6 + 2 * 16]              ; [18]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 2 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6 + 8 * 16]              ; [24]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 8 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6 + 14 * 16]             ; [30]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 14 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6 - 12 * 16]             ; [4]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 12 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r6 - 6 * 16]              ; [10]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m7,        m0
+    pmaddwd     m7,        [r6 - 6 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6]                      ; [16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r0 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6 + 6 * 16]              ; [22]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m6,        m0
+    pmaddwd     m6,        [r6 + 6 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m4,        m6
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6 + 12 * 16]             ; [28]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m6,        m0
+    pmaddwd     m6,        [r6 + 12 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m2,        m6
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r6 - 14 * 16]             ; [2]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 14 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    movu        m5,        [r2]
+    pshufb      m5,        [pw_ang8_17]
+
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6 - 8 * 16]              ; [8]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 8 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6 - 2 * 16]              ; [14]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 2 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6 + 4 * 16]              ; [20]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 4 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6 + 10 * 16]             ; [26]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 10 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    pmaddwd     m3,        [r6 - 16 * 16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m0,        [r6 - 16 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m3,        m0
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m7
+    movu        [r5 + r4],       m3
+
+    lea         r0,        [r0 + 16]
+    mov         r2,        [rsp + mmsize]
+    movu        m5,        [r3]
+    lea         r3,        [r3 + 16]
+    dec         byte [rsp]
+    jnz         .loop
+
+    RET
+
+cglobal intra_pred_ang16_20, 4,7,8,0-(2*mmsize)
+    add         r1,            r1
+    lea         r4,            [r1 * 3]
+    lea         r6,            [ang_table + 13 * 16]
+    movu        m6,            [r2 + 4]
+    pshufb      m6,            [pw_ang16_16]
+    movu        m5,            [r2 + 16]
+    pshufb      m5,            [pw_ang16_16]
+    punpckhqdq  m5,            m6
+    mov         [rsp + mmsize],r2
+    lea         r2,            [r2 + 24]
+    mov         [rsp],         byte 2
+
+.loop:
+    movu        m0,        [r3]                       ; [7 6 5 4 3 2 1 0]
+    movu        m1,        [r3 + 2]                   ; [8 7 6 5 4 3 2 1]
+
+    palignr     m6,        m0, m5, 2
+
+    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
+    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6 - 2 * 16]              ; [11]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r6 - 2 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m0,        m3, 12
+    palignr     m3,        m6, 12
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6 + 9 * 16]              ; [22]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 9 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r6 - 12 * 16]             ; [1]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 12 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6 - 1 * 16]              ; [12]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 1 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6 + 10 * 16]             ; [23]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 10 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6 - 11 * 16]             ; [2]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 11 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r6]                       ; [13]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m7,        m0
+    pmaddwd     m7,        [r6]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6 + 11 * 16]             ; [24]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 11 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r0 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6 - 10 * 16]             ; [3]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m6,        m0
+    pmaddwd     m6,        [r6 - 10 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m4,        m6
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6 + 1 * 16]              ; [14]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m6,        m0
+    pmaddwd     m6,        [r6 + 1 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m2,        m6
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r6 + 12 * 16]             ; [25]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 12 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6 - 9 * 16]              ; [4]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 9 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6 + 2 * 16]              ; [15]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 2 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    movu        m5,        [r2]
+    pshufb      m5,        [pw_ang8_16]
+
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6 + 13 * 16]             ; [26]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 13 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6 - 8 * 16]              ; [5]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 8 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    pmaddwd     m3,        [r6 + 3 * 16]              ; [16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m0,        [r6 + 3 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m3,        m0
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m7
+    movu        [r5 + r4],       m3
+
+    lea         r0,        [r0 + 16]
+    mov         r2,        [rsp + mmsize]
+    movu        m5,        [r3]
+    lea         r3,        [r3 + 16]
+    dec         byte [rsp]
+    jnz         .loop
+
+    RET
+
+cglobal intra_pred_ang16_21, 4,7,8
+    add         r1,            r1
+    lea         r4,            [r1 * 3]
+    lea         r6,            [ang_table + 15 * 16]
+    movu        m6,            [r2 + 4]
+    pshufb      m6,            [pw_ang8_15]
+    movu        m5,            [r2 + 18]
+    pshufb      m5,            [pw_ang8_15]
+    punpckhqdq  m5,            m6
+    mov         r2,            2
+
+.loop:
+    movu        m0,        [r3]                       ; [7 6 5 4 3 2 1 0]
+    movu        m1,        [r3 + 2]                   ; [8 7 6 5 4 3 2 1]
+
+    palignr     m6,        m0, m5, 2
+
+    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
+    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6]                       ; [15]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r6]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m0,        m3, 12
+    palignr     m3,        m6, 12
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6 + 15 * 16]             ; [30]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 15 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r6 - 2 * 16]              ; [13]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 2 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6 + 13 * 16]             ; [28]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 13 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6 - 4 * 16]              ; [11]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 4 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6 + 11 * 16]             ; [26]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 11 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r6 - 6 * 16]              ; [9]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m7,        m0
+    pmaddwd     m7,        [r6 - 6 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6 + 9 * 16]              ; [24]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 9 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r0 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6 - 8 * 16]              ; [7]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m6,        m0
+    pmaddwd     m6,        [r6 - 8 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m4,        m6
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6 + 7 * 16]              ; [22]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m6,        m0
+    pmaddwd     m6,        [r6 + 7 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m2,        m6
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r6 - 10 * 16]             ; [5]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 10 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6 + 5 * 16]              ; [20]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 5 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6 - 12 * 16]             ; [3]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 12 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6 + 3 * 16]              ; [18]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 3 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6 - 14 * 16]             ; [1]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 14 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    pmaddwd     m3,        [r6 + 1 * 16]              ; [16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m0,        [r6 + 1 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m3,        m0
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m7
+    movu        [r5 + r4],       m3
+
+    lea         r0,        [r0 + 16]
+    movu        m5,        [r3]
+    lea         r3,        [r3 + 16]
+    dec         r2
+    jnz         .loop
+
+    RET
+
+cglobal intra_pred_ang16_22, 4,7,8
+    add         r1,            r1
+    lea         r4,            [r1 * 3]
+    lea         r6,            [ang_table + 18 * 16]
+    movu        m6,            [r2]
+    pshufb      m6,            [pw_ang8_14]
+    movu        m5,            [r2 + 20]
+    pshufb      m5,            [pw_ang8_14]
+    punpckhqdq  m5,            m6
+    mov         r2,            2
+
+.loop:
+    movu        m0,        [r3]                       ; [7 6 5 4 3 2 1 0]
+    movu        m1,        [r3 + 2]                   ; [8 7 6 5 4 3 2 1]
+
+    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
+    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6 + 1 * 16]              ; [19]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r6 + 1 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6 - 12 * 16]             ; [6]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 12 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r6 + 7 * 16]              ; [25]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 7 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6 - 6 * 16]              ; [12]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 6 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6 + 13 * 16]             ; [31]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 13 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6]                       ; [18]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r6 - 13 * 16]             ; [5]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m7,        m0
+    pmaddwd     m7,        [r6 - 13 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6 + 6 * 16]              ; [24]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 6 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r0 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6 - 7 * 16]              ; [11]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m6,        m0
+    pmaddwd     m6,        [r6 - 7 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m4,        m6
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6 + 12 * 16]             ; [30]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m6,        m0
+    pmaddwd     m6,        [r6 + 12 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m2,        m6
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r6 - 1 * 16]              ; [17]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 1 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6 - 14 * 16]             ; [4]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 14 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6 + 5 * 16]              ; [23]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 5 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6 - 8 * 16]              ; [10]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 8 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6 + 11 * 16]             ; [29]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 11 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    pmaddwd     m3,        [r6 - 2 * 16]              ; [16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m0,        [r6 - 2 *16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m3,        m0
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m7
+    movu        [r5 + r4],       m3
+
+    lea         r0,        [r0 + 16]
+    movu        m5,        [r3 + 2]
+    lea         r3,        [r3 + 16]
+    dec         r2
+    jnz         .loop
+
+    RET
+
+cglobal intra_pred_ang16_23, 4,7,8
+    add         r1,            r1
+    lea         r4,            [r1 * 3]
+    lea         r6,            [ang_table + 15 * 16]
+    movu        m5,            [r2]
+    pshufb      m5,            [pw_ang16_13]
+    movu        m6,            [r2 + 14]
+    pshufb      m6,            [pw_ang8_13]
+    pslldq      m6,            2
+    palignr     m5,            m6, 6
+    mov         r2,            2
+
+.loop:
+    movu        m0,        [r3]                       ; [7 6 5 4 3 2 1 0]
+    movu        m1,        [r3 + 2]                   ; [8 7 6 5 4 3 2 1]
+
+    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
+    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6 + 8 * 16]              ; [23]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r6 + 8 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6 - 1 * 16]              ; [14]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 1 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r6 - 10 *16]             ; [5]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 10 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6 + 13 * 16]             ; [28]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 13 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6 + 4 * 16]              ; [19]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 4 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6 - 5 * 16]              ; [10]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 5 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r6 - 14 * 16]             ; [1]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m7,        m0
+    pmaddwd     m7,        [r6 - 14 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6 + 9 * 16]              ; [24]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 9 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r0 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6]                       ; [15]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m6,        m0
+    pmaddwd     m6,        [r6]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m4,        m6
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6 - 9 * 16]              ; [6]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m6,        m0
+    pmaddwd     m6,        [r6 - 9 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m2,        m6
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r6 + 14 * 16]             ; [29]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 14 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6 + 5 * 16]              ; [20]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 5 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6 - 4 * 16]              ; [11]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 4 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6 - 13 * 16]             ; [2]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 13 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6 + 10 * 16]             ; [25]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 10 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    pmaddwd     m3,        [r6 + 1 * 16]              ; [16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m0,        [r6 + 1 *16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m3,        m0
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m7
+    movu        [r5 + r4],       m3
+
+    lea         r0,        [r0 + 16]
+    movu        m5,        [r3 + 2]
+    lea         r3,        [r3 + 16]
+    dec         r2
+    jnz         .loop
+
+    RET
+
+cglobal intra_pred_ang16_24, 4,7,8
+    add         r1,            r1
+    lea         r4,            [r1 * 3]
+    lea         r6,            [ang_table + 16 * 16]
+    movu        m5,            [r2]
+    pshufb      m5,            [pw_ang8_12]
+    pinsrw      m5,            [r2 + 26], 5
+    mov         r2,            2
+
+.loop:
+    movu        m0,        [r3]                       ; [7 6 5 4 3 2 1 0]
+    movu        m1,        [r3 + 2]                   ; [8 7 6 5 4 3 2 1]
+
+    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
+    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6 + 11 * 16]             ; [27]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r6 + 11 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6 + 6 * 16]              ; [22]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 6 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r6 + 1 *16]              ; [17]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 1 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6 - 4 * 16]              ; [12]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 4 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6 - 9 * 16]              ; [7]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 9 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6 - 14 * 16]             ; [2]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 14 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r6 + 13 * 16]             ; [29]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m7,        m0
+    pmaddwd     m7,        [r6 + 13 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6 + 8 * 16]              ; [24]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 8 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r0 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6 + 3 *16]               ; [19]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m6,        m0
+    pmaddwd     m6,        [r6 + 3 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m4,        m6
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6 - 2 * 16]              ; [14]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m6,        m0
+    pmaddwd     m6,        [r6 - 2 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m2,        m6
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r6 - 7 * 16]              ; [9]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 7 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6 - 12 * 16]             ; [4]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 - 12 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    pslldq      m5,        2
+    palignr     m0,        m3, 12
+    palignr     m3,        m5, 12
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r6 + 15 * 16]             ; [31]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 15 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r6 + 10 * 16]             ; [26]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 10 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r6 + 5 * 16]              ; [21]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r6 + 5 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    pmaddwd     m3,        [r6]                       ; [16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m0,        [r6]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m3,        m0
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m7
+    movu        [r5 + r4],       m3
+
+    lea         r0,        [r0 + 16]
+    movu        m5,        [r3 + 2]
+    lea         r3,        [r3 + 16]
+    dec         r2
+    jnz         .loop
+
+    RET
+
+cglobal intra_pred_ang16_25, 3,7,8
+    mov         r2,        r3mp
+    lea         r3,        [ang_table + 16 * 16]
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+    mov         r6,        2
+
+.loop:
+    movu        m0,        [r2]                       ; [7 6 5 4 3 2 1 0]
+    movu        m1,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+
+    punpcklwd   m3,        m0, m1                     ; [4 3 3 2 2 1 1 0]
+    punpckhwd   m0,        m1                         ; [8 7 7 6 6 5 5 4]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 14 * 16]             ; [30]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 14 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 + 12 * 16]             ; [28]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 12 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 + 10 *16]             ; [26]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 10 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r3 + 8 * 16]              ; [24]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 8 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 6 * 16]              ; [22]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 6 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 + 4 * 16]              ; [20]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 4 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 + 2 * 16]              ; [18]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m7,        m0
+    pmaddwd     m7,        [r3 + 2 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r3]                       ; [16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r0 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 - 2 *16]               ; [14]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m6,        m0
+    pmaddwd     m6,        [r3 - 2 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m4,        m6
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 - 4 * 16]              ; [12]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m6,        m0
+    pmaddwd     m6,        [r3 - 4 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m2,        m6
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 - 6 * 16]              ; [10]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 6 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r3 - 8 * 16]              ; [8]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 8 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 - 10 * 16]             ; [6]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 10 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 - 12 * 16]             ; [4]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 12 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r3 - 14 * 16]             ; [2]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 14 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    movu        m3,        [r2]
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m7
+    movu        [r5 + r4],       m3
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + 16]
+    dec         r6
+    jnz         .loop
+
+    RET
+
+cglobal intra_pred_ang16_26, 4,5,4
+    movu        m0,                 [r3 + 2]            ; [8 7 6 5 4 3 2 1]
+    movu        m3,                 [r3 + 18]           ; [16 15 14 13 12 11 10 9]
+    add         r1,                 r1
+    lea         r4,                 [r1 * 3]
+
+    movu        [r0],               m0
+    movu        [r0 + 16],          m3
+    movu        [r0 + r1],          m0
+    movu        [r0 + r1 + 16],     m3
+    movu        [r0 + r1 * 2],      m0
+    movu        [r0 + r1 * 2 + 16], m3
+    movu        [r0 + r4],          m0
+    movu        [r0 + r4 + 16],     m3
+
+    lea         r3,                 [r0 + r1 *4]
+    movu        [r3],               m0
+    movu        [r3 + 16],          m3
+    movu        [r3 + r1],          m0
+    movu        [r3 + r1 + 16],     m3
+    movu        [r3 + r1 * 2],      m0
+    movu        [r3 + r1 * 2 + 16], m3
+    movu        [r3 + r4],          m0
+    movu        [r3 + r4 + 16],     m3
+
+    lea         r3,                 [r3 + r1 *4]
+    movu        [r3],               m0
+    movu        [r3 + 16],          m3
+    movu        [r3 + r1],          m0
+    movu        [r3 + r1 + 16],     m3
+    movu        [r3 + r1 * 2],      m0
+    movu        [r3 + r1 * 2 + 16], m3
+    movu        [r3 + r4],          m0
+    movu        [r3 + r4 + 16],     m3
+
+    lea         r3,                 [r3 + r1 *4]
+    movu        [r3],               m0
+    movu        [r3 + 16],          m3
+    movu        [r3 + r1],          m0
+    movu        [r3 + r1 + 16],     m3
+    movu        [r3 + r1 * 2],      m0
+    movu        [r3 + r1 * 2 + 16], m3
+    movu        [r3 + r4],          m0
+    movu        [r3 + r4 + 16],     m3
+
+    cmp         r5m,                byte 0
+    jz         .quit
+
+    ; filter
+
+    pshufb      m0,                 [pw_unpackwdq]
+    movh        m1,                 [r2]                ; [3 2 1 0]
+    pshufb      m2,                 m1, [pw_unpackwdq]  ; [0 0 0 0 0 0 0 0]
+    movu        m1,                 [r2 + 2]            ; [8 7 6 5 4 3 2 1]
+    movu        m3,                 [r2 + 18]           ; [16 15 14 13 12 11 10 9]
+    psubw       m1,                 m2
+    psubw       m3,                 m2
+    psraw       m1,                 1
+    psraw       m3,                 1
+    paddw       m3,                 m0
+    paddw       m0,                 m1
+    pxor        m1,                 m1
+    pmaxsw      m0,                 m1
+    pminsw      m0,                 [pw_1023]
+    pmaxsw      m3,                 m1
+    pminsw      m3,                 [pw_1023]
+    pextrw      [r0],               m0, 0
+    pextrw      [r0 + r1],          m0, 1
+    pextrw      [r0 + r1 * 2],      m0, 2
+    pextrw      [r0 + r4],          m0, 3
+    lea         r0,                 [r0 + r1 * 4]
+    pextrw      [r0],               m0, 4
+    pextrw      [r0 + r1],          m0, 5
+    pextrw      [r0 + r1 * 2],      m0, 6
+    pextrw      [r0 + r4],          m0, 7
+    lea         r0,                 [r0 + r1 * 4]
+    pextrw      [r0],               m3, 0
+    pextrw      [r0 + r1],          m3, 1
+    pextrw      [r0 + r1 * 2],      m3, 2
+    pextrw      [r0 + r4],          m3, 3
+    pextrw      [r3],               m3, 4
+    pextrw      [r3 + r1],          m3, 5
+    pextrw      [r3 + r1 * 2],      m3, 6
+    pextrw      [r3 + r4],          m3, 7
+
+.quit:
+    RET
+
+cglobal intra_pred_ang16_27, 3,7,8
+    mov         r2,        r3mp
+    lea         r3,        [ang_table + 16 * 16]
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+    mov         r6,        2
+
+.loop:
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m1,        [r2 + 4]                   ; [9 8 7 6 5 4 3 2]
+
+    punpcklwd   m3,        m0, m1                     ; [5 4 4 3 3 2 2 1]
+    punpckhwd   m0,        m1                         ; [9 8 8 7 7 6 6 5]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 - 14 * 16]             ; [2]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 - 14 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 - 12 * 16]             ; [4]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 12 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 - 10 *16]             ; [6]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 10 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r3 - 8 * 16]              ; [8]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 8 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 - 6 * 16]              ; [10]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 6 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 - 4 * 16]              ; [12]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 4 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 - 2 * 16]              ; [14]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m7,        m0
+    pmaddwd     m7,        [r3 - 2 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r3]                       ; [16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r0 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 2 *16]               ; [18]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m6,        m0
+    pmaddwd     m6,        [r3 + 2 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m4,        m6
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 + 4 * 16]              ; [20]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m6,        m0
+    pmaddwd     m6,        [r3 + 4 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m2,        m6
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 + 6 * 16]              ; [22]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 6 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r3 + 8 * 16]              ; [24]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 8 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 10 * 16]             ; [26]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 10 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 + 12 * 16]             ; [28]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 12 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    pmaddwd     m3,        [r3 + 14 * 16]             ; [30]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m0,        [r3 + 14 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m3,        m0
+
+    movu        m7,        [r2 + 4]
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m3
+    movu        [r5 + r4],       m7
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + 16]
+    dec         r6
+    jnz         .loop
+
+    RET
+
+cglobal intra_pred_ang16_28, 3,7,8
+    mov         r2,        r3mp
+    lea         r3,        [ang_table + 15 * 16]
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+    mov         r6,        2
+
+.loop:
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
+    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
+    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
+
+    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
+    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
+    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 - 10 * 16]             ; [5]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 - 10 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 - 5 * 16]              ; [10]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 - 5 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3]                       ; [15]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    mova        m7,        m3
+    pmaddwd     m7,        [r3 + 5 * 16]              ; [20]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 5 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 10 * 16]             ; [25]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 10 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 + 15 * 16]             ; [30]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 15 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    palignr     m6,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
+    pmaddwd     m6,        [r3 - 12 * 16]             ; [3]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    palignr     m7,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    pmaddwd     m7,        [r3 - 12 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    palignr     m7,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
+    pmaddwd     m7,        [r3 - 7 * 16]              ; [8]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    pmaddwd     m1,        [r3 - 7 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r0 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    palignr     m4,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
+    mova        m7,        m4
+    pmaddwd     m4,        [r3 - 2 *16]               ; [13]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    palignr     m6,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    mova        m1,        m6
+    pmaddwd     m6,        [r3 - 2 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m4,        m6
+
+    mova        m2,        m7
+    pmaddwd     m2,        [r3 + 3 * 16]              ; [18]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m6,        m1
+    pmaddwd     m6,        [r3 + 3 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m2,        m6
+
+    mova        m6,        m7
+    pmaddwd     m6,        [r3 + 8 * 16]              ; [23]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    pmaddwd     m1,        [r3 + 8 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m7,        [r3 + 13 * 16]             ; [28]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    pmaddwd     m1,        [r3 + 13 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    palignr     m1,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
+    mova        m4,        m1
+    pmaddwd     m4,        [r3 - 14 * 16]             ; [1]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    palignr     m5,        m0, 8                      ; [11 10 10 9 9 8 8 7]
+    mova        m0,        m5
+    pmaddwd     m0,        [r3 - 14 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m4,        m0
+
+    mova        m2,        m1
+    pmaddwd     m2,        [r3 - 9 * 16]              ; [6]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m3,        m5
+    pmaddwd     m3,        [r3 - 9 * 16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    packusdw    m2,        m3
+
+    mova        m7,        m1
+    pmaddwd     m7,        [r3 - 4 * 16]              ; [11]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m3,        m5
+    pmaddwd     m3,        [r3 - 4 * 16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    packusdw    m7,        m3
+
+    pmaddwd     m1,        [r3 + 1 * 16]              ; [16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    pmaddwd     m5,        [r3 + 1 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m1,        m5
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m7
+    movu        [r5 + r4],       m1
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + 16]
+    dec         r6
+    jnz         .loop
+
+    RET
+
+cglobal intra_pred_ang16_29, 3,7,8
+    mov         r2,        r3mp
+    lea         r3,        [ang_table + 17 * 16]
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+    mov         r6,        2
+
+.loop:
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
+    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
+    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
+
+    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
+    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
+    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 - 8 * 16]              ; [9]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 - 8 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 + 1 * 16]              ; [18]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 1 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 + 10 * 16]             ; [27]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 10 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    palignr     m7,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
+    pmaddwd     m7,        [r3 - 13 * 16]             ; [4]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    pmaddwd     m1,        [r3 - 13 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+
+    palignr     m4,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
+    mova        m6,        m4
+    pmaddwd     m4,        [r3 - 4 * 16]              ; [13]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    mova        m7,        m1
+    pmaddwd     m1,        [r3 - 4 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    mova        m2,        m6
+    pmaddwd     m2,        [r3 + 5 * 16]              ; [22]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m7
+    pmaddwd     m1,        [r3 + 5 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    pmaddwd     m6,        [r3 + 14 * 16]             ; [31]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    pmaddwd     m7,        [r3 + 14 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    palignr     m7,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
+    pmaddwd     m7,        [r3 - 9 * 16]              ; [8]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m1,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    pmaddwd     m1,        [r3 - 9 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r0 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    palignr     m4,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
+    mova        m2,        m4
+    pmaddwd     m4,        [r3]                       ; [17]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    palignr     m1,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    mova        m7,        m1
+    pmaddwd     m1,        [r3]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m2,        [r3 + 9 * 16]              ; [26]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    pmaddwd     m7,        [r3 + 9 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m2,        m7
+
+    palignr     m6,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
+    pmaddwd     m6,        [r3 - 14 * 16]             ; [3]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    palignr     m1,        m5, m0, 12                 ; [12 11 11 10 10 9 9 8]
+    pmaddwd     m1,        [r3 - 14 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    palignr     m7,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
+    pmaddwd     m7,        [r3 - 5 * 16]             ; [12]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m1,        m5, m0, 12                 ; [12 11 11 10 10 9 9 8]
+    pmaddwd     m1,        [r3 - 5 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    palignr     m4,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
+    mova        m2,        m4
+    pmaddwd     m4,        [r3 + 4 * 16]              ; [21]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    palignr     m1,        m5, m0, 12                 ; [12 11 11 10 10 9 9 8]
+    mova        m3,        m1
+    pmaddwd     m1,        [r3 + 4 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m2,        [r3 + 13 * 16]             ; [30]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    pmaddwd     m3,        [r3 + 13 * 16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    packusdw    m2,        m3
+
+    mova        m7,        m0
+    pmaddwd     m7,        [r3 - 10 * 16]             ; [7]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m3,        m5
+    pmaddwd     m3,        [r3 - 10 * 16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    packusdw    m7,        m3
+
+    pmaddwd     m0,        [r3 - 1 * 16]              ; [16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    pmaddwd     m5,        [r3 - 1 * 16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    packusdw    m0,        m5
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m7
+    movu        [r5 + r4],       m0
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + 16]
+    dec         r6
+    jnz         .loop
+
+    RET
+
+cglobal intra_pred_ang16_30, 3,7,8
+    mov         r2,        r3mp
+    lea         r3,        [ang_table + 15 * 16]
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+    mov         r6,        2
+
+.loop:
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
+    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
+    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
+
+    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
+    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
+    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 - 2 * 16]              ; [13]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 - 2 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 + 11 * 16]             ; [26]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m0
+    pmaddwd     m1,        [r3 + 11 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    palignr     m6,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
+    mova        m7,        m6
+    pmaddwd     m6,        [r3 - 8 * 16]              ; [7]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    pmaddwd     m1,        [r3 - 8 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    pmaddwd     m7,        [r3 + 5 * 16]              ; [20]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    pmaddwd     m1,        [r3 + 5 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+
+    palignr     m4,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
+    mova        m6,        m4
+    pmaddwd     m4,        [r3 - 14 * 16]             ; [1]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    palignr     m1,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    mova        m7,        m1
+    pmaddwd     m1,        [r3 - 14 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    mova        m2,        m6
+    pmaddwd     m2,        [r3 - 1 * 16]              ; [14]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m7
+    pmaddwd     m1,        [r3 - 1 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    pmaddwd     m6,        [r3 + 12 * 16]             ; [27]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    pmaddwd     m7,        [r3 + 12 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    palignr     m7,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
+    pmaddwd     m7,        [r3 - 7 * 16]              ; [8]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m1,        m5, m0, 12                 ; [12 11 11 10 10 9 9 8]
+    pmaddwd     m1,        [r3 - 7 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r0 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    palignr     m4,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
+    pmaddwd     m4,        [r3 + 6 * 16]              ; [21]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    palignr     m2,        m5, m0, 12                 ; [12 11 11 10 10 9 9 8]
+    pmaddwd     m2,        [r3 + 6 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 - 13 * 16]             ; [2]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m7,        m5
+    pmaddwd     m7,        [r3 - 13 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m2,        m7
+
+    mova        m6,        m0
+    pmaddwd     m6,        [r3]                       ; [15]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m1,        m5
+    pmaddwd     m1,        [r3]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    mova        m7,        m0
+    pmaddwd     m7,        [r3 + 13 * 16]             ; [28]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m1,        m5
+    pmaddwd     m1,        [r3 + 13 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    movh        m3,        [r2 + 26]                  ; [16 15 14 13]
+
+    palignr     m4,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    mova        m2,        m4
+    pmaddwd     m4,        [r3 - 6 * 16]              ; [9]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    palignr     m1,        m3, m5, 4                  ; [14 13 13 12 12 11 11 10]
+    mova        m6,        m1
+    pmaddwd     m1,        [r3 - 6 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m4,        m1
+
+    pmaddwd     m2,        [r3 + 7 * 16]              ; [22]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m1,        m6
+    pmaddwd     m1,        [r3 + 7 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    psrldq      m3,        2
+    palignr     m7,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    mova        m5,        m7
+    pmaddwd     m7,        [r3 - 12 * 16]             ; [3]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m3,        m6, 4                      ; [15 14 14 13 13 12 12 11]
+    mova        m1,        m3
+    pmaddwd     m3,        [r3 - 12 * 16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    packusdw    m7,        m3
+
+    pmaddwd     m5,        [r3 + 1 * 16]              ; [16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    pmaddwd     m1,        [r3 + 1 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m5,        m1
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m7
+    movu        [r5 + r4],       m5
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + 16]
+    dec         r6
+    jnz         .loop
+
+    RET
+
+cglobal intra_pred_ang16_31, 3,7,8
+    mov         r2,        r3mp
+    lea         r3,        [ang_table + 16 * 16]
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+    mov         r6,        2
+
+.loop:
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
+    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
+    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
+
+    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
+    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
+    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 1 * 16]              ; [17]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 1 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m2,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
+    mova        m6,        m2
+    pmaddwd     m2,        [r3 - 14 * 16]             ; [2]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    mova        m7,        m1
+    pmaddwd     m1,        [r3 - 14 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    pmaddwd     m6,        [r3 + 3 * 16]              ; [19]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    pmaddwd     m7,        [r3 + 3 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    palignr     m7,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
+    pmaddwd     m7,        [r3 - 12 * 16]             ; [4]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m1,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    pmaddwd     m1,        [r3 - 12 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+
+    palignr     m4,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
+    pmaddwd     m4,        [r3 + 5 * 16]              ; [21]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    palignr     m7,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    pmaddwd     m7,        [r3 + 5 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m4,        m7
+
+    palignr     m2,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
+    mova        m6,        m2
+    pmaddwd     m2,        [r3 - 10 * 16]             ; [6]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    palignr     m1,        m5, m0, 12                 ; [12 11 11 10 10 9 9 8]
+    mova        m7,        m1
+    pmaddwd     m1,        [r3 - 10 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    pmaddwd     m6,        [r3 + 7 * 16]              ; [23]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    pmaddwd     m7,        [r3 + 7 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    mova        m7,        m0
+    pmaddwd     m7,        [r3 - 8 * 16]              ; [8]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    mova        m3,        m5
+    pmaddwd     m3,        [r3 - 8 * 16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    packusdw    m7,        m3
+
+    lea         r5,              [r0 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    movu        m1,        [r2 + 26]                  ; [20 19 18 17 16 15 14 13]
+    psrldq      m4,        m1, 2                      ; [x 20 19 18 17 16 15 14]
+
+    punpcklwd   m3,        m1, m4                     ; [17 16 16 15 15 14 14 13]
+
+    mova        m4,        m0
+    pmaddwd     m4,        [r3 + 9 * 16]              ; [25]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m5
+    pmaddwd     m2,        [r3 + 9 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m2,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    mova        m6,        m2
+    pmaddwd     m2,        [r3 - 6 * 16]              ; [10]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    palignr     m7,        m3, m5, 4                  ; [14 13 13 12 12 11 11 10]
+    mova        m1,        m7
+    pmaddwd     m7,        [r3 - 6 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m2,        m7
+
+    pmaddwd     m6,        [r3 + 11 * 16]             ; [27]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    pmaddwd     m1,        [r3 + 11 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m6,        m1
+
+    palignr     m7,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    pmaddwd     m7,        [r3 - 4 * 16]              ; [12]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m1,        m3, m5, 8                  ; [15 14 14 13 13 12 12 11]
+    pmaddwd     m1,        [r3 - 4 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    palignr     m4,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    pmaddwd     m4,        [r3 + 13 * 16]             ; [29]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    palignr     m2,        m3, m5, 8                  ; [15 14 14 13 13 12 12 11]
+    pmaddwd     m2,        [r3 + 13 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m2,        m5, m0, 12                 ; [12 11 11 10 10 9 9 8]
+    mova        m7,        m2
+    pmaddwd     m2,        [r3 - 2 * 16]              ; [14]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    palignr     m6,        m3, m5, 12                 ; [15 16 15 14 14 13 13 12]
+    mova        m0,        m6
+    pmaddwd     m6,        [r3 - 2 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m2,        m6
+
+    pmaddwd     m7,        [r3 + 15 * 16]             ; [31]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    pmaddwd     m0,        [r3 + 15 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m7,        m0
+
+    pmaddwd     m5,        [r3]                       ; [16]
+    paddd       m5,        [pd_16]
+    psrld       m5,        5
+    pmaddwd     m3,        [r3]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    packusdw    m5,        m3
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m7
+    movu        [r5 + r4],       m5
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + 16]
+    dec         r6
+    jnz         .loop
+
+    RET
+
+cglobal intra_pred_ang16_32, 3,7,8
+    mov         r2,        r3mp
+    lea         r3,        [ang_table + 18 * 16]
+    add         r1,        r1
+    lea         r4,        [r1 * 3]
+    mov         r6,        2
+
+.loop:
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
+    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
+    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
+
+    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
+    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
+    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 3 * 16]              ; [21]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 3 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m2,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
+    mova        m6,        m2
+    pmaddwd     m2,        [r3 - 8 * 16]              ; [10]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    palignr     m1,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    mova        m7,        m1
+    pmaddwd     m1,        [r3 - 8 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m2,        m1
+
+    pmaddwd     m6,        [r3 + 13 * 16]             ; [31]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    pmaddwd     m7,        [r3 + 13 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    palignr     m7,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
+    pmaddwd     m7,        [r3 + 2 * 16]              ; [20]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m1,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    pmaddwd     m1,        [r3 + 2 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+
+    palignr     m4,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
+    mova        m2,        m4
+    pmaddwd     m4,        [r3 - 9 * 16]              ; [9]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    palignr     m7,        m5, m0, 12                 ; [12 11 11 10 10 9 9 8]
+    mova        m6,        m7
+    pmaddwd     m7,        [r3 - 9 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m4,        m7
+
+    pmaddwd     m2,        [r3 + 12 * 16]             ; [30]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    pmaddwd     m6,        [r3 + 12 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m2,        m6
+
+    mova        m6,        m0
+    pmaddwd     m6,        [r3 + 1 * 16]              ; [19]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m7,        m5
+    pmaddwd     m7,        [r3 + 1 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    movu        m1,        [r2 + 26]                  ; [20 19 18 17 16 15 14 13]
+
+    palignr     m7,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    pmaddwd     m7,        [r3 - 10 * 16]             ; [8]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m3,        m1, m5, 4                  ; [14 13 13 12 12 11 11 10]
+    pmaddwd     m3,        [r3 - 10 * 16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    packusdw    m7,        m3
+
+    lea         r5,              [r0 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    psrldq      m4,        m1, 2                      ; [x 20 19 18 17 16 15 14]
+
+    punpcklwd   m3,        m1, m4                     ; [17 16 16 15 15 14 14 13]
+    punpckhwd   m1,        m4                         ; [x 20 20 19 19 18 18 17]
+
+    palignr     m4,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    pmaddwd     m4,        [r3 + 11 * 16]             ; [29]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    palignr     m2,        m3, m5, 4                  ; [14 13 13 12 12 11 11 10]
+    pmaddwd     m2,        [r3 + 11 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m2,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    pmaddwd     m2,        [r3]                       ; [18]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    palignr     m6,        m3, m5, 8                  ; [15 14 14 13 13 12 12 11]
+    pmaddwd     m6,        [r3]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m2,        m6
+
+    palignr     m6,        m5, m0, 12                 ; [12 11 11 10 10 9 9 8]
+    mova        m7,        m6
+    pmaddwd     m6,        [r3 - 11 * 16]             ; [7]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    palignr     m0,        m3, m5, 12                 ; [15 16 15 14 14 13 13 12]
+    pmaddwd     m0,        [r3 - 11 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m6,        m0
+
+    pmaddwd     m7,        [r3 + 10 * 16]             ; [28]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m0,        m3, m5, 12                 ; [15 16 15 14 14 13 13 12]
+    pmaddwd     m0,        [r3 + 10 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m7,        m0
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    mova        m4,        m5
+    pmaddwd     m4,        [r3 - 1 * 16]              ; [17]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m3
+    pmaddwd     m2,        [r3 - 1 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m2,        m3, m5, 4                  ; [14 13 13 12 12 11 11 10]
+    mova        m7,        m2
+    pmaddwd     m2,        [r3 - 12 * 16]             ; [6]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    palignr     m6,        m1, m3, 4                  ; [18 17 17 16 16 15 15 14]
+    mova        m0,        m6
+    pmaddwd     m6,        [r3 - 12 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m2,        m6
+
+    pmaddwd     m7,        [r3 + 9 * 16]              ; [27]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    pmaddwd     m0,        [r3 + 9 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m7,        m0
+
+    palignr     m0,        m3, m5, 8                  ; [15 14 14 13 13 12 12 11]
+    pmaddwd     m0,        [r3 - 2 * 16]              ; [16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    palignr     m1,        m3, 8                      ; [19 18 18 17 17 16 16 15]
+    pmaddwd     m1,        [r3 - 2 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m0,        m1
+
+    lea         r5,              [r5 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m7
+    movu        [r5 + r4],       m0
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + 16]
+    dec         r6
+    jnz         .loop
+
+    RET
+
+cglobal intra_pred_ang16_33, 3,7,8
+    mov         r2,        r3mp
+    lea         r3,        [ang_table + 16 * 16]
+    add         r1,        r1
+    lea         r4,              [r1 * 3]
+    mov         r6,        2
+
+.loop:
+    movu        m0,        [r2 + 2]                   ; [8 7 6 5 4 3 2 1]
+    movu        m1,        [r2 + 18]                  ; [16 15 14 13 12 11 10 9]
+    palignr     m2,        m1, m0, 2                  ; [9 8 7 6 5 4 3 2]
+    psrldq      m4,        m1, 2                      ; [x 16 15 14 13 12 11 10]
+
+    punpcklwd   m3,        m0, m2                     ; [5 4 4 3 3 2 2 1]
+    punpckhwd   m0,        m2                         ; [9 8 8 7 7 6 6 5]
+    punpcklwd   m5,        m1, m4                     ; [13 12 12 11 11 10 10 9]
+    punpckhwd   m1,        m4                         ; [x 16 16 15 15 14 14 13]
+
+    mova        m4,        m3
+    pmaddwd     m4,        [r3 + 10 * 16]             ; [26]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 10 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m2,        m0, m3, 4                  ; [6 5 5 4 4 3 3 2]
+    pmaddwd     m2,        [r3 + 4 * 16]              ; [20]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    palignr     m6,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    pmaddwd     m6,        [r3 + 4 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m2,        m6
+
+    palignr     m6,        m0, m3, 8                  ; [7 6 6 5 5 4 4 3]
+    pmaddwd     m6,        [r3 - 2 * 16]              ; [14]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    palignr     m7,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    pmaddwd     m7,        [r3 - 2 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    palignr     m7,        m0, m3, 12                 ; [8 7 7 6 6 5 5 4]
+    pmaddwd     m7,        [r3 - 8 * 16]              ; [ 8]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m3,        m5, m0, 12                 ; [12 11 11 10 10 9 9 8]
+    pmaddwd     m3,        [r3 - 8 * 16]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    packusdw    m7,        m3
+
+    movu        [r0],            m4
+    movu        [r0 + r1],       m2
+    movu        [r0 + r1 * 2],   m6
+    movu        [r0 + r4],       m7
+
+    mova        m4,        m0
+    pmaddwd     m4,        [r3 - 14 * 16]             ; [ 2]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    mova        m2,        m5
+    pmaddwd     m2,        [r3 - 14 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m0
+    pmaddwd     m2,        [r3 + 12 * 16]             ; [28]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m6,        m5
+    pmaddwd     m6,        [r3 + 12 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m2,        m6
+
+    palignr     m6,        m5, m0, 4                  ; [10 9 9 8 8 7 7 6]
+    pmaddwd     m6,        [r3 + 6 * 16]              ; [22]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    palignr     m7,        m1, m5, 4                  ; [14 13 13 12 12 11 11 10]
+    pmaddwd     m7,        [r3 + 6 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    palignr     m7,        m5, m0, 8                  ; [11 10 10 9 9 8 8 7]
+    pmaddwd     m7,        [r3]                       ; [16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m1,        m5, 8                      ; [15 14 14 13 13 12 12 11]
+    pmaddwd     m1,        [r3]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m7,        m1
+
+    lea         r5,              [r0 + r1 * 4]
+    movu        [r5],            m4
+    movu        [r5 + r1],       m2
+    movu        [r5 + r1 * 2],   m6
+    movu        [r5 + r4],       m7
+
+    movu        m1,        [r2 + 26]                  ; [20 19 18 17 16 15 14 13]
+    psrldq      m4,        m1, 2                      ; [x 20 19 18 17 16 15 14]
+
+    punpcklwd   m3,        m1, m4                     ; [17 16 16 15 15 14 14 13]
+    punpckhwd   m1,        m4                         ; [x 20 20 19 19 18 18 17]
+
+    palignr     m4,        m5, m0, 12                 ; [12 11 11 10 10 9 9 8]
+    pmaddwd     m4,        [r3 - 6 * 16]              ; [10]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    palignr     m2,        m3, m5, 12                 ; [15 16 15 14 14 13 13 12]
+    pmaddwd     m2,        [r3 - 6 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    mova        m2,        m5
+    pmaddwd     m2,        [r3 - 12 * 16]             ; [4]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    mova        m6,        m3
+    pmaddwd     m6,        [r3 - 12 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m2,        m6
+
+    mova        m6,        m5
+    pmaddwd     m6,        [r3 + 14 * 16]             ; [30]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    mova        m7,        m3
+    pmaddwd     m7,        [r3 + 14 * 16]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    packusdw    m6,        m7
+
+    palignr     m7,        m3, m5, 4                  ; [14 13 13 12 12 11 11 10]
+    pmaddwd     m7,        [r3 + 8 * 16]              ; [24]
+    paddd       m7,        [pd_16]
+    psrld       m7,        5
+    palignr     m0,        m1, m3, 4                  ; [18 17 17 16 16 15 15 14]
+    pmaddwd     m0,        [r3 + 8 * 16]
+    paddd       m0,        [pd_16]
+    psrld       m0,        5
+    packusdw    m7,        m0
+
+    lea         r5,                   [r5 + r1 * 4]
+    movu        [r5],                 m4
+    movu        [r5 + r1],            m2
+    movu        [r5 + r1 * 2],        m6
+    movu        [r5 + r4],            m7
+
+    palignr     m4,        m3, m5, 8                  ; [15 14 14 13 13 12 12 11]
+    pmaddwd     m4,        [r3 + 2 * 16]              ; [18]
+    paddd       m4,        [pd_16]
+    psrld       m4,        5
+    palignr     m2,        m1, m3, 8                  ; [19 18 18 17 17 16 16 15]
+    pmaddwd     m2,        [r3 + 2 * 16]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    packusdw    m4,        m2
+
+    palignr     m2,        m3, m5, 12                 ; [16 15 15 14 14 13 13 12]
+    pmaddwd     m2,        [r3 - 4 * 16]              ; [12]
+    paddd       m2,        [pd_16]
+    psrld       m2,        5
+    palignr     m6,        m1, m3, 12                 ; [20 19 19 18 18 17 17 16]
+    pmaddwd     m6,        [r3 - 4 * 16]
+    paddd       m6,        [pd_16]
+    psrld       m6,        5
+    packusdw    m2,        m6
+
+    pinsrw      m1,        [r2 + 42], 7
+    pmaddwd     m3,        [r3 - 10 * 16]             ; [6]
+    paddd       m3,        [pd_16]
+    psrld       m3,        5
+    pmaddwd     m1,        [r3 - 10 * 16]
+    paddd       m1,        [pd_16]
+    psrld       m1,        5
+    packusdw    m3,        m1
+
+    movu        m7,        [r2 + 28]
+
+    lea         r5,                   [r5 + r1 * 4]
+    movu        [r5],                 m4
+    movu        [r5 + r1],            m2
+    movu        [r5 + r1 * 2],        m3
+    movu        [r5 + r4],            m7
+
+    lea         r2,        [r2 + 16]
+    lea         r0,        [r0 + 16]
+    dec         r6
+    jnz         .loop
+
+    RET
+
 %macro MODE_2_34 0
     movu            m0, [r2 + 4]
     movu            m1, [r2 + 20]


More information about the x265-devel mailing list