[x265] [PATCH] asm: intrapred_angX_4x4 sse2 performance tweaks

dtyx265 at gmail.com dtyx265 at gmail.com
Mon Jun 22 05:53:37 CEST 2015


# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1434936838 25200
# Node ID b870e819ade1c9f197766318ffa7d96814dbb3cb
# Parent  44b6b2df7016f0129e66d91e9aab03261d02758a
asm: intrapred_angX_4x4 sse2 performance tweaks

Created individual primitives for angles 19-25 and 27-33 to allow
individual tweaking of each angle for about 20% performance improvement

intra_ang_4x4[ 3]	3.66x 	 542.46   	 1986.43
intra_ang_4x4[ 4]	4.21x 	 507.58   	 2135.09
intra_ang_4x4[ 5]	4.16x 	 510.05   	 2119.99
intra_ang_4x4[ 6]	4.43x 	 482.52   	 2135.18
intra_ang_4x4[ 7]	4.09x 	 477.58   	 1955.19
intra_ang_4x4[ 8]	4.53x 	 460.03   	 2085.06
intra_ang_4x4[ 9]	4.51x 	 462.54   	 2084.99
intra_ang_4x4[11]	4.53x 	 480.05   	 2176.00
intra_ang_4x4[12]	4.66x 	 480.00   	 2235.34
intra_ang_4x4[13]	4.24x 	 550.06   	 2330.84
intra_ang_4x4[14]	4.13x 	 567.51   	 2345.12
intra_ang_4x4[15]	4.08x 	 567.53   	 2315.21
intra_ang_4x4[16]	4.17x 	 567.52   	 2365.42
intra_ang_4x4[17]	3.98x 	 610.05   	 2425.51
intra_ang_4x4[19]	3.54x 	 514.99   	 1825.34
intra_ang_4x4[20]	3.88x 	 452.49   	 1755.41
intra_ang_4x4[21]	3.72x 	 452.66   	 1684.99
intra_ang_4x4[22]	3.79x 	 460.04   	 1745.36
intra_ang_4x4[23]	3.65x 	 470.09   	 1715.27
intra_ang_4x4[24]	4.60x 	 362.51   	 1666.24
intra_ang_4x4[25]	4.32x 	 362.62   	 1565.41
intra_ang_4x4[27]	4.24x 	 352.69   	 1496.58
intra_ang_4x4[28]	4.24x 	 352.60   	 1495.93
intra_ang_4x4[29]	3.66x 	 365.34   	 1336.02
intra_ang_4x4[30]	3.96x 	 377.61   	 1495.37
intra_ang_4x4[31]	3.68x 	 420.17   	 1545.37
intra_ang_4x4[32]	3.86x 	 400.19   	 1545.37
intra_ang_4x4[33]	3.12x 	 427.53   	 1335.37

diff -r 44b6b2df7016 -r b870e819ade1 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Jun 19 16:43:29 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Sun Jun 21 18:33:58 2015 -0700
@@ -2208,21 +2208,21 @@
         p.cu[BLOCK_4x4].intra_pred[16] = PFX(intra_pred_ang4_16_sse2);
         p.cu[BLOCK_4x4].intra_pred[17] = PFX(intra_pred_ang4_17_sse2);
         p.cu[BLOCK_4x4].intra_pred[18] = PFX(intra_pred_ang4_18_sse2);
-        p.cu[BLOCK_4x4].intra_pred[19] = PFX(intra_pred_ang4_17_sse2);
-        p.cu[BLOCK_4x4].intra_pred[20] = PFX(intra_pred_ang4_16_sse2);
-        p.cu[BLOCK_4x4].intra_pred[21] = PFX(intra_pred_ang4_15_sse2);
-        p.cu[BLOCK_4x4].intra_pred[22] = PFX(intra_pred_ang4_14_sse2);
-        p.cu[BLOCK_4x4].intra_pred[23] = PFX(intra_pred_ang4_13_sse2);
-        p.cu[BLOCK_4x4].intra_pred[24] = PFX(intra_pred_ang4_12_sse2);
-        p.cu[BLOCK_4x4].intra_pred[25] = PFX(intra_pred_ang4_11_sse2);
+        p.cu[BLOCK_4x4].intra_pred[19] = PFX(intra_pred_ang4_19_sse2);
+        p.cu[BLOCK_4x4].intra_pred[20] = PFX(intra_pred_ang4_20_sse2);
+        p.cu[BLOCK_4x4].intra_pred[21] = PFX(intra_pred_ang4_21_sse2);
+        p.cu[BLOCK_4x4].intra_pred[22] = PFX(intra_pred_ang4_22_sse2);
+        p.cu[BLOCK_4x4].intra_pred[23] = PFX(intra_pred_ang4_23_sse2);
+        p.cu[BLOCK_4x4].intra_pred[24] = PFX(intra_pred_ang4_24_sse2);
+        p.cu[BLOCK_4x4].intra_pred[25] = PFX(intra_pred_ang4_25_sse2);
         p.cu[BLOCK_4x4].intra_pred[26] = PFX(intra_pred_ang4_26_sse2);
-        p.cu[BLOCK_4x4].intra_pred[27] = PFX(intra_pred_ang4_9_sse2);
-        p.cu[BLOCK_4x4].intra_pred[28] = PFX(intra_pred_ang4_8_sse2);
-        p.cu[BLOCK_4x4].intra_pred[29] = PFX(intra_pred_ang4_7_sse2);
-        p.cu[BLOCK_4x4].intra_pred[30] = PFX(intra_pred_ang4_6_sse2);
-        p.cu[BLOCK_4x4].intra_pred[31] = PFX(intra_pred_ang4_5_sse2);
-        p.cu[BLOCK_4x4].intra_pred[32] = PFX(intra_pred_ang4_4_sse2);
-        p.cu[BLOCK_4x4].intra_pred[33] = PFX(intra_pred_ang4_3_sse2);
+        p.cu[BLOCK_4x4].intra_pred[27] = PFX(intra_pred_ang4_27_sse2);
+        p.cu[BLOCK_4x4].intra_pred[28] = PFX(intra_pred_ang4_28_sse2);
+        p.cu[BLOCK_4x4].intra_pred[29] = PFX(intra_pred_ang4_29_sse2);
+        p.cu[BLOCK_4x4].intra_pred[30] = PFX(intra_pred_ang4_30_sse2);
+        p.cu[BLOCK_4x4].intra_pred[31] = PFX(intra_pred_ang4_31_sse2);
+        p.cu[BLOCK_4x4].intra_pred[32] = PFX(intra_pred_ang4_32_sse2);
+        p.cu[BLOCK_4x4].intra_pred[33] = PFX(intra_pred_ang4_33_sse2);
 
         p.cu[BLOCK_4x4].intra_pred_allangs = PFX(all_angs_pred_4x4_sse2);
 
diff -r 44b6b2df7016 -r b870e819ade1 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Fri Jun 19 16:43:29 2015 +0530
+++ b/source/common/x86/intrapred8.asm	Sun Jun 21 18:33:58 2015 -0700
@@ -1541,6 +1541,30 @@
 
 %endif ; end ARCH_X86_32
 
+%macro STORE_4x4 0
+    movd        [r0], m0
+    psrldq      m0, 4
+    movd        [r0 + r1], m0
+    psrldq      m0, 4
+    movd        [r0 + r1 * 2], m0
+    lea         r1, [r1 * 3]
+    psrldq      m0, 4
+    movd        [r0 + r1], m0
+%endmacro
+
+%macro TRANSPOSE_4x4 0
+    pshufd      m0, m0, 0xD8
+    pshufd      m1, m2, 0xD8
+    pshuflw     m0, m0, 0xD8
+    pshuflw     m1, m1, 0xD8
+    pshufhw     m0, m0, 0xD8
+    pshufhw     m1, m1, 0xD8
+    mova        m2, m0
+    punpckldq   m0, m1
+    punpckhdq   m2, m1
+    packuswb    m0, m2
+%endmacro
+
 ;-----------------------------------------------------------------------------------------
 ; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
 ;-----------------------------------------------------------------------------------------
@@ -1563,214 +1587,208 @@
     RET
 
 INIT_XMM sse2
-cglobal intra_pred_ang4_3, 3,5,8
-    mov         r4d, 1
-    cmp         r3m, byte 33
-    mov         r3d, 9
-    cmove       r3d, r4d
-
-    movh        m0, [r2 + r3]   ; [8 7 6 5 4 3 2 1]
-    mova        m1, m0
-    psrldq      m1, 1           ; [x 8 7 6 5 4 3 2]
-    punpcklbw   m0, m1          ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
-    mova        m1, m0
-    psrldq      m1, 2           ; [x x x x x x x x 6 5 5 4 4 3 3 2]
-    mova        m2, m0
-    psrldq      m2, 4           ; [x x x x x x x x 7 6 6 5 5 4 4 3]
-    mova        m3, m0
-    psrldq      m3, 6           ; [x x x x x x x x 8 7 7 6 6 5 5 4]
-    punpcklqdq  m0, m1
-    punpcklqdq  m2, m3
-
-    lea         r3, [pw_ang_table + 20 * 16]
-    mova        m4, [r3 + 6 * 16]   ; [26]
-    mova        m5, [r3]            ; [20]
-    mova        m6, [r3 - 6 * 16]   ; [14]
-    mova        m7, [r3 - 12 * 16]  ; [ 8]
-    jmp        .do_filter4x4
-
-    ; NOTE: share path, input is m0=[1 0], m2=[3 2], m3,m4=coef, flag_z=no_transpose
-ALIGN 16
-.do_filter4x4:
-    pxor        m1, m1
-    punpckhbw   m3, m0
-    psrlw       m3, 8
-    pmaddwd     m3, m5
-    punpcklbw   m0, m1
-    pmaddwd     m0, m4
+cglobal intra_pred_ang4_3, 3,3,5
+    movh        m3, [r2 + 9]   ; [8 7 6 5 4 3 2 1]
+    punpcklbw   m3, m3
+    psrldq      m3, 1
+    movh        m0, m3                  ;[x x x x x x x x 5 4 4 3 3 2 2 1]
+    psrldq      m3, 2
+    movh        m1, m3                  ;[x x x x x x x x 6 5 5 4 4 3 3 2]
+    psrldq      m3, 2
+    movh        m2, m3                  ;[x x x x x x x x 7 6 6 5 5 4 4 3]
+    psrldq      m3, 2                   ;[x x x x x x x x 8 7 7 6 6 5 5 4]
+
+    pxor        m4, m4
+    punpcklbw   m1, m4
+    pmaddwd     m1, [pw_ang_table + 20 * 16]
+    punpcklbw   m0, m4
+    pmaddwd     m0, [pw_ang_table + 26 * 16]
+    packssdw    m0, m1
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    punpcklbw   m3, m4
+    pmaddwd     m3, [pw_ang_table + 8 * 16]
+    punpcklbw   m2, m4
+    pmaddwd     m2, [pw_ang_table + 14 * 16]
+    packssdw    m2, m3
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+
+    TRANSPOSE_4x4
+
+    STORE_4x4
+    RET
+
+cglobal intra_pred_ang4_4, 3,3,5
+    movh        m1, [r2 + 9]            ;[8 7 6 5 4 3 2 1]
+    punpcklbw   m1, m1
+    psrldq      m1, 1
+    movh        m0, m1                  ;[x x x x x x x x 5 4 4 3 3 2 2 1]
+    psrldq      m1, 2
+    movh        m2, m1                  ;[x x x x x x x x 6 5 5 4 4 3 3 2]
+    psrldq      m1, 2                   ;[x x x x x x x x 7 6 6 5 5 4 4 3]
+
+    pxor        m4, m4
+    punpcklbw   m2, m4
+    mova        m3, m2
+    pmaddwd     m3, [pw_ang_table + 10 * 16]
+    punpcklbw   m0, m4
+    pmaddwd     m0, [pw_ang_table + 21 * 16]
     packssdw    m0, m3
     paddw       m0, [pw_16]
     psraw       m0, 5
-    punpckhbw   m3, m2
-    psrlw       m3, 8
-    pmaddwd     m3, m7
+    punpcklbw   m1, m4
+    pmaddwd     m1, [pw_ang_table + 20 * 16]
+    pmaddwd     m2, [pw_ang_table + 31 * 16]
+    packssdw    m2, m1
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+
+    TRANSPOSE_4x4
+
+    STORE_4x4
+    RET
+
+cglobal intra_pred_ang4_5, 3,3,5
+    movh        m3, [r2 + 9]            ;[8 7 6 5 4 3 2 1]
+    punpcklbw   m3, m3
+    psrldq      m3, 1
+    mova        m0, m3                  ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
+    psrldq      m3, 2
+    mova        m2, m3                  ;[x x x x x x x x 6 5 5 4 4 3 3 2]
+    psrldq      m3, 2                   ;[x x x x x x x x 7 6 6 5 5 4 4 3]
+
+    pxor        m1, m1
     punpcklbw   m2, m1
-    pmaddwd     m2, m6
+    mova        m4, m2
+    pmaddwd     m4, [pw_ang_table + 2 * 16]
+    punpcklbw   m0, m1
+    pmaddwd     m0, [pw_ang_table + 17 * 16]
+    packssdw    m0, m4
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    punpcklbw   m3, m1
+    pmaddwd     m3, [pw_ang_table + 4 * 16]
+    pmaddwd     m2, [pw_ang_table + 19 * 16]
     packssdw    m2, m3
     paddw       m2, [pw_16]
     psraw       m2, 5
 
-    ; NOTE: mode 33 doesn't reorder, UNSAFE but I don't use any instruction that affect eflag register before
-    jz         .store
-
-    ; transpose 4x4 c_trans_4x4           db  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
-    pshufd      m0, m0, 0xD8
-    pshufd      m1, m2, 0xD8
-    pshuflw     m0, m0, 0xD8
-    pshuflw     m1, m1, 0xD8
-    pshufhw     m0, m0, 0xD8
-    pshufhw     m1, m1, 0xD8
+    TRANSPOSE_4x4
+
+    STORE_4x4
+    RET
+
+cglobal intra_pred_ang4_6, 3,3,4
+    movh        m2, [r2 + 9]            ;[8 7 6 5 4 3 2 1]
+    punpcklbw   m2, m2
+    psrldq      m2, 1
+    movh        m0, m2                  ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
+    psrldq      m2, 2                   ;[x x x 8 8 7 7 6 6 5 5 4 4 3 3 2]
+
+    pxor        m1, m1
+    punpcklbw   m0, m1
+    mova        m3, m0
+    pmaddwd     m3, [pw_ang_table + 26 * 16]
+    pmaddwd     m0, [pw_ang_table + 13 * 16]
+    packssdw    m0, m3
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    punpcklbw   m2, m1
+    mova        m3, m2
+    pmaddwd     m3, [pw_ang_table + 20 * 16]
+    pmaddwd     m2, [pw_ang_table + 7 * 16]
+    packssdw    m2, m3
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+
+    TRANSPOSE_4x4
+
+    STORE_4x4
+    RET
+
+cglobal intra_pred_ang4_7, 3,3,5
+    movh        m3, [r2 + 9]            ;[8 7 6 5 4 3 2 1]
+    punpcklbw   m3, m3
+    psrldq      m3, 1
+    movh        m0, m3                  ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
+    psrldq      m3, 2                   ;[x x x x x x x x 6 5 5 4 4 3 3 2]
+
+    pxor        m1, m1
+    punpcklbw   m0, m1
+    mova        m4, m0
     mova        m2, m0
-    punpckldq   m0, m1
-    punpckhdq   m2, m1
-
-.store:
-    packuswb    m0, m2
-    movd        [r0], m0
-    psrldq      m0, 4
-    movd        [r0 + r1], m0
-    psrldq      m0, 4
-    movd        [r0 + r1 * 2], m0
-    lea         r1, [r1 * 3]
-    psrldq      m0, 4
-    movd        [r0 + r1], m0
-    RET
-
-cglobal intra_pred_ang4_4, 3,5,8
-    xor         r4d, r4d
-    inc         r4d
-    cmp         r3m, byte 32
-    mov         r3d, 9
-    cmove       r3d, r4d
-
-    movh        m0, [r2 + r3]    ; [8 7 6 5 4 3 2 1]
+    pmaddwd     m4, [pw_ang_table + 18 * 16]
+    pmaddwd     m0, [pw_ang_table + 9 * 16]
+    packssdw    m0, m4
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    punpcklbw   m3, m1
+    pmaddwd     m3, [pw_ang_table + 4 * 16]
+    pmaddwd     m2, [pw_ang_table + 27 * 16]
+    packssdw    m2, m3
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+
+    TRANSPOSE_4x4
+
+    STORE_4x4
+    RET
+
+cglobal intra_pred_ang4_8, 3,3,5
+    movh        m0, [r2 + 9]            ;[8 7 6 5 4 3 2 1]
     punpcklbw   m0, m0
-    psrldq      m0, 1
+    psrldq      m0, 1                   ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
+
+    pxor        m1, m1
+    punpcklbw   m0, m1
     mova        m2, m0
-    psrldq      m2, 2           ; [x x x x x x x x 6 5 5 4 4 3 3 2]
-    mova        m1, m0
-    psrldq      m1, 4           ; [x x x x x x x x 7 6 6 5 5 4 4 3]
-    punpcklqdq  m0, m2
-    punpcklqdq  m2, m1
-
-    lea         r3, [pw_ang_table + 18 * 16]
-    mova        m4, [r3 +  3 * 16]  ; [21]
-    mova        m5, [r3 -  8 * 16]  ; [10]
-    mova        m6, [r3 + 13 * 16]  ; [31]
-    mova        m7, [r3 +  2 * 16]  ; [20]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_5, 3,5,8
-    xor         r4d, r4d
-    inc         r4d
-    cmp         r3m, byte 31
-    mov         r3d, 9
-    cmove       r3d, r4d
-
-    movh        m0, [r2 + r3]    ; [8 7 6 5 4 3 2 1]
-    punpcklbw   m0, m0          ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
-    psrldq      m0, 1
+    mova        m3, m0
+    mova        m4, m2
+    pmaddwd     m3, [pw_ang_table + 10 * 16]
+    pmaddwd     m0, [pw_ang_table + 5 * 16]
+    packssdw    m0, m3
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    pmaddwd     m4, [pw_ang_table + 20 * 16]
+    pmaddwd     m2, [pw_ang_table + 15 * 16]
+    packssdw    m2, m4
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+
+    TRANSPOSE_4x4
+
+    STORE_4x4
+    RET
+
+cglobal intra_pred_ang4_9, 3,3,5
+    movh        m0, [r2 + 9]            ;[8 7 6 5 4 3 2 1]
+    punpcklbw   m0, m0
+    psrldq      m0, 1                   ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
+
+    pxor        m1, m1
+    punpcklbw   m0, m1
     mova        m2, m0
-    psrldq      m2, 2           ; [x x x x x x x x 6 5 5 4 4 3 3 2]
     mova        m3, m0
-    psrldq      m3, 4           ; [x x x x x x x x 7 6 6 5 5 4 4 3]
-    punpcklqdq  m0, m2
-    punpcklqdq  m2, m3
-
-    lea         r3, [pw_ang_table + 10 * 16]
-    mova        m4, [r3 +  7 * 16]  ; [17]
-    mova        m5, [r3 -  8 * 16]  ; [ 2]
-    mova        m6, [r3 +  9 * 16]  ; [19]
-    mova        m7, [r3 -  6 * 16]  ; [ 4]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_6, 3,5,8
-    xor         r4d, r4d
-    inc         r4d
-    cmp         r3m, byte 30
-    mov         r3d, 9
-    cmove       r3d, r4d
-
-    movh        m0, [r2 + r3]    ; [8 7 6 5 4 3 2 1]
-    punpcklbw   m0, m0
-    psrldq      m0, 1           ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
-    mova        m2, m0
-    psrldq      m2, 2           ; [x x x 8 8 7 7 6 6 5 5 4 4 3 3 2]
-    punpcklqdq  m0, m0
-    punpcklqdq  m2, m2
-
-    lea         r3, [pw_ang_table + 19 * 16]
-    mova        m4, [r3 -  6 * 16]  ; [13]
-    mova        m5, [r3 +  7 * 16]  ; [26]
-    mova        m6, [r3 - 12 * 16]  ; [ 7]
-    mova        m7, [r3 +  1 * 16]  ; [20]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_7, 3,5,8
-    xor         r4d, r4d
-    inc         r4d
-    cmp         r3m, byte 29
-    mov         r3d, 9
-    cmove       r3d, r4d
-
-    movh        m0, [r2 + r3]    ; [8 7 6 5 4 3 2 1]
-    punpcklbw   m0, m0
-    psrldq      m0, 1
-    mova        m2, m0
-    psrldq      m2, 2           ; [x x x x x x x x 6 5 5 4 4 3 3 2]
-    punpcklqdq  m0, m0
-    punpcklqdq  m2, m2
-    movhlps     m2, m0
-
-    lea         r3, [pw_ang_table + 20 * 16]
-    mova        m4, [r3 - 11 * 16]  ; [ 9]
-    mova        m5, [r3 -  2 * 16]  ; [18]
-    mova        m6, [r3 +  7 * 16]  ; [27]
-    mova        m7, [r3 - 16 * 16]  ; [ 4]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_8, 3,5,8
-    xor         r4d, r4d
-    inc         r4d
-    cmp         r3m, byte 28
-    mov         r3d, 9
-    cmove       r3d, r4d
-
-    movh        m0, [r2 + r3]    ; [8 7 6 5 4 3 2 1]
-    punpcklbw   m0, m0
-    psrldq      m0, 1
-    punpcklqdq  m0, m0
-    mova        m2, m0
-
-    lea         r3, [pw_ang_table + 13 * 16]
-    mova        m4, [r3 -  8 * 16]  ; [ 5]
-    mova        m5, [r3 -  3 * 16]  ; [10]
-    mova        m6, [r3 +  2 * 16]  ; [15]
-    mova        m7, [r3 +  7 * 16]  ; [20]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_9, 3,5,8
-    xor         r4d, r4d
-    inc         r4d
-    cmp         r3m, byte 27
-    mov         r3d, 9
-    cmove       r3d, r4d
-
-    movh        m0, [r2 + r3]    ; [8 7 6 5 4 3 2 1]
-    punpcklbw   m0, m0
-    psrldq      m0, 1           ; [x 8 7 6 5 4 3 2]
-    punpcklqdq  m0, m0
-    mova        m2, m0
-
-    lea         r3, [pw_ang_table + 4 * 16]
-    mova        m4, [r3 -  2 * 16]  ; [ 2]
-    mova        m5, [r3 -  0 * 16]  ; [ 4]
-    mova        m6, [r3 +  2 * 16]  ; [ 6]
-    mova        m7, [r3 +  4 * 16]  ; [ 8]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+    mova        m4, m2
+    pmaddwd     m3, [pw_ang_table + 4 * 16]
+    pmaddwd     m0, [pw_ang_table + 2 * 16]
+    packssdw    m0, m3
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    pmaddwd     m4, [pw_ang_table + 8 * 16]
+    pmaddwd     m2, [pw_ang_table + 6 * 16]
+    packssdw    m2, m4
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+
+    TRANSPOSE_4x4
+
+    STORE_4x4
+    RET
 
 cglobal intra_pred_ang4_10, 3,5,4
-    movd        m0, [r2 + 9]            ; [8 7 6 5 4 3 2 1]
+    movd        m0, [r2 + 9]            ;[8 7 6 5 4 3 2 1]
     punpcklbw   m0, m0
     punpcklwd   m0, m0
     pshufd      m1, m0, 1
@@ -1786,7 +1804,7 @@
     ; filter
     pxor        m3, m3
     punpcklbw   m0, m3
-    movh        m1, [r2]                ; [4 3 2 1 0]
+    movh        m1, [r2]                ;[4 3 2 1 0]
     punpcklbw   m1, m3
     pshuflw     m2, m1, 0x00
     psrldq      m1, 2
@@ -1799,8 +1817,491 @@
     movd        [r0], m0
     RET
 
+cglobal intra_pred_ang4_11, 3,3,5
+    movd        m1, [r2 + 9]            ;[4 3 2 1]
+    movh        m0, [r2 - 7]            ;[A x x x x x x x]
+    punpcklbw   m1, m1                  ;[4 4 3 3 2 2 1 1]
+    punpcklqdq  m0, m1                  ;[4 4 3 3 2 2 1 1 A x x x x x x x]]
+    psrldq      m0, 7                   ;[x x x x x x x x 4 3 3 2 2 1 1 A]
+
+    pxor        m1, m1
+    punpcklbw   m0, m1
+    mova        m2, m0
+    mova        m3, m0
+    mova        m4, m2
+    pmaddwd     m3, [pw_ang_table + 28 * 16]
+    pmaddwd     m0, [pw_ang_table + 30 * 16]
+    packssdw    m0, m3
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    pmaddwd     m4, [pw_ang_table + 24 * 16]
+    pmaddwd     m2, [pw_ang_table + 26 * 16]
+    packssdw    m2, m4
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+
+    TRANSPOSE_4x4
+
+    STORE_4x4
+    RET
+
+    cglobal intra_pred_ang4_12, 3,3,5
+    movd        m1, [r2 + 9]            ;[4 3 2 1]
+    movh        m0, [r2 - 7]            ;[A x x x x x x x]
+    punpcklbw   m1, m1                  ;[4 4 3 3 2 2 1 1]
+    punpcklqdq  m0, m1                  ;[4 4 3 3 2 2 1 1 A x x x x x x x]
+    psrldq      m0, 7                   ;[x x x x x x x x 4 3 3 2 2 1 1 A]
+
+    pxor        m1, m1
+    punpcklbw   m0, m1
+    mova        m2, m0
+    mova        m3, m0
+    mova        m4, m2
+    pmaddwd     m3, [pw_ang_table + 22 * 16]
+    pmaddwd     m0, [pw_ang_table + 27 * 16]
+    packssdw    m0, m3
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    pmaddwd     m4, [pw_ang_table + 12 * 16]
+    pmaddwd     m2, [pw_ang_table + 17 * 16]
+    packssdw    m2, m4
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+
+    TRANSPOSE_4x4
+
+    STORE_4x4
+    RET
+
+    cglobal intra_pred_ang4_24, 3,3,5
+    movd        m1, [r2 + 1]            ;[4 3 2 1]
+    movh        m0, [r2 - 7]            ;[A x x x x x x x]
+    punpcklbw   m1, m1                  ;[4 4 3 3 2 2 1 1]
+    punpcklqdq  m0, m1                  ;[4 4 3 3 2 2 1 1 A x x x x x x x]
+    psrldq      m0, 7                   ;[x x x x x x x x 4 3 3 2 2 1 1 A]
+
+    pxor        m1, m1
+    punpcklbw   m0, m1
+    mova        m2, m0
+    mova        m3, m0
+    mova        m4, m2
+    pmaddwd     m3, [pw_ang_table + 22 * 16]
+    pmaddwd     m0, [pw_ang_table + 27 * 16]
+    packssdw    m0, m3
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    pmaddwd     m4, [pw_ang_table + 12 * 16]
+    pmaddwd     m2, [pw_ang_table + 17 * 16]
+    packssdw    m2, m4
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+    packuswb    m0, m2
+
+    STORE_4x4
+    RET
+
+cglobal intra_pred_ang4_13, 3,3,5
+    movd        m1, [r2 - 1]            ;[x x A x]
+    movd        m2, [r2 + 9]           ;[4 3 2 1]
+    movd        m0, [r2 + 3]            ;[x x B x]
+    punpcklbw   m0, m1                  ;[x x x x A B x x]
+    punpckldq   m0, m2                  ;[4 3 2 1 A B x x]
+    psrldq      m0, 2                   ;[x x 4 3 2 1 A B]
+    punpcklbw   m0, m0
+    psrldq      m0, 1
+    movh        m3, m0                  ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
+    psrldq      m0, 2                   ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
+
+    pxor        m1, m1
+    punpcklbw   m0, m1
+    mova        m4, m0
+    mova        m2, m0
+    pmaddwd     m4, [pw_ang_table + 14 * 16]
+    pmaddwd     m0, [pw_ang_table + 23 * 16]
+    packssdw    m0, m4
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    punpcklbw   m3, m1
+    pmaddwd     m3, [pw_ang_table + 28 * 16]
+    pmaddwd     m2, [pw_ang_table + 5 * 16]
+    packssdw    m2, m3
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+
+    TRANSPOSE_4x4
+
+    STORE_4x4
+    RET
+
+cglobal intra_pred_ang4_14, 3,3,4
+    movd        m1, [r2 - 1]            ;[x x A x]
+    movd        m0, [r2 + 1]            ;[x x B x]
+    punpcklbw   m0, m1                  ;[A B x x]
+    movd        m1, [r2 + 9]            ;[4 3 2 1]
+    punpckldq   m0, m1                  ;[4 3 2 1 A B x x]
+    psrldq      m0, 2                   ;[x x 4 3 2 1 A B]
+    punpcklbw   m0, m0                  ;[x x x x 4 4 3 3 2 2 1 1 A A B B]
+    psrldq      m0, 1
+    movh        m2, m0                  ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
+    psrldq      m0, 2                   ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
+
+    pxor        m1, m1
+    punpcklbw   m0, m1
+    mova        m3, m0
+    pmaddwd     m3, [pw_ang_table + 6 * 16]
+    pmaddwd     m0, [pw_ang_table + 19 * 16]
+    packssdw    m0, m3
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    punpcklbw   m2, m1
+    mova        m3, m2
+    pmaddwd     m3, [pw_ang_table + 12 * 16]
+    pmaddwd     m2, [pw_ang_table + 25 * 16]
+    packssdw    m2, m3
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+
+    TRANSPOSE_4x4
+
+    STORE_4x4
+    RET
+
+cglobal intra_pred_ang4_15, 3,3,5
+    movd        m0, [r2]                ;[x x x A]
+    movd        m1, [r2 + 2]            ;[x x x B]
+    punpcklbw   m1, m0                  ;[x x A B]
+    movd        m0, [r2 + 3]            ;[x x C x]
+    punpcklwd   m0, m1                  ;[A B C x]
+    movd        m1, [r2 + 9]            ;[4 3 2 1]
+    punpckldq   m0, m1                  ;[4 3 2 1 A B C x]
+    psrldq      m0, 1                   ;[x 4 3 2 1 A B C]
+    punpcklbw   m0, m0                  ;[x x 4 4 3 3 2 2 1 1 A A B B C C]
+    psrldq      m0, 1
+    movh        m1, m0                  ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
+    psrldq      m0, 2
+    movh        m2, m0                  ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
+    psrldq      m0, 2                   ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
+
+    pxor        m4, m4
+    punpcklbw   m2, m4
+    mova        m3, m2
+    pmaddwd     m3, [pw_ang_table + 30 * 16]
+    punpcklbw   m0, m4
+    pmaddwd     m0, [pw_ang_table + 15 * 16]
+    packssdw    m0, m3
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    punpcklbw   m1, m4
+    pmaddwd     m1, [pw_ang_table + 28 * 16]
+    pmaddwd     m2, [pw_ang_table + 13 * 16]
+    packssdw    m2, m1
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+
+    TRANSPOSE_4x4
+
+    STORE_4x4
+    RET
+
+cglobal intra_pred_ang4_16, 3,3,5
+    movd        m2, [r2]                ;[x x x A]
+    movd        m1, [r2 + 2]            ;[x x x B]
+    punpcklbw   m1, m2                  ;[x x A B]
+    movd        m0, [r2 + 2]            ;[x x C x]
+    punpcklwd   m0, m1                  ;[A B C x]
+    movd        m1, [r2 + 9]            ;[4 3 2 1]
+    punpckldq   m0, m1                  ;[4 3 2 1 A B C x]
+    psrldq      m0, 1                   ;[x 4 3 2 1 A B C]
+    punpcklbw   m0, m0                  ;[x x 4 4 3 3 2 2 1 1 A A B B C C]
+    psrldq      m0, 1
+    movh        m1, m0                  ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
+    psrldq      m0, 2
+    movh        m2, m0                  ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
+    psrldq      m0, 2                   ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
+
+    pxor        m4, m4
+    punpcklbw   m2, m4
+    mova        m3, m2
+    pmaddwd     m3, [pw_ang_table + 22 * 16]
+    punpcklbw   m0, m4
+    pmaddwd     m0, [pw_ang_table + 11 * 16]
+    packssdw    m0, m3
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    punpcklbw   m1, m4
+    pmaddwd     m1, [pw_ang_table + 12 * 16]
+    pmaddwd     m2, [pw_ang_table + 1 * 16]
+    packssdw    m2, m1
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+
+    TRANSPOSE_4x4
+
+    STORE_4x4
+    RET
+
+cglobal intra_pred_ang4_17, 3,3,5
+    movd        m2, [r2]                ;[x x x A]
+    movd        m3, [r2 + 1]            ;[x x x B]
+    movd        m4, [r2 + 2]            ;[x x x C]
+    movd        m0, [r2 + 4]            ;[x x x D]
+    punpcklbw   m3, m2                  ;[x x A B]
+    punpcklbw   m0, m4                  ;[x x C D]
+    punpcklwd   m0, m3                  ;[A B C D]
+    movd        m1, [r2 + 9]            ;[4 3 2 1]
+    punpckldq   m0, m1                  ;[4 3 2 1 A B C D]
+    punpcklbw   m0, m0                  ;[4 4 3 3 2 2 1 1 A A B B C C D D]
+    psrldq      m0, 1
+    movh        m1, m0                  ;[x 4 4 3 3 2 2 1 1 A A B B C C D]
+    psrldq      m0, 2
+    movh        m2, m0                  ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
+    psrldq      m0, 2
+    movh        m3, m0                  ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
+    psrldq      m0, 2                   ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
+
+    pxor        m4, m4
+    punpcklbw   m3, m4
+    pmaddwd     m3, [pw_ang_table + 12 * 16]
+    punpcklbw   m0, m4
+    pmaddwd     m0, [pw_ang_table + 6 * 16]
+    packssdw    m0, m3
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    punpcklbw   m1, m4
+    pmaddwd     m1, [pw_ang_table + 24 * 16]
+    punpcklbw   m2, m4
+    pmaddwd     m2, [pw_ang_table + 18 * 16]
+    packssdw    m2, m1
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+
+    TRANSPOSE_4x4
+
+    STORE_4x4
+    RET
+
+cglobal intra_pred_ang4_18, 3,4,2
+    mov         r3d, [r2 + 8]
+    mov         r3b, byte [r2]
+    bswap       r3d
+    movd        m0, r3d
+
+    movd        m1, [r2 + 1]
+    punpckldq   m0, m1
+    lea         r3, [r1 * 3]
+    movd        [r0 + r3], m0
+    psrldq      m0, 1
+    movd        [r0 + r1 * 2], m0
+    psrldq      m0, 1
+    movd        [r0 + r1], m0
+    psrldq      m0, 1
+    movd        [r0], m0
+    RET
+
+cglobal intra_pred_ang4_19, 3,3,5
+    movd        m2, [r2]                ;[x x x A]
+    movd        m3, [r2 + 9]            ;[x x x B]
+    movd        m4, [r2 + 10]           ;[x x x C]
+    movd        m0, [r2 + 12]           ;[x x x D]
+    punpcklbw   m3, m2                  ;[x x A B]
+    punpcklbw   m0, m4                  ;[x x C D]
+    punpcklwd   m0, m3                  ;[A B C D]
+    movd        m1, [r2 + 1]            ;[4 3 2 1]
+    punpckldq   m0, m1                  ;[4 3 2 1 A B C D]
+    punpcklbw   m0, m0                  ;[4 4 3 3 2 2 1 1 A A B B C C D D]
+    psrldq      m0, 1
+    movh        m1, m0                  ;[x 4 4 3 3 2 2 1 1 A A B B C C D]
+    psrldq      m0, 2
+    movh        m2, m0                  ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
+    psrldq      m0, 2
+    movh        m3, m0                  ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
+    psrldq      m0, 2                   ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
+
+    pxor        m4, m4
+    punpcklbw   m3, m4
+    pmaddwd     m3, [pw_ang_table + 12 * 16]
+    punpcklbw   m0, m4
+    pmaddwd     m0, [pw_ang_table + 6 * 16]
+    packssdw    m0, m3
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    punpcklbw   m1, m4
+    pmaddwd     m1, [pw_ang_table + 24 * 16]
+    punpcklbw   m2, m4
+    pmaddwd     m2, [pw_ang_table + 18 * 16]
+    packssdw    m2, m1
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+    packuswb    m0, m2
+
+    STORE_4x4
+    RET
+
+cglobal intra_pred_ang4_20, 3,3,5
+    movd        m2, [r2]                ;[x x x A]
+    movd        m1, [r2 + 10]           ;[x x x B]
+    punpcklbw   m1, m2                  ;[x x A B]
+    movd        m0, [r2 + 10]           ;[x x C x]
+    punpcklwd   m0, m1                  ;[A B C x]
+    movd        m1, [r2 + 1]            ;[4 3 2 1]
+    punpckldq   m0, m1                  ;[4 3 2 1 A B C x]
+    psrldq      m0, 1                   ;[x 4 3 2 1 A B C]
+    punpcklbw   m0, m0                  ;[x x 4 4 3 3 2 2 1 1 A A B B C C]
+    psrldq      m0, 1
+    movh        m1, m0                  ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
+    psrldq      m0, 2
+    movh        m2, m0                  ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
+    psrldq      m0, 2                   ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
+
+    pxor        m4, m4
+    punpcklbw   m2, m4
+    mova        m3, m2
+    pmaddwd     m3, [pw_ang_table + 22 * 16]
+    punpcklbw   m0, m4
+    pmaddwd     m0, [pw_ang_table + 11 * 16]
+    packssdw    m0, m3
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    punpcklbw   m1, m4
+    pmaddwd     m1, [pw_ang_table + 12 * 16]
+    pmaddwd     m2, [pw_ang_table + 1 * 16]
+    packssdw    m2, m1
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+    packuswb    m0, m2
+
+    STORE_4x4
+    RET
+
+cglobal intra_pred_ang4_21, 3,3,5
+    movd        m0, [r2]                ;[x x x A]
+    movd        m1, [r2 + 10]           ;[x x x B]
+    punpcklbw   m1, m0                  ;[x x A B]
+    movd        m0, [r2 + 11]           ;[x x C x]
+    punpcklwd   m0, m1                  ;[A B C x]
+    movd        m1, [r2 + 1]            ;[4 3 2 1]
+    punpckldq   m0, m1                  ;[4 3 2 1 A B C x]
+    psrldq      m0, 1                   ;[x 4 3 2 1 A B C]
+    punpcklbw   m0, m0                  ;[x x 4 4 3 3 2 2 1 1 A A B B C C]
+    psrldq      m0, 1
+    movh        m1, m0                  ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
+    psrldq      m0, 2
+    movh        m2, m0                  ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
+    psrldq      m0, 2                   ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
+
+    pxor        m4, m4
+    punpcklbw   m2, m4
+    mova        m3, m2
+    pmaddwd     m3, [pw_ang_table + 30 * 16]
+    punpcklbw   m0, m4
+    pmaddwd     m0, [pw_ang_table + 15 * 16]
+    packssdw    m0, m3
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    punpcklbw   m1, m4
+    pmaddwd     m1, [pw_ang_table + 28 * 16]
+    pmaddwd     m2, [pw_ang_table + 13 * 16]
+    packssdw    m2, m1
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+    packuswb    m0, m2
+
+    STORE_4x4
+    RET
+
+cglobal intra_pred_ang4_22, 3,3,4
+    movd        m1, [r2 - 1]            ;[x x A x]
+    movd        m0, [r2 + 9]            ;[x x B x]
+    punpcklbw   m0, m1                  ;[A B x x]
+    movd        m1, [r2 + 1]            ;[4 3 2 1]
+    punpckldq   m0, m1                  ;[4 3 2 1 A B x x]
+    psrldq      m0, 2                   ;[x x 4 3 2 1 A B]
+    punpcklbw   m0, m0                  ;[x x x x 4 4 3 3 2 2 1 1 A A B B]
+    psrldq      m0, 1
+    movh        m2, m0                  ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
+    psrldq      m0, 2                   ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
+
+    pxor        m1, m1
+    punpcklbw   m0, m1
+    mova        m3, m0
+    pmaddwd     m3, [pw_ang_table + 6 * 16]
+    pmaddwd     m0, [pw_ang_table + 19 * 16]
+    packssdw    m0, m3
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    punpcklbw   m2, m1
+    mova        m3, m2
+    pmaddwd     m3, [pw_ang_table + 12 * 16]
+    pmaddwd     m2, [pw_ang_table + 25 * 16]
+    packssdw    m2, m3
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+    packuswb    m0, m2
+
+    STORE_4x4
+    RET
+
+cglobal intra_pred_ang4_23, 3,3,5
+    movd        m1, [r2 - 1]            ;[x x A x]
+    movd        m2, [r2 + 1]            ;[4 3 2 1]
+    movd        m0, [r2 + 11]           ;[x x B x]
+    punpcklbw   m0, m1                  ;[x x x x A B x x]
+    punpckldq   m0, m2                  ;[4 3 2 1 A B x x]
+    psrldq      m0, 2                   ;[x x 4 3 2 1 A B]
+    punpcklbw   m0, m0
+    psrldq      m0, 1
+    mova        m3, m0                  ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
+    psrldq      m0, 2                   ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
+
+    pxor        m1, m1
+    punpcklbw   m0, m1
+    mova        m4, m0
+    mova        m2, m0
+    pmaddwd     m4, [pw_ang_table + 14 * 16]
+    pmaddwd     m0, [pw_ang_table + 23 * 16]
+    packssdw    m0, m4
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    punpcklbw   m3, m1
+    pmaddwd     m3, [pw_ang_table + 28 * 16]
+    pmaddwd     m2, [pw_ang_table + 5 * 16]
+    packssdw    m2, m3
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+    packuswb    m0, m2
+
+    STORE_4x4
+    RET
+
+cglobal intra_pred_ang4_25, 3,3,5
+    movd        m1, [r2 + 1]            ;[4 3 2 1]
+    movh        m0, [r2 - 7]            ;[A x x x x x x x]
+    punpcklbw   m1, m1                  ;[4 4 3 3 2 2 1 1]
+    punpcklqdq  m0, m1                  ;[4 4 3 3 2 2 1 1 A x x x x x x x]
+    psrldq      m0, 7                   ;[x x x x x x x x 4 3 3 2 2 1 1 A]
+
+    pxor        m1, m1
+    punpcklbw   m0, m1
+    mova        m2, m0
+    mova        m3, m0
+    mova        m4, m2
+    pmaddwd     m3, [pw_ang_table + 28 * 16]
+    pmaddwd     m0, [pw_ang_table + 30 * 16]
+    packssdw    m0, m3
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    pmaddwd     m4, [pw_ang_table + 24 * 16]
+    pmaddwd     m2, [pw_ang_table + 26 * 16]
+    packssdw    m2, m4
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+    packuswb    m0, m2
+
+    STORE_4x4
+    RET
+
 cglobal intra_pred_ang4_26, 3,4,4
-    movd        m0, [r2 + 1]            ; [8 7 6 5 4 3 2 1]
+    movd        m0, [r2 + 1]            ;[8 7 6 5 4 3 2 1]
 
     ; store
     movd        [r0], m0
@@ -1838,221 +2339,197 @@
 .quit:
     RET
 
-cglobal intra_pred_ang4_11, 3,5,8
-    xor         r4d, r4d
-    cmp         r3m, byte 25
-    mov         r3d, 8
-    cmove       r3d, r4d
-
-    movd        m1, [r2 + r3 + 1]       ;[4 3 2 1]
-    movh        m0, [r2 - 7]            ;[A x x x x x x x]
-    punpcklbw   m1, m1                  ;[4 4 3 3 2 2 1 1]
-    punpcklqdq  m0, m1                  ;[4 4 3 3 2 2 1 1 A x x x x x x x]]
-    psrldq      m0, 7                   ;[x x x x x x x x 4 3 3 2 2 1 1 A]
-    punpcklqdq  m0, m0
+cglobal intra_pred_ang4_27, 3,3,5
+    movh        m0, [r2 + 1]            ;[8 7 6 5 4 3 2 1]
+    punpcklbw   m0, m0
+    psrldq      m0, 1                   ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
+
+    pxor        m1, m1
+    punpcklbw   m0, m1
     mova        m2, m0
-
-    lea         r3, [pw_ang_table + 24 * 16]
-
-    mova        m4, [r3 +  6 * 16]  ; [24]
-    mova        m5, [r3 +  4 * 16]  ; [26]
-    mova        m6, [r3 +  2 * 16]  ; [28]
-    mova        m7, [r3 +  0 * 16]  ; [30]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_12, 3,5,8
-    xor         r4d, r4d
-    cmp         r3m, byte 24
-    mov         r3d, 8
-    cmove       r3d, r4d
-
-    movd        m1, [r2 + r3 + 1]
-    movh        m0, [r2 - 7]
+    mova        m3, m0
+    mova        m4, m2
+    pmaddwd     m3, [pw_ang_table + 4 * 16]
+    pmaddwd     m0, [pw_ang_table + 2 * 16]
+    packssdw    m0, m3
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    pmaddwd     m4, [pw_ang_table + 8 * 16]
+    pmaddwd     m2, [pw_ang_table + 6 * 16]
+    packssdw    m2, m4
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+    packuswb    m0, m2
+
+    STORE_4x4
+    RET
+
+cglobal intra_pred_ang4_28, 3,3,5
+    movh        m0, [r2 + 1]            ;[8 7 6 5 4 3 2 1]
+    punpcklbw   m0, m0
+    psrldq      m0, 1                   ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
+
+    pxor        m1, m1
+    punpcklbw   m0, m1
+    mova        m2, m0
+    mova        m3, m0
+    mova        m4, m2
+    pmaddwd     m3, [pw_ang_table + 10 * 16]
+    pmaddwd     m0, [pw_ang_table + 5 * 16]
+    packssdw    m0, m3
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    pmaddwd     m4, [pw_ang_table + 20 * 16]
+    pmaddwd     m2, [pw_ang_table + 15 * 16]
+    packssdw    m2, m4
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+    packuswb    m0, m2
+
+    STORE_4x4
+    RET
+
+cglobal intra_pred_ang4_29, 3,3,5
+    movh        m3, [r2 + 1]            ;[8 7 6 5 4 3 2 1]
+    punpcklbw   m3, m3
+    psrldq      m3, 1
+    movh        m0, m3                  ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
+    psrldq      m3, 2                   ;[x x x x x x x x 6 5 5 4 4 3 3 2]
+
+    pxor        m1, m1
+    punpcklbw   m0, m1
+    mova        m4, m0
+    mova        m2, m0
+    pmaddwd     m4, [pw_ang_table + 18 * 16]
+    pmaddwd     m0, [pw_ang_table + 9 * 16]
+    packssdw    m0, m4
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    punpcklbw   m3, m1
+    pmaddwd     m3, [pw_ang_table + 4 * 16]
+    pmaddwd     m2, [pw_ang_table + 27 * 16]
+    packssdw    m2, m3
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+    packuswb    m0, m2
+
+    STORE_4x4
+    RET
+
+cglobal intra_pred_ang4_30, 3,3,4
+    movh        m2, [r2 + 1]            ;[8 7 6 5 4 3 2 1]
+    punpcklbw   m2, m2
+    psrldq      m2, 1
+    movh        m0, m2                  ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
+    psrldq      m2, 2                   ;[x x x 8 8 7 7 6 6 5 5 4 4 3 3 2]
+
+    pxor        m1, m1
+    punpcklbw   m0, m1
+    mova        m3, m0
+    pmaddwd     m3, [pw_ang_table + 26 * 16]
+    pmaddwd     m0, [pw_ang_table + 13 * 16]
+    packssdw    m0, m3
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    punpcklbw   m2, m1
+    mova        m3, m2
+    pmaddwd     m3, [pw_ang_table + 20 * 16]
+    pmaddwd     m2, [pw_ang_table + 7 * 16]
+    packssdw    m2, m3
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+    packuswb    m0, m2
+
+    STORE_4x4
+    RET
+
+cglobal intra_pred_ang4_31, 3,3,5
+    movh        m3, [r2 + 1]            ;[8 7 6 5 4 3 2 1]
+    punpcklbw   m3, m3
+    psrldq      m3, 1
+    mova        m0, m3                  ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
+    psrldq      m3, 2
+    mova        m2, m3                  ;[x x x x x x x x 6 5 5 4 4 3 3 2]
+    psrldq      m3, 2                   ;[x x x x x x x x 7 6 6 5 5 4 4 3]
+
+    pxor        m1, m1
+    punpcklbw   m2, m1
+    mova        m4, m2
+    pmaddwd     m4, [pw_ang_table + 2 * 16]
+    punpcklbw   m0, m1
+    pmaddwd     m0, [pw_ang_table + 17 * 16]
+    packssdw    m0, m4
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    punpcklbw   m3, m1
+    pmaddwd     m3, [pw_ang_table + 4 * 16]
+    pmaddwd     m2, [pw_ang_table + 19 * 16]
+    packssdw    m2, m3
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+    packuswb    m0, m2
+
+    STORE_4x4
+    RET
+
+cglobal intra_pred_ang4_32, 3,3,5
+    movh        m1, [r2 + 1]            ;[8 7 6 5 4 3 2 1]
     punpcklbw   m1, m1
-    punpcklqdq  m0, m1
-    psrldq      m0, 7
-    punpcklqdq  m0, m0
-    mova        m2, m0
-
-    lea         r3, [pw_ang_table + 20 * 16]
-    mova        m4, [r3 +  7 * 16]  ; [27]
-    mova        m5, [r3 +  2 * 16]  ; [22]
-    mova        m6, [r3 -  3 * 16]  ; [17]
-    mova        m7, [r3 -  8 * 16]  ; [12]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_13, 3,5,8
-    xor         r4d, r4d
-    cmp         r3m, byte 23
-    mov         r3d, 8
-    jz          .next
-    xchg        r3d, r4d
-
-.next:
-    movd        m1, [r2 - 1]            ;[x x A x]
-    movd        m2, [r2 + r4 + 1]       ;[4 3 2 1]
-    movd        m0, [r2 + r3 + 3]       ;[x x B x]
-    punpcklbw   m0, m1                  ;[x x x x A B x x]
-    punpckldq   m0, m2                  ;[4 3 2 1 A B x x]
-    psrldq      m0, 2                   ;[x x 4 3 2 1 A B]
-    punpcklbw   m0, m0                  ;[x x x x 4 4 3 3 2 2 1 1 A A B B]
-    mova        m1, m0
-    psrldq      m0, 3                   ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
-    psrldq      m1, 1                   ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
-    movh        m2, m0
-    punpcklqdq  m0, m0
-    punpcklqdq  m2, m1
-
-    lea         r3, [pw_ang_table + 21 * 16]
-    mova        m4, [r3 +  2 * 16]  ; [23]
-    mova        m5, [r3 -  7 * 16]  ; [14]
-    mova        m6, [r3 - 16 * 16]  ; [ 5]
-    mova        m7, [r3 +  7 * 16]  ; [28]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_14, 3,5,8
-    xor         r4d, r4d
-    cmp         r3m, byte 22
-    mov         r3d, 8
-    jz          .next
-    xchg        r3d, r4d
-
-.next:
-    movd        m1, [r2 - 1]            ;[x x A x]
-    movd        m0, [r2 + r3 + 1]       ;[x x B x]
-    punpcklbw   m0, m1                  ;[A B x x]
-    movd        m1, [r2 + r4 + 1]       ;[4 3 2 1]
-    punpckldq   m0, m1                  ;[4 3 2 1 A B x x]
-    psrldq      m0, 2                   ;[x x 4 3 2 1 A B]
-    punpcklbw   m0, m0                  ;[x x x x 4 4 3 3 2 2 1 1 A A B B]
-    mova        m2, m0
-    psrldq      m0, 3                   ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
-    psrldq      m2, 1                   ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
-    punpcklqdq  m0, m0
-    punpcklqdq  m2, m2
-
-    lea         r3, [pw_ang_table + 19 * 16]
-    mova        m4, [r3 +  0 * 16]  ; [19]
-    mova        m5, [r3 - 13 * 16]  ; [ 6]
-    mova        m6, [r3 +  6 * 16]  ; [25]
-    mova        m7, [r3 -  7 * 16]  ; [12]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_15, 3,5,8
-    xor         r4d, r4d
-    cmp         r3m, byte 21
-    mov         r3d, 8
-    jz          .next
-    xchg        r3d, r4d
-
-.next:
-    movd        m0, [r2]                ;[x x x A]
-    movd        m1, [r2 + r3 + 2]       ;[x x x B]
-    punpcklbw   m1, m0                  ;[x x A B]
-    movd        m0, [r2 + r3 + 3]       ;[x x C x]
-    punpcklwd   m0, m1                  ;[A B C x]
-    movd        m1, [r2 + r4 + 1]       ;[4 3 2 1]
-    punpckldq   m0, m1                  ;[4 3 2 1 A B C x]
-    psrldq      m0, 1                   ;[x 4 3 2 1 A B C]
-    punpcklbw   m0, m0                  ;[x x 4 4 3 3 2 2 1 1 A A B B C C]
-    psrldq      m0, 1
-    movh        m1, m0
-    psrldq      m0, 2
-    movh        m2, m0
-    psrldq      m0, 2
-    punpcklqdq  m0, m2
-    punpcklqdq  m2, m1
-
-    lea         r3, [pw_ang_table + 23 * 16]
-    mova        m4, [r3 -  8 * 16]  ; [15]
-    mova        m5, [r3 +  7 * 16]  ; [30]
-    mova        m6, [r3 - 10 * 16]  ; [13]
-    mova        m7, [r3 +  5 * 16]  ; [28]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_16, 3,5,8
-    xor         r4d, r4d
-    cmp         r3m, byte 20
-    mov         r3d, 8
-    jz          .next
-    xchg        r3d, r4d
-
-.next:
-    movd        m2, [r2]                ;[x x x A]
-    movd        m1, [r2 + r3 + 2]       ;[x x x B]
-    punpcklbw   m1, m2                  ;[x x A B]
-    movh        m0, [r2 + r3 + 2]       ;[x x C x]
-    punpcklwd   m0, m1                  ;[A B C x]
-    movd        m1, [r2 + r4 + 1]       ;[4 3 2 1]
-    punpckldq   m0, m1                  ;[4 3 2 1 A B C x]
-    psrldq      m0, 1                   ;[x 4 3 2 1 A B C]
-    punpcklbw   m0, m0                  ;[x x 4 4 3 3 2 2 1 1 A A B B C C]
-    psrldq      m0, 1
-    movh        m1, m0
-    psrldq      m0, 2
-    movh        m2, m0
-    psrldq      m0, 2
-    punpcklqdq  m0, m2
-    punpcklqdq  m2, m1
-
-    lea         r3, [pw_ang_table + 19 * 16]
-    mova        m4, [r3 -  8 * 16]  ; [11]
-    mova        m5, [r3 +  3 * 16]  ; [22]
-    mova        m6, [r3 - 18 * 16]  ; [ 1]
-    mova        m7, [r3 -  7 * 16]  ; [12]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_17, 3,5,8
-    xor         r4d, r4d
-    cmp         r3m, byte 19
-    mov         r3d, 8
-    jz          .next
-    xchg        r3d, r4d
-
-.next:
-    movd        m2, [r2]                ;[x x x A]
-    movd        m3, [r2 + r3 + 1]       ;[x x x B]
-    movd        m4, [r2 + r3 + 2]       ;[x x x C]
-    movd        m0, [r2 + r3 + 4]       ;[x x x D]
-    punpcklbw   m3, m2                  ;[x x A B]
-    punpcklbw   m0, m4                  ;[x x C D]
-    punpcklwd   m0, m3                  ;[A B C D]
-    movd        m1, [r2 + r4 + 1]       ;[4 3 2 1]
-    punpckldq   m0, m1                  ;[4 3 2 1 A B C D]
-    punpcklbw   m0, m0                  ;[4 4 3 3 2 2 1 1 A A B B C C D D]
-    psrldq      m0, 1
-    movh        m1, m0
-    psrldq      m0, 2
-    movh        m2, m0
-    punpcklqdq  m2, m1
-    psrldq      m0, 2
-    movh        m1, m0
-    psrldq      m0, 2
-    punpcklqdq  m0, m1
-
-    lea         r3, [pw_ang_table + 14 * 16]
-    mova        m4, [r3 -  8 * 16]  ; [ 6]
-    mova        m5, [r3 -  2 * 16]  ; [12]
-    mova        m6, [r3 +  4 * 16]  ; [18]
-    mova        m7, [r3 + 10 * 16]  ; [24]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_18, 3,4,2
-    mov         r3d, [r2 + 8]
-    mov         r3b, byte [r2]
-    bswap       r3d
-    movd        m0, r3d
-
-    movd        m1, [r2 + 1]
-    punpckldq   m0, m1
-    lea         r3, [r1 * 3]
-    movd        [r0 + r3], m0
-    psrldq      m0, 1
-    movd        [r0 + r1 * 2], m0
-    psrldq      m0, 1
-    movd        [r0 + r1], m0
-    psrldq      m0, 1
-    movd        [r0], m0
+    psrldq      m1, 1
+    movh        m0, m1                  ;[x x x x x x x x 5 4 4 3 3 2 2 1]
+    psrldq      m1, 2
+    movh        m2, m1                  ;[x x x x x x x x 6 5 5 4 4 3 3 2]
+    psrldq      m1, 2                   ;[x x x x x x x x 7 6 6 5 5 4 4 3]
+
+    pxor        m4, m4
+    punpcklbw   m2, m4
+    mova        m3, m2
+    pmaddwd     m3, [pw_ang_table + 10 * 16]
+    punpcklbw   m0, m4
+    pmaddwd     m0, [pw_ang_table + 21 * 16]
+    packssdw    m0, m3
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    punpcklbw   m1, m4
+    pmaddwd     m1, [pw_ang_table + 20 * 16]
+    pmaddwd     m2, [pw_ang_table + 31 * 16]
+    packssdw    m2, m1
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+    packuswb    m0, m2
+
+    STORE_4x4
+    RET
+
+cglobal intra_pred_ang4_33, 3,3,5
+    movh        m3, [r2 + 1]   ; [8 7 6 5 4 3 2 1]
+    punpcklbw   m3, m3
+    psrldq      m3, 1
+    movh        m0, m3                  ;[x x x x x x x x 5 4 4 3 3 2 2 1]
+    psrldq      m3, 2
+    movh        m1, m3                  ;[x x x x x x x x 6 5 5 4 4 3 3 2]
+    psrldq      m3, 2
+    movh        m2, m3                  ;[x x x x x x x x 7 6 6 5 5 4 4 3]
+    psrldq      m3, 2                   ;[x x x x x x x x 8 7 7 6 6 5 5 4]
+
+    pxor        m4, m4
+    punpcklbw   m1, m4
+    pmaddwd     m1, [pw_ang_table + 20 * 16]
+    punpcklbw   m0, m4
+    pmaddwd     m0, [pw_ang_table + 26 * 16]
+    packssdw    m0, m1
+    paddw       m0, [pw_16]
+    psraw       m0, 5
+    punpcklbw   m3, m4
+    pmaddwd     m3, [pw_ang_table + 8 * 16]
+    punpcklbw   m2, m4
+    pmaddwd     m2, [pw_ang_table + 14 * 16]
+    packssdw    m2, m3
+    paddw       m2, [pw_16]
+    psraw       m2, 5
+    packuswb    m0, m2
+
+    STORE_4x4
     RET
 
 ;---------------------------------------------------------------------------------------------



More information about the x265-devel mailing list