[x265] [PATCH] asm: intra_allangs4x4 improved by ~61% over SSE4

praveen at multicorewareinc.com praveen at multicorewareinc.com
Fri Apr 17 14:00:13 CEST 2015


# HG changeset patch
# User Praveen Tiwari <praveen at multicorewareinc.com>
# Date 1429268008 -19800
#      Fri Apr 17 16:23:28 2015 +0530
# Node ID c8ea565afa9a8e7934ada36f76a0bb79f34d59b2
# Parent  7be1172ec816298c32f588908e1b6f0fa214d349
asm: intra_allangs4x4 improved by ~61% over SSE4

AVX2:
intra_allangs4x4        31.17x   1070.01         33353.50

SSE4:
intra_allangs4x4        12.04x   2746.58         33061.69

diff -r 7be1172ec816 -r c8ea565afa9a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Apr 16 11:38:32 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Apr 17 16:23:28 2015 +0530
@@ -1909,6 +1909,9 @@
         p.cu[BLOCK_32x32].intra_pred[21] = x265_intra_pred_ang32_21_avx2;
         p.cu[BLOCK_32x32].intra_pred[18] = x265_intra_pred_ang32_18_avx2;
 
+        // all_angs primitives
+        p.cu[BLOCK_4x4].intra_pred_allangs = x265_all_angs_pred_4x4_avx2;
+
         // copy_sp primitives
         p.cu[BLOCK_16x16].copy_sp = x265_blockcopy_sp_16x16_avx2;
         p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_sp = x265_blockcopy_sp_16x16_avx2;
diff -r 7be1172ec816 -r c8ea565afa9a source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Thu Apr 16 11:38:32 2015 +0530
+++ b/source/common/x86/intrapred.h	Fri Apr 17 16:23:28 2015 +0530
@@ -283,4 +283,5 @@
 void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
 void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
 void x265_all_angs_pred_32x32_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
+void x265_all_angs_pred_4x4_avx2(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
 #endif // ifndef X265_INTRAPRED_H
diff -r 7be1172ec816 -r c8ea565afa9a source/common/x86/intrapred8_allangs.asm
--- a/source/common/x86/intrapred8_allangs.asm	Thu Apr 16 11:38:32 2015 +0530
+++ b/source/common/x86/intrapred8_allangs.asm	Fri Apr 17 16:23:28 2015 +0530
@@ -27,6 +27,64 @@
 
 SECTION_RODATA 32
 
+all_ang4_shuff: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+                db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7
+                db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6
+                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5
+                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5
+                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4
+                db 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+                db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12
+                db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 4, 0, 0, 9, 9, 10, 10, 11
+                db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11
+                db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 4, 2, 2, 0, 0, 9, 9, 10
+                db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 3, 2, 2, 0, 0, 9, 9, 10
+                db 0, 9, 9, 10, 10, 11, 11, 12, 1, 0, 0, 9, 9, 10, 10, 11, 2, 1, 1, 0, 0, 9, 9, 10, 4, 2, 2, 1, 1, 0, 0, 9
+                db 0, 1, 2, 3, 9, 0, 1, 2, 10, 9, 0, 1, 11, 10, 9, 0, 0, 1, 2, 3, 9, 0, 1, 2, 10, 9, 0, 1, 11, 10, 9, 0
+                db 0, 1, 1, 2, 2, 3, 3, 4, 9, 0, 0, 1, 1, 2, 2, 3, 10, 9, 9, 0, 0, 1, 1, 2, 12, 10, 10, 9, 9, 0, 0, 1
+                db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 11, 10, 10, 0, 0, 1, 1, 2
+                db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 12, 10, 10, 0, 0, 1, 1, 2
+                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3
+                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 12, 0, 0, 1, 1, 2, 2, 3
+                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4
+                db 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4
+                db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5
+                db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6
+                db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6
+                db 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7
+                db 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7, 4, 5, 5, 6, 6, 7, 7, 8
+                db 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8
+
+all_ang4: db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8
+          db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20
+          db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
+          db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
+          db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
+          db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
+          db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
+          db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
+          db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
+          db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
+          db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
+          db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28
+          db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
+          db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
+          db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
+          db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
+          db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28
+          db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
+          db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
+          db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
+          db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
+          db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
+          db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
+          db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
+          db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
+          db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
+          db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20
+          db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8
+
+
 SECTION .text
 
 ; global constant
@@ -23012,6 +23070,324 @@
     movu       [r0 + 2111 * 16],   m4
     RET
 
+
+;-----------------------------------------------------------------------------
+; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
+;-----------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal all_angs_pred_4x4, 4, 4, 6
+
+    mova           m5, [pw_1024]
+    lea            r2, [all_ang4]
+    lea            r3, [all_ang4_shuff]
+
+; mode 2
+
+    vbroadcasti128 m0, [r1 + 9]
+    mova           xm1, xm0
+    psrldq         xm1, 1
+    pshufb         xm1, [r3]
+    movu           [r0], xm1
+
+; mode 3
+
+    pshufb         m1, m0, [r3 + 1 * mmsize]
+    pmaddubsw      m1, [r2]
+    pmulhrsw       m1, m5
+
+; mode 4
+
+    pshufb         m2, m0, [r3 + 2 * mmsize]
+    pmaddubsw      m2, [r2 + 1 * mmsize]
+    pmulhrsw       m2, m5
+    packuswb       m1, m2
+    vpermq         m1, m1, 11011000b
+    movu           [r0 + (3 - 2) * 16], m1
+
+; mode 5
+
+    pshufb         m1, m0, [r3 + 2 * mmsize]
+    pmaddubsw      m1, [r2 + 2 * mmsize]
+    pmulhrsw       m1, m5
+
+; mode 6
+
+    pshufb         m2, m0, [r3 + 3 * mmsize]
+    pmaddubsw      m2, [r2 + 3 * mmsize]
+    pmulhrsw       m2, m5
+    packuswb       m1, m2
+    vpermq         m1, m1, 11011000b
+    movu           [r0 + (5 - 2) * 16], m1
+
+    add            r3, 4 * mmsize
+    add            r2, 4 * mmsize
+
+; mode 7
+
+    pshufb         m1, m0, [r3 + 0 * mmsize]
+    pmaddubsw      m1, [r2 + 0 * mmsize]
+    pmulhrsw       m1, m5
+
+; mode 8
+
+    pshufb         m2, m0, [r3 + 1 * mmsize]
+    pmaddubsw      m2, [r2 + 1 * mmsize]
+    pmulhrsw       m2, m5
+    packuswb       m1, m2
+    vpermq         m1, m1, 11011000b
+    movu           [r0 + (7 - 2) * 16], m1
+
+; mode 9
+
+    pshufb         m1, m0, [r3 + 1 * mmsize]
+    pmaddubsw      m1, [r2 + 2 * mmsize]
+    pmulhrsw       m1, m5
+    packuswb       m1, m1
+    vpermq         m1, m1, 11011000b
+    movu           [r0 + (9 - 2) * 16], xm1
+
+; mode 10
+
+    pshufb         xm1, xm0, [r3 + 2 * mmsize]
+    movu           [r0 + (10 - 2) * 16], xm1
+
+    pxor           xm1, xm1
+    movd           xm2, [r1 + 1]
+    pshufd         xm3, xm2, 0
+    punpcklbw      xm3, xm1
+    pinsrb         xm2, [r1], 0
+    pshufb         xm4, xm2, xm1
+    punpcklbw      xm4, xm1
+    psubw          xm3, xm4
+    psraw          xm3, 1
+    pshufb         xm4, xm0, xm1
+    punpcklbw      xm4, xm1
+    paddw          xm3, xm4
+    packuswb       xm3, xm1
+
+    pextrb         [r0 + 128], xm3, 0
+    pextrb         [r0 + 132], xm3, 1
+    pextrb         [r0 + 136], xm3, 2
+    pextrb         [r0 + 140], xm3, 3
+
+; mode 11
+
+    vbroadcasti128 m0, [r1]
+    pshufb         m1, m0, [r3 + 3 * mmsize]
+    pmaddubsw      m1, [r2 + 3 * mmsize]
+    pmulhrsw       m1, m5
+
+; mode 12
+
+    add            r2, 4 * mmsize
+
+    pshufb         m2, m0, [r3 + 3 * mmsize]
+    pmaddubsw      m2, [r2 + 0 * mmsize]
+    pmulhrsw       m2, m5
+    packuswb       m1, m2
+    vpermq         m1, m1, 11011000b
+    movu           [r0 + (11 - 2) * 16], m1
+
+; mode 13
+
+    add            r3, 4 * mmsize
+
+    pshufb         m1, m0, [r3 + 0 * mmsize]
+    pmaddubsw      m1, [r2 + 1 * mmsize]
+    pmulhrsw       m1, m5
+
+; mode 14
+
+    pshufb         m2, m0, [r3 + 1 * mmsize]
+    pmaddubsw      m2, [r2 + 2 * mmsize]
+    pmulhrsw       m2, m5
+    packuswb       m1, m2
+    vpermq         m1, m1, 11011000b
+    movu           [r0 + (13 - 2) * 16], m1
+
+; mode 15
+
+    pshufb         m1, m0, [r3 + 2 * mmsize]
+    pmaddubsw      m1, [r2 + 3 * mmsize]
+    pmulhrsw       m1, m5
+
+; mode 16
+
+    add            r2, 4 * mmsize
+
+    pshufb         m2, m0, [r3 + 3 * mmsize]
+    pmaddubsw      m2, [r2 + 0 * mmsize]
+    pmulhrsw       m2, m5
+    packuswb       m1, m2
+    vpermq         m1, m1, 11011000b
+    movu           [r0 + (15 - 2) * 16], m1
+
+; mode 17
+
+    add            r3, 4 * mmsize
+
+    pshufb         m1, m0, [r3 + 0 * mmsize]
+    pmaddubsw      m1, [r2 + 1 * mmsize]
+    pmulhrsw       m1, m5
+    packuswb       m1, m1
+    vpermq         m1, m1, 11011000b
+
+; mode 18
+
+    pshufb         m2, m0, [r3 + 1 * mmsize]
+    vinserti128    m1, m1, xm2, 1
+    movu           [r0 + (17 - 2) * 16], m1
+
+; mode 19
+
+    pshufb         m1, m0, [r3 + 2 * mmsize]
+    pmaddubsw      m1, [r2 + 2 * mmsize]
+    pmulhrsw       m1, m5
+
+; mode 20
+
+    pshufb         m2, m0, [r3 + 3 * mmsize]
+    pmaddubsw      m2, [r2 + 3 * mmsize]
+    pmulhrsw       m2, m5
+    packuswb       m1, m2
+    vpermq         m1, m1, 11011000b
+    movu           [r0 + (19 - 2) * 16], m1
+
+; mode 21
+
+    add            r2, 4 * mmsize
+    add            r3, 4 * mmsize
+
+    pshufb         m1, m0, [r3 + 0 * mmsize]
+    pmaddubsw      m1, [r2 + 0 * mmsize]
+    pmulhrsw       m1, m5
+
+; mode 22
+
+    pshufb         m2, m0, [r3 + 1 * mmsize]
+    pmaddubsw      m2, [r2 + 1 * mmsize]
+    pmulhrsw       m2, m5
+    packuswb       m1, m2
+    vpermq         m1, m1, 11011000b
+    movu           [r0 + (21 - 2) * 16], m1
+
+; mode 23
+
+    pshufb         m1, m0, [r3 + 2 * mmsize]
+    pmaddubsw      m1, [r2 + 2 * mmsize]
+    pmulhrsw       m1, m5
+
+; mode 24
+
+    pshufb         m2, m0, [r3 + 3 * mmsize]
+    pmaddubsw      m2, [r2 + 3 * mmsize]
+    pmulhrsw       m2, m5
+    packuswb       m1, m2
+    vpermq         m1, m1, 11011000b
+    movu           [r0 + (23 - 2) * 16], m1
+
+; mode 25
+
+    add            r2, 4 * mmsize
+
+    pshufb         m1, m0, [r3 + 3 * mmsize]
+    pmaddubsw      m1, [r2 + 0 * mmsize]
+    pmulhrsw       m1, m5
+    packuswb       m1, m1
+    vpermq         m1, m1, 11011000b
+    movu           [r0 + (25 - 2) * 16], xm1
+
+; mode 26
+
+    add            r3, 4 * mmsize
+
+    pshufb         xm1, xm0, [r3 + 0 * mmsize]
+    movu           [r0 + (26 - 2) * 16], xm1
+
+    pxor           xm1, xm1
+    movd           xm2, [r1 + 9]
+    pshufd         xm3, xm2, 0
+    punpcklbw      xm3, xm1
+    pinsrb         xm4, [r1 + 0], 0
+    pshufb         xm4, xm1
+    punpcklbw      xm4, xm1
+    psubw          xm3, xm4
+    psraw          xm3, 1
+    psrldq         xm2, xm0, 1
+    pshufb         xm2, xm1
+    punpcklbw      xm2, xm1
+    paddw          xm3, xm2
+    packuswb       xm3, xm1
+
+    pextrb       [r0 + 384], xm3, 0
+    pextrb       [r0 + 388], xm3, 1
+    pextrb       [r0 + 392], xm3, 2
+    pextrb       [r0 + 396], xm3, 3
+
+; mode 27
+
+    pshufb        m1, m0, [r3 + 1 * mmsize]
+    pmaddubsw     m1, [r2 + 1 * mmsize]
+    pmulhrsw      m1, m5
+
+; mode 28
+
+    pshufb        m2, m0, [r3 + 1 * mmsize]
+    pmaddubsw     m2, [r2 + 2 * mmsize]
+    pmulhrsw      m2, m5
+    packuswb      m1, m2
+    vpermq        m1, m1, 11011000b
+    movu          [r0 + (27 - 2) * 16], m1
+
+; mode 29
+
+    pshufb        m1, m0, [r3 + 2 * mmsize]
+    pmaddubsw     m1, [r2 + 3 * mmsize]
+    pmulhrsw      m1, m5
+
+; mode 30
+
+    add           r2, 4 * mmsize
+
+    pshufb        m2, m0, [r3 + 3 * mmsize]
+    pmaddubsw     m2, [r2 + 0 * mmsize]
+    pmulhrsw      m2, m5
+    packuswb      m1, m2
+    vpermq        m1, m1, 11011000b
+    movu          [r0 + (29 - 2) * 16], m1
+
+; mode 31
+
+    add           r3, 4 * mmsize
+
+    pshufb        m1, m0, [r3 + 0 * mmsize]
+    pmaddubsw     m1, [r2 + 1 * mmsize]
+    pmulhrsw      m1, m5
+
+; mode 32
+
+    pshufb        m2, m0, [r3 + 0 * mmsize]
+    pmaddubsw     m2, [r2 + 2 * mmsize]
+    pmulhrsw      m2, m5
+    packuswb      m1, m2
+    vpermq        m1, m1, 11011000b
+    movu          [r0 + (31 - 2) * 16], m1
+
+; mode 33
+
+    pshufb        m1, m0, [r3 + 1 * mmsize]
+    pmaddubsw     m1, [r2 + 3 * mmsize]
+    pmulhrsw      m1, m5
+    packuswb      m1, m2
+    vpermq        m1, m1, 11011000b
+
+; mode 34
+
+    pshufb        m0, [r3 + 2 * mmsize]
+    vinserti128   m1, m1, xm0, 1
+    movu          [r0 + (33 - 2) * 16], m1
+    RET
+
 ;-----------------------------------------------------------------------------
 ; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
 ;-----------------------------------------------------------------------------


More information about the x265-devel mailing list