[x265] [PATCH 5 of 7] asm: intra_pred_ang16_7 improved by ~22% over SSE4

praveen at multicorewareinc.com praveen at multicorewareinc.com
Tue Apr 7 14:56:50 CEST 2015


# HG changeset patch
# User Praveen Tiwari <praveen at multicorewareinc.com>
# Date 1428408146 -19800
#      Tue Apr 07 17:32:26 2015 +0530
# Node ID c4a3dbba145d71307be6b6ca8918d1e19db9ad22
# Parent  255b6935884a682ed6b8c8c006588d107aca1dcb
asm: intra_pred_ang16_7 improved by ~22% over SSE4

AVX2:
intra_ang_16x16[ 7]     14.58x   795.95          11608.27

SSE4:
intra_ang_16x16[ 7]     11.54x   1021.72         11793.51

diff -r 255b6935884a -r c4a3dbba145d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Apr 07 16:32:12 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Apr 07 17:32:26 2015 +0530
@@ -1761,6 +1761,7 @@
         p.cu[BLOCK_8x8].intra_pred[12] = x265_intra_pred_ang8_12_avx2;
         p.cu[BLOCK_8x8].intra_pred[24] = x265_intra_pred_ang8_24_avx2;
         p.cu[BLOCK_8x8].intra_pred[11] = x265_intra_pred_ang8_11_avx2;
+        p.cu[BLOCK_16x16].intra_pred[7] = x265_intra_pred_ang16_7_avx2;
         p.cu[BLOCK_16x16].intra_pred[8] = x265_intra_pred_ang16_8_avx2;
         p.cu[BLOCK_16x16].intra_pred[9] = x265_intra_pred_ang16_9_avx2;
         p.cu[BLOCK_16x16].intra_pred[11] = x265_intra_pred_ang16_11_avx2;
diff -r 255b6935884a -r c4a3dbba145d source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Tue Apr 07 16:32:12 2015 +0530
+++ b/source/common/x86/intrapred.h	Tue Apr 07 17:32:26 2015 +0530
@@ -233,6 +233,7 @@
 void x265_intra_pred_ang8_12_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang8_24_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang8_11_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang16_7_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang16_8_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang16_9_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang16_11_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
diff -r 255b6935884a -r c4a3dbba145d source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Tue Apr 07 16:32:12 2015 +0530
+++ b/source/common/x86/intrapred8.asm	Tue Apr 07 17:32:26 2015 +0530
@@ -188,6 +188,15 @@
                      db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
                      db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
 
+ALIGN 32
+c_ang16_mode_7:      db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
+                     db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
+                     db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3
+                     db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
+                     db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
+                     db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
+                     db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7
+                     db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
 
 ALIGN 32
 c_ang16_mode_30:      db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
@@ -12077,6 +12086,113 @@
     RET
 
 
+
+INIT_YMM avx2
+cglobal intra_pred_ang16_7, 3, 5, 12
+    mova              m11, [pw_1024]
+
+    movu              xm9, [r2 + 1 + 32]
+    pshufb            xm9, [intra_pred_shuff_0_8]
+    movu              xm10, [r2 + 9 + 32]
+    pshufb            xm10, [intra_pred_shuff_0_8]
+
+    movu              xm7, [r2 + 3  + 32]
+    pshufb            xm7, [intra_pred_shuff_0_8]
+    vinserti128       m9, m9, xm7, 1
+
+    movu              xm8, [r2 + 11 + 32]
+    pshufb            xm8, [intra_pred_shuff_0_8]
+    vinserti128       m10, m10, xm8, 1
+
+    lea               r3, [3 * r1]
+    lea               r4, [c_ang16_mode_7]
+
+    pmaddubsw         m0, m9, [r4 + 0 * mmsize]
+    pmulhrsw          m0, m11
+    pmaddubsw         m1, m10, [r4 + 0 * mmsize]
+    pmulhrsw          m1, m11
+    packuswb          m0, m1
+
+    pmaddubsw         m1, m9, [r4 + 1 * mmsize]
+    pmulhrsw          m1, m11
+    pmaddubsw         m2, m10, [r4 + 1 * mmsize]
+    pmulhrsw          m2, m11
+    packuswb          m1, m2
+
+    movu              xm7, [r2 + 4  + 32]
+    pshufb            xm7, [intra_pred_shuff_0_8]
+    vinserti128       m9, m9, xm7, 1
+
+    movu              xm8, [r2 + 12 + 32]
+    pshufb            xm8, [intra_pred_shuff_0_8]
+    vinserti128       m10, m10, xm8, 1
+
+    pmaddubsw         m2, m9, [r4 + 2 * mmsize]
+    pmulhrsw          m2, m11
+    pmaddubsw         m3, m10, [r4 + 2 * mmsize]
+    pmulhrsw          m3, m11
+    packuswb          m2, m3
+
+    movu              xm7, [r2 + 2  + 32]
+    pshufb            xm7, [intra_pred_shuff_0_8]
+    vinserti128       m9, m9, xm7, 0
+
+    movu              xm8, [r2 + 10 + 32]
+    pshufb            xm8, [intra_pred_shuff_0_8]
+    vinserti128       m10, m10, xm8, 0
+
+    pmaddubsw         m3, m9, [r4 + 3 * mmsize]
+    pmulhrsw          m3, m11
+    pmaddubsw         m4, m10, [r4 + 3 * mmsize]
+    pmulhrsw          m4, m11
+    packuswb          m3, m4
+
+    add               r4, 4 * mmsize
+
+    pmaddubsw         m4, m9, [r4 + 0 * mmsize]
+    pmulhrsw          m4, m11
+    pmaddubsw         m5, m10, [r4 + 0 * mmsize]
+    pmulhrsw          m5, m11
+    packuswb          m4, m5
+
+    pmaddubsw         m5, m9, [r4 + 1 * mmsize]
+    pmulhrsw          m5, m11
+    pmaddubsw         m6, m10, [r4 + 1 * mmsize]
+    pmulhrsw          m6, m11
+    packuswb          m5, m6
+
+    movu              xm7, [r2 + 5  + 32]
+    pshufb            xm7, [intra_pred_shuff_0_8]
+    vinserti128       m9, m9, xm7, 1
+
+    movu              xm8, [r2 + 13 + 32]
+    pshufb            xm8, [intra_pred_shuff_0_8]
+    vinserti128       m10, m10, xm8, 1
+
+    pmaddubsw         m6, m9, [r4 + 2 * mmsize]
+    pmulhrsw          m6, m11
+    pmaddubsw         m7, m10, [r4 + 2 * mmsize]
+    pmulhrsw          m7, m11
+    packuswb          m6, m7
+
+    movu              xm7, [r2 + 3  + 32]
+    pshufb            xm7, [intra_pred_shuff_0_8]
+    vinserti128       m9, m9, xm7, 0
+
+    movu              xm8, [r2 + 11 + 32]
+    pshufb            xm8, [intra_pred_shuff_0_8]
+    vinserti128       m10, m10, xm8, 0
+
+    pmaddubsw         m7, m9, [r4 + 3 * mmsize]
+    pmulhrsw          m7, m11
+    pmaddubsw         m8, m10, [r4 + 3 * mmsize]
+    pmulhrsw          m8, m11
+    packuswb          m7, m8
+
+    ; transpose and store
+    INTRA_PRED_TRANS_STORE_16x16
+    RET
+
 INIT_YMM avx2
 cglobal intra_pred_ang16_8, 3, 5, 12
     mova              m11, [pw_1024]


More information about the x265-devel mailing list