[x265] [PATCH 8 of 9] asm:intra pred planar16 sse2

dtyx265 at gmail.com dtyx265 at gmail.com
Fri Mar 6 01:20:01 CET 2015


# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1425599635 28800
# Node ID 670625ad427f2cd3d8bc00fa1bcd590708caea9a
# Parent  2505da40f1a6f63266b98d5d6d81f51602c61a12
asm:intra pred planar16 sse2

This replaces c code for systems using ssse3 to sse2 processors
The code is backported from intrapred planar16 sse4

./test/TestBench --testbench intrapred | grep intra_planar_16x16
intra_planar_16x16	4.47x 	 2727.49  	 12185.39

diff -r 2505da40f1a6 -r 670625ad427f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Mar 05 15:43:35 2015 -0800
+++ b/source/common/x86/asm-primitives.cpp	Thu Mar 05 15:53:55 2015 -0800
@@ -1219,6 +1219,7 @@
 
         p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = x265_intra_pred_planar4_sse2;
         p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = x265_intra_pred_planar8_sse2;
+        p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = x265_intra_pred_planar16_sse2;
 
         p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2;
         p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2;
diff -r 2505da40f1a6 -r 670625ad427f source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Thu Mar 05 15:43:35 2015 -0800
+++ b/source/common/x86/intrapred.h	Thu Mar 05 15:53:55 2015 -0800
@@ -37,6 +37,7 @@
 
 void x265_intra_pred_planar4_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
 void x265_intra_pred_planar8_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
+void x265_intra_pred_planar16_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
 void x265_intra_pred_planar4_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
 void x265_intra_pred_planar8_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
 void x265_intra_pred_planar16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
diff -r 2505da40f1a6 -r 670625ad427f source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Thu Mar 05 15:43:35 2015 -0800
+++ b/source/common/x86/intrapred8.asm	Thu Mar 05 15:53:55 2015 -0800
@@ -645,6 +645,95 @@
     INTRA_PRED_PLANAR_8 7
     RET
 
+;---------------------------------------------------------------------------------------
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
+;---------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal intra_pred_planar16, 3,5,8
+    pxor            m0, m0
+    movh            m2, [r2 + 1]
+    punpcklbw       m2, m0
+    movh            m7, [r2 + 9]
+    punpcklbw       m7, m0
+
+    movd            m3, [r2 + 17]               ; topRight   = above[16]
+    movd            m6, [r2 + 49]               ; bottomLeft = left[16]
+    pand            m3, [pw_00ff]
+    pand            m6, [pw_00ff]
+    pshuflw         m3, m3, 0x00
+    pshuflw         m6, m6, 0x00
+    pshufd          m3, m3, 0x44                ; v_topRight
+    pshufd          m6, m6, 0x44                ; v_bottomLeft
+
+    pmullw          m4, m3, [multiH]            ; (x + 1) * topRight
+    pmullw          m3, [multiL]                ; (x + 1) * topRight
+    pmullw          m1, m2, [pw_planar16_1]     ; (blkSize - 1 - y) * above[x]
+    pmullw          m5, m7, [pw_planar16_1]     ; (blkSize - 1 - y) * above[x]
+    paddw           m4, [pw_16]
+    paddw           m3, [pw_16]
+    paddw           m4, m6
+    paddw           m3, m6
+    paddw           m4, m5
+    paddw           m3, m1
+    psubw           m1, m6, m7
+    psubw           m6, m2
+
+    movh            m2, [r2 + 33]
+    punpcklbw       m2, m0
+    movh            m7, [r2 + 41]
+    punpcklbw       m7, m0
+
+%macro INTRA_PRED_PLANAR_16 1
+%if (%1 < 4)
+    pshuflw         m5, m2, 0x55 * %1
+    pshufd          m5, m5, 0
+%else
+%if (%1 < 8)
+    pshufhw         m5, m2, 0x55 * (%1 - 4)
+    pshufd          m5, m5, 0xAA
+%else
+%if (%1 < 12)
+    pshuflw         m5, m7, 0x55 * (%1 - 8)
+    pshufd          m5, m5, 0
+%else
+    pshufhw         m5, m7, 0x55 * (%1 - 12)
+    pshufd          m5, m5, 0xAA
+%endif
+%endif
+%endif
+%if (%1 > 0)
+    paddw           m3, m6
+    paddw           m4, m1
+    lea             r0, [r0 + r1]
+%endif
+    pmullw          m0, m5, [pw_planar8_0]
+    pmullw          m5, [pw_planar16_0]
+    paddw           m0, m4
+    paddw           m5, m3
+    psraw           m5, 5
+    psraw           m0, 5
+    packuswb        m5, m0
+    movu            [r0], m5
+%endmacro
+
+    INTRA_PRED_PLANAR_16 0
+    INTRA_PRED_PLANAR_16 1
+    INTRA_PRED_PLANAR_16 2
+    INTRA_PRED_PLANAR_16 3
+    INTRA_PRED_PLANAR_16 4
+    INTRA_PRED_PLANAR_16 5
+    INTRA_PRED_PLANAR_16 6
+    INTRA_PRED_PLANAR_16 7
+    INTRA_PRED_PLANAR_16 8
+    INTRA_PRED_PLANAR_16 9
+    INTRA_PRED_PLANAR_16 10
+    INTRA_PRED_PLANAR_16 11
+    INTRA_PRED_PLANAR_16 12
+    INTRA_PRED_PLANAR_16 13
+    INTRA_PRED_PLANAR_16 14
+    INTRA_PRED_PLANAR_16 15
+    RET
+
 ;---------------------------------------------------------------------------------------------
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
 ;---------------------------------------------------------------------------------------------


More information about the x265-devel mailing list