[x265] [PATCH 9 of 9] asm:intra pred planar16 sse2 high bit
dtyx265 at gmail.com
dtyx265 at gmail.com
Fri Mar 6 01:20:02 CET 2015
# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1425600109 28800
# Node ID c16a875d913fffbad405f7d5af51700b1a8990bb
# Parent 670625ad427f2cd3d8bc00fa1bcd590708caea9a
asm:intra pred planar16 sse2 high bit
This replaces c code for systems using ssse3 to sse2 processors
The code is backported from intrapred planar16 sse4 high bit
./test/TestBench --testbench intrapred | grep intra_planar_16x16
intra_planar_16x16 4.90x 2507.48 12282.71
diff -r 670625ad427f -r c16a875d913f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Mar 05 15:53:55 2015 -0800
+++ b/source/common/x86/asm-primitives.cpp Thu Mar 05 16:01:49 2015 -0800
@@ -875,6 +875,7 @@
p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = x265_intra_pred_planar4_sse2;
p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = x265_intra_pred_planar8_sse2;
+ p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = x265_intra_pred_planar16_sse2;
p.cu[BLOCK_4x4].sse_ss = x265_pixel_ssd_ss_4x4_mmx2;
ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
diff -r 670625ad427f -r c16a875d913f source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Thu Mar 05 15:53:55 2015 -0800
+++ b/source/common/x86/intrapred16.asm Thu Mar 05 16:01:49 2015 -0800
@@ -492,6 +492,89 @@
INTRA_PRED_PLANAR_8 7
RET
+;---------------------------------------------------------------------------------------
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
+;---------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal intra_pred_planar16, 3,3,8
+ movu m2, [r2 + 2]
+ movu m7, [r2 + 18]
+
+ movd m3, [r2 + 34] ; topRight = above[16]
+ movd m6, [r2 + 98] ; bottomLeft = left[16]
+
+ pshuflw m3, m3, 0
+ pshuflw m6, m6, 0
+ pshufd m3, m3, 0 ; v_topRight
+ pshufd m6, m6, 0 ; v_bottomLeft
+
+ pmullw m4, m3, [multiH] ; (x + 1) * topRight
+ pmullw m3, [multiL] ; (x + 1) * topRight
+ pmullw m1, m2, [pw_planar16_1] ; (blkSize - 1 - y) * above[x]
+ pmullw m5, m7, [pw_planar16_1] ; (blkSize - 1 - y) * above[x]
+ paddw m4, [pw_16]
+ paddw m3, [pw_16]
+ paddw m4, m6
+ paddw m3, m6
+ paddw m4, m5
+ paddw m3, m1
+ psubw m1, m6, m7
+ psubw m6, m2
+
+ movu m2, [r2 + 66]
+ movu m7, [r2 + 82]
+
+%macro INTRA_PRED_PLANAR_16 1
+%if (%1 < 4)
+ pshuflw m5, m2, 0x55 * %1
+ pshufd m5, m5, 0
+%else
+%if (%1 < 8)
+ pshufhw m5, m2, 0x55 * (%1 - 4)
+ pshufd m5, m5, 0xAA
+%else
+%if (%1 < 12)
+ pshuflw m5, m7, 0x55 * (%1 - 8)
+ pshufd m5, m5, 0
+%else
+ pshufhw m5, m7, 0x55 * (%1 - 12)
+ pshufd m5, m5, 0xAA
+%endif
+%endif
+%endif
+%if (%1 > 0)
+ paddw m3, m6
+ paddw m4, m1
+ lea r0, [r0 + r1 * 2]
+%endif
+ pmullw m0, m5, [pw_planar8_0]
+ pmullw m5, [pw_planar16_0]
+ paddw m0, m4
+ paddw m5, m3
+ psraw m5, 5
+ psraw m0, 5
+ movu [r0], m5
+ movu [r0 + 16], m0
+%endmacro
+
+ INTRA_PRED_PLANAR_16 0
+ INTRA_PRED_PLANAR_16 1
+ INTRA_PRED_PLANAR_16 2
+ INTRA_PRED_PLANAR_16 3
+ INTRA_PRED_PLANAR_16 4
+ INTRA_PRED_PLANAR_16 5
+ INTRA_PRED_PLANAR_16 6
+ INTRA_PRED_PLANAR_16 7
+ INTRA_PRED_PLANAR_16 8
+ INTRA_PRED_PLANAR_16 9
+ INTRA_PRED_PLANAR_16 10
+ INTRA_PRED_PLANAR_16 11
+ INTRA_PRED_PLANAR_16 12
+ INTRA_PRED_PLANAR_16 13
+ INTRA_PRED_PLANAR_16 14
+ INTRA_PRED_PLANAR_16 15
+ RET
+
;-----------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
;-----------------------------------------------------------------------------------
More information about the x265-devel
mailing list