[x265] [PATCH 6 of 9] asm:intra pred planar8 sse2
dtyx265 at gmail.com
dtyx265 at gmail.com
Fri Mar 6 01:19:59 CET 2015
# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1425598408 28800
# Node ID 352b5ea85101e8da0c4a15d5a7c2a303b28e02da
# Parent 9fb1aba51e2ab4ebde4ab65a3cca9db2ed122ec4
asm:intra pred planar8 sse2
This replaces c code for systems using ssse3 to sse2 processors
The code is backported from intrapred planar8 sse4
64-bit
./test/TestBench --testbench intrapred | grep intra_planar_8x8
intra_planar_8x8 3.34x 997.49 3327.61
32-bit
./test/TestBench --testbench intrapred | grep intra_planar_8x8
intra_planar_8x8 3.90x 1042.49 4062.56
This patch also groups intra pred planar 8 sse2 with the other sse2 primitives
diff -r 9fb1aba51e2a -r 352b5ea85101 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Mar 05 14:55:09 2015 -0800
+++ b/source/common/x86/asm-primitives.cpp Thu Mar 05 15:33:28 2015 -0800
@@ -1217,6 +1217,7 @@
p.cu[BLOCK_32x32].intra_pred[DC_IDX] = x265_intra_pred_dc32_sse2;
p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = x265_intra_pred_planar4_sse2;
+ p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = x265_intra_pred_planar8_sse2;
p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2;
p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2;
diff -r 9fb1aba51e2a -r 352b5ea85101 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Thu Mar 05 14:55:09 2015 -0800
+++ b/source/common/x86/intrapred.h Thu Mar 05 15:33:28 2015 -0800
@@ -36,6 +36,7 @@
void x265_intra_pred_dc32_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
void x265_intra_pred_planar4_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
+void x265_intra_pred_planar8_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
void x265_intra_pred_planar4_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
void x265_intra_pred_planar8_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
void x265_intra_pred_planar16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
diff -r 9fb1aba51e2a -r 352b5ea85101 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm Thu Mar 05 14:55:09 2015 -0800
+++ b/source/common/x86/intrapred8.asm Thu Mar 05 15:33:28 2015 -0800
@@ -125,6 +125,7 @@
cextern pw_257
cextern pw_1024
cextern pw_4096
+cextern pw_00ff
cextern pb_unpackbd1
cextern multiL
cextern multiH
@@ -532,66 +533,6 @@
%endrep
RET
-;---------------------------------------------------------------------------------------------
-; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
-;---------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_dc4, 5,5,3
- inc r2
- pxor m0, m0
- movd m1, [r2]
- movd m2, [r2 + 8]
- punpckldq m1, m2
- psadbw m1, m0 ; m1 = sum
-
- test r4d, r4d
-
- pmulhrsw m1, [pw_4096] ; m1 = (sum + 4) / 8
- movd r4d, m1 ; r4d = dc_val
- pshufb m1, m0 ; m1 = byte [dc_val ...]
-
- ; store DC 4x4
- lea r3, [r1 * 3]
- movd [r0], m1
- movd [r0 + r1], m1
- movd [r0 + r1 * 2], m1
- movd [r0 + r3], m1
-
- ; do DC filter
- jz .end
- lea r3d, [r4d * 2 + 2] ; r3d = DC * 2 + 2
- add r4d, r3d ; r4d = DC * 3 + 2
- movd m1, r4d
- pshuflw m1, m1, 0 ; m1 = pixDCx3
- pshufd m1, m1, 0
-
- ; filter top
- movd m2, [r2]
- movd m0, [r2 + 9]
- punpckldq m2, m0
- pmovzxbw m2, m2
- paddw m2, m1
- psraw m2, 2
- packuswb m2, m2
- movd [r0], m2 ; overwrite top-left pixel, we will update it later
-
- ; filter top-left
- movzx r4d, byte [r2 + 8]
- add r3d, r4d
- movzx r4d, byte [r2]
- add r3d, r4d
- shr r3d, 2
- mov [r0], r3b
-
- ; filter left
- add r0, r1
- pextrb [r0], m2, 4
- pextrb [r0 + r1], m2, 5
- pextrb [r0 + r1 * 2], m2, 6
-
-.end:
- RET
-
;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
@@ -647,6 +588,123 @@
movd [r0 + r1], m1
RET
+;---------------------------------------------------------------------------------------
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
+;---------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal intra_pred_planar8, 3,3,6
+ pxor m0, m0
+ movh m1, [r2 + 1]
+ punpcklbw m1, m0
+ movh m2, [r2 + 17]
+ punpcklbw m2, m0
+
+ movd m3, [r2 + 9] ; topRight = above[8];
+ movd m4, [r2 + 25] ; bottomLeft = left[8];
+
+ pand m3, [pw_00ff]
+ pand m4, [pw_00ff]
+ pshuflw m3, m3, 0x00
+ pshuflw m4, m4, 0x00
+ pshufd m3, m3, 0x44
+ pshufd m4, m4, 0x44
+
+ pmullw m3, [multiL] ; (x + 1) * topRight
+ pmullw m0, m1, [pw_planar8_1] ; (blkSize - 1 - y) * above[x]
+ paddw m3, [pw_8]
+ paddw m3, m4
+ paddw m3, m0
+ psubw m4, m1
+
+%macro INTRA_PRED_PLANAR_8 1
+%if (%1 < 4)
+ pshuflw m5, m2, 0x55 * %1
+ pshufd m5, m5, 0
+%else
+ pshufhw m5, m2, 0x55 * (%1 - 4)
+ pshufd m5, m5, 0xAA
+%endif
+ pmullw m5, [pw_planar8_0]
+ paddw m5, m3
+ psraw m5, 4
+ packuswb m5, m5
+ movh [r0], m5
+%if (%1 < 7)
+ paddw m3, m4
+ lea r0, [r0 + r1]
+%endif
+%endmacro
+
+ INTRA_PRED_PLANAR_8 0
+ INTRA_PRED_PLANAR_8 1
+ INTRA_PRED_PLANAR_8 2
+ INTRA_PRED_PLANAR_8 3
+ INTRA_PRED_PLANAR_8 4
+ INTRA_PRED_PLANAR_8 5
+ INTRA_PRED_PLANAR_8 6
+ INTRA_PRED_PLANAR_8 7
+ RET
+
+;---------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
+;---------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_pred_dc4, 5,5,3
+ inc r2
+ pxor m0, m0
+ movd m1, [r2]
+ movd m2, [r2 + 8]
+ punpckldq m1, m2
+ psadbw m1, m0 ; m1 = sum
+
+ test r4d, r4d
+
+ pmulhrsw m1, [pw_4096] ; m1 = (sum + 4) / 8
+ movd r4d, m1 ; r4d = dc_val
+ pshufb m1, m0 ; m1 = byte [dc_val ...]
+
+ ; store DC 4x4
+ lea r3, [r1 * 3]
+ movd [r0], m1
+ movd [r0 + r1], m1
+ movd [r0 + r1 * 2], m1
+ movd [r0 + r3], m1
+
+ ; do DC filter
+ jz .end
+ lea r3d, [r4d * 2 + 2] ; r3d = DC * 2 + 2
+ add r4d, r3d ; r4d = DC * 3 + 2
+ movd m1, r4d
+ pshuflw m1, m1, 0 ; m1 = pixDCx3
+ pshufd m1, m1, 0
+
+ ; filter top
+ movd m2, [r2]
+ movd m0, [r2 + 9]
+ punpckldq m2, m0
+ pmovzxbw m2, m2
+ paddw m2, m1
+ psraw m2, 2
+ packuswb m2, m2
+ movd [r0], m2 ; overwrite top-left pixel, we will update it later
+
+ ; filter top-left
+ movzx r4d, byte [r2 + 8]
+ add r3d, r4d
+ movzx r4d, byte [r2]
+ add r3d, r4d
+ shr r3d, 2
+ mov [r0], r3b
+
+ ; filter left
+ add r0, r1
+ pextrb [r0], m2, 4
+ pextrb [r0 + r1], m2, 5
+ pextrb [r0 + r1 * 2], m2, 6
+
+.end:
+ RET
+
;---------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;---------------------------------------------------------------------------------------------
More information about the x265-devel
mailing list