[x264-devel] Better x86 high bit depth predict_8x8c_p
Daniel Kang
git at videolan.org
Sun Jul 10 06:17:54 CEST 2011
x264 | branch: master | Daniel Kang <daniel.d.kang at gmail.com> | Sun Jul 3 17:32:00 2011 -0400| [e976f640d5d0ac4775a0cb706888e79649c54a6c] | committer: Jason Garrett-Glaser
Better x86 high bit depth predict_8x8c_p
Avoid the need to check for corner cases by reordering arithmetic.
Also make a minor optimization to high bit depth predict_16x16_p.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=e976f640d5d0ac4775a0cb706888e79649c54a6c
---
common/x86/predict-a.asm | 58 +++++++++++++++++++++++++--------------------
common/x86/predict-c.c | 11 ++++----
2 files changed, 37 insertions(+), 32 deletions(-)
diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
index 39f2f3b..d2477d1 100644
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -31,10 +31,12 @@
SECTION_RODATA
pw_76543210:
-pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7
-pb_00s_ff: times 8 db 0
-pb_0s_ff: times 7 db 0
- db 0xff
+pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7
+pw_43210123: dw -3, -2, -1, 0, 1, 2, 3, 4
+pw_m3: times 8 dw -3
+pb_00s_ff: times 8 db 0
+pb_0s_ff: times 7 db 0
+ db 0xff
SECTION .text
@@ -45,6 +47,7 @@ cextern pw_1
cextern pw_2
cextern pw_4
cextern pw_8
+cextern pw_16
cextern pw_ff00
cextern pb_reverse
cextern pw_pixel_max
@@ -1045,32 +1048,42 @@ ALIGN 4
REP_RET
%endif ; !ARCH_X86_64
-%macro PREDICT_8x8C_P 1
-cglobal predict_8x8c_p_core_%1, 1,1
+INIT_XMM
+%ifdef HIGH_BIT_DEPTH
+cglobal predict_8x8c_p_core_sse2, 1,1,7
movd m0, r1m
movd m2, r2m
movd m4, r3m
-%ifdef HIGH_BIT_DEPTH
mova m3, [pw_pixel_max]
pxor m1, m1
-%endif
SPLATW m0, m0, 0
SPLATW m2, m2, 0
SPLATW m4, m4, 0
- pmullw m2, [pw_76543210]
-%ifdef HIGH_BIT_DEPTH
+ pmullw m2, [pw_43210123] ; b
+ pmullw m5, m4, [pw_m3] ; c
+ paddw m5, [pw_16]
mov r1d, 8
.loop:
- paddsw m5, m0, m2
- psraw m5, 5
- CLIPW m5, m1, m3
- mova [r0], m5
- paddw m2, m4
+ paddsw m6, m2, m5
+ paddsw m6, m0
+ psraw m6, 5
+ CLIPW m6, m1, m3
+ mova [r0], m6
+ paddw m5, m4
add r0, FDEC_STRIDEB
dec r1d
jg .loop
-%else ;!HIGH_BIT_DEPTH
- paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
+ REP_RET
+%else ; !HIGH_BIT_DEPTH
+cglobal predict_8x8c_p_core_sse2, 1,1
+ movd m0, r1m
+ movd m2, r2m
+ movd m4, r3m
+ SPLATW m0, m0, 0
+ SPLATW m2, m2, 0
+ SPLATW m4, m4, 0
+ pmullw m2, [pw_76543210]
+ paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
paddsw m3, m0, m4
paddsw m4, m4
call .loop
@@ -1090,12 +1103,8 @@ call .loop
packuswb m5, m1
movq [r0+FDEC_STRIDE*2], m5
movhps [r0+FDEC_STRIDE*3], m5
-%endif ;!HIGH_BIT_DEPTH
RET
-%endmacro ; PREDICT_8x8C_P
-
-INIT_XMM
-PREDICT_8x8C_P sse2
+%endif ; HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
@@ -1146,16 +1155,13 @@ cglobal predict_16x16_p_core_%1, 1,2,8
movd m0, r1m
movd m1, r2m
movd m2, r3m
-%ifdef HIGH_BIT_DEPTH
- pxor m6, m6
- pxor m7, m7
-%endif
SPLATW m0, m0, 0
SPLATW m1, m1, 0
SPLATW m2, m2, 0
pmullw m3, m1, [pw_76543210]
psllw m1, 3
%ifdef HIGH_BIT_DEPTH
+ pxor m6, m6
mov r1d, 16
.loop:
mova m4, m0
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
index 725df94..319734e 100644
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -315,12 +315,11 @@ static void x264_predict_8x8c_p_ssse3( uint8_t *src )
b = ( 17 * H + 16 ) >> 5;
c = ( 17 * V + 16 ) >> 5;
i00 = a -3*b -3*c + 16;
- /* b*7 + c*7 can overflow: it's easier to just branch away in this rare case
- * than to try to consider it in the asm. */
- if( BIT_DEPTH > 8 && (i00 > 0x7fff || abs(b) > 2340 || abs(c) > 2340) )
- x264_predict_8x8c_p_c( src );
- else
- x264_predict_8x8c_p_core_sse2( src, i00, b, c );
+#if HIGH_BIT_DEPTH
+ x264_predict_8x8c_p_core_sse2( src, a, b, c );
+#else
+ x264_predict_8x8c_p_core_sse2( src, i00, b, c );
+#endif
}
#endif
#if !HIGH_BIT_DEPTH
More information about the x264-devel
mailing list