[x264-devel] commit: Faster x86 predict_8x8c_dc, MMX/SSE2 high bit depth versions ( Jason Garrett-Glaser )
git at videolan.org
git at videolan.org
Mon Jan 10 22:00:59 CET 2011
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Sat Dec 18 12:40:13 2010 -0800| [8cf764ecef221d085bb6669dbb4301e1904a0bc3] | committer: Jason Garrett-Glaser
Faster x86 predict_8x8c_dc, MMX/SSE2 high bit depth versions
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=8cf764ecef221d085bb6669dbb4301e1904a0bc3
---
common/x86/predict-a.asm | 117 ++++++++++++++++++++++++++++++++++------------
common/x86/predict-c.c | 31 ++++--------
common/x86/x86util.asm | 2 +
3 files changed, 100 insertions(+), 50 deletions(-)
diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
index e3faba5..6d05b10 100644
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -1261,41 +1261,98 @@ PRED_8x8C_H mmxext
PRED_8x8C_H ssse3
;-----------------------------------------------------------------------------
-; void predict_8x8c_dc_core( uint8_t *src, int s2, int s3 )
+; void predict_8x8c_dc( pixel *src )
;-----------------------------------------------------------------------------
-cglobal predict_8x8c_dc_core_mmxext, 1,1
- movq mm0, [r0 - FDEC_STRIDE]
- pxor mm1, mm1
- pxor mm2, mm2
- punpckhbw mm1, mm0
- punpcklbw mm0, mm2
- psadbw mm1, mm2 ; s1
- psadbw mm0, mm2 ; s0
-%ifdef ARCH_X86_64
- movd mm4, r1d
- movd mm5, r2d
- paddw mm0, mm4
- pshufw mm2, mm5, 0
+%macro PREDICT_8x8C_DC 1
+cglobal predict_8x8c_dc_%1, 1,3
+ pxor m7, m7
+%ifdef HIGH_BIT_DEPTH
+ movq m0, [r0-FDEC_STRIDEB+0]
+ movq m1, [r0-FDEC_STRIDEB+8]
+ HADDW m0, m2
+ HADDW m1, m2
%else
- paddw mm0, r1m
- pshufw mm2, r2m, 0
+ movd m0, [r0-FDEC_STRIDEB+0]
+ movd m1, [r0-FDEC_STRIDEB+4]
+ psadbw m0, m7 ; s0
+ psadbw m1, m7 ; s1
+%endif
+ add r0, FDEC_STRIDEB*4
+
+ movzx r1d, pixel [r0-FDEC_STRIDEB*4-SIZEOF_PIXEL]
+ movzx r2d, pixel [r0-FDEC_STRIDEB*3-SIZEOF_PIXEL]
+ add r1d, r2d
+ movzx r2d, pixel [r0-FDEC_STRIDEB*2-SIZEOF_PIXEL]
+ add r1d, r2d
+ movzx r2d, pixel [r0-FDEC_STRIDEB*1-SIZEOF_PIXEL]
+ add r1d, r2d
+ movd m2, r1d ; s2
+
+ movzx r1d, pixel [r0+FDEC_STRIDEB*0-SIZEOF_PIXEL]
+ movzx r2d, pixel [r0+FDEC_STRIDEB*1-SIZEOF_PIXEL]
+ add r1d, r2d
+ movzx r2d, pixel [r0+FDEC_STRIDEB*2-SIZEOF_PIXEL]
+ add r1d, r2d
+ movzx r2d, pixel [r0+FDEC_STRIDEB*3-SIZEOF_PIXEL]
+ add r1d, r2d
+ movd m3, r1d ; s3
+
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ punpckldq m0, m2 ; s0, s1, s2, s3
+ pshufw m3, m0, 11110110b ; s2, s1, s3, s3
+ pshufw m0, m0, 01110100b ; s0, s1, s3, s1
+ paddw m0, m3
+ psrlw m0, 2
+ pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
+%ifdef HIGH_BIT_DEPTH
+%ifidn %1, sse2
+ movq2dq xmm0, m0
+ punpcklwd xmm0, xmm0
+ pshufd xmm1, xmm0, 11111010b
+ punpckldq xmm0, xmm0
+%assign n 0
+%rep 8
+%assign i (0 + (n/4))
+ movdqa [r0+FDEC_STRIDEB*(n-4)+0], xmm %+ i
+%assign n n+1
+%endrep
+%else
+ pshufw m1, m0, 0x00
+ pshufw m2, m0, 0x55
+ pshufw m3, m0, 0xaa
+ pshufw m4, m0, 0xff
+%assign n 0
+%rep 8
+%assign i (1 + (n/4)*2)
+%assign j (2 + (n/4)*2)
+ movq [r0+FDEC_STRIDEB*(n-4)+0], m %+ i
+ movq [r0+FDEC_STRIDEB*(n-4)+8], m %+ j
+%assign n n+1
+%endrep
+%endif
+%else
+ packuswb m0, m0
+ punpcklbw m0, m0
+ movq m1, m0
+ punpcklbw m0, m0
+ punpckhbw m1, m1
+%assign n 0
+%rep 8
+%assign i (0 + (n/4))
+ movq [r0+FDEC_STRIDEB*(n-4)], m %+ i
+%assign n n+1
+%endrep
%endif
- psrlw mm0, 3
- paddw mm1, [pw_2]
- movq mm3, mm2
- pshufw mm1, mm1, 0
- pshufw mm0, mm0, 0 ; dc0 (w)
- paddw mm3, mm1
- psrlw mm3, 3 ; dc3 (w)
- psrlw mm2, 2 ; dc2 (w)
- psrlw mm1, 2 ; dc1 (w)
-
- packuswb mm0, mm1 ; dc0,dc1 (b)
- packuswb mm2, mm3 ; dc2,dc3 (b)
-
- STORE8x8 mm0, mm2
RET
+%endmacro
+
+INIT_MMX
+PREDICT_8x8C_DC mmxext
+%ifdef HIGH_BIT_DEPTH
+PREDICT_8x8C_DC sse2
+%endif
cglobal predict_8x8c_dc_top_mmxext, 1,1
movq mm0, [r0 - FDEC_STRIDE]
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
index 829a191..299e476 100644
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -45,7 +45,8 @@
void x264_predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c );
void x264_predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c );
void x264_predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c );
- void x264_predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 );
+ void x264_predict_8x8c_dc_mmxext( pixel *src );
+ void x264_predict_8x8c_dc_sse2( uint16_t *src );
void x264_predict_8x8c_dc_top_mmxext( uint8_t *src );
void x264_predict_8x8c_v_mmx( uint8_t *src );
void x264_predict_8x8c_h_mmxext( uint8_t *src );
@@ -245,23 +246,6 @@ static void x264_predict_8x8c_p_ssse3( uint8_t *src )
}
#endif
-static void x264_predict_8x8c_dc_mmxext( uint8_t *src )
-{
- int s2 = 4
- + src[-1 + 0*FDEC_STRIDE]
- + src[-1 + 1*FDEC_STRIDE]
- + src[-1 + 2*FDEC_STRIDE]
- + src[-1 + 3*FDEC_STRIDE];
-
- int s3 = 2
- + src[-1 + 4*FDEC_STRIDE]
- + src[-1 + 5*FDEC_STRIDE]
- + src[-1 + 6*FDEC_STRIDE]
- + src[-1 + 7*FDEC_STRIDE];
-
- x264_predict_8x8c_dc_core_mmxext( src, s2, s3 );
-}
-
#if ARCH_X86_64
static void x264_predict_8x8c_dc_left( uint8_t *src )
{
@@ -428,7 +412,14 @@ void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
{
if( !(cpu&X264_CPU_MMX) )
return;
-#if !HIGH_BIT_DEPTH
+#if HIGH_BIT_DEPTH
+ if( !(cpu&X264_CPU_MMXEXT) )
+ return;
+ pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_mmxext;
+ if( !(cpu&X264_CPU_SSE2) )
+ return;
+ pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_sse2;
+#else
#if ARCH_X86_64
pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left;
#endif
@@ -450,7 +441,7 @@ void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
#ifdef __GNUC__
pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_ssse3;
#endif
-#endif // !HIGH_BIT_DEPTH
+#endif // HIGH_BIT_DEPTH
}
void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter )
diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
index bd8cfe5..7901fa7 100644
--- a/common/x86/x86util.asm
+++ b/common/x86/x86util.asm
@@ -29,9 +29,11 @@
%assign SIZEOF_PIXEL 1
%assign SIZEOF_DCTCOEF 2
+%define pixel byte
%ifdef HIGH_BIT_DEPTH
%assign SIZEOF_PIXEL 2
%assign SIZEOF_DCTCOEF 4
+ %define pixel word
%endif
%assign FENC_STRIDEB SIZEOF_PIXEL*FENC_STRIDE
More information about the x264-devel
mailing list