[x264-devel] commit: Faster x86 predict_8x8c_dc, MMX/SSE2 high bit depth versions ( Jason Garrett-Glaser )

git at videolan.org git at videolan.org
Mon Jan 10 22:00:59 CET 2011


x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Sat Dec 18 12:40:13 2010 -0800| [8cf764ecef221d085bb6669dbb4301e1904a0bc3] | committer: Jason Garrett-Glaser 

Faster x86 predict_8x8c_dc, MMX/SSE2 high bit depth versions

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=8cf764ecef221d085bb6669dbb4301e1904a0bc3
---

 common/x86/predict-a.asm |  117 ++++++++++++++++++++++++++++++++++------------
 common/x86/predict-c.c   |   31 ++++--------
 common/x86/x86util.asm   |    2 +
 3 files changed, 100 insertions(+), 50 deletions(-)

diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
index e3faba5..6d05b10 100644
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -1261,41 +1261,98 @@ PRED_8x8C_H mmxext
 PRED_8x8C_H ssse3
 
 ;-----------------------------------------------------------------------------
-; void predict_8x8c_dc_core( uint8_t *src, int s2, int s3 )
+; void predict_8x8c_dc( pixel *src )
 ;-----------------------------------------------------------------------------
-cglobal predict_8x8c_dc_core_mmxext, 1,1
-    movq        mm0, [r0 - FDEC_STRIDE]
-    pxor        mm1, mm1
-    pxor        mm2, mm2
-    punpckhbw   mm1, mm0
-    punpcklbw   mm0, mm2
-    psadbw      mm1, mm2        ; s1
-    psadbw      mm0, mm2        ; s0
 
-%ifdef ARCH_X86_64
-    movd        mm4, r1d
-    movd        mm5, r2d
-    paddw       mm0, mm4
-    pshufw      mm2, mm5, 0
+%macro PREDICT_8x8C_DC 1
+cglobal predict_8x8c_dc_%1, 1,3
+    pxor      m7, m7
+%ifdef HIGH_BIT_DEPTH
+    movq      m0, [r0-FDEC_STRIDEB+0]
+    movq      m1, [r0-FDEC_STRIDEB+8]
+    HADDW     m0, m2
+    HADDW     m1, m2
 %else
-    paddw       mm0, r1m
-    pshufw      mm2, r2m, 0
+    movd      m0, [r0-FDEC_STRIDEB+0]
+    movd      m1, [r0-FDEC_STRIDEB+4]
+    psadbw    m0, m7            ; s0
+    psadbw    m1, m7            ; s1
+%endif
+    add       r0, FDEC_STRIDEB*4
+
+    movzx    r1d, pixel [r0-FDEC_STRIDEB*4-SIZEOF_PIXEL]
+    movzx    r2d, pixel [r0-FDEC_STRIDEB*3-SIZEOF_PIXEL]
+    add      r1d, r2d
+    movzx    r2d, pixel [r0-FDEC_STRIDEB*2-SIZEOF_PIXEL]
+    add      r1d, r2d
+    movzx    r2d, pixel [r0-FDEC_STRIDEB*1-SIZEOF_PIXEL]
+    add      r1d, r2d
+    movd      m2, r1d            ; s2
+
+    movzx    r1d, pixel [r0+FDEC_STRIDEB*0-SIZEOF_PIXEL]
+    movzx    r2d, pixel [r0+FDEC_STRIDEB*1-SIZEOF_PIXEL]
+    add      r1d, r2d
+    movzx    r2d, pixel [r0+FDEC_STRIDEB*2-SIZEOF_PIXEL]
+    add      r1d, r2d
+    movzx    r2d, pixel [r0+FDEC_STRIDEB*3-SIZEOF_PIXEL]
+    add      r1d, r2d
+    movd      m3, r1d            ; s3
+
+    punpcklwd m0, m1
+    punpcklwd m2, m3
+    punpckldq m0, m2            ; s0, s1, s2, s3
+    pshufw    m3, m0, 11110110b ; s2, s1, s3, s3
+    pshufw    m0, m0, 01110100b ; s0, s1, s3, s1
+    paddw     m0, m3
+    psrlw     m0, 2
+    pavgw     m0, m7            ; s0+s2, s1, s3, s1+s3
+%ifdef HIGH_BIT_DEPTH
+%ifidn %1, sse2
+    movq2dq   xmm0, m0
+    punpcklwd xmm0, xmm0
+    pshufd    xmm1, xmm0, 11111010b
+    punpckldq xmm0, xmm0
+%assign n 0
+%rep 8
+%assign i (0 + (n/4))
+    movdqa [r0+FDEC_STRIDEB*(n-4)+0], xmm %+ i
+%assign n n+1
+%endrep
+%else
+    pshufw    m1, m0, 0x00
+    pshufw    m2, m0, 0x55
+    pshufw    m3, m0, 0xaa
+    pshufw    m4, m0, 0xff
+%assign n 0
+%rep 8
+%assign i (1 + (n/4)*2)
+%assign j (2 + (n/4)*2)
+    movq [r0+FDEC_STRIDEB*(n-4)+0], m %+ i
+    movq [r0+FDEC_STRIDEB*(n-4)+8], m %+ j
+%assign n n+1
+%endrep
+%endif
+%else
+    packuswb  m0, m0
+    punpcklbw m0, m0
+    movq      m1, m0
+    punpcklbw m0, m0
+    punpckhbw m1, m1
+%assign n 0
+%rep 8
+%assign i (0 + (n/4))
+    movq [r0+FDEC_STRIDEB*(n-4)], m %+ i
+%assign n n+1
+%endrep
 %endif
-    psrlw       mm0, 3
-    paddw       mm1, [pw_2]
-    movq        mm3, mm2
-    pshufw      mm1, mm1, 0
-    pshufw      mm0, mm0, 0     ; dc0 (w)
-    paddw       mm3, mm1
-    psrlw       mm3, 3          ; dc3 (w)
-    psrlw       mm2, 2          ; dc2 (w)
-    psrlw       mm1, 2          ; dc1 (w)
-
-    packuswb    mm0, mm1        ; dc0,dc1 (b)
-    packuswb    mm2, mm3        ; dc2,dc3 (b)
-
-    STORE8x8    mm0, mm2
     RET
+%endmacro
+
+INIT_MMX
+PREDICT_8x8C_DC mmxext
+%ifdef HIGH_BIT_DEPTH
+PREDICT_8x8C_DC sse2
+%endif
 
 cglobal predict_8x8c_dc_top_mmxext, 1,1
     movq        mm0, [r0 - FDEC_STRIDE]
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
index 829a191..299e476 100644
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -45,7 +45,8 @@
  void x264_predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c );
  void x264_predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c );
  void x264_predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c );
- void x264_predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 );
+ void x264_predict_8x8c_dc_mmxext( pixel *src );
+ void x264_predict_8x8c_dc_sse2( uint16_t *src );
  void x264_predict_8x8c_dc_top_mmxext( uint8_t *src );
  void x264_predict_8x8c_v_mmx( uint8_t *src );
  void x264_predict_8x8c_h_mmxext( uint8_t *src );
@@ -245,23 +246,6 @@ static void x264_predict_8x8c_p_ssse3( uint8_t *src )
 }
 #endif
 
-static void x264_predict_8x8c_dc_mmxext( uint8_t *src )
-{
-    int s2 = 4
-       + src[-1 + 0*FDEC_STRIDE]
-       + src[-1 + 1*FDEC_STRIDE]
-       + src[-1 + 2*FDEC_STRIDE]
-       + src[-1 + 3*FDEC_STRIDE];
-
-    int s3 = 2
-       + src[-1 + 4*FDEC_STRIDE]
-       + src[-1 + 5*FDEC_STRIDE]
-       + src[-1 + 6*FDEC_STRIDE]
-       + src[-1 + 7*FDEC_STRIDE];
-
-    x264_predict_8x8c_dc_core_mmxext( src, s2, s3 );
-}
-
 #if ARCH_X86_64
 static void x264_predict_8x8c_dc_left( uint8_t *src )
 {
@@ -428,7 +412,14 @@ void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
 {
     if( !(cpu&X264_CPU_MMX) )
         return;
-#if !HIGH_BIT_DEPTH
+#if HIGH_BIT_DEPTH
+    if( !(cpu&X264_CPU_MMXEXT) )
+        return;
+    pf[I_PRED_CHROMA_DC]      = x264_predict_8x8c_dc_mmxext;
+    if( !(cpu&X264_CPU_SSE2) )
+        return;
+    pf[I_PRED_CHROMA_DC]      = x264_predict_8x8c_dc_sse2;
+#else
 #if ARCH_X86_64
     pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left;
 #endif
@@ -450,7 +441,7 @@ void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
 #ifdef __GNUC__
     pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_ssse3;
 #endif
-#endif // !HIGH_BIT_DEPTH
+#endif // HIGH_BIT_DEPTH
 }
 
 void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter )
diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
index bd8cfe5..7901fa7 100644
--- a/common/x86/x86util.asm
+++ b/common/x86/x86util.asm
@@ -29,9 +29,11 @@
 
 %assign SIZEOF_PIXEL 1
 %assign SIZEOF_DCTCOEF 2
+%define pixel byte
 %ifdef HIGH_BIT_DEPTH
     %assign SIZEOF_PIXEL 2
     %assign SIZEOF_DCTCOEF 4
+    %define pixel word
 %endif
 
 %assign FENC_STRIDEB SIZEOF_PIXEL*FENC_STRIDE



More information about the x264-devel mailing list