[x264-devel] commit: SSE2 high bit depth zigzag_interleave_cavlc (Daniel Kang )

git at videolan.org git at videolan.org
Mon Jan 10 22:01:01 CET 2011


x264 | branch: master | Daniel Kang <daniel.d.kang at gmail.com> | Thu Dec 23 12:15:03 2010 -0500| [658b6ae1b5f43b488efd0ab4f6c60b93a1366333] | committer: Jason Garrett-Glaser 

SSE2 high bit depth zigzag_interleave_cavlc

Patch from Google Code-In.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=658b6ae1b5f43b488efd0ab4f6c60b93a1366333
---

 common/dct.c         |    7 +++-
 common/x86/dct-a.asm |   80 ++++++++++++++++++++++++++++----------------------
 common/x86/dct.h     |    2 +-
 3 files changed, 51 insertions(+), 38 deletions(-)

diff --git a/common/dct.c b/common/dct.c
index e7926dd..dc5e59f 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -802,12 +802,15 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
     }
 
     pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
-#if !HIGH_BIT_DEPTH
 #if HAVE_MMX
+#if HIGH_BIT_DEPTH
+    if( cpu&X264_CPU_SSE2 )
+        pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
+#else
     if( cpu&X264_CPU_MMX )
         pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
     if( cpu&X264_CPU_SHUFFLE_IS_FAST )
         pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
+#endif // HIGH_BIT_DEPTH
 #endif
-#endif // !HIGH_BIT_DEPTH
 }
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 8275627..e31b60c 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -1267,47 +1267,57 @@ ZIGZAG_SUB_4x4 ac, field
 ;-----------------------------------------------------------------------------
 ; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
 ;-----------------------------------------------------------------------------
-
-%macro INTERLEAVE 1
-    movq   m0, [r1+%1*4+ 0]
-    movq   m1, [r1+%1*4+ 8]
-    movq   m2, [r1+%1*4+16]
-    movq   m3, [r1+%1*4+24]
-    TRANSPOSE4x4W 0,1,2,3,4
-    movq   [r0+%1+ 0], m0
-    movq   [r0+%1+32], m1
-    movq   [r0+%1+64], m2
-    movq   [r0+%1+96], m3
-%if %1
+%macro INTERLEAVE 2
+    mova     m0, [r1+(%1*4+ 0)*SIZEOF_PIXEL]
+    mova     m1, [r1+(%1*4+ 8)*SIZEOF_PIXEL]
+    mova     m2, [r1+(%1*4+16)*SIZEOF_PIXEL]
+    mova     m3, [r1+(%1*4+24)*SIZEOF_PIXEL]
+    TRANSPOSE4x4%2 0,1,2,3,4
+    mova     [r0+(%1+ 0)*SIZEOF_PIXEL], m0
+    mova     [r0+(%1+32)*SIZEOF_PIXEL], m1
+    mova     [r0+(%1+64)*SIZEOF_PIXEL], m2
+    mova     [r0+(%1+96)*SIZEOF_PIXEL], m3
     packsswb m0, m1
-    por    m6, m2
-    por    m7, m3
-    por    m5, m0
+%if %1
+    por      m6, m2
+    por      m7, m3
+    por      m5, m0
 %else
-    packsswb m0, m1
-    SWAP   m5, m0
-    SWAP   m6, m2
-    SWAP   m7, m3
+    SWAP     m5, m0
+    SWAP     m6, m2
+    SWAP     m7, m3
 %endif
 %endmacro
 
-INIT_MMX
-cglobal zigzag_interleave_8x8_cavlc_mmx, 3,3
-    INTERLEAVE  0
-    INTERLEAVE  8
-    INTERLEAVE 16
-    INTERLEAVE 24
-    packsswb m6, m7
-    packsswb m5, m6
-    packsswb m5, m5
-    pxor     m0, m0
-    pcmpeqb  m5, m0
-    paddb    m5, [pb_1]
-    movd    r0d, m5
-    mov  [r2+0], r0w
-    shr     r0d, 16
-    mov  [r2+8], r0w
+%macro ZIGZAG_8x8_CAVLC 2
+cglobal zigzag_interleave_8x8_cavlc_%1, 3,3,8*(mmsize/16)
+    INTERLEAVE  0, %2
+    INTERLEAVE  8, %2
+    INTERLEAVE 16, %2
+    INTERLEAVE 24, %2
+    packsswb   m6, m7
+    packsswb   m5, m6
+    packsswb   m5, m5
+    pxor       m0, m0
+%ifdef HIGH_BIT_DEPTH
+    packsswb   m5, m5
+%endif
+    pcmpeqb    m5, m0
+    paddb      m5, [pb_1]
+    movd      r0d, m5
+    mov    [r2+0], r0w
+    shr       r0d, 16
+    mov    [r2+8], r0w
     RET
+%endmacro
+
+%ifdef HIGH_BIT_DEPTH
+INIT_XMM
+ZIGZAG_8x8_CAVLC sse2, D
+%else
+INIT_MMX
+ZIGZAG_8x8_CAVLC mmx , W
+%endif
 
 %macro INTERLEAVE_XMM 1
     mova   m0, [r1+%1*4+ 0]
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 54a6e44..fb2a607 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -86,6 +86,6 @@ int  x264_zigzag_sub_4x4ac_frame_ssse3( int16_t level[16], const uint8_t *src, u
 int  x264_zigzag_sub_4x4_field_ssse3  ( int16_t level[16], const uint8_t *src, uint8_t *dst );
 int  x264_zigzag_sub_4x4ac_field_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
 void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz );
-void x264_zigzag_interleave_8x8_cavlc_sse2( int16_t *dst, int16_t *src, uint8_t *nnz );
+void x264_zigzag_interleave_8x8_cavlc_sse2( dctcoef *dst, dctcoef *src, uint8_t *nnz );
 
 #endif



More information about the x264-devel mailing list