[x264-devel] x86: SSE zigzag_scan_4x4_field

Tue Sep 20 20:57:52 CEST 2016

x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sun Sep 11 15:32:54 2016 +0200| [75d0f9cc8770bc4f36785062116757d24eb44604] | committer: Anton Mitrofanov

x86: SSE zigzag_scan_4x4_field

Replaces the MMX2 version, one cycle faster.

Also change the checkasm test to use the correct alignment macro.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=75d0f9cc8770bc4f36785062116757d24eb44604
---

 common/dct.c         |  3 ++-
 common/x86/dct-a.asm | 27 +++++++++++----------------
 common/x86/dct.h     |  2 +-
 tools/checkasm.c     |  4 ++--
 4 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/common/dct.c b/common/dct.c
index 9e2e955..7dfeea2 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -990,10 +990,11 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
         pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
     if( cpu&X264_CPU_MMX2 )
     {
-        pf_interlaced->scan_4x4  = x264_zigzag_scan_4x4_field_mmx2;
         pf_interlaced->scan_8x8  = x264_zigzag_scan_8x8_field_mmx2;
         pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2;
     }
+    if( cpu&X264_CPU_SSE )
+        pf_interlaced->scan_4x4  = x264_zigzag_scan_4x4_field_sse;
     if( cpu&X264_CPU_SSE2_IS_FAST )
         pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
     if( cpu&X264_CPU_SSSE3 )
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 454f53f..150a6ed 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -1463,9 +1463,9 @@ cglobal zigzag_scan_4x4_frame, 2,2
 ; void zigzag_scan_4x4_field( int32_t level[16], int32_t dct[4][4] )
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal zigzag_scan_4x4_field, 2,3
-    movu       m4, [r1+ 8]
-    pshufd     m0, m4, q3102
+cglobal zigzag_scan_4x4_field, 2,2
+    movu       m0, [r1+ 8]
+    pshufd     m0, m0, q3102
     mova       m1, [r1+32]
     mova       m2, [r1+48]
     movu  [r0+ 8], m0
@@ -1480,19 +1480,14 @@ cglobal zigzag_scan_4x4_field, 2,3
 ;-----------------------------------------------------------------------------
 ; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
 ;-----------------------------------------------------------------------------
-; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
-INIT_MMX mmx2
-cglobal zigzag_scan_4x4_field, 2,3
-    pshufw      m0, [r1+4], q3102
-    mova        m1, [r1+16]
-    mova        m2, [r1+24]
-    movu    [r0+4], m0
-    mova   [r0+16], m1
-    mova   [r0+24], m2
-    mov        r2d, [r1]
-    mov       [r0], r2d
-    mov        r2d, [r1+12]
-    mov    [r0+12], r2d
+INIT_XMM sse
+cglobal zigzag_scan_4x4_field, 2,2
+    mova       m0, [r1]
+    mova       m1, [r1+16]
+    pshufw    mm0, [r1+4], q3102
+    mova     [r0], m0
+    mova  [r0+16], m1
+    movq   [r0+4], mm0
     RET
 %endif ; HIGH_BIT_DEPTH
 
diff --git a/common/x86/dct.h b/common/x86/dct.h
index ded790f..a851ce9 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -112,7 +112,7 @@ void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[16] );
 void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
 void x264_zigzag_scan_4x4_frame_mmx  ( int16_t level[16], int16_t dct[16] );
 void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] );
-void x264_zigzag_scan_4x4_field_mmx2 ( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_4x4_field_sse  ( int16_t level[16], int16_t dct[16] );
 void x264_zigzag_scan_8x8_field_xop  ( int16_t level[64], int16_t dct[64] );
 void x264_zigzag_scan_8x8_field_avx  ( int32_t level[64], int32_t dct[64] );
 void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] );
diff --git a/tools/checkasm.c b/tools/checkasm.c
index ce7518e..8785cc8 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1025,8 +1025,8 @@ static int check_dct( int cpu_ref, int cpu_new )
     x264_zigzag_function_t zigzag_ref[2];
     x264_zigzag_function_t zigzag_asm[2];
 
-    ALIGNED_16( dctcoef level1[64] );
-    ALIGNED_16( dctcoef level2[64] );
+    ALIGNED_ARRAY_16( dctcoef, level1,[64] );
+    ALIGNED_ARRAY_16( dctcoef, level2,[64] );
 
 #define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \
     if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \