[x264-devel] x86: SSE zigzag_scan_4x4_field
Henrik Gramner
git at videolan.org
Tue Sep 20 20:57:52 CEST 2016
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sun Sep 11 15:32:54 2016 +0200| [75d0f9cc8770bc4f36785062116757d24eb44604] | committer: Anton Mitrofanov
x86: SSE zigzag_scan_4x4_field
Replaces the MMX2 version, one cycle faster.
Also change the checkasm test to use the correct alignment macro.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=75d0f9cc8770bc4f36785062116757d24eb44604
---
common/dct.c | 3 ++-
common/x86/dct-a.asm | 27 +++++++++++----------------
common/x86/dct.h | 2 +-
tools/checkasm.c | 4 ++--
4 files changed, 16 insertions(+), 20 deletions(-)
diff --git a/common/dct.c b/common/dct.c
index 9e2e955..7dfeea2 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -990,10 +990,11 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
if( cpu&X264_CPU_MMX2 )
{
- pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_mmx2;
pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_mmx2;
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2;
}
+ if( cpu&X264_CPU_SSE )
+ pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse;
if( cpu&X264_CPU_SSE2_IS_FAST )
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
if( cpu&X264_CPU_SSSE3 )
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 454f53f..150a6ed 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -1463,9 +1463,9 @@ cglobal zigzag_scan_4x4_frame, 2,2
; void zigzag_scan_4x4_field( int32_t level[16], int32_t dct[4][4] )
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal zigzag_scan_4x4_field, 2,3
- movu m4, [r1+ 8]
- pshufd m0, m4, q3102
+cglobal zigzag_scan_4x4_field, 2,2
+ movu m0, [r1+ 8]
+ pshufd m0, m0, q3102
mova m1, [r1+32]
mova m2, [r1+48]
movu [r0+ 8], m0
@@ -1480,19 +1480,14 @@ cglobal zigzag_scan_4x4_field, 2,3
;-----------------------------------------------------------------------------
; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
-; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
-INIT_MMX mmx2
-cglobal zigzag_scan_4x4_field, 2,3
- pshufw m0, [r1+4], q3102
- mova m1, [r1+16]
- mova m2, [r1+24]
- movu [r0+4], m0
- mova [r0+16], m1
- mova [r0+24], m2
- mov r2d, [r1]
- mov [r0], r2d
- mov r2d, [r1+12]
- mov [r0+12], r2d
+INIT_XMM sse
+cglobal zigzag_scan_4x4_field, 2,2
+ mova m0, [r1]
+ mova m1, [r1+16]
+ pshufw mm0, [r1+4], q3102
+ mova [r0], m0
+ mova [r0+16], m1
+ movq [r0+4], mm0
RET
%endif ; HIGH_BIT_DEPTH
diff --git a/common/x86/dct.h b/common/x86/dct.h
index ded790f..a851ce9 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -112,7 +112,7 @@ void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] );
-void x264_zigzag_scan_4x4_field_mmx2 ( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_4x4_field_sse ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_8x8_field_xop ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] );
void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] );
diff --git a/tools/checkasm.c b/tools/checkasm.c
index ce7518e..8785cc8 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1025,8 +1025,8 @@ static int check_dct( int cpu_ref, int cpu_new )
x264_zigzag_function_t zigzag_ref[2];
x264_zigzag_function_t zigzag_asm[2];
- ALIGNED_16( dctcoef level1[64] );
- ALIGNED_16( dctcoef level2[64] );
+ ALIGNED_ARRAY_16( dctcoef, level1,[64] );
+ ALIGNED_ARRAY_16( dctcoef, level2,[64] );
#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \
if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \
More information about the x264-devel
mailing list