[x264-devel] commit: SSE2 zigzag_interleave (Jason Garrett-Glaser )
git version control
git at videolan.org
Wed Mar 18 09:48:12 CET 2009
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Tue Mar 17 11:01:57 2009 -0700| [682b54d6175f98dfa14fec4d951f4b3b6e686b95] | committer: Jason Garrett-Glaser
SSE2 zigzag_interleave
Replace PHADD with FastShuffle (more accurate naming).
This flag represents asm functions that rely on fast SSE2 shuffle units, and thus are only faster on Phenom, Nehalem, and Penryn CPUs.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=682b54d6175f98dfa14fec4d951f4b3b6e686b95
---
common/cpu.c | 5 +++--
common/dct.c | 6 ++++--
common/pixel.c | 2 +-
common/x86/dct-a.asm | 49 ++++++++++++++++++++++++++++++++++++++++++++++++-
common/x86/dct.h | 1 +
tools/checkasm.c | 9 ++++++---
x264.h | 2 +-
7 files changed, 64 insertions(+), 10 deletions(-)
diff --git a/common/cpu.c b/common/cpu.c
index 01bb67b..1cb7080 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -53,7 +53,7 @@ const x264_cpu_name_t x264_cpu_names[] = {
{"SSE2Fast",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_FAST},
{"SSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3},
{"SSSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
- {"PHADD", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_PHADD_IS_FAST},
+ {"FastShuffle", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SHUFFLE_IS_FAST},
{"SSE4.1", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4.2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
{"Cache32", X264_CPU_CACHELINE_32},
@@ -107,7 +107,7 @@ uint32_t x264_cpu_detect( void )
if( cpu & X264_CPU_SSSE3 )
cpu |= X264_CPU_SSE2_IS_FAST;
if( cpu & X264_CPU_SSE4 )
- cpu |= X264_CPU_PHADD_IS_FAST;
+ cpu |= X264_CPU_SHUFFLE_IS_FAST;
x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
max_extended_cap = eax;
@@ -124,6 +124,7 @@ uint32_t x264_cpu_detect( void )
cpu |= X264_CPU_SSE2_IS_FAST;
cpu |= X264_CPU_SSE_MISALIGN;
cpu |= X264_CPU_LZCNT;
+ cpu |= X264_CPU_SHUFFLE_IS_FAST;
x264_cpu_mask_misalign_sse();
}
else
diff --git a/common/dct.c b/common/dct.c
index 04301a9..1f8f4b3 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -663,9 +663,9 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
{
pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
+ if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+ pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
}
- if( cpu&X264_CPU_PHADD_IS_FAST )
- pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
#endif
#ifdef ARCH_PPC
@@ -678,5 +678,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMX )
pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
+ if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+ pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
#endif
}
diff --git a/common/pixel.c b/common/pixel.c
index 38c3926..76d04e0 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -763,7 +763,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x3, _cache64_ssse3 );
INIT2( sad_x4, _cache64_ssse3 );
}
- if( !(cpu&X264_CPU_PHADD_IS_FAST) )
+ if( !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
{
INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
}
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index df51926..6e92df6 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -35,7 +35,7 @@ pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
-pb_1: times 8 db 1
+pb_1: times 16 db 1
SECTION .text
@@ -785,3 +785,50 @@ cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 3,3
shr r0d, 16
mov [r2+8], r0w
RET
+
+%macro INTERLEAVE_XMM 1
+ mova m0, [r1+%1*4+ 0]
+ mova m1, [r1+%1*4+16]
+ mova m4, [r1+%1*4+32]
+ mova m5, [r1+%1*4+48]
+ SBUTTERFLY wd, 0, 1, 6
+ SBUTTERFLY wd, 4, 5, 7
+ SBUTTERFLY wd, 0, 1, 6
+ SBUTTERFLY wd, 4, 5, 7
+ movq [r0+%1+ 0], m0
+ movhps [r0+%1+ 32], m0
+ movq [r0+%1+ 64], m1
+ movhps [r0+%1+ 96], m1
+ movq [r0+%1+ 8], m4
+ movhps [r0+%1+ 40], m4
+ movq [r0+%1+ 72], m5
+ movhps [r0+%1+104], m5
+%if %1
+ por m2, m0
+ por m3, m1
+ por m2, m4
+ por m3, m5
+%else
+ SWAP 0,2
+ SWAP 3,1
+ por m2, m4
+ por m3, m5
+%endif
+%endmacro
+
+INIT_XMM
+cglobal x264_zigzag_interleave_8x8_cavlc_sse2, 3,3,8
+ INTERLEAVE_XMM 0
+ INTERLEAVE_XMM 16
+ packsswb m2, m3
+ pxor m5, m5
+ packsswb m2, m2
+ packsswb m2, m2
+ pcmpeqb m5, m2
+ paddb m5, [pb_1 GLOBAL]
+ movd r0d, m5
+ mov [r2+0], r0w
+ shr r0d, 16
+ mov [r2+8], r0w
+ RET
+
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 5b83d34..4451821 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -69,5 +69,6 @@ void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz );
+void x264_zigzag_interleave_8x8_cavlc_sse2( int16_t *dst, int16_t *src, uint8_t *nnz );
#endif
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 27aaaa9..e5f62e2 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -151,7 +151,7 @@ static void print_bench(void)
if( k<j ) continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
b->cpu&X264_CPU_SSE4 ? "sse4" :
- b->cpu&X264_CPU_PHADD_IS_FAST ? "phadd" :
+ b->cpu&X264_CPU_SHUFFLE_IS_FAST ? "fastshuffle" :
b->cpu&X264_CPU_SSSE3 ? "ssse3" :
b->cpu&X264_CPU_SSE3 ? "sse3" :
/* print sse2slow only if there's also a sse2fast version of the same func */
@@ -1364,10 +1364,10 @@ static int check_intra( int cpu_ref, int cpu_new )
for( i = 0; i < 12; i++ )
INTRA_TEST( predict_8x8, i, 8, edge );
- used_asm = 1;
set_func_name("intra_predict_8x8_filter");
if( ip_a.predict_8x8_filter != ip_ref.predict_8x8_filter )
{
+ used_asm = 1;
for( i = 0; i < 32; i++ )
{
memcpy( edge2, edge, 33 );
@@ -1463,6 +1463,8 @@ static int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSE2 FastShuffle" );
+ cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
}
if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
{
@@ -1483,7 +1485,8 @@ static int check_all_flags( void )
cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_PHADD_IS_FAST, "PHADD" );
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" );
+ cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
}
if( x264_cpu_detect() & X264_CPU_SSE4 )
{
diff --git a/x264.h b/x264.h
index 3971992..26ac421 100644
--- a/x264.h
+++ b/x264.h
@@ -57,7 +57,7 @@ typedef struct x264_t x264_t;
#define X264_CPU_SSE2_IS_FAST 0x000100 /* a few functions are only faster on Core2 and Phenom */
#define X264_CPU_SSE3 0x000200
#define X264_CPU_SSSE3 0x000400
-#define X264_CPU_PHADD_IS_FAST 0x000800 /* pre-Penryn Core2 have a uselessly slow PHADD instruction */
+#define X264_CPU_SHUFFLE_IS_FAST 0x000800 /* Penryn, Nehalem, and Phenom have fast shuffle units */
#define X264_CPU_STACK_MOD4 0x001000 /* if stack is only mod4 and not mod16 */
#define X264_CPU_SSE4 0x002000 /* SSE4.1 */
#define X264_CPU_SSE42 0x004000 /* SSE4.2 */
More information about the x264-devel
mailing list