[x264-devel] commit: enable ssse3 phadd satd on Penryn. (Loren Merritt )
git version control
git at videolan.org
Sun Jun 8 07:01:38 CEST 2008
x264 | branch: master | Loren Merritt <pengvado at akuvian.org> | Fri Jun 6 23:30:37 2008 -0600| [2998b17a64cb41ecf3389cc9f5a44c6f07620487]
enable ssse3 phadd satd on Penryn.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=2998b17a64cb41ecf3389cc9f5a44c6f07620487
---
common/cpu.c | 3 +++
common/pixel.c | 9 +++++++++
common/x86/pixel-a.asm | 47 ++++++++++++++++++++++++++++++++++-------------
common/x86/pixel.h | 1 +
tools/checkasm.c | 6 ++++++
x264.h | 1 +
6 files changed, 54 insertions(+), 13 deletions(-)
diff --git a/common/cpu.c b/common/cpu.c
index 47a72f7..3ebe970 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -47,6 +47,7 @@ const struct {
{"SSE2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2},
{"SSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3},
{"SSSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
+ {"SSE4", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"3DNow", X264_CPU_3DNOW},
{"Altivec", X264_CPU_ALTIVEC},
{"Cache32", X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32},
@@ -88,6 +89,8 @@ uint32_t x264_cpu_detect( void )
cpu |= X264_CPU_SSE3;
if( ecx&0x00000200 )
cpu |= X264_CPU_SSSE3;
+ if( ecx&0x00080000 )
+ cpu |= X264_CPU_SSE4;
x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
max_extended_cap = eax;
diff --git a/common/pixel.c b/common/pixel.c
index 133968c..0d00b6e 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -360,6 +360,7 @@ SATD_X_DECL7()
SATD_X_DECL7( _mmxext )
SATD_X_DECL5( _sse2 )
SATD_X_DECL7( _ssse3 )
+SATD_X_DECL5( _ssse3_phadd )
#endif
/****************************************************************************
@@ -649,6 +650,14 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x4, _cache64_ssse3 );
}
}
+
+ if( cpu&X264_CPU_SSE4 )
+ {
+ // enabled on Penryn, but slower on Conroe
+ INIT5( satd, _ssse3_phadd );
+ INIT5( satd_x3, _ssse3_phadd );
+ INIT5( satd_x4, _ssse3_phadd );
+ }
#endif //HAVE_MMX
#ifdef ARCH_PPC
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index 9eed1db..361e2a6 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -274,19 +274,23 @@ SSD_SSE2 8, 4
LOAD_DIFF_8P %4, %6, [r0+r4], [r2+r5]
%endmacro
-;;; row transform not used, because phaddw is much slower than paddw on a Conroe
-;%macro PHSUMSUB 3
-; movdqa %3, %1
-; phaddw %1, %2
-; phsubw %3, %2
-;%endmacro
-
-;%macro HADAMARD4_ROW_SSSE3 5 ; abcd-t -> adtc
-; PHSUMSUB %1, %2, %5
-; PHSUMSUB %3, %4, %2
-; PHSUMSUB %1, %3, %4
-; PHSUMSUB %5, %2, %3
-;%endmacro
+; phaddw is used only in 4x4 hadamard, because in 8x8 it's slower:
+; even on Penryn, phaddw has latency 3 while paddw and punpck* have 1.
+; 4x4 is special in that 4x4 transpose in xmmregs takes extra munging,
+; whereas phaddw-based transform doesn't care what order the coefs end up in.
+
+%macro PHSUMSUB 3
+ movdqa %3, %1
+ phaddw %1, %2
+ phsubw %3, %2
+%endmacro
+
+%macro HADAMARD4_ROW_PHADD 5 ; abcd-t -> adtc
+ PHSUMSUB %1, %2, %5
+ PHSUMSUB %3, %4, %2
+ PHSUMSUB %1, %3, %4
+ PHSUMSUB %5, %2, %3
+%endmacro
%macro SUMSUB_BADC 4
paddw %1, %2
@@ -494,6 +498,21 @@ SSD_SSE2 8, 4
paddusw xmm6, xmm2
%endmacro
+%macro SATD_8x4_PHADD 1
+ LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+%if %1
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+%endif
+ HADAMARD4_1D xmm0, xmm1, xmm2, xmm3
+ HADAMARD4_ROW_PHADD xmm0, xmm1, xmm2, xmm3, xmm4
+ ABS4 xmm0, xmm3, xmm4, xmm2, xmm1, xmm5
+ paddusw xmm0, xmm3
+ paddusw xmm2, xmm4
+ paddusw xmm6, xmm0
+ paddusw xmm6, xmm2
+%endmacro
+
%macro SATD_START_MMX 0
lea r4, [3*r1] ; 3*stride1
lea r5, [3*r3] ; 3*stride2
@@ -1279,6 +1298,8 @@ SA8D_16x16_32 ssse3
INTRA_SA8D_SSE2 ssse3
INTRA_SATDS_MMX ssse3
SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3.
+%define SATD_8x4_SSE2 SATD_8x4_PHADD
+SATDS_SSE2 ssse3_phadd
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 92adfbf..6aa556a 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -51,6 +51,7 @@ DECL_X1( ssd, sse2 )
DECL_X1( satd, mmxext )
DECL_X1( satd, sse2 )
DECL_X1( satd, ssse3 )
+DECL_X1( satd, ssse3_phadd )
DECL_X1( sa8d, mmxext )
DECL_X1( sa8d, sse2 )
DECL_X1( sa8d, ssse3 )
diff --git a/tools/checkasm.c b/tools/checkasm.c
index ab3c4f9..115e221 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -120,6 +120,7 @@ static void print_bench(void)
for( k=0; k<j && benchs[i].vers[k].pointer != b->pointer; k++ );
if( k<j ) continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
+ b->cpu&X264_CPU_SSE4 ? "sse4" :
b->cpu&X264_CPU_SSSE3 ? "ssse3" :
b->cpu&X264_CPU_SSE3 ? "sse3" :
b->cpu&X264_CPU_SSE2 ? "sse2" :
@@ -1142,6 +1143,11 @@ int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
}
+ if( x264_cpu_detect() & X264_CPU_SSSE3 )
+ {
+ cpu1 &= ~(X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64);
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
+ }
#elif ARCH_PPC
if( x264_cpu_detect() & X264_CPU_ALTIVEC )
{
diff --git a/x264.h b/x264.h
index ff4cc24..7b39049 100644
--- a/x264.h
+++ b/x264.h
@@ -58,6 +58,7 @@ typedef struct x264_t x264_t;
#define X264_CPU_CACHELINE_SPLIT 0x200 /* avoid memory loads that span the border between two cachelines */
#define X264_CPU_CACHELINE_32 0x0400 /* size of a cacheline in bytes */
#define X264_CPU_CACHELINE_64 0x0800
+#define X264_CPU_SSE4 0x001000 /* sse 4.1 */
/* Analyse flags
*/
More information about the x264-devel
mailing list