[x264-devel] commit: Slightly faster 8x16 SAD on Penryn Core 2 (Jason Garrett-Glaser )
git version control
git at videolan.org
Wed Mar 4 01:29:45 CET 2009
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Tue Mar 3 16:21:52 2009 -0800| [6f0b2a9b18f3af3fd7e495640756e1d5e43343e1] | committer: Jason Garrett-Glaser
Slightly faster 8x16 SAD on Penryn Core 2
Same as MMX 8x16 cacheline SAD, but calls SSE2 8x16 SAD in non-cacheline case.
Only Nehalem benefits from sizes smaller than 8x16, and Nehalem doesn't use cacheline functions, so no smaller versions are included.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=6f0b2a9b18f3af3fd7e495640756e1d5e43343e1
---
common/pixel.c | 13 ++++++++++---
common/x86/sad-a.asm | 40 +++++++++++++++++++++-------------------
2 files changed, 31 insertions(+), 22 deletions(-)
diff --git a/common/pixel.c b/common/pixel.c
index 8f1b1f5..7fa9830 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -679,14 +679,21 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT_ADS( _sse2 );
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
-#ifdef ARCH_X86
- if( cpu&X264_CPU_CACHELINE_64 )
+
+ if( cpu&X264_CPU_CACHELINE_64 )
{
+#ifdef ARCH_X86
INIT2( sad, _cache64_sse2 );
INIT2( sad_x3, _cache64_sse2 );
INIT2( sad_x4, _cache64_sse2 );
- }
#endif
+ if( cpu&X264_CPU_SSE2_IS_FAST )
+ {
+ pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_sse2;
+ pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_sse2;
+ }
+ }
+
if( cpu&X264_CPU_SSE_MISALIGN )
{
INIT2( sad_x3, _sse2_misalign );
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index 73a78a4..b448aff 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -1142,8 +1142,8 @@ cglobal x264_pixel_sad_8x%1_cache%2_mmxext
jg .split
%endmacro
-%macro SADX3_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
-cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5
+%macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
+cglobal x264_pixel_sad_x3_%1x%2_cache%3_%6
CHECK_SPLIT r1m, %1, %3
CHECK_SPLIT r2m, %1, %3
CHECK_SPLIT r3m, %1, %3
@@ -1207,8 +1207,8 @@ cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5
%endif
%endmacro
-%macro SADX4_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
-cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5
+%macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
+cglobal x264_pixel_sad_x4_%1x%2_cache%3_%6
CHECK_SPLIT r1m, %1, %3
CHECK_SPLIT r2m, %1, %3
CHECK_SPLIT r3m, %1, %3
@@ -1285,9 +1285,9 @@ cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5
%endif
%endmacro
-%macro SADX34_CACHELINE_FUNC 5
- SADX3_CACHELINE_FUNC %1, %2, %3, %4, %5
- SADX4_CACHELINE_FUNC %1, %2, %3, %4, %5
+%macro SADX34_CACHELINE_FUNC 1+
+ SADX3_CACHELINE_FUNC %1
+ SADX4_CACHELINE_FUNC %1
%endmacro
@@ -1307,15 +1307,15 @@ SAD8_CACHELINE_FUNC_MMX2 8, 64
SAD8_CACHELINE_FUNC_MMX2 16, 64
%ifndef ARCH_X86_64
-SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext
-SADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext
-SADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext
-SADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext
-SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext
-SADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext
+SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext, mmxext
+SADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext, mmxext
+SADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext, mmxext
+SADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext, mmxext
+SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext, mmxext
+SADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext, mmxext
%endif ; !ARCH_X86_64
-SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext
-SADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext
+SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext, mmxext
+SADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext, mmxext
%ifndef ARCH_X86_64
SAD16_CACHELINE_FUNC sse2, 8
@@ -1325,9 +1325,10 @@ SAD16_CACHELINE_FUNC sse2, 16
SAD16_CACHELINE_LOOP_SSE2 i
%assign i i+1
%endrep
-SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2
-SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2
+SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2
+SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2, sse2
%endif ; !ARCH_X86_64
+SADX34_CACHELINE_FUNC 8, 16, 64, sse2, mmxext, sse2
SAD16_CACHELINE_FUNC ssse3, 8
SAD16_CACHELINE_FUNC ssse3, 16
@@ -1336,5 +1337,6 @@ SAD16_CACHELINE_FUNC ssse3, 16
SAD16_CACHELINE_LOOP_SSSE3 i
%assign i i+1
%endrep
-SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3
-SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3
+SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3
+SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3, ssse3
+
More information about the x264-devel
mailing list