[x264-devel] commit: Slightly faster 8x16 SAD on Penryn Core 2 (Jason Garrett-Glaser )

git version control git at videolan.org
Wed Mar 4 01:29:45 CET 2009


x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Tue Mar  3 16:21:52 2009 -0800| [6f0b2a9b18f3af3fd7e495640756e1d5e43343e1] | committer: Jason Garrett-Glaser 

Slightly faster 8x16 SAD on Penryn Core 2
Same as MMX 8x16 cacheline SAD, but calls SSE2 8x16 SAD in non-cacheline case.
Only Nehalem benefits from sizes smaller than 8x16, and Nehalem doesn't use cacheline functions, so no smaller versions are included.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=6f0b2a9b18f3af3fd7e495640756e1d5e43343e1
---

 common/pixel.c       |   13 ++++++++++---
 common/x86/sad-a.asm |   40 +++++++++++++++++++++-------------------
 2 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/common/pixel.c b/common/pixel.c
index 8f1b1f5..7fa9830 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -679,14 +679,21 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         INIT_ADS( _sse2 );
         pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
         pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
-#ifdef ARCH_X86
-        if( cpu&X264_CPU_CACHELINE_64 )
+
+       if( cpu&X264_CPU_CACHELINE_64 )
         {
+#ifdef ARCH_X86
             INIT2( sad, _cache64_sse2 );
             INIT2( sad_x3, _cache64_sse2 );
             INIT2( sad_x4, _cache64_sse2 );
-        }
 #endif
+           if( cpu&X264_CPU_SSE2_IS_FAST )
+           {
+               pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_sse2;
+               pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_sse2;
+           }
+        }
+
         if( cpu&X264_CPU_SSE_MISALIGN )
         {
             INIT2( sad_x3, _sse2_misalign );
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index 73a78a4..b448aff 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -1142,8 +1142,8 @@ cglobal x264_pixel_sad_8x%1_cache%2_mmxext
     jg .split
 %endmacro
 
-%macro SADX3_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
-cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5
+%macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
+cglobal x264_pixel_sad_x3_%1x%2_cache%3_%6
     CHECK_SPLIT r1m, %1, %3
     CHECK_SPLIT r2m, %1, %3
     CHECK_SPLIT r3m, %1, %3
@@ -1207,8 +1207,8 @@ cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5
 %endif
 %endmacro
 
-%macro SADX4_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
-cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5
+%macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
+cglobal x264_pixel_sad_x4_%1x%2_cache%3_%6
     CHECK_SPLIT r1m, %1, %3
     CHECK_SPLIT r2m, %1, %3
     CHECK_SPLIT r3m, %1, %3
@@ -1285,9 +1285,9 @@ cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5
 %endif
 %endmacro
 
-%macro SADX34_CACHELINE_FUNC 5
-    SADX3_CACHELINE_FUNC %1, %2, %3, %4, %5
-    SADX4_CACHELINE_FUNC %1, %2, %3, %4, %5
+%macro SADX34_CACHELINE_FUNC 1+
+    SADX3_CACHELINE_FUNC %1
+    SADX4_CACHELINE_FUNC %1
 %endmacro
 
 
@@ -1307,15 +1307,15 @@ SAD8_CACHELINE_FUNC_MMX2   8, 64
 SAD8_CACHELINE_FUNC_MMX2  16, 64
 
 %ifndef ARCH_X86_64
-SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext
-SADX34_CACHELINE_FUNC 16,  8, 32, mmxext, mmxext
-SADX34_CACHELINE_FUNC  8, 16, 32, mmxext, mmxext
-SADX34_CACHELINE_FUNC  8,  8, 32, mmxext, mmxext
-SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext
-SADX34_CACHELINE_FUNC 16,  8, 64, mmxext, mmxext
+SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext, mmxext
+SADX34_CACHELINE_FUNC 16,  8, 32, mmxext, mmxext, mmxext
+SADX34_CACHELINE_FUNC  8, 16, 32, mmxext, mmxext, mmxext
+SADX34_CACHELINE_FUNC  8,  8, 32, mmxext, mmxext, mmxext
+SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext, mmxext
+SADX34_CACHELINE_FUNC 16,  8, 64, mmxext, mmxext, mmxext
 %endif ; !ARCH_X86_64
-SADX34_CACHELINE_FUNC  8, 16, 64, mmxext, mmxext
-SADX34_CACHELINE_FUNC  8,  8, 64, mmxext, mmxext
+SADX34_CACHELINE_FUNC  8, 16, 64, mmxext, mmxext, mmxext
+SADX34_CACHELINE_FUNC  8,  8, 64, mmxext, mmxext, mmxext
 
 %ifndef ARCH_X86_64
 SAD16_CACHELINE_FUNC sse2, 8
@@ -1325,9 +1325,10 @@ SAD16_CACHELINE_FUNC sse2, 16
 SAD16_CACHELINE_LOOP_SSE2 i
 %assign i i+1
 %endrep
-SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2
-SADX34_CACHELINE_FUNC 16,  8, 64, sse2, sse2
+SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2
+SADX34_CACHELINE_FUNC 16,  8, 64, sse2, sse2, sse2
 %endif ; !ARCH_X86_64
+SADX34_CACHELINE_FUNC  8, 16, 64, sse2, mmxext, sse2
 
 SAD16_CACHELINE_FUNC ssse3, 8
 SAD16_CACHELINE_FUNC ssse3, 16
@@ -1336,5 +1337,6 @@ SAD16_CACHELINE_FUNC ssse3, 16
 SAD16_CACHELINE_LOOP_SSSE3 i
 %assign i i+1
 %endrep
-SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3
-SADX34_CACHELINE_FUNC 16,  8, 64, sse2, ssse3
+SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3
+SADX34_CACHELINE_FUNC 16,  8, 64, sse2, ssse3, ssse3
+



More information about the x264-devel mailing list