[x264-devel] commit: Initial Nehalem CPU optimizations (Jason Garrett-Glaser )

Wed Nov 5 12:40:42 CET 2008

x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Wed Nov  5 03:11:45 2008 -0800| [a5ac6a5b8688915553fe6fccee09f1272f3788ac] | committer: Jason Garrett-Glaser 

Initial Nehalem CPU optimizations
movaps/movups are no longer equivalent to their integer equivalents on the Nehalem, so that substitution is removed.
Nehalem has a much lower cacheline split penalty than previous Intel CPUs, so cacheline workarounds are no longer necessary.
Thanks to Intel for providing Avail Media with the pre-release Nehalem CPU needed to prepare these (and other not-yet-committed) optimizations.
Overall speed improvement with Nehalem vs Penryn at the same clock speed is around 40%.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=a5ac6a5b8688915553fe6fccee09f1272f3788ac
---

 common/cpu.c          |    7 +++++--
 common/x86/x86inc.asm |    4 ----
 encoder/encoder.c     |    3 +++
 x264.h                |    5 +++--
 4 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/common/cpu.c b/common/cpu.c
index 307a0ee..2d722c6 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -48,7 +48,8 @@ const x264_cpu_name_t x264_cpu_names[] = {
     {"SSE3",    X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3},
     {"SSSE3",   X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
     {"PHADD",   X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_PHADD_IS_FAST},
-    {"SSE4",    X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
+    {"SSE4.1",  X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
+    {"SSE4.2",  X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
     {"Cache32", X264_CPU_CACHELINE_32},
     {"Cache64", X264_CPU_CACHELINE_64},
     {"Slow_mod4_stack", X264_CPU_STACK_MOD4},
@@ -91,6 +92,8 @@ uint32_t x264_cpu_detect( void )
         cpu |= X264_CPU_SSSE3;
     if( ecx&0x00080000 )
         cpu |= X264_CPU_SSE4;
+    if( ecx&0x00100000 )
+        cpu |= X264_CPU_SSE42;
 
     if( cpu & X264_CPU_SSSE3 )
         cpu |= X264_CPU_SSE2_IS_FAST;
@@ -131,7 +134,7 @@ uint32_t x264_cpu_detect( void )
         }
     }
 
-    if( !strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead") )
+    if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42))
     {
         /* cacheline size is specified in 3 places, any of which may be missing */
         x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index dc06e7c..9a4a92b 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -474,7 +474,3 @@ INIT_MMX
     %endif
 %endmacro
 
-; substitutions which are functionally identical but reduce code size
-%define movdqa movaps
-%define movdqu movups
-
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 2e7ea80..4a9860f 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -744,6 +744,9 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
         if( !strcmp(x264_cpu_names[i].name, "SSE3")
             && (param->cpu & X264_CPU_SSSE3 || !(param->cpu & X264_CPU_CACHELINE_64)) )
             continue;
+        if( !strcmp(x264_cpu_names[i].name, "SSE4.1")
+            && (param->cpu & X264_CPU_SSE42) )
+            continue;
         if( (param->cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags
             && (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) )
             p += sprintf( p, " %s", x264_cpu_names[i].name );
diff --git a/x264.h b/x264.h
index afb8a41..323f9bb 100644
--- a/x264.h
+++ b/x264.h
@@ -58,8 +58,9 @@ typedef struct x264_t x264_t;
 #define X264_CPU_SSE3           0x000200
 #define X264_CPU_SSSE3          0x000400
 #define X264_CPU_PHADD_IS_FAST  0x000800  /* pre-Penryn Core2 have a uselessly slow PHADD instruction */
-#define X264_CPU_SSE4           0x001000  /* SSE4.1 */
-#define X264_CPU_STACK_MOD4     0x002000  /* if stack is only mod4 and not mod16 */
+#define X264_CPU_STACK_MOD4     0x001000  /* if stack is only mod4 and not mod16 */
+#define X264_CPU_SSE4           0x002000  /* SSE4.1 */
+#define X264_CPU_SSE42          0x004000  /* SSE4.2 */
 
 /* Analyse flags
  */