[x264-devel] commit: Initial Nehalem CPU optimizations (Jason Garrett-Glaser )
git version control
git at videolan.org
Wed Nov 5 12:40:42 CET 2008
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Wed Nov 5 03:11:45 2008 -0800| [a5ac6a5b8688915553fe6fccee09f1272f3788ac] | committer: Jason Garrett-Glaser
Initial Nehalem CPU optimizations
movaps/movups are no longer equivalent to their integer equivalents on the Nehalem, so that substitution is removed.
Nehalem has a much lower cacheline split penalty than previous Intel CPUs, so cacheline workarounds are no longer necessary.
Thanks to Intel for providing Avail Media with the pre-release Nehalem CPU needed to prepare these (and other not-yet-committed) optimizations.
Overall speed improvement with Nehalem vs Penryn at the same clock speed is around 40%.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=a5ac6a5b8688915553fe6fccee09f1272f3788ac
---
common/cpu.c | 7 +++++--
common/x86/x86inc.asm | 4 ----
encoder/encoder.c | 3 +++
x264.h | 5 +++--
4 files changed, 11 insertions(+), 8 deletions(-)
diff --git a/common/cpu.c b/common/cpu.c
index 307a0ee..2d722c6 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -48,7 +48,8 @@ const x264_cpu_name_t x264_cpu_names[] = {
{"SSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3},
{"SSSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
{"PHADD", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_PHADD_IS_FAST},
- {"SSE4", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
+ {"SSE4.1", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
+ {"SSE4.2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
{"Slow_mod4_stack", X264_CPU_STACK_MOD4},
@@ -91,6 +92,8 @@ uint32_t x264_cpu_detect( void )
cpu |= X264_CPU_SSSE3;
if( ecx&0x00080000 )
cpu |= X264_CPU_SSE4;
+ if( ecx&0x00100000 )
+ cpu |= X264_CPU_SSE42;
if( cpu & X264_CPU_SSSE3 )
cpu |= X264_CPU_SSE2_IS_FAST;
@@ -131,7 +134,7 @@ uint32_t x264_cpu_detect( void )
}
}
- if( !strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead") )
+ if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42))
{
/* cacheline size is specified in 3 places, any of which may be missing */
x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index dc06e7c..9a4a92b 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -474,7 +474,3 @@ INIT_MMX
%endif
%endmacro
-; substitutions which are functionally identical but reduce code size
-%define movdqa movaps
-%define movdqu movups
-
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 2e7ea80..4a9860f 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -744,6 +744,9 @@ x264_t *x264_encoder_open ( x264_param_t *param )
if( !strcmp(x264_cpu_names[i].name, "SSE3")
&& (param->cpu & X264_CPU_SSSE3 || !(param->cpu & X264_CPU_CACHELINE_64)) )
continue;
+ if( !strcmp(x264_cpu_names[i].name, "SSE4.1")
+ && (param->cpu & X264_CPU_SSE42) )
+ continue;
if( (param->cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags
&& (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) )
p += sprintf( p, " %s", x264_cpu_names[i].name );
diff --git a/x264.h b/x264.h
index afb8a41..323f9bb 100644
--- a/x264.h
+++ b/x264.h
@@ -58,8 +58,9 @@ typedef struct x264_t x264_t;
#define X264_CPU_SSE3 0x000200
#define X264_CPU_SSSE3 0x000400
#define X264_CPU_PHADD_IS_FAST 0x000800 /* pre-Penryn Core2 have a uselessly slow PHADD instruction */
-#define X264_CPU_SSE4 0x001000 /* SSE4.1 */
-#define X264_CPU_STACK_MOD4 0x002000 /* if stack is only mod4 and not mod16 */
+#define X264_CPU_STACK_MOD4 0x001000 /* if stack is only mod4 and not mod16 */
+#define X264_CPU_SSE4 0x002000 /* SSE4.1 */
+#define X264_CPU_SSE42 0x004000 /* SSE4.2 */
/* Analyse flags
*/
More information about the x264-devel
mailing list