[x264-devel] TBM, AVX2, FMA3, BMI1, and BMI2 CPU detection support
Jason Garrett-Glaser
git at videolan.org
Sat Feb 4 21:10:53 CET 2012
x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Thu Jan 19 14:56:54 2012 -0800| [da19765d723b06a1fa189478e9da61a1c18490f8] | committer: Jason Garrett-Glaser
TBM, AVX2, FMA3, BMI1, and BMI2 CPU detection support
TBM and BMI1 are supported by Trinity/Piledriver.
The others (and BMI1) will probably appear in Intel's upcoming Haswell.
Also update x86inc with AVX2 stuff.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=da19765d723b06a1fa189478e9da61a1c18490f8
---
common/cpu.c | 31 ++++++++++++++++++++++++++++---
common/x86/cpu-a.asm | 1 +
common/x86/x86inc.asm | 19 ++++++++++++++++---
tools/checkasm.c | 30 ++++++++++++++++++++++++++++++
x264.h | 5 +++++
5 files changed, 80 insertions(+), 6 deletions(-)
diff --git a/common/cpu.c b/common/cpu.c
index 78424c8..fa57407 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -62,14 +62,21 @@ const x264_cpu_name_t x264_cpu_names[] =
{"SSE4.1", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4.2", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
- {"AVX", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX},
- {"XOP", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX|X264_CPU_XOP},
- {"FMA4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX|X264_CPU_FMA4},
+#define AVX SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX
+ {"AVX", AVX},
+ {"XOP", AVX|X264_CPU_XOP},
+ {"FMA4", AVX|X264_CPU_FMA4},
+ {"AVX2", AVX|X264_CPU_AVX2},
+ {"FMA3", AVX|X264_CPU_FMA3},
+#undef AVX
#undef SSE2
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
{"SSEMisalign", X264_CPU_SSE_MISALIGN},
{"LZCNT", X264_CPU_LZCNT},
+ {"BMI1", X264_CPU_BMI1},
+ {"BMI2", X264_CPU_BMI1|X264_CPU_BMI2},
+ {"TBM", X264_CPU_TBM},
{"Slow_mod4_stack", X264_CPU_STACK_MOD4},
{"ARMv6", X264_CPU_ARMV6},
{"NEON", X264_CPU_NEON},
@@ -143,7 +150,22 @@ uint32_t x264_cpu_detect( void )
/* Check for OS support */
x264_cpu_xgetbv( 0, &eax, &edx );
if( (eax&0x6) == 0x6 )
+ {
cpu |= X264_CPU_AVX;
+ if( ecx&0x00001000 )
+ cpu |= X264_CPU_FMA3;
+ }
+ }
+
+ x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx );
+ /* AVX2 requires OS support, but BMI1/2 don't. */
+ if( (cpu&X264_CPU_AVX) && (ebx&0x00000020) )
+ cpu |= X264_CPU_AVX2;
+ if( ebx&0x00000008 )
+ {
+ cpu |= X264_CPU_BMI1;
+ if( ebx&0x00000100 )
+ cpu |= X264_CPU_BMI2;
}
if( cpu & X264_CPU_SSSE3 )
@@ -185,6 +207,9 @@ uint32_t x264_cpu_detect( void )
if( ecx&0x00010000 ) /* FMA4 */
cpu |= X264_CPU_FMA4;
}
+
+ if( ecx&0x00200000 )
+ cpu |= X264_CPU_TBM;
}
}
diff --git a/common/x86/cpu-a.asm b/common/x86/cpu-a.asm
index 05d7f64..85985b4 100644
--- a/common/x86/cpu-a.asm
+++ b/common/x86/cpu-a.asm
@@ -39,6 +39,7 @@ cglobal cpu_cpuid, 5,7
push r2
push r1
mov eax, r0d
+ xor ecx, ecx
cpuid
pop rsi
mov [rsi], eax
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index 57ebc85..29ac7fa 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -554,6 +554,8 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%assign cpuflags_avx (1<<9) | cpuflags_sse42
%assign cpuflags_xop (1<<10)| cpuflags_avx
%assign cpuflags_fma4 (1<<11)| cpuflags_avx
+%assign cpuflags_avx2 (1<<12)| cpuflags_avx
+%assign cpuflags_fma3 (1<<13)| cpuflags_avx
%assign cpuflags_cache32 (1<<16)
%assign cpuflags_cache64 (1<<17)
@@ -561,6 +563,9 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%assign cpuflags_lzcnt (1<<19)
%assign cpuflags_misalign (1<<20)
%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant
+%assign cpuflags_bmi1 (1<<22)
+%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1
+%assign cpuflags_tbm (1<<24)|cpuflags_bmi1
%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
@@ -822,10 +827,10 @@ INIT_XMM
;%4 == number of operands given
;%5+: operands
%macro RUN_AVX_INSTR 6-7+
- %ifid %5
- %define %%sizeofreg sizeof%5
- %elifid %6
+ %ifid %6
%define %%sizeofreg sizeof%6
+ %elifid %5
+ %define %%sizeofreg sizeof%5
%else
%define %%sizeofreg mmsize
%endif
@@ -948,6 +953,9 @@ AVX_INSTR mulsd, 1, 0, 1
AVX_INSTR mulss, 1, 0, 1
AVX_INSTR orpd, 1, 0, 1
AVX_INSTR orps, 1, 0, 1
+AVX_INSTR pabsb, 0, 0, 0
+AVX_INSTR pabsw, 0, 0, 0
+AVX_INSTR pabsd, 0, 0, 0
AVX_INSTR packsswb, 0, 0, 0
AVX_INSTR packssdw, 0, 0, 0
AVX_INSTR packuswb, 0, 0, 0
@@ -999,6 +1007,7 @@ AVX_INSTR pminsd, 0, 0, 1
AVX_INSTR pminub, 0, 0, 1
AVX_INSTR pminuw, 0, 0, 1
AVX_INSTR pminud, 0, 0, 1
+AVX_INSTR pmovmskb, 0, 0, 0
AVX_INSTR pmulhuw, 0, 0, 1
AVX_INSTR pmulhrsw, 0, 0, 1
AVX_INSTR pmulhw, 0, 0, 1
@@ -1009,6 +1018,9 @@ AVX_INSTR pmuldq, 0, 0, 1
AVX_INSTR por, 0, 0, 1
AVX_INSTR psadbw, 0, 0, 1
AVX_INSTR pshufb, 0, 0, 0
+AVX_INSTR pshufd, 0, 1, 0
+AVX_INSTR pshufhw, 0, 1, 0
+AVX_INSTR pshuflw, 0, 1, 0
AVX_INSTR psignb, 0, 0, 0
AVX_INSTR psignw, 0, 0, 0
AVX_INSTR psignd, 0, 0, 0
@@ -1030,6 +1042,7 @@ AVX_INSTR psubsb, 0, 0, 0
AVX_INSTR psubsw, 0, 0, 0
AVX_INSTR psubusb, 0, 0, 0
AVX_INSTR psubusw, 0, 0, 0
+AVX_INSTR ptest, 0, 0, 0
AVX_INSTR punpckhbw, 0, 0, 0
AVX_INSTR punpckhwd, 0, 0, 0
AVX_INSTR punpckhdq, 0, 0, 0
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 97fb331..13b6bfa 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -164,6 +164,8 @@ static void print_bench(void)
if( k < j )
continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
+ b->cpu&X264_CPU_AVX2 ? "avx2" :
+ b->cpu&X264_CPU_FMA3 ? "fma3" :
b->cpu&X264_CPU_FMA4 ? "fma4" :
b->cpu&X264_CPU_XOP ? "xop" :
b->cpu&X264_CPU_AVX ? "avx" :
@@ -182,6 +184,9 @@ static void print_bench(void)
b->cpu&X264_CPU_SHUFFLE_IS_FAST && !(b->cpu&X264_CPU_SSE4) ? "_fastshuffle" :
b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
+ b->cpu&X264_CPU_BMI2 ? "_bmi2" :
+ b->cpu&X264_CPU_TBM ? "_tbm" :
+ b->cpu&X264_CPU_BMI1 ? "_bmi1" :
b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" :
b->cpu&X264_CPU_SLOW_ATOM ? "_slow_atom" : "",
@@ -2405,7 +2410,32 @@ static int check_all_flags( void )
if( x264_cpu_detect() & X264_CPU_XOP )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_XOP, "XOP" );
if( x264_cpu_detect() & X264_CPU_FMA4 )
+ {
ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" );
+ cpu1 &= ~X264_CPU_FMA4;
+ }
+ if( x264_cpu_detect() & X264_CPU_FMA3 )
+ {
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
+ cpu1 &= ~X264_CPU_FMA3;
+ }
+ if( x264_cpu_detect() & X264_CPU_BMI1 )
+ {
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
+ if( x264_cpu_detect() & X264_CPU_TBM )
+ {
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_TBM, "TBM" );
+ cpu1 &= ~X264_CPU_TBM;
+ }
+ if( x264_cpu_detect() & X264_CPU_BMI2 )
+ {
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI2, "BMI2" );
+ cpu1 &= ~X264_CPU_BMI2;
+ }
+ cpu1 &= ~X264_CPU_BMI1;
+ }
+ if( x264_cpu_detect() & X264_CPU_AVX2 )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
#elif ARCH_PPC
if( x264_cpu_detect() & X264_CPU_ALTIVEC )
{
diff --git a/x264.h b/x264.h
index 965b96d..322efb4 100644
--- a/x264.h
+++ b/x264.h
@@ -127,6 +127,11 @@ typedef struct
* aren't used. */
#define X264_CPU_XOP 0x0800000 /* AMD XOP */
#define X264_CPU_FMA4 0x1000000 /* AMD FMA4 */
+#define X264_CPU_AVX2 0x2000000 /* AVX2 */
+#define X264_CPU_FMA3 0x4000000 /* Intel FMA3 */
+#define X264_CPU_BMI1 0x8000000 /* BMI1 */
+#define X264_CPU_BMI2 0x10000000 /* BMI2 */
+#define X264_CPU_TBM 0x20000000 /* AMD TBM */
/* Analyse flags
*/
More information about the x264-devel
mailing list