[x264-devel] TBM, AVX2, FMA3, BMI1, and BMI2 CPU detection support

Jason Garrett-Glaser git at videolan.org
Sat Feb 4 21:10:53 CET 2012


x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Thu Jan 19 14:56:54 2012 -0800| [da19765d723b06a1fa189478e9da61a1c18490f8] | committer: Jason Garrett-Glaser

TBM, AVX2, FMA3, BMI1, and BMI2 CPU detection support
TBM and BMI1 are supported by Trinity/Piledriver.
The others (and BMI1) will probably appear in Intel's upcoming Haswell.
Also update x86inc with AVX2 stuff.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=da19765d723b06a1fa189478e9da61a1c18490f8
---

 common/cpu.c          |   31 ++++++++++++++++++++++++++++---
 common/x86/cpu-a.asm  |    1 +
 common/x86/x86inc.asm |   19 ++++++++++++++++---
 tools/checkasm.c      |   30 ++++++++++++++++++++++++++++++
 x264.h                |    5 +++++
 5 files changed, 80 insertions(+), 6 deletions(-)

diff --git a/common/cpu.c b/common/cpu.c
index 78424c8..fa57407 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -62,14 +62,21 @@ const x264_cpu_name_t x264_cpu_names[] =
     {"SSE4.1",      SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
     {"SSE4",        SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
     {"SSE4.2",      SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
-    {"AVX",         SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX},
-    {"XOP",         SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX|X264_CPU_XOP},
-    {"FMA4",        SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX|X264_CPU_FMA4},
+#define AVX SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX
+    {"AVX",         AVX},
+    {"XOP",         AVX|X264_CPU_XOP},
+    {"FMA4",        AVX|X264_CPU_FMA4},
+    {"AVX2",        AVX|X264_CPU_AVX2},
+    {"FMA3",        AVX|X264_CPU_FMA3},
+#undef AVX
 #undef SSE2
     {"Cache32",         X264_CPU_CACHELINE_32},
     {"Cache64",         X264_CPU_CACHELINE_64},
     {"SSEMisalign",     X264_CPU_SSE_MISALIGN},
     {"LZCNT",           X264_CPU_LZCNT},
+    {"BMI1",            X264_CPU_BMI1},
+    {"BMI2",            X264_CPU_BMI1|X264_CPU_BMI2},
+    {"TBM",             X264_CPU_TBM},
     {"Slow_mod4_stack", X264_CPU_STACK_MOD4},
     {"ARMv6",           X264_CPU_ARMV6},
     {"NEON",            X264_CPU_NEON},
@@ -143,7 +150,22 @@ uint32_t x264_cpu_detect( void )
         /* Check for OS support */
         x264_cpu_xgetbv( 0, &eax, &edx );
         if( (eax&0x6) == 0x6 )
+        {
             cpu |= X264_CPU_AVX;
+            if( ecx&0x00001000 )
+                cpu |= X264_CPU_FMA3;
+        }
+    }
+
+    x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx );
+    /* AVX2 requires OS support, but BMI1/2 don't. */
+    if( (cpu&X264_CPU_AVX) && (ebx&0x00000020) )
+        cpu |= X264_CPU_AVX2;
+    if( ebx&0x00000008 )
+    {
+        cpu |= X264_CPU_BMI1;
+        if( ebx&0x00000100 )
+            cpu |= X264_CPU_BMI2;
     }
 
     if( cpu & X264_CPU_SSSE3 )
@@ -185,6 +207,9 @@ uint32_t x264_cpu_detect( void )
                 if( ecx&0x00010000 ) /* FMA4 */
                     cpu |= X264_CPU_FMA4;
             }
+
+            if( ecx&0x00200000 )
+                cpu |= X264_CPU_TBM;
         }
     }
 
diff --git a/common/x86/cpu-a.asm b/common/x86/cpu-a.asm
index 05d7f64..85985b4 100644
--- a/common/x86/cpu-a.asm
+++ b/common/x86/cpu-a.asm
@@ -39,6 +39,7 @@ cglobal cpu_cpuid, 5,7
     push  r2
     push  r1
     mov  eax, r0d
+    xor  ecx, ecx
     cpuid
     pop  rsi
     mov [rsi], eax
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index 57ebc85..29ac7fa 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -554,6 +554,8 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
 %assign cpuflags_avx      (1<<9) | cpuflags_sse42
 %assign cpuflags_xop      (1<<10)| cpuflags_avx
 %assign cpuflags_fma4     (1<<11)| cpuflags_avx
+%assign cpuflags_avx2     (1<<12)| cpuflags_avx
+%assign cpuflags_fma3     (1<<13)| cpuflags_avx
 
 %assign cpuflags_cache32  (1<<16)
 %assign cpuflags_cache64  (1<<17)
@@ -561,6 +563,9 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
 %assign cpuflags_lzcnt    (1<<19)
 %assign cpuflags_misalign (1<<20)
 %assign cpuflags_aligned  (1<<21) ; not a cpu feature, but a function variant
+%assign cpuflags_bmi1     (1<<22)
+%assign cpuflags_bmi2     (1<<23)|cpuflags_bmi1
+%assign cpuflags_tbm      (1<<24)|cpuflags_bmi1
 
 %define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
 %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
@@ -822,10 +827,10 @@ INIT_XMM
 ;%4 == number of operands given
 ;%5+: operands
 %macro RUN_AVX_INSTR 6-7+
-    %ifid %5
-        %define %%sizeofreg sizeof%5
-    %elifid %6
+    %ifid %6
         %define %%sizeofreg sizeof%6
+    %elifid %5
+        %define %%sizeofreg sizeof%5
     %else
         %define %%sizeofreg mmsize
     %endif
@@ -948,6 +953,9 @@ AVX_INSTR mulsd, 1, 0, 1
 AVX_INSTR mulss, 1, 0, 1
 AVX_INSTR orpd, 1, 0, 1
 AVX_INSTR orps, 1, 0, 1
+AVX_INSTR pabsb, 0, 0, 0
+AVX_INSTR pabsw, 0, 0, 0
+AVX_INSTR pabsd, 0, 0, 0
 AVX_INSTR packsswb, 0, 0, 0
 AVX_INSTR packssdw, 0, 0, 0
 AVX_INSTR packuswb, 0, 0, 0
@@ -999,6 +1007,7 @@ AVX_INSTR pminsd, 0, 0, 1
 AVX_INSTR pminub, 0, 0, 1
 AVX_INSTR pminuw, 0, 0, 1
 AVX_INSTR pminud, 0, 0, 1
+AVX_INSTR pmovmskb, 0, 0, 0
 AVX_INSTR pmulhuw, 0, 0, 1
 AVX_INSTR pmulhrsw, 0, 0, 1
 AVX_INSTR pmulhw, 0, 0, 1
@@ -1009,6 +1018,9 @@ AVX_INSTR pmuldq, 0, 0, 1
 AVX_INSTR por, 0, 0, 1
 AVX_INSTR psadbw, 0, 0, 1
 AVX_INSTR pshufb, 0, 0, 0
+AVX_INSTR pshufd, 0, 1, 0
+AVX_INSTR pshufhw, 0, 1, 0
+AVX_INSTR pshuflw, 0, 1, 0
 AVX_INSTR psignb, 0, 0, 0
 AVX_INSTR psignw, 0, 0, 0
 AVX_INSTR psignd, 0, 0, 0
@@ -1030,6 +1042,7 @@ AVX_INSTR psubsb, 0, 0, 0
 AVX_INSTR psubsw, 0, 0, 0
 AVX_INSTR psubusb, 0, 0, 0
 AVX_INSTR psubusw, 0, 0, 0
+AVX_INSTR ptest, 0, 0, 0
 AVX_INSTR punpckhbw, 0, 0, 0
 AVX_INSTR punpckhwd, 0, 0, 0
 AVX_INSTR punpckhdq, 0, 0, 0
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 97fb331..13b6bfa 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -164,6 +164,8 @@ static void print_bench(void)
             if( k < j )
                 continue;
             printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
+                    b->cpu&X264_CPU_AVX2 ? "avx2" :
+                    b->cpu&X264_CPU_FMA3 ? "fma3" :
                     b->cpu&X264_CPU_FMA4 ? "fma4" :
                     b->cpu&X264_CPU_XOP ? "xop" :
                     b->cpu&X264_CPU_AVX ? "avx" :
@@ -182,6 +184,9 @@ static void print_bench(void)
                     b->cpu&X264_CPU_SHUFFLE_IS_FAST && !(b->cpu&X264_CPU_SSE4) ? "_fastshuffle" :
                     b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
                     b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
+                    b->cpu&X264_CPU_BMI2 ? "_bmi2" :
+                    b->cpu&X264_CPU_TBM ? "_tbm" :
+                    b->cpu&X264_CPU_BMI1 ? "_bmi1" :
                     b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
                     b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" :
                     b->cpu&X264_CPU_SLOW_ATOM ? "_slow_atom" : "",
@@ -2405,7 +2410,32 @@ static int check_all_flags( void )
     if( x264_cpu_detect() & X264_CPU_XOP )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_XOP, "XOP" );
     if( x264_cpu_detect() & X264_CPU_FMA4 )
+    {
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" );
+        cpu1 &= ~X264_CPU_FMA4;
+    }
+    if( x264_cpu_detect() & X264_CPU_FMA3 )
+    {
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
+        cpu1 &= ~X264_CPU_FMA3;
+    }
+    if( x264_cpu_detect() & X264_CPU_BMI1 )
+    {
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
+        if( x264_cpu_detect() & X264_CPU_TBM )
+        {
+            ret |= add_flags( &cpu0, &cpu1, X264_CPU_TBM, "TBM" );
+            cpu1 &= ~X264_CPU_TBM;
+        }
+        if( x264_cpu_detect() & X264_CPU_BMI2 )
+        {
+            ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI2, "BMI2" );
+            cpu1 &= ~X264_CPU_BMI2;
+        }
+        cpu1 &= ~X264_CPU_BMI1;
+    }
+    if( x264_cpu_detect() & X264_CPU_AVX2 )
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
 #elif ARCH_PPC
     if( x264_cpu_detect() & X264_CPU_ALTIVEC )
     {
diff --git a/x264.h b/x264.h
index 965b96d..322efb4 100644
--- a/x264.h
+++ b/x264.h
@@ -127,6 +127,11 @@ typedef struct
                                              * aren't used. */
 #define X264_CPU_XOP             0x0800000  /* AMD XOP */
 #define X264_CPU_FMA4            0x1000000  /* AMD FMA4 */
+#define X264_CPU_AVX2            0x2000000  /* AVX2 */
+#define X264_CPU_FMA3            0x4000000  /* Intel FMA3 */
+#define X264_CPU_BMI1            0x8000000  /* BMI1 */
+#define X264_CPU_BMI2           0x10000000  /* BMI2 */
+#define X264_CPU_TBM            0x20000000  /* AMD TBM */
 
 /* Analyse flags
  */



More information about the x264-devel mailing list