[x264-devel] Check for OS AVX support in addition to CPUID

Thu Jan 27 15:07:55 CET 2011

x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Thu Jan 27 05:33:25 2011 -0800| [f6d0c95b964d52780891c39f6ec93022b6ec1cb0] | committer: Jason Garrett-Glaser

Check for OS AVX support in addition to CPUID
Even if not using ymm registers, AVX operations will cause SIGILLs on unsupported OSs.
On Windows, AVX is only available on Windows 7 SP1 or later.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=f6d0c95b964d52780891c39f6ec93022b6ec1cb0
---

 common/cpu.c         |   13 ++++++++--
 common/x86/cpu-a.asm |   62 +++++++++++++++++++++++++-------------------------
 x264.h               |    5 +--
 3 files changed, 43 insertions(+), 37 deletions(-)

diff --git a/common/cpu.c b/common/cpu.c
index 6885746..e77253d 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -94,7 +94,8 @@ static void sigill_handler( int sig )
 
 #if HAVE_MMX
 int x264_cpu_cpuid_test( void );
-uint32_t x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
+void x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
+void x264_cpu_xgetbv( uint32_t op, int *eax, int *edx );
 
 uint32_t x264_cpu_detect( void )
 {
@@ -130,8 +131,14 @@ uint32_t x264_cpu_detect( void )
         cpu |= X264_CPU_SSE4;
     if( ecx&0x00100000 )
         cpu |= X264_CPU_SSE42;
-    if( ecx&0x10000000 )
-        cpu |= X264_CPU_AVX;
+    /* Check OXSAVE and AVX bits */
+    if( (ecx&0x18000000) == 0x18000000 )
+    {
+        /* Check for OS support */
+        x264_cpu_xgetbv( 0, &eax, &edx );
+        if( (eax&0x6) == 0x6 )
+            cpu |= X264_CPU_AVX;
+    }
 
     if( cpu & X264_CPU_SSSE3 )
         cpu |= X264_CPU_SSE2_IS_FAST;
diff --git a/common/x86/cpu-a.asm b/common/x86/cpu-a.asm
index c2dd72d..02265bc 100644
--- a/common/x86/cpu-a.asm
+++ b/common/x86/cpu-a.asm
@@ -29,27 +29,43 @@
 
 SECTION .text
 
-%ifdef ARCH_X86_64
-
 ;-----------------------------------------------------------------------------
-; int cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
+; void cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
 ;-----------------------------------------------------------------------------
 cglobal cpu_cpuid, 5,7
-    push    rbx
-    mov     r11,   r1
-    mov     r10,   r2
-    movifnidn r9,  r3
-    movifnidn r8,  r4
-    mov     eax,   r0d
+    push rbx
+    push  r4
+    push  r3
+    push  r2
+    push  r1
+    mov  eax, r0d
     cpuid
-    mov     [r11], eax
-    mov     [r10], ebx
-    mov     [r9],  ecx
-    mov     [r8],  edx
-    pop     rbx
+    pop  rsi
+    mov [rsi], eax
+    pop  rsi
+    mov [rsi], ebx
+    pop  rsi
+    mov [rsi], ecx
+    pop  rsi
+    mov [rsi], edx
+    pop  rbx
+    RET
+
+;-----------------------------------------------------------------------------
+; void cpu_xgetbv( int op, int *eax, int *edx )
+;-----------------------------------------------------------------------------
+cglobal cpu_xgetbv, 3,7
+    push  r2
+    push  r1
+    mov  ecx, r0d
+    xgetbv
+    pop  rsi
+    mov [rsi], eax
+    pop  rsi
+    mov [rsi], edx
     RET
 
-%else
+%ifndef ARCH_X86_64
 
 ;-----------------------------------------------------------------------------
 ; int cpu_cpuid_test( void )
@@ -78,22 +94,6 @@ cglobal cpu_cpuid_test
     ret
 
 ;-----------------------------------------------------------------------------
-; int cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
-;-----------------------------------------------------------------------------
-cglobal cpu_cpuid, 0,6
-    mov     eax,    r0m
-    cpuid
-    mov     esi,    r1m
-    mov     [esi],  eax
-    mov     esi,    r2m
-    mov     [esi],  ebx
-    mov     esi,    r3m
-    mov     [esi],  ecx
-    mov     esi,    r4m
-    mov     [esi],  edx
-    RET
-
-;-----------------------------------------------------------------------------
 ; void stack_align( void (*func)(void*), void *arg );
 ;-----------------------------------------------------------------------------
 cglobal stack_align
diff --git a/x264.h b/x264.h
index 5234bc0..8f39497 100644
--- a/x264.h
+++ b/x264.h
@@ -122,9 +122,8 @@ typedef struct
 #define X264_CPU_FAST_NEON_MRC  0x080000  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
 #define X264_CPU_SLOW_CTZ       0x100000  /* BSR/BSF x86 instructions are really slow on some CPUs */
 #define X264_CPU_SLOW_ATOM      0x200000  /* The Atom just sucks */
-#define X264_CPU_AVX            0x400000  /* AVX support -- we don't currently use YMM registers, just
-                                           * the 3-operand capability, so we don't require OS support
-                                           * for AVX. */
+#define X264_CPU_AVX            0x400000  /* AVX support: requires OS support even if YMM registers
+                                           * aren't used. */
 
 /* Analyse flags
  */