[x265] [PATCH] asm: adopt x264 CPU detection and flags

Steve Borho steve at borho.org
Fri Oct 11 03:03:50 CEST 2013

# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1381389912 18000
#      Thu Oct 10 02:25:12 2013 -0500
# Node ID 891e90e568768f0e9995c7d34c22dde2f3d0bbc4
# Parent  3c97a8e40dba1bc571172b39dbf0083b9c324501
asm: adopt x264 CPU detection and flags

diff -r 3c97a8e40dba -r 891e90e56876 source/VectorClass/instrset.h
--- a/source/VectorClass/instrset.h	Tue Sep 10 22:26:32 2013 -0500
+++ b/source/VectorClass/instrset.h	Thu Oct 10 02:25:12 2013 -0500
@@ -140,12 +140,6 @@
 #include <intrin.h>                              // define _BitScanReverse(int), __cpuid(int[4],int), _xgetbv(int)
 #endif // _MSC_VER
-// functions in instrset_detect.cpp
-int  instrset_detect(void);                      // tells which instruction sets are supported
-bool hasFMA3(void);                              // true if FMA3 instructions supported
-bool hasFMA4(void);                              // true if FMA4 instructions supported
-bool hasXOP(void);                               // true if XOP  instructions supported
 // GCC version
 #if defined(__GNUC__) && !defined(GCC_VERSION)
 #define GCC_VERSION  ((__GNUC__) * 10000 + (__GNUC_MINOR__) * 100 + (__GNUC_PATCHLEVEL__))
diff -r 3c97a8e40dba -r 891e90e56876 source/VectorClass/instrset_detect.cpp
--- a/source/VectorClass/instrset_detect.cpp	Tue Sep 10 22:26:32 2013 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,185 +0,0 @@
-/**************************  instrset_detect.cpp   ****************************
-| Author:        Agner Fog
-| Date created:  2012-05-30
-| Last modified: 2012-07-08
-| Version:       1.01 Beta
-| Project:       vector classes
-| Description:
-| Functions for checking which instruction sets are supported.
-| (c) Copyright 2012 GNU General Public License http://www.gnu.org/licenses
-#include "instrset.h"
-// Define interface to cpuid instruction.
-// input:  eax = functionnumber, ecx = 0
-// output: eax = output[0], ebx = output[1], ecx = output[2], edx = output[3]
-static inline void cpuid(int output[4], int functionnumber)
-#if defined(_MSC_VER) || (_WIN32 && defined(__INTEL_COMPILER))
-    // Microsoft or Intel compiler, intrin.h included
-    __cpuidex(output, functionnumber, 0);          // intrinsic function for CPUID
-#elif defined(__GNUC__)                            // use inline assembly, Gnu/AT&T syntax
-    int a, b, c, d;
-    __asm("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (functionnumber), "c" (0) :);
-    output[0] = a;
-    output[1] = b;
-    output[2] = c;
-    output[3] = d;
-#else                                              // unknown platform. try inline assembly with masm/intel syntax
-    __asm {
-        mov eax, functionnumber
-        xor ecx, ecx
-            cpuid;
-        mov esi, output
-            mov[esi],    eax
-            mov[esi + 4],  ebx
-            mov[esi + 8],  ecx
-            mov[esi + 12], edx
-    }
-#endif // if defined(_MSC_VER) || defined(__INTEL_COMPILER)
-// Define interface to xgetbv instruction
-static inline int64_t xgetbv(int ctr)
-#if MACOS
-    return 0 & ctr;
-#elif (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1200)
-    // MSVC 2010 SP1 or later, or similar Intel release
-    return _xgetbv(ctr);   // intrinsic function for XGETBV
-#elif defined(__GNUC__)    // use inline assembly, Gnu/AT&T syntax
-    uint32_t a, d;
-    __asm("xgetbv" : "=a" (a), "=d" (d) : "c" (ctr) :);
-    return a | (uint64_t(d) << 32);
-#elif defined(_WIN64)      // On x64 with older compilers, this is impossible
-    return 0 & ctr;
-#else                      // other compiler (x86). try inline assembly with masm/intel/MS syntax
-    uint32_t a, d;
-    __asm {
-        mov ecx, ctr
-            _emit 0x0f
-        _emit 0x01
-        _emit 0xd0; // xgetbv
-        mov a, eax
-        mov d, edx
-    }
-    return a | (uint64_t(d) << 32);
-#endif // if MSVC10 SP1
-/* find supported instruction set
-    return value:
-    0           = 80386 instruction set
-    1  or above = SSE (XMM) supported by CPU (not testing for O.S. support)
-    2  or above = SSE2
-    3  or above = SSE3
-    4  or above = Supplementary SSE3 (SSSE3)
-    5  or above = SSE4.1
-    6  or above = SSE4.2
-    7  or above = AVX supported by CPU and operating system
-    8  or above = AVX2
-int instrset_detect(void)
-    static int iset = -1;                        // remember value for next call
-    if (iset >= 0)
-    {
-        return iset;                             // called before
-    }
-    iset = 0;                                    // default value
-    int abcd[4] = { 0, 0, 0, 0 };                // cpuid results
-    cpuid(abcd, 0);                              // call cpuid function 0
-    if (abcd[0] == 0) return iset; // no further cpuid function supported
-    cpuid(abcd, 1);                              // call cpuid function 1 for feature flags
-    if ((abcd[3] & (1 <<  0)) == 0) return iset; // no floating point
-    if ((abcd[3] & (1 << 23)) == 0) return iset; // no MMX
-    if ((abcd[3] & (1 << 15)) == 0) return iset; // no conditional move
-    if ((abcd[3] & (1 << 24)) == 0) return iset; // no FXSAVE
-    if ((abcd[3] & (1 << 25)) == 0) return iset; // no SSE
-    iset = 1;                                    // 1: SSE supported
-    if ((abcd[3] & (1 << 26)) == 0) return iset; // no SSE2
-    iset = 2;                                    // 2: SSE2 supported
-    if ((abcd[2] & (1 <<  0)) == 0) return iset; // no SSE3
-    iset = 3;                                    // 3: SSE3 supported
-    if ((abcd[2] & (1 <<  9)) == 0) return iset; // no SSSE3
-    iset = 4;                                    // 4: SSSE3 supported
-    if ((abcd[2] & (1 << 19)) == 0) return iset; // no SSE4.1
-    iset = 5;                                    // 5: SSE4.1 supported
-    if ((abcd[2] & (1 << 23)) == 0) return iset; // no POPCNT
-    if ((abcd[2] & (1 << 20)) == 0) return iset; // no SSE4.2
-    iset = 6;                                    // 6: SSE4.2 supported
-    if ((abcd[2] & (1 << 27)) == 0) return iset; // no OSXSAVE
-    if ((xgetbv(0) & 6) != 6) return iset;       // AVX not enabled in O.S.
-    if ((abcd[2] & (1 << 28)) == 0) return iset; // no AVX
-    iset = 7;                                    // 7: AVX supported
-    cpuid(abcd, 7);                              // call cpuid leaf 7 for feature flags
-    if ((abcd[1] & (1 <<  5)) == 0) return iset; // no AVX2
-    iset = 8;                                    // 8: AVX2 supported
-    return iset;
-// detect if CPU supports the FMA3 instruction set
-bool hasFMA3(void)
-    if (instrset_detect() < 7) return false; // must have AVX
-    int abcd[4];                                 // cpuid results
-    cpuid(abcd, 1);                              // call cpuid function 1
-    return (abcd[2] & (1 << 12)) != 0;           // ecx bit 12 indicates FMA3
-// detect if CPU supports the FMA4 instruction set
-bool hasFMA4(void)
-    if (instrset_detect() < 7) return false; // must have AVX
-    int abcd[4];                                 // cpuid results
-    cpuid(abcd, 0x80000001);                     // call cpuid function 0x80000001
-    return (abcd[2] & (1 << 16)) != 0;           // ecx bit 16 indicates FMA4
-// detect if CPU supports the XOP instruction set
-bool hasXOP(void)
-    if (instrset_detect() < 7) return false; // must have AVX
-    int abcd[4];                                 // cpuid results
-    cpuid(abcd, 0x80000001);                     // call cpuid function 0x80000001
-    return (abcd[2] & (1 << 11)) != 0;           // ecx bit 11 indicates XOP
diff -r 3c97a8e40dba -r 891e90e56876 source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt	Tue Sep 10 22:26:32 2013 -0500
+++ b/source/common/CMakeLists.txt	Thu Oct 10 02:25:12 2013 -0500
@@ -223,7 +223,7 @@
     primitives.cpp primitives.h
     pixel.cpp dct.cpp ipfilter.cpp intrapred.cpp
-    ../VectorClass/instrset_detect.cpp
+    cpu.cpp cpu.h
     threading.cpp threading.h
     threadpool.cpp threadpool.h
     wavefront.h wavefront.cpp
diff -r 3c97a8e40dba -r 891e90e56876 source/common/cpu.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/cpu.cpp	Thu Oct 10 02:25:12 2013 -0500
@@ -0,0 +1,291 @@
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Loren Merritt <lorenm at u.washington.edu>
+ *          Laurent Aimar <fenrir at via.ecp.fr>
+ *          Jason Garrett-Glaser <darkshikari at gmail.com>
+ *          Steve Borho <steve at borho.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at multicorewareinc.com.
+ *****************************************************************************/
+#include "cpu.h"
+#include "common.h"
+#include <cstring>
+#include <assert.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <machine/cpu.h>
+namespace x265 {
+const cpu_name_t cpu_names[] =
+#define MMX2 X265_CPU_MMX | X265_CPU_MMX2 | X265_CPU_CMOV
+    { "MMX2",        MMX2 },
+    { "MMXEXT",      MMX2 },
+    { "SSE",         MMX2 | X265_CPU_SSE },
+#define SSE2 MMX2 | X265_CPU_SSE | X265_CPU_SSE2
+    { "SSE2Slow",    SSE2 | X265_CPU_SSE2_IS_SLOW },
+    { "SSE2",        SSE2 },
+    { "SSE2Fast",    SSE2 | X265_CPU_SSE2_IS_FAST },
+    { "SSE3",        SSE2 | X265_CPU_SSE3 },
+    { "SSSE3",       SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 },
+    { "SSE4.1",      SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 },
+    { "SSE4",        SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 },
+    { "SSE4.2",      SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 },
+#define AVX SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 | X265_CPU_AVX
+    { "AVX",         AVX },
+    { "XOP",         AVX | X265_CPU_XOP },
+    { "FMA4",        AVX | X265_CPU_FMA4 },
+    { "AVX2",        AVX | X265_CPU_AVX2 },
+    { "FMA3",        AVX | X265_CPU_FMA3 },
+#undef AVX
+#undef SSE2
+#undef MMX2
+    { "Cache32",         X265_CPU_CACHELINE_32 },
+    { "Cache64",         X265_CPU_CACHELINE_64 },
+    { "LZCNT",           X265_CPU_LZCNT },
+    { "BMI1",            X265_CPU_BMI1 },
+    { "BMI2",            X265_CPU_BMI1 | X265_CPU_BMI2 },
+    { "SlowCTZ",         X265_CPU_SLOW_CTZ },
+    { "SlowAtom",        X265_CPU_SLOW_ATOM },
+    { "SlowPshufb",      X265_CPU_SLOW_PSHUFB },
+    { "SlowPalignr",     X265_CPU_SLOW_PALIGNR },
+    { "SlowShuffle",     X265_CPU_SLOW_SHUFFLE },
+    { "UnalignedStack",  X265_CPU_STACK_MOD4 },
+    { "", 0 },
+extern "C" {
+/* cpu-a.asm */
+int x265_cpu_cpuid_test(void);
+void x265_cpu_cpuid(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
+void x265_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx);
+#if defined(_MSC_VER)
+#pragma warning(disable: 4309) // truncation of constant value
+uint32_t cpu_detect(void)
+    uint32_t cpu = 0;
+    uint32_t eax, ebx, ecx, edx;
+    uint32_t vendor[4] = { 0 };
+    uint32_t max_extended_cap, max_basic_cap;
+    int cache;
+#if !X86_64
+    if (!x265_cpu_cpuid_test())
+        return 0;
+    x265_cpu_cpuid(0, &eax, vendor + 0, vendor + 2, vendor + 1);
+    max_basic_cap = eax;
+    if (max_basic_cap == 0)
+        return 0;
+    x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
+    if (edx & 0x00800000)
+        cpu |= X265_CPU_MMX;
+    else
+        return cpu;
+    if (edx & 0x02000000)
+        cpu |= X265_CPU_MMX2 | X265_CPU_SSE;
+    if (edx & 0x00008000)
+        cpu |= X265_CPU_CMOV;
+    else
+        return cpu;
+    if (edx & 0x04000000)
+        cpu |= X265_CPU_SSE2;
+    if (ecx & 0x00000001)
+        cpu |= X265_CPU_SSE3;
+    if (ecx & 0x00000200)
+        cpu |= X265_CPU_SSSE3;
+    if (ecx & 0x00080000)
+        cpu |= X265_CPU_SSE4;
+    if (ecx & 0x00100000)
+        cpu |= X265_CPU_SSE42;
+    /* Check OXSAVE and AVX bits */
+    if ((ecx & 0x18000000) == 0x18000000)
+    {
+        /* Check for OS support */
+        x265_cpu_xgetbv(0, &eax, &edx);
+        if ((eax & 0x6) == 0x6)
+        {
+            cpu |= X265_CPU_AVX;
+            if (ecx & 0x00001000)
+                cpu |= X265_CPU_FMA3;
+        }
+    }
+    if (max_basic_cap >= 7)
+    {
+        x265_cpu_cpuid(7, &eax, &ebx, &ecx, &edx);
+        /* AVX2 requires OS support, but BMI1/2 don't. */
+        if ((cpu & X265_CPU_AVX) && (ebx & 0x00000020))
+            cpu |= X265_CPU_AVX2;
+        if (ebx & 0x00000008)
+        {
+            cpu |= X265_CPU_BMI1;
+            if (ebx & 0x00000100)
+                cpu |= X265_CPU_BMI2;
+        }
+    }
+    if (cpu & X265_CPU_SSSE3)
+        cpu |= X265_CPU_SSE2_IS_FAST;
+    x265_cpu_cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
+    max_extended_cap = eax;
+    if (max_extended_cap >= 0x80000001)
+    {
+        x265_cpu_cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+        if (ecx & 0x00000020)
+            cpu |= X265_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */
+        if (ecx & 0x00000040) /* SSE4a, AMD only */
+        {
+            int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+            cpu |= X265_CPU_SSE2_IS_FAST;      /* Phenom and later CPUs have fast SSE units */
+            if (family == 0x14)
+            {
+                cpu &= ~X265_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
+                cpu |= X265_CPU_SSE2_IS_SLOW;  /* Bobcat has 64-bit SIMD units */
+                cpu |= X265_CPU_SLOW_PALIGNR;  /* palignr is insanely slow on Bobcat */
+            }
+            if (family == 0x16)
+            {
+                cpu |= X265_CPU_SLOW_PSHUFB;   /* Jaguar's pshufb isn't that slow, but it's slow enough
+                                                * compared to alternate instruction sequences that this
+                                                * is equal or faster on almost all such functions. */
+            }
+        }
+        if (cpu & X265_CPU_AVX)
+        {
+            if (ecx & 0x00000800) /* XOP */
+                cpu |= X265_CPU_XOP;
+            if (ecx & 0x00010000) /* FMA4 */
+                cpu |= X265_CPU_FMA4;
+        }
+        if (!strcmp((char*)vendor, "AuthenticAMD"))
+        {
+            if (edx & 0x00400000)
+                cpu |= X265_CPU_MMX2;
+            if (!(cpu & X265_CPU_LZCNT))
+                cpu |= X265_CPU_SLOW_CTZ;
+            if ((cpu & X265_CPU_SSE2) && !(cpu & X265_CPU_SSE2_IS_FAST))
+                cpu |= X265_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
+        }
+    }
+    if (!strcmp((char*)vendor, "GenuineIntel"))
+    {
+        x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
+        int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+        int model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
+        if (family == 6)
+        {
+            /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
+             * theoretically support sse2, but it's significantly slower than mmx for
+             * almost all of x264's functions, so let's just pretend they don't. */
+            if (model == 9 || model == 13 || model == 14)
+            {
+                cpu &= ~(X265_CPU_SSE2 | X265_CPU_SSE3);
+                assert(!(cpu & (X265_CPU_SSSE3 | X265_CPU_SSE4)));
+            }
+            /* Detect Atom CPU */
+            else if (model == 28)
+            {
+                cpu |= X265_CPU_SLOW_ATOM;
+                cpu |= X265_CPU_SLOW_CTZ;
+                cpu |= X265_CPU_SLOW_PSHUFB;
+            }
+            /* Conroe has a slow shuffle unit. Check the model number to make sure not
+             * to include crippled low-end Penryns and Nehalems that don't have SSE4. */
+            else if ((cpu & X265_CPU_SSSE3) && !(cpu & X265_CPU_SSE4) && model < 23)
+                cpu |= X265_CPU_SLOW_SHUFFLE;
+        }
+    }
+    if ((!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu & X265_CPU_SSE42))
+    {
+        /* cacheline size is specified in 3 places, any of which may be missing */
+        x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
+        cache = (ebx & 0xff00) >> 5; // cflush size
+        if (!cache && max_extended_cap >= 0x80000006)
+        {
+            x265_cpu_cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
+            cache = ecx & 0xff; // cacheline size
+        }
+        if (!cache && max_basic_cap >= 2)
+        {
+            // Cache and TLB Information
+            static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
+            static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67,
+                                                0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
+            uint32_t buf[4];
+            int max, i = 0;
+            do
+            {
+                x265_cpu_cpuid(2, buf + 0, buf + 1, buf + 2, buf + 3);
+                max = buf[0] & 0xff;
+                buf[0] &= ~0xff;
+                for (int j = 0; j < 4; j++)
+                {
+                    if (!(buf[j] >> 31))
+                        while (buf[j])
+                        {
+                            if (strchr(cache32_ids, buf[j] & 0xff))
+                                cache = 32;
+                            if (strchr(cache64_ids, buf[j] & 0xff))
+                                cache = 64;
+                            buf[j] >>= 8;
+                        }
+                }
+            }
+            while (++i < max);
+        }
+        if (cache == 32)
+            cpu |= X265_CPU_CACHELINE_32;
+        else if (cache == 64)
+            cpu |= X265_CPU_CACHELINE_64;
+        else
+            x265_log(NULL, X265_LOG_WARNING, "unable to determine cacheline size\n");
+    }
+    cpu |= X265_CPU_STACK_MOD4;
+    return cpu;
diff -r 3c97a8e40dba -r 891e90e56876 source/common/cpu.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/cpu.h	Thu Oct 10 02:25:12 2013 -0500
@@ -0,0 +1,61 @@
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Loren Merritt <lorenm at u.washington.edu>
+ *          Steve Borho <steve at borho.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at multicorewareinc.com.
+ *****************************************************************************/
+#ifndef X265_CPU_H
+#define X265_CPU_H
+#include <stdint.h>
+// from cpu-a.asm, if ASM primitives are compiled, else primitives.cpp
+extern "C" void x265_cpu_emms(void);
+extern "C" void x265_safe_intel_cpu_indicator_init(void);
+#if _MSC_VER && _WIN64
+#define x265_emms() x265_cpu_emms()
+#elif _MSC_VER
+#include <mmintrin.h>
+#define x265_emms() _mm_empty()
+#elif __GNUC__
+// Cannot use _mm_empty() directly without compiling all the source with
+// a fixed CPU arch, which we would like to avoid at the moment
+#define x265_emms() x265_cpu_emms()
+#define x265_emms() x265_cpu_emms()
+namespace x265 {
+uint32_t cpu_detect(void);
+struct cpu_name_t
+    char name[16];
+    uint32_t flags;
+extern const cpu_name_t cpu_names[];
+#endif // ifndef X265_CPU_H
diff -r 3c97a8e40dba -r 891e90e56876 source/common/primitives.cpp
--- a/source/common/primitives.cpp	Tue Sep 10 22:26:32 2013 -0500
+++ b/source/common/primitives.cpp	Thu Oct 10 02:25:12 2013 -0500
@@ -70,20 +70,6 @@
-static const char *CpuType[] =
-    "",
-    "",
-    "SSE2",
-    "SSE3",
-    "SSSE3",
-    "SSE4.1",
-    "SSE4.2",
-    "AVX",
-    "AVX2",
-    0
 using namespace x265;
 /* cpuid == 0 - auto-detect CPU type, else
@@ -105,22 +91,29 @@
     if (cpuid == 0)
-        cpuid = instrset_detect(); // Detect supported instruction set
+        cpuid = x265::cpu_detect();
         if (param->logLevel >= X265_LOG_INFO)
-            x265_log(param, X265_LOG_INFO, "detected SIMD: ");
-            for (int i = 2; i <= cpuid; i++)
+            char buf[1000];
+            char *p = buf + sprintf( buf, "using cpu capabilities:" );
+            for (int i = 0; x265::cpu_names[i].flags; i++)
-                fprintf(stderr, "%s ", CpuType[i]);
+                if (!strcmp(x265::cpu_names[i].name, "SSE2")
+                    && cpuid & (X265_CPU_SSE2_IS_FAST|X265_CPU_SSE2_IS_SLOW))
+                    continue;
+                if (!strcmp(x265::cpu_names[i].name, "SSE3")
+                    && (cpuid & X265_CPU_SSSE3 || !(cpuid & X265_CPU_CACHELINE_64)))
+                    continue;
+                if (!strcmp(x265::cpu_names[i].name, "SSE4.1")
+                    && (cpuid & X265_CPU_SSE42))
+                    continue;
+                if ((cpuid & x265::cpu_names[i].flags) == x265::cpu_names[i].flags
+                    && (!i || x265::cpu_names[i].flags != x265::cpu_names[i-1].flags))
+                    p += sprintf( p, " %s", x265::cpu_names[i].name );
-            if (cpuid >= 7)
-            {
-                if (hasXOP()) fprintf(stderr, "XOP ");
-                if (hasFMA3()) fprintf(stderr, "FMA3 ");
-                if (hasFMA4()) fprintf(stderr, "FMA4 ");
-            }
-            fprintf(stderr, "\n");
+            if( !cpuid )
+                p += sprintf( p, " none!" );
+            x265_log(param, X265_LOG_INFO, "%s\n", buf);
@@ -128,15 +121,12 @@
-    for (int i = 2; i <= cpuid; i++)
-    {
-        Setup_Vector_Primitives(primitives, 1 << i);
+    Setup_Vector_Primitives(primitives, cpuid);
-        Setup_Assembly_Primitives(primitives, 1 << i);
+    Setup_Assembly_Primitives(primitives, cpuid);
-    }
     primitives.sa8d_inter[PARTITION_8x8] = primitives.sa8d[BLOCK_8x8];
     primitives.sa8d_inter[PARTITION_16x16] = primitives.sa8d[BLOCK_16x16];
diff -r 3c97a8e40dba -r 891e90e56876 source/common/primitives.h
--- a/source/common/primitives.h	Tue Sep 10 22:26:32 2013 -0500
+++ b/source/common/primitives.h	Thu Oct 10 02:25:12 2013 -0500
@@ -30,25 +30,10 @@
 #include <stdint.h>
 #include "x265.h"
+#include "cpu.h"
 #define FENC_STRIDE 64
-// from cpu-a.asm, if ASM primitives are compiled, else primitives.cpp
-extern "C" void x265_cpu_emms(void);
-#if _MSC_VER && _WIN64
-#define x265_emms() x265_cpu_emms()
-#elif _MSC_VER
-#include <mmintrin.h>
-#define x265_emms() _mm_empty()
-#elif __GNUC__
-// Cannot use _mm_empty() directly without compiling all the source with
-// a fixed CPU arch, which we would like to avoid at the moment
-#define x265_emms() x265_cpu_emms()
-#define x265_emms() x265_cpu_emms()
 #if defined(__GNUC__)
 #define ALIGN_VAR_8(T, var)  T var __attribute__((aligned(8)))
 #define ALIGN_VAR_16(T, var) T var __attribute__((aligned(16)))
diff -r 3c97a8e40dba -r 891e90e56876 source/common/vec/vec-primitives.cpp
--- a/source/common/vec/vec-primitives.cpp	Tue Sep 10 22:26:32 2013 -0500
+++ b/source/common/vec/vec-primitives.cpp	Thu Oct 10 02:25:12 2013 -0500
@@ -23,7 +23,30 @@
 #include "primitives.h"
-bool hasXOP(void); // instr_detect.cpp
+#include <intrin.h>
+extern "C" {
+int x265_cpu_cpuid_test(void)
+    return 0;
+void x265_cpu_cpuid(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+    int output[4];
+    __cpuidex(output, op, 0);
+    *eax = output[0];
+    *ebx = output[1];
+    *ecx = output[2];
+    *edx = output[3];
+void x265_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx)
+    uint64_t out = _xgetbv(op);
+    *eax = (uint32_t)out;
+    *edx = (uint32_t)(out >> 32);
 /* The #if logic here must match the file lists in CMakeLists.txt */
 #if defined(__INTEL_COMPILER)
@@ -74,7 +97,7 @@
 void Setup_Vector_Primitives(EncoderPrimitives &p, int cpuMask)
 #ifdef HAVE_SSE3
-    if (cpuMask & (1 << X265_CPU_LEVEL_SSE3))
+    if (cpuMask & X265_CPU_SSE3)
@@ -83,7 +106,7 @@
 #ifdef HAVE_SSSE3
-    if (cpuMask & (1 << X265_CPU_LEVEL_SSSE3))
+    if (cpuMask & X265_CPU_SSSE3)
@@ -91,7 +114,7 @@
 #ifdef HAVE_SSE4
-    if (cpuMask & (1 << X265_CPU_LEVEL_SSE41))
+    if (cpuMask & X265_CPU_SSE4)
@@ -100,7 +123,7 @@
 #ifdef HAVE_AVX2
-    if (cpuMask & (1 << X265_CPU_LEVEL_AVX2))
+    if (cpuMask & X265_CPU_AVX2)
diff -r 3c97a8e40dba -r 891e90e56876 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Sep 10 22:26:32 2013 -0500
+++ b/source/common/x86/asm-primitives.cpp	Thu Oct 10 02:25:12 2013 -0500
@@ -26,7 +26,47 @@
 extern "C" {
 #include "pixel.h"
+/* Agner's patch to Intel's CPU dispatcher from pages 131-132 of
+ * http://agner.org/optimize/optimizing_cpp.pdf (2011-01-30)
+ * adapted to x265's cpu schema. */
+// Global variable indicating cpu
+int __intel_cpu_indicator = 0;
+// CPU dispatcher function
+void x265_intel_cpu_indicator_init(void)
+    unsigned int cpu = cpu_detect();
+    if (cpu&X265_CPU_AVX)
+        __intel_cpu_indicator = 0x20000;
+    else if (cpu&X265_CPU_SSE42)
+        __intel_cpu_indicator = 0x8000;
+    else if (cpu&X265_CPU_SSE4)
+        __intel_cpu_indicator = 0x2000;
+    else if (cpu&X265_CPU_SSSE3)
+        __intel_cpu_indicator = 0x1000;
+    else if (cpu&X265_CPU_SSE3)
+        __intel_cpu_indicator = 0x800;
+    else if (cpu&X265_CPU_SSE2 && !(cpu&X265_CPU_SSE2_IS_SLOW))
+        __intel_cpu_indicator = 0x200;
+    else if (cpu&X265_CPU_SSE)
+        __intel_cpu_indicator = 0x80;
+    else if (cpu&X265_CPU_MMX2)
+        __intel_cpu_indicator = 8;
+    else
+        __intel_cpu_indicator = 1;
+/* __intel_cpu_indicator_init appears to have a non-standard calling convention that
+ * assumes certain registers aren't preserved, so we'll route it through a function
+ * that backs up all the registers. */
+void __intel_cpu_indicator_init( void )
+    x265_safe_intel_cpu_indicator_init();
 void x265_intel_cpu_indicator_init( void ) {}
 #define LOWRES(cpu)\
     void x265_frame_init_lowres_core_##cpu( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\
@@ -54,8 +94,6 @@
 void x265_filterHorizontal_p_p_4_sse4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, short const *coeff);
-bool hasXOP(void); // instr_detect.cpp
 using namespace x265;
 namespace {
@@ -175,9 +213,9 @@
 void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
-    if (cpuMask & (1 << X265_CPU_LEVEL_SSE2)) p.sa8d[0] = p.sa8d[0];
+    if (cpuMask & X265_CPU_SSE2) p.sa8d[0] = p.sa8d[0];
-    if (cpuMask & (1 << X265_CPU_LEVEL_SSE2))
+    if (cpuMask & X265_CPU_SSE2)
         INIT8_NAME( sse_pp, ssd, _mmx );
         INIT8( sad, _mmx2 );
@@ -214,7 +252,7 @@
         p.sa8d[BLOCK_16x16] = x265_pixel_sa8d_16x16_sse2;
-    if (cpuMask & (1 << X265_CPU_LEVEL_SSSE3))
+    if (cpuMask & X265_CPU_SSSE3)
         p.frame_init_lowres_core = x265_frame_init_lowres_core_ssse3;
         p.sa8d[BLOCK_8x8]   = x265_pixel_sa8d_8x8_ssse3;
@@ -228,7 +266,7 @@
         p.sad_x4[PARTITION_8x8] = x265_pixel_sad_x4_8x8_ssse3;
         p.sad_x4[PARTITION_8x16] = x265_pixel_sad_x4_8x16_ssse3;
-    if (cpuMask & (1 << X265_CPU_LEVEL_SSE41))
+    if (cpuMask & X265_CPU_SSE4)
         p.satd[PARTITION_4x16] = x265_pixel_satd_4x16_sse4;
         p.satd[PARTITION_12x16] = cmp<12, 16, 4, 16, x265_pixel_satd_4x16_sse4>;
@@ -240,7 +278,7 @@
         p.ipfilter_pp[FILTER_H_P_P_4] = x265_filterHorizontal_p_p_4_sse4;
-    if (cpuMask & (1 << X265_CPU_LEVEL_AVX))
+    if (cpuMask & X265_CPU_AVX)
         p.frame_init_lowres_core = x265_frame_init_lowres_core_avx;
         p.satd[PARTITION_4x16] = x265_pixel_satd_4x16_avx;
@@ -250,7 +288,7 @@
-    if ((cpuMask & (1 << X265_CPU_LEVEL_AVX)) && hasXOP())
+    if (cpuMask & X265_CPU_XOP)
         p.frame_init_lowres_core = x265_frame_init_lowres_core_xop;
         p.sa8d[BLOCK_8x8]   = x265_pixel_sa8d_8x8_xop;
@@ -260,7 +298,7 @@
         INIT5_NAME( sse_pp, ssd, _xop );
-    if (cpuMask & (1 << X265_CPU_LEVEL_AVX2))
+    if (cpuMask & X265_CPU_AVX2)
         INIT2( sad_x4, _avx2 );
         INIT4( satd, _avx2 );
diff -r 3c97a8e40dba -r 891e90e56876 source/test/testbench.cpp
--- a/source/test/testbench.cpp	Tue Sep 10 22:26:32 2013 -0500
+++ b/source/test/testbench.cpp	Thu Oct 10 02:25:12 2013 -0500
@@ -28,6 +28,7 @@
 #include "mbdstharness.h"
 #include "ipfilterharness.h"
 #include "intrapredharness.h"
+#include "cpu.h"
 #include <stdio.h>
 #include <stdlib.h>
@@ -36,25 +37,9 @@
 using namespace x265;
-static const char *CpuType[] =
-    "",
-    "",
-    "SSE2",
-    "SSE3",
-    "SSSE3",
-    "SSE4.1",
-    "SSE4.2",
-    "AVX",
-    "AVX2",
-    0
-extern int instrset_detect();
 int main(int argc, char *argv[])
-    int cpuid = instrset_detect(); // Detect supported instruction set
+    int cpuid = x265::cpu_detect();
     const char *testname = 0;
     int cpuid_user = -1;
@@ -94,20 +79,32 @@
     memset(&cprim, 0, sizeof(EncoderPrimitives));
-    int cpuid_low = 2;
-    int cpuid_high = cpuid;
+    struct test_arch_t
+    {
+        char name[12];
+        int flag;
+    } test_arch[] = {
+        { "SSE2", X265_CPU_SSE2 },
+        { "SSE3", X265_CPU_SSE3 },
+        { "SSSE3", X265_CPU_SSSE3 },
+        { "SSE4", X265_CPU_SSE4 },
+        { "AVX", X265_CPU_AVX },
+        { "XOP", X265_CPU_XOP },
+        { "AVX2", X265_CPU_AVX2 },
+        { "", 0 },
+    };
-    if (cpuid_user >= 0)
+    for (int i = 0; test_arch[i].flag; i++)
-        cpuid_low = cpuid_high = cpuid_user;
-    }
-    for (int i = cpuid_low; i <= cpuid_high; i++)
-    {
+        if (test_arch[i].flag & cpuid)
+            printf("Testing primitives: %s\n", test_arch[i].name);
+        else
+            continue;
         EncoderPrimitives vecprim;
         memset(&vecprim, 0, sizeof(vecprim));
-        Setup_Vector_Primitives(vecprim, 1 << i);
-        printf("Testing intrinsic primitives: %s (%d)\n", CpuType[i], i);
+        Setup_Vector_Primitives(vecprim, test_arch[i].flag);
         for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
             if (testname && strncmp(testname, harness[h]->getName(), strlen(testname)))
@@ -124,8 +121,7 @@
         EncoderPrimitives asmprim;
         memset(&asmprim, 0, sizeof(asmprim));
-        Setup_Assembly_Primitives(asmprim, 1 << i);
-        printf("Testing assembly primitives: %s (%d)\n", CpuType[i], i);
+        Setup_Assembly_Primitives(asmprim, test_arch[i].flag);
         for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
             if (testname && strncmp(testname, harness[h]->getName(), strlen(testname)))
@@ -137,7 +133,6 @@
                 return -1;
@@ -145,15 +140,12 @@
     EncoderPrimitives optprim;
     memset(&optprim, 0, sizeof(optprim));
-    for (int i = 2; i <= cpuid; i++)
-    {
-        Setup_Vector_Primitives(optprim, 1 << i);
+    Setup_Vector_Primitives(optprim, cpuid);
-        Setup_Assembly_Primitives(optprim, 1 << i);
+    Setup_Assembly_Primitives(optprim, cpuid);
-    }
     printf("\nTest performance improvement with full optimizations\n");
diff -r 3c97a8e40dba -r 891e90e56876 source/x265.h
--- a/source/x265.h	Tue Sep 10 22:26:32 2013 -0500
+++ b/source/x265.h	Thu Oct 10 02:25:12 2013 -0500
@@ -152,6 +152,43 @@
+/* CPU flags */
+/* x86 */
+#define X265_CPU_CMOV            0x0000001
+#define X265_CPU_MMX             0x0000002
+#define X265_CPU_MMX2            0x0000004  /* MMX2 aka MMXEXT aka ISSE */
+#define X265_CPU_MMXEXT          X265_CPU_MMX2
+#define X265_CPU_SSE             0x0000008
+#define X265_CPU_SSE2            0x0000010
+#define X265_CPU_SSE3            0x0000020
+#define X265_CPU_SSSE3           0x0000040
+#define X265_CPU_SSE4            0x0000080  /* SSE4.1 */
+#define X265_CPU_SSE42           0x0000100  /* SSE4.2 */
+#define X265_CPU_LZCNT           0x0000200  /* Phenom support for "leading zero count" instruction. */
+#define X265_CPU_AVX             0x0000400  /* AVX support: requires OS support even if YMM registers aren't used. */
+#define X265_CPU_XOP             0x0000800  /* AMD XOP */
+#define X265_CPU_FMA4            0x0001000  /* AMD FMA4 */
+#define X265_CPU_AVX2            0x0002000  /* AVX2 */
+#define X265_CPU_FMA3            0x0004000  /* Intel FMA3 */
+#define X265_CPU_BMI1            0x0008000  /* BMI1 */
+#define X265_CPU_BMI2            0x0010000  /* BMI2 */
+/* x86 modifiers */
+#define X265_CPU_CACHELINE_32    0x0020000  /* avoid memory loads that span the border between two cachelines */
+#define X265_CPU_CACHELINE_64    0x0040000  /* 32/64 is the size of a cacheline in bytes */
+#define X265_CPU_SSE2_IS_SLOW    0x0080000  /* avoid most SSE2 functions on Athlon64 */
+#define X265_CPU_SSE2_IS_FAST    0x0100000  /* a few functions are only faster on Core2 and Phenom */
+#define X265_CPU_SLOW_SHUFFLE    0x0200000  /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
+#define X265_CPU_STACK_MOD4      0x0400000  /* if stack is only mod4 and not mod16 */
+#define X265_CPU_SLOW_CTZ        0x0800000  /* BSR/BSF x86 instructions are really slow on some CPUs */
+#define X265_CPU_SLOW_ATOM       0x1000000  /* The Atom is terrible: slow SSE unaligned loads, slow
+                                             * SIMD multiplies, slow SIMD variable shifts, slow pshufb,
+                                             * cacheline split penalties -- gather everything here that
+                                             * isn't shared by other CPUs to avoid making half a dozen
+                                             * new SLOW flags. */
+#define X265_CPU_SLOW_PSHUFB     0x2000000  /* such as on the Intel Atom */
+#define X265_CPU_SLOW_PALIGNR    0x4000000  /* such as on the AMD Bobcat */
 static const char * const x265_motion_est_names[] = { "dia", "hex", "umh", "star", "full", 0 };
 #define X265_MAX_SUBPEL_LEVEL   7
@@ -299,20 +336,10 @@
-#define X265_CPU_LEVEL_AUTO  0
-#define X265_CPU_LEVEL_NONE  1 // C code only, no SIMD
-#define X265_CPU_LEVEL_SSE2  2
-#define X265_CPU_LEVEL_SSE3  3
-#define X265_CPU_LEVEL_SSSE3 4
-#define X265_CPU_LEVEL_SSE41 5
-#define X265_CPU_LEVEL_SSE42 6
-#define X265_CPU_LEVEL_AVX   7
-#define X265_CPU_LEVEL_AVX2  8
  * If not called, first encoder allocated will auto-detect the CPU and
  * initialize performance primitives, which are process global */
-void x265_setup_primitives(x265_param_t *param, int cpulevel);
+void x265_setup_primitives(x265_param_t *param, int cpu);
  * Initialize an x265_param_t structure to default values

More information about the x265-devel mailing list