[x265] [PATCH] asm: adopt x264 CPU detection and flags
Steve Borho
steve at borho.org
Fri Oct 11 03:03:50 CEST 2013
# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1381389912 18000
# Thu Oct 10 02:25:12 2013 -0500
# Node ID 891e90e568768f0e9995c7d34c22dde2f3d0bbc4
# Parent 3c97a8e40dba1bc571172b39dbf0083b9c324501
asm: adopt x264 CPU detection and flags
diff -r 3c97a8e40dba -r 891e90e56876 source/VectorClass/instrset.h
--- a/source/VectorClass/instrset.h Tue Sep 10 22:26:32 2013 -0500
+++ b/source/VectorClass/instrset.h Thu Oct 10 02:25:12 2013 -0500
@@ -140,12 +140,6 @@
#include <intrin.h> // define _BitScanReverse(int), __cpuid(int[4],int), _xgetbv(int)
#endif // _MSC_VER
-// functions in instrset_detect.cpp
-int instrset_detect(void); // tells which instruction sets are supported
-bool hasFMA3(void); // true if FMA3 instructions supported
-bool hasFMA4(void); // true if FMA4 instructions supported
-bool hasXOP(void); // true if XOP instructions supported
-
// GCC version
#if defined(__GNUC__) && !defined(GCC_VERSION)
#define GCC_VERSION ((__GNUC__) * 10000 + (__GNUC_MINOR__) * 100 + (__GNUC_PATCHLEVEL__))
diff -r 3c97a8e40dba -r 891e90e56876 source/VectorClass/instrset_detect.cpp
--- a/source/VectorClass/instrset_detect.cpp Tue Sep 10 22:26:32 2013 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,185 +0,0 @@
-/************************** instrset_detect.cpp ****************************
-| Author: Agner Fog
-| Date created: 2012-05-30
-| Last modified: 2012-07-08
-| Version: 1.01 Beta
-| Project: vector classes
-| Description:
-| Functions for checking which instruction sets are supported.
-|
-| (c) Copyright 2012 GNU General Public License http://www.gnu.org/licenses
-\*****************************************************************************/
-
-#include "instrset.h"
-
-// Define interface to cpuid instruction.
-// input: eax = functionnumber, ecx = 0
-// output: eax = output[0], ebx = output[1], ecx = output[2], edx = output[3]
-static inline void cpuid(int output[4], int functionnumber)
-{
-#if defined(_MSC_VER) || (_WIN32 && defined(__INTEL_COMPILER))
-
- // Microsoft or Intel compiler, intrin.h included
- __cpuidex(output, functionnumber, 0); // intrinsic function for CPUID
-
-#elif defined(__GNUC__) // use inline assembly, Gnu/AT&T syntax
-
- int a, b, c, d;
- __asm("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (functionnumber), "c" (0) :);
- output[0] = a;
- output[1] = b;
- output[2] = c;
- output[3] = d;
-
-#else // unknown platform. try inline assembly with masm/intel syntax
-
- __asm {
- mov eax, functionnumber
- xor ecx, ecx
- cpuid;
- mov esi, output
- mov[esi], eax
- mov[esi + 4], ebx
- mov[esi + 8], ecx
- mov[esi + 12], edx
- }
-
-#endif // if defined(_MSC_VER) || defined(__INTEL_COMPILER)
-}
-
-// Define interface to xgetbv instruction
-static inline int64_t xgetbv(int ctr)
-{
-#if MACOS
-
- return 0 & ctr;
-
-#elif (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1200)
-
- // MSVC 2010 SP1 or later, or similar Intel release
- return _xgetbv(ctr); // intrinsic function for XGETBV
-
-#elif defined(__GNUC__) // use inline assembly, Gnu/AT&T syntax
-
- uint32_t a, d;
- __asm("xgetbv" : "=a" (a), "=d" (d) : "c" (ctr) :);
- return a | (uint64_t(d) << 32);
-
-#elif defined(_WIN64) // On x64 with older compilers, this is impossible
-
- return 0 & ctr;
-
-#else // other compiler (x86). try inline assembly with masm/intel/MS syntax
-
- uint32_t a, d;
- __asm {
- mov ecx, ctr
- _emit 0x0f
- _emit 0x01
- _emit 0xd0; // xgetbv
- mov a, eax
- mov d, edx
- }
- return a | (uint64_t(d) << 32);
-
-#endif // if MSVC10 SP1
-}
-
-/* find supported instruction set
- return value:
- 0 = 80386 instruction set
- 1 or above = SSE (XMM) supported by CPU (not testing for O.S. support)
- 2 or above = SSE2
- 3 or above = SSE3
- 4 or above = Supplementary SSE3 (SSSE3)
- 5 or above = SSE4.1
- 6 or above = SSE4.2
- 7 or above = AVX supported by CPU and operating system
- 8 or above = AVX2
-*/
-int instrset_detect(void)
-{
- static int iset = -1; // remember value for next call
-
- if (iset >= 0)
- {
- return iset; // called before
- }
-
- iset = 0; // default value
- int abcd[4] = { 0, 0, 0, 0 }; // cpuid results
- cpuid(abcd, 0); // call cpuid function 0
- if (abcd[0] == 0) return iset; // no further cpuid function supported
-
- cpuid(abcd, 1); // call cpuid function 1 for feature flags
- if ((abcd[3] & (1 << 0)) == 0) return iset; // no floating point
-
- if ((abcd[3] & (1 << 23)) == 0) return iset; // no MMX
-
- if ((abcd[3] & (1 << 15)) == 0) return iset; // no conditional move
-
- if ((abcd[3] & (1 << 24)) == 0) return iset; // no FXSAVE
-
- if ((abcd[3] & (1 << 25)) == 0) return iset; // no SSE
-
- iset = 1; // 1: SSE supported
- if ((abcd[3] & (1 << 26)) == 0) return iset; // no SSE2
-
- iset = 2; // 2: SSE2 supported
- if ((abcd[2] & (1 << 0)) == 0) return iset; // no SSE3
-
- iset = 3; // 3: SSE3 supported
- if ((abcd[2] & (1 << 9)) == 0) return iset; // no SSSE3
-
- iset = 4; // 4: SSSE3 supported
- if ((abcd[2] & (1 << 19)) == 0) return iset; // no SSE4.1
-
- iset = 5; // 5: SSE4.1 supported
- if ((abcd[2] & (1 << 23)) == 0) return iset; // no POPCNT
-
- if ((abcd[2] & (1 << 20)) == 0) return iset; // no SSE4.2
-
- iset = 6; // 6: SSE4.2 supported
- if ((abcd[2] & (1 << 27)) == 0) return iset; // no OSXSAVE
-
- if ((xgetbv(0) & 6) != 6) return iset; // AVX not enabled in O.S.
-
- if ((abcd[2] & (1 << 28)) == 0) return iset; // no AVX
-
- iset = 7; // 7: AVX supported
- cpuid(abcd, 7); // call cpuid leaf 7 for feature flags
- if ((abcd[1] & (1 << 5)) == 0) return iset; // no AVX2
-
- iset = 8; // 8: AVX2 supported
- return iset;
-}
-
-// detect if CPU supports the FMA3 instruction set
-bool hasFMA3(void)
-{
- if (instrset_detect() < 7) return false; // must have AVX
-
- int abcd[4]; // cpuid results
- cpuid(abcd, 1); // call cpuid function 1
- return (abcd[2] & (1 << 12)) != 0; // ecx bit 12 indicates FMA3
-}
-
-// detect if CPU supports the FMA4 instruction set
-bool hasFMA4(void)
-{
- if (instrset_detect() < 7) return false; // must have AVX
-
- int abcd[4]; // cpuid results
- cpuid(abcd, 0x80000001); // call cpuid function 0x80000001
- return (abcd[2] & (1 << 16)) != 0; // ecx bit 16 indicates FMA4
-}
-
-// detect if CPU supports the XOP instruction set
-bool hasXOP(void)
-{
- if (instrset_detect() < 7) return false; // must have AVX
-
- int abcd[4]; // cpuid results
- cpuid(abcd, 0x80000001); // call cpuid function 0x80000001
- return (abcd[2] & (1 << 11)) != 0; // ecx bit 11 indicates XOP
-}
diff -r 3c97a8e40dba -r 891e90e56876 source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt Tue Sep 10 22:26:32 2013 -0500
+++ b/source/common/CMakeLists.txt Thu Oct 10 02:25:12 2013 -0500
@@ -223,7 +223,7 @@
${ASM_PRIMITIVES} ${VEC_PRIMITIVES}
primitives.cpp primitives.h
pixel.cpp dct.cpp ipfilter.cpp intrapred.cpp
- ../VectorClass/instrset_detect.cpp
+ cpu.cpp cpu.h
threading.cpp threading.h
threadpool.cpp threadpool.h
wavefront.h wavefront.cpp
diff -r 3c97a8e40dba -r 891e90e56876 source/common/cpu.cpp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/cpu.cpp Thu Oct 10 02:25:12 2013 -0500
@@ -0,0 +1,291 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Loren Merritt <lorenm at u.washington.edu>
+ * Laurent Aimar <fenrir at via.ecp.fr>
+ * Jason Garrett-Glaser <darkshikari at gmail.com>
+ * Steve Borho <steve at borho.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at multicorewareinc.com.
+ *****************************************************************************/
+
+#include "cpu.h"
+#include "common.h"
+
+#include <cstring>
+#include <assert.h>
+
+#if MACOS || SYS_FREEBSD
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+#if SYS_OPENBSD
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <machine/cpu.h>
+#endif
+
+namespace x265 {
+const cpu_name_t cpu_names[] =
+{
+#define MMX2 X265_CPU_MMX | X265_CPU_MMX2 | X265_CPU_CMOV
+ { "MMX2", MMX2 },
+ { "MMXEXT", MMX2 },
+ { "SSE", MMX2 | X265_CPU_SSE },
+#define SSE2 MMX2 | X265_CPU_SSE | X265_CPU_SSE2
+ { "SSE2Slow", SSE2 | X265_CPU_SSE2_IS_SLOW },
+ { "SSE2", SSE2 },
+ { "SSE2Fast", SSE2 | X265_CPU_SSE2_IS_FAST },
+ { "SSE3", SSE2 | X265_CPU_SSE3 },
+ { "SSSE3", SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 },
+ { "SSE4.1", SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 },
+ { "SSE4", SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 },
+ { "SSE4.2", SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 },
+#define AVX SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 | X265_CPU_AVX
+ { "AVX", AVX },
+ { "XOP", AVX | X265_CPU_XOP },
+ { "FMA4", AVX | X265_CPU_FMA4 },
+ { "AVX2", AVX | X265_CPU_AVX2 },
+ { "FMA3", AVX | X265_CPU_FMA3 },
+#undef AVX
+#undef SSE2
+#undef MMX2
+ { "Cache32", X265_CPU_CACHELINE_32 },
+ { "Cache64", X265_CPU_CACHELINE_64 },
+ { "LZCNT", X265_CPU_LZCNT },
+ { "BMI1", X265_CPU_BMI1 },
+ { "BMI2", X265_CPU_BMI1 | X265_CPU_BMI2 },
+ { "SlowCTZ", X265_CPU_SLOW_CTZ },
+ { "SlowAtom", X265_CPU_SLOW_ATOM },
+ { "SlowPshufb", X265_CPU_SLOW_PSHUFB },
+ { "SlowPalignr", X265_CPU_SLOW_PALIGNR },
+ { "SlowShuffle", X265_CPU_SLOW_SHUFFLE },
+ { "UnalignedStack", X265_CPU_STACK_MOD4 },
+ { "", 0 },
+};
+
+extern "C" {
+/* cpu-a.asm */
+int x265_cpu_cpuid_test(void);
+void x265_cpu_cpuid(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
+void x265_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx);
+}
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4309) // truncation of constant value
+#endif
+
+uint32_t cpu_detect(void)
+{
+ uint32_t cpu = 0;
+ uint32_t eax, ebx, ecx, edx;
+ uint32_t vendor[4] = { 0 };
+ uint32_t max_extended_cap, max_basic_cap;
+ int cache;
+
+#if !X86_64
+ if (!x265_cpu_cpuid_test())
+ return 0;
+#endif
+
+ x265_cpu_cpuid(0, &eax, vendor + 0, vendor + 2, vendor + 1);
+ max_basic_cap = eax;
+ if (max_basic_cap == 0)
+ return 0;
+
+ x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
+ if (edx & 0x00800000)
+ cpu |= X265_CPU_MMX;
+ else
+ return cpu;
+ if (edx & 0x02000000)
+ cpu |= X265_CPU_MMX2 | X265_CPU_SSE;
+ if (edx & 0x00008000)
+ cpu |= X265_CPU_CMOV;
+ else
+ return cpu;
+ if (edx & 0x04000000)
+ cpu |= X265_CPU_SSE2;
+ if (ecx & 0x00000001)
+ cpu |= X265_CPU_SSE3;
+ if (ecx & 0x00000200)
+ cpu |= X265_CPU_SSSE3;
+ if (ecx & 0x00080000)
+ cpu |= X265_CPU_SSE4;
+ if (ecx & 0x00100000)
+ cpu |= X265_CPU_SSE42;
+ /* Check OXSAVE and AVX bits */
+ if ((ecx & 0x18000000) == 0x18000000)
+ {
+ /* Check for OS support */
+ x265_cpu_xgetbv(0, &eax, &edx);
+ if ((eax & 0x6) == 0x6)
+ {
+ cpu |= X265_CPU_AVX;
+ if (ecx & 0x00001000)
+ cpu |= X265_CPU_FMA3;
+ }
+ }
+
+ if (max_basic_cap >= 7)
+ {
+ x265_cpu_cpuid(7, &eax, &ebx, &ecx, &edx);
+ /* AVX2 requires OS support, but BMI1/2 don't. */
+ if ((cpu & X265_CPU_AVX) && (ebx & 0x00000020))
+ cpu |= X265_CPU_AVX2;
+ if (ebx & 0x00000008)
+ {
+ cpu |= X265_CPU_BMI1;
+ if (ebx & 0x00000100)
+ cpu |= X265_CPU_BMI2;
+ }
+ }
+
+ if (cpu & X265_CPU_SSSE3)
+ cpu |= X265_CPU_SSE2_IS_FAST;
+
+ x265_cpu_cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
+ max_extended_cap = eax;
+
+ if (max_extended_cap >= 0x80000001)
+ {
+ x265_cpu_cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+
+ if (ecx & 0x00000020)
+ cpu |= X265_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */
+ if (ecx & 0x00000040) /* SSE4a, AMD only */
+ {
+ int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+ cpu |= X265_CPU_SSE2_IS_FAST; /* Phenom and later CPUs have fast SSE units */
+ if (family == 0x14)
+ {
+ cpu &= ~X265_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
+ cpu |= X265_CPU_SSE2_IS_SLOW; /* Bobcat has 64-bit SIMD units */
+ cpu |= X265_CPU_SLOW_PALIGNR; /* palignr is insanely slow on Bobcat */
+ }
+ if (family == 0x16)
+ {
+ cpu |= X265_CPU_SLOW_PSHUFB; /* Jaguar's pshufb isn't that slow, but it's slow enough
+ * compared to alternate instruction sequences that this
+ * is equal or faster on almost all such functions. */
+ }
+ }
+
+ if (cpu & X265_CPU_AVX)
+ {
+ if (ecx & 0x00000800) /* XOP */
+ cpu |= X265_CPU_XOP;
+ if (ecx & 0x00010000) /* FMA4 */
+ cpu |= X265_CPU_FMA4;
+ }
+
+ if (!strcmp((char*)vendor, "AuthenticAMD"))
+ {
+ if (edx & 0x00400000)
+ cpu |= X265_CPU_MMX2;
+ if (!(cpu & X265_CPU_LZCNT))
+ cpu |= X265_CPU_SLOW_CTZ;
+ if ((cpu & X265_CPU_SSE2) && !(cpu & X265_CPU_SSE2_IS_FAST))
+ cpu |= X265_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
+ }
+ }
+
+ if (!strcmp((char*)vendor, "GenuineIntel"))
+ {
+ x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
+ int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+ int model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
+ if (family == 6)
+ {
+ /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
+ * theoretically support sse2, but it's significantly slower than mmx for
+ * almost all of x264's functions, so let's just pretend they don't. */
+ if (model == 9 || model == 13 || model == 14)
+ {
+ cpu &= ~(X265_CPU_SSE2 | X265_CPU_SSE3);
+ assert(!(cpu & (X265_CPU_SSSE3 | X265_CPU_SSE4)));
+ }
+ /* Detect Atom CPU */
+ else if (model == 28)
+ {
+ cpu |= X265_CPU_SLOW_ATOM;
+ cpu |= X265_CPU_SLOW_CTZ;
+ cpu |= X265_CPU_SLOW_PSHUFB;
+ }
+
+ /* Conroe has a slow shuffle unit. Check the model number to make sure not
+ * to include crippled low-end Penryns and Nehalems that don't have SSE4. */
+ else if ((cpu & X265_CPU_SSSE3) && !(cpu & X265_CPU_SSE4) && model < 23)
+ cpu |= X265_CPU_SLOW_SHUFFLE;
+ }
+ }
+
+ if ((!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu & X265_CPU_SSE42))
+ {
+ /* cacheline size is specified in 3 places, any of which may be missing */
+ x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
+ cache = (ebx & 0xff00) >> 5; // cflush size
+ if (!cache && max_extended_cap >= 0x80000006)
+ {
+ x265_cpu_cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
+ cache = ecx & 0xff; // cacheline size
+ }
+ if (!cache && max_basic_cap >= 2)
+ {
+ // Cache and TLB Information
+ static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
+ static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67,
+ 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
+ uint32_t buf[4];
+ int max, i = 0;
+ do
+ {
+ x265_cpu_cpuid(2, buf + 0, buf + 1, buf + 2, buf + 3);
+ max = buf[0] & 0xff;
+ buf[0] &= ~0xff;
+ for (int j = 0; j < 4; j++)
+ {
+ if (!(buf[j] >> 31))
+ while (buf[j])
+ {
+ if (strchr(cache32_ids, buf[j] & 0xff))
+ cache = 32;
+ if (strchr(cache64_ids, buf[j] & 0xff))
+ cache = 64;
+ buf[j] >>= 8;
+ }
+ }
+ }
+ while (++i < max);
+ }
+
+ if (cache == 32)
+ cpu |= X265_CPU_CACHELINE_32;
+ else if (cache == 64)
+ cpu |= X265_CPU_CACHELINE_64;
+ else
+ x265_log(NULL, X265_LOG_WARNING, "unable to determine cacheline size\n");
+ }
+
+#if BROKEN_STACK_ALIGNMENT
+ cpu |= X265_CPU_STACK_MOD4;
+#endif
+
+ return cpu;
+}
+}
diff -r 3c97a8e40dba -r 891e90e56876 source/common/cpu.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/cpu.h Thu Oct 10 02:25:12 2013 -0500
@@ -0,0 +1,61 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Loren Merritt <lorenm at u.washington.edu>
+ * Steve Borho <steve at borho.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at multicorewareinc.com.
+ *****************************************************************************/
+
+#ifndef X265_CPU_H
+#define X265_CPU_H
+
+#include <stdint.h>
+
+// from cpu-a.asm, if ASM primitives are compiled, else primitives.cpp
+extern "C" void x265_cpu_emms(void);
+extern "C" void x265_safe_intel_cpu_indicator_init(void);
+
+#if _MSC_VER && _WIN64
+#define x265_emms() x265_cpu_emms()
+#elif _MSC_VER
+#include <mmintrin.h>
+#define x265_emms() _mm_empty()
+#elif __GNUC__
+// Cannot use _mm_empty() directly without compiling all the source with
+// a fixed CPU arch, which we would like to avoid at the moment
+#define x265_emms() x265_cpu_emms()
+#else
+#define x265_emms() x265_cpu_emms()
+#endif
+
+namespace x265 {
+
+uint32_t cpu_detect(void);
+
+struct cpu_name_t
+{
+ char name[16];
+ uint32_t flags;
+};
+
+extern const cpu_name_t cpu_names[];
+
+}
+
+#endif // ifndef X265_CPU_H
diff -r 3c97a8e40dba -r 891e90e56876 source/common/primitives.cpp
--- a/source/common/primitives.cpp Tue Sep 10 22:26:32 2013 -0500
+++ b/source/common/primitives.cpp Thu Oct 10 02:25:12 2013 -0500
@@ -70,20 +70,6 @@
}
}
-static const char *CpuType[] =
-{
- "",
- "",
- "SSE2",
- "SSE3",
- "SSSE3",
- "SSE4.1",
- "SSE4.2",
- "AVX",
- "AVX2",
- 0
-};
-
using namespace x265;
/* cpuid == 0 - auto-detect CPU type, else
@@ -105,22 +91,29 @@
}
if (cpuid == 0)
{
- cpuid = instrset_detect(); // Detect supported instruction set
+ cpuid = x265::cpu_detect();
if (param->logLevel >= X265_LOG_INFO)
{
- x265_log(param, X265_LOG_INFO, "detected SIMD: ");
- for (int i = 2; i <= cpuid; i++)
+ char buf[1000];
+ char *p = buf + sprintf( buf, "using cpu capabilities:" );
+ for (int i = 0; x265::cpu_names[i].flags; i++)
{
- fprintf(stderr, "%s ", CpuType[i]);
+ if (!strcmp(x265::cpu_names[i].name, "SSE2")
+ && cpuid & (X265_CPU_SSE2_IS_FAST|X265_CPU_SSE2_IS_SLOW))
+ continue;
+ if (!strcmp(x265::cpu_names[i].name, "SSE3")
+ && (cpuid & X265_CPU_SSSE3 || !(cpuid & X265_CPU_CACHELINE_64)))
+ continue;
+ if (!strcmp(x265::cpu_names[i].name, "SSE4.1")
+ && (cpuid & X265_CPU_SSE42))
+ continue;
+ if ((cpuid & x265::cpu_names[i].flags) == x265::cpu_names[i].flags
+ && (!i || x265::cpu_names[i].flags != x265::cpu_names[i-1].flags))
+ p += sprintf( p, " %s", x265::cpu_names[i].name );
}
-
- if (cpuid >= 7)
- {
- if (hasXOP()) fprintf(stderr, "XOP ");
- if (hasFMA3()) fprintf(stderr, "FMA3 ");
- if (hasFMA4()) fprintf(stderr, "FMA4 ");
- }
- fprintf(stderr, "\n");
+ if( !cpuid )
+ p += sprintf( p, " none!" );
+ x265_log(param, X265_LOG_INFO, "%s\n", buf);
}
}
@@ -128,15 +121,12 @@
Setup_C_Primitives(primitives);
- for (int i = 2; i <= cpuid; i++)
- {
#if ENABLE_VECTOR_PRIMITIVES
- Setup_Vector_Primitives(primitives, 1 << i);
+ Setup_Vector_Primitives(primitives, cpuid);
#endif
#if ENABLE_ASM_PRIMITIVES
- Setup_Assembly_Primitives(primitives, 1 << i);
+ Setup_Assembly_Primitives(primitives, cpuid);
#endif
- }
primitives.sa8d_inter[PARTITION_8x8] = primitives.sa8d[BLOCK_8x8];
primitives.sa8d_inter[PARTITION_16x16] = primitives.sa8d[BLOCK_16x16];
diff -r 3c97a8e40dba -r 891e90e56876 source/common/primitives.h
--- a/source/common/primitives.h Tue Sep 10 22:26:32 2013 -0500
+++ b/source/common/primitives.h Thu Oct 10 02:25:12 2013 -0500
@@ -30,25 +30,10 @@
#include <stdint.h>
#include "x265.h"
+#include "cpu.h"
#define FENC_STRIDE 64
-// from cpu-a.asm, if ASM primitives are compiled, else primitives.cpp
-extern "C" void x265_cpu_emms(void);
-
-#if _MSC_VER && _WIN64
-#define x265_emms() x265_cpu_emms()
-#elif _MSC_VER
-#include <mmintrin.h>
-#define x265_emms() _mm_empty()
-#elif __GNUC__
-// Cannot use _mm_empty() directly without compiling all the source with
-// a fixed CPU arch, which we would like to avoid at the moment
-#define x265_emms() x265_cpu_emms()
-#else
-#define x265_emms() x265_cpu_emms()
-#endif
-
#if defined(__GNUC__)
#define ALIGN_VAR_8(T, var) T var __attribute__((aligned(8)))
#define ALIGN_VAR_16(T, var) T var __attribute__((aligned(16)))
diff -r 3c97a8e40dba -r 891e90e56876 source/common/vec/vec-primitives.cpp
--- a/source/common/vec/vec-primitives.cpp Tue Sep 10 22:26:32 2013 -0500
+++ b/source/common/vec/vec-primitives.cpp Thu Oct 10 02:25:12 2013 -0500
@@ -23,7 +23,30 @@
#include "primitives.h"
-bool hasXOP(void); // instr_detect.cpp
+#if !ENABLE_ASM_PRIMITIVES
+#include <intrin.h>
+extern "C" {
+int x265_cpu_cpuid_test(void)
+{
+ return 0;
+}
+void x265_cpu_cpuid(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+{
+ int output[4];
+ __cpuidex(output, op, 0);
+ *eax = output[0];
+ *ebx = output[1];
+ *ecx = output[2];
+ *edx = output[3];
+}
+void x265_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx)
+{
+ uint64_t out = _xgetbv(op);
+ *eax = (uint32_t)out;
+ *edx = (uint32_t)(out >> 32);
+}
+}
+#endif
/* The #if logic here must match the file lists in CMakeLists.txt */
#if defined(__INTEL_COMPILER)
@@ -74,7 +97,7 @@
void Setup_Vector_Primitives(EncoderPrimitives &p, int cpuMask)
{
#ifdef HAVE_SSE3
- if (cpuMask & (1 << X265_CPU_LEVEL_SSE3))
+ if (cpuMask & X265_CPU_SSE3)
{
Setup_Vec_PixelPrimitives_sse3(p);
Setup_Vec_DCTPrimitives_sse3(p);
@@ -83,7 +106,7 @@
}
#endif
#ifdef HAVE_SSSE3
- if (cpuMask & (1 << X265_CPU_LEVEL_SSSE3))
+ if (cpuMask & X265_CPU_SSSE3)
{
Setup_Vec_PixelPrimitives_ssse3(p);
Setup_Vec_IPFilterPrimitives_ssse3(p);
@@ -91,7 +114,7 @@
}
#endif
#ifdef HAVE_SSE4
- if (cpuMask & (1 << X265_CPU_LEVEL_SSE41))
+ if (cpuMask & X265_CPU_SSE4)
{
Setup_Vec_PixelPrimitives_sse41(p);
Setup_Vec_IPredPrimitives_sse41(p);
@@ -100,7 +123,7 @@
}
#endif
#ifdef HAVE_AVX2
- if (cpuMask & (1 << X265_CPU_LEVEL_AVX2))
+ if (cpuMask & X265_CPU_AVX2)
{
Setup_Vec_PixelPrimitives_avx2(p);
Setup_Vec_BlockCopyPrimitives_avx2(p);
diff -r 3c97a8e40dba -r 891e90e56876 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Sep 10 22:26:32 2013 -0500
+++ b/source/common/x86/asm-primitives.cpp Thu Oct 10 02:25:12 2013 -0500
@@ -26,7 +26,47 @@
extern "C" {
#include "pixel.h"
+#ifdef __INTEL_COMPILER
+/* Agner's patch to Intel's CPU dispatcher from pages 131-132 of
+ * http://agner.org/optimize/optimizing_cpp.pdf (2011-01-30)
+ * adapted to x265's cpu schema. */
+
+// Global variable indicating cpu
+int __intel_cpu_indicator = 0;
+// CPU dispatcher function
+void x265_intel_cpu_indicator_init(void)
+{
+ unsigned int cpu = cpu_detect();
+ if (cpu&X265_CPU_AVX)
+ __intel_cpu_indicator = 0x20000;
+ else if (cpu&X265_CPU_SSE42)
+ __intel_cpu_indicator = 0x8000;
+ else if (cpu&X265_CPU_SSE4)
+ __intel_cpu_indicator = 0x2000;
+ else if (cpu&X265_CPU_SSSE3)
+ __intel_cpu_indicator = 0x1000;
+ else if (cpu&X265_CPU_SSE3)
+ __intel_cpu_indicator = 0x800;
+ else if (cpu&X265_CPU_SSE2 && !(cpu&X265_CPU_SSE2_IS_SLOW))
+ __intel_cpu_indicator = 0x200;
+ else if (cpu&X265_CPU_SSE)
+ __intel_cpu_indicator = 0x80;
+ else if (cpu&X265_CPU_MMX2)
+ __intel_cpu_indicator = 8;
+ else
+ __intel_cpu_indicator = 1;
+}
+
+/* __intel_cpu_indicator_init appears to have a non-standard calling convention that
+ * assumes certain registers aren't preserved, so we'll route it through a function
+ * that backs up all the registers. */
+void __intel_cpu_indicator_init( void )
+{
+ x265_safe_intel_cpu_indicator_init();
+}
+#else
void x265_intel_cpu_indicator_init( void ) {}
+#endif
#define LOWRES(cpu)\
void x265_frame_init_lowres_core_##cpu( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\
@@ -54,8 +94,6 @@
void x265_filterHorizontal_p_p_4_sse4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, short const *coeff);
}
-bool hasXOP(void); // instr_detect.cpp
-
using namespace x265;
namespace {
@@ -175,9 +213,9 @@
void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
{
#if HIGH_BIT_DEPTH
- if (cpuMask & (1 << X265_CPU_LEVEL_SSE2)) p.sa8d[0] = p.sa8d[0];
+ if (cpuMask & X265_CPU_SSE2) p.sa8d[0] = p.sa8d[0];
#else
- if (cpuMask & (1 << X265_CPU_LEVEL_SSE2))
+ if (cpuMask & X265_CPU_SSE2)
{
INIT8_NAME( sse_pp, ssd, _mmx );
INIT8( sad, _mmx2 );
@@ -214,7 +252,7 @@
p.sa8d[BLOCK_16x16] = x265_pixel_sa8d_16x16_sse2;
SA8D_INTER_FROM_BLOCK(sse2);
}
- if (cpuMask & (1 << X265_CPU_LEVEL_SSSE3))
+ if (cpuMask & X265_CPU_SSSE3)
{
p.frame_init_lowres_core = x265_frame_init_lowres_core_ssse3;
p.sa8d[BLOCK_8x8] = x265_pixel_sa8d_8x8_ssse3;
@@ -228,7 +266,7 @@
p.sad_x4[PARTITION_8x8] = x265_pixel_sad_x4_8x8_ssse3;
p.sad_x4[PARTITION_8x16] = x265_pixel_sad_x4_8x16_ssse3;
}
- if (cpuMask & (1 << X265_CPU_LEVEL_SSE41))
+ if (cpuMask & X265_CPU_SSE4)
{
p.satd[PARTITION_4x16] = x265_pixel_satd_4x16_sse4;
p.satd[PARTITION_12x16] = cmp<12, 16, 4, 16, x265_pixel_satd_4x16_sse4>;
@@ -240,7 +278,7 @@
p.ipfilter_pp[FILTER_H_P_P_4] = x265_filterHorizontal_p_p_4_sse4;
#endif
}
- if (cpuMask & (1 << X265_CPU_LEVEL_AVX))
+ if (cpuMask & X265_CPU_AVX)
{
p.frame_init_lowres_core = x265_frame_init_lowres_core_avx;
p.satd[PARTITION_4x16] = x265_pixel_satd_4x16_avx;
@@ -250,7 +288,7 @@
SA8D_INTER_FROM_BLOCK(avx);
ASSGN_SSE(avx);
}
- if ((cpuMask & (1 << X265_CPU_LEVEL_AVX)) && hasXOP())
+ if (cpuMask & X265_CPU_XOP)
{
p.frame_init_lowres_core = x265_frame_init_lowres_core_xop;
p.sa8d[BLOCK_8x8] = x265_pixel_sa8d_8x8_xop;
@@ -260,7 +298,7 @@
INIT5_NAME( sse_pp, ssd, _xop );
HEVC_SATD(xop);
}
- if (cpuMask & (1 << X265_CPU_LEVEL_AVX2))
+ if (cpuMask & X265_CPU_AVX2)
{
INIT2( sad_x4, _avx2 );
INIT4( satd, _avx2 );
diff -r 3c97a8e40dba -r 891e90e56876 source/test/testbench.cpp
--- a/source/test/testbench.cpp Tue Sep 10 22:26:32 2013 -0500
+++ b/source/test/testbench.cpp Thu Oct 10 02:25:12 2013 -0500
@@ -28,6 +28,7 @@
#include "mbdstharness.h"
#include "ipfilterharness.h"
#include "intrapredharness.h"
+#include "cpu.h"
#include <stdio.h>
#include <stdlib.h>
@@ -36,25 +37,9 @@
using namespace x265;
-static const char *CpuType[] =
-{
- "",
- "",
- "SSE2",
- "SSE3",
- "SSSE3",
- "SSE4.1",
- "SSE4.2",
- "AVX",
- "AVX2",
- 0
-};
-
-extern int instrset_detect();
-
int main(int argc, char *argv[])
{
- int cpuid = instrset_detect(); // Detect supported instruction set
+ int cpuid = x265::cpu_detect();
const char *testname = 0;
int cpuid_user = -1;
@@ -94,20 +79,32 @@
memset(&cprim, 0, sizeof(EncoderPrimitives));
Setup_C_Primitives(cprim);
- int cpuid_low = 2;
- int cpuid_high = cpuid;
+ struct test_arch_t
+ {
+ char name[12];
+ int flag;
+ } test_arch[] = {
+ { "SSE2", X265_CPU_SSE2 },
+ { "SSE3", X265_CPU_SSE3 },
+ { "SSSE3", X265_CPU_SSSE3 },
+ { "SSE4", X265_CPU_SSE4 },
+ { "AVX", X265_CPU_AVX },
+ { "XOP", X265_CPU_XOP },
+ { "AVX2", X265_CPU_AVX2 },
+ { "", 0 },
+ };
- if (cpuid_user >= 0)
+ for (int i = 0; test_arch[i].flag; i++)
{
- cpuid_low = cpuid_high = cpuid_user;
- }
- for (int i = cpuid_low; i <= cpuid_high; i++)
- {
+ if (test_arch[i].flag & cpuid)
+ printf("Testing primitives: %s\n", test_arch[i].name);
+ else
+ continue;
+
#if ENABLE_VECTOR_PRIMITIVES
EncoderPrimitives vecprim;
memset(&vecprim, 0, sizeof(vecprim));
- Setup_Vector_Primitives(vecprim, 1 << i);
- printf("Testing intrinsic primitives: %s (%d)\n", CpuType[i], i);
+ Setup_Vector_Primitives(vecprim, test_arch[i].flag);
for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
{
if (testname && strncmp(testname, harness[h]->getName(), strlen(testname)))
@@ -124,8 +121,7 @@
#if ENABLE_ASM_PRIMITIVES
EncoderPrimitives asmprim;
memset(&asmprim, 0, sizeof(asmprim));
- Setup_Assembly_Primitives(asmprim, 1 << i);
- printf("Testing assembly primitives: %s (%d)\n", CpuType[i], i);
+ Setup_Assembly_Primitives(asmprim, test_arch[i].flag);
for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
{
if (testname && strncmp(testname, harness[h]->getName(), strlen(testname)))
@@ -137,7 +133,6 @@
return -1;
}
}
-
#endif // if ENABLE_ASM_PRIMITIVES
}
@@ -145,15 +140,12 @@
EncoderPrimitives optprim;
memset(&optprim, 0, sizeof(optprim));
- for (int i = 2; i <= cpuid; i++)
- {
#if ENABLE_VECTOR_PRIMITIVES
- Setup_Vector_Primitives(optprim, 1 << i);
+ Setup_Vector_Primitives(optprim, cpuid);
#endif
#if ENABLE_ASM_PRIMITIVES
- Setup_Assembly_Primitives(optprim, 1 << i);
+ Setup_Assembly_Primitives(optprim, cpuid);
#endif
- }
printf("\nTest performance improvement with full optimizations\n");
diff -r 3c97a8e40dba -r 891e90e56876 source/x265.h
--- a/source/x265.h Tue Sep 10 22:26:32 2013 -0500
+++ b/source/x265.h Thu Oct 10 02:25:12 2013 -0500
@@ -152,6 +152,43 @@
}
X265_ME_METHODS;
+/* CPU flags */
+
+/* x86 */
+#define X265_CPU_CMOV 0x0000001
+#define X265_CPU_MMX 0x0000002
+#define X265_CPU_MMX2 0x0000004 /* MMX2 aka MMXEXT aka ISSE */
+#define X265_CPU_MMXEXT X265_CPU_MMX2
+#define X265_CPU_SSE 0x0000008
+#define X265_CPU_SSE2 0x0000010
+#define X265_CPU_SSE3 0x0000020
+#define X265_CPU_SSSE3 0x0000040
+#define X265_CPU_SSE4 0x0000080 /* SSE4.1 */
+#define X265_CPU_SSE42 0x0000100 /* SSE4.2 */
+#define X265_CPU_LZCNT 0x0000200 /* Phenom support for "leading zero count" instruction. */
+#define X265_CPU_AVX 0x0000400 /* AVX support: requires OS support even if YMM registers aren't used. */
+#define X265_CPU_XOP 0x0000800 /* AMD XOP */
+#define X265_CPU_FMA4 0x0001000 /* AMD FMA4 */
+#define X265_CPU_AVX2 0x0002000 /* AVX2 */
+#define X265_CPU_FMA3 0x0004000 /* Intel FMA3 */
+#define X265_CPU_BMI1 0x0008000 /* BMI1 */
+#define X265_CPU_BMI2 0x0010000 /* BMI2 */
+/* x86 modifiers */
+#define X265_CPU_CACHELINE_32 0x0020000 /* avoid memory loads that span the border between two cachelines */
+#define X265_CPU_CACHELINE_64 0x0040000 /* 32/64 is the size of a cacheline in bytes */
+#define X265_CPU_SSE2_IS_SLOW 0x0080000 /* avoid most SSE2 functions on Athlon64 */
+#define X265_CPU_SSE2_IS_FAST 0x0100000 /* a few functions are only faster on Core2 and Phenom */
+#define X265_CPU_SLOW_SHUFFLE 0x0200000 /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
+#define X265_CPU_STACK_MOD4 0x0400000 /* if stack is only mod4 and not mod16 */
+#define X265_CPU_SLOW_CTZ 0x0800000 /* BSR/BSF x86 instructions are really slow on some CPUs */
+#define X265_CPU_SLOW_ATOM 0x1000000 /* The Atom is terrible: slow SSE unaligned loads, slow
+ * SIMD multiplies, slow SIMD variable shifts, slow pshufb,
+ * cacheline split penalties -- gather everything here that
+ * isn't shared by other CPUs to avoid making half a dozen
+ * new SLOW flags. */
+#define X265_CPU_SLOW_PSHUFB 0x2000000 /* such as on the Intel Atom */
+#define X265_CPU_SLOW_PALIGNR 0x4000000 /* such as on the AMD Bobcat */
+
static const char * const x265_motion_est_names[] = { "dia", "hex", "umh", "star", "full", 0 };
#define X265_MAX_SUBPEL_LEVEL 7
@@ -299,20 +336,10 @@
}
x265_param_t;
-#define X265_CPU_LEVEL_AUTO 0
-#define X265_CPU_LEVEL_NONE 1 // C code only, no SIMD
-#define X265_CPU_LEVEL_SSE2 2
-#define X265_CPU_LEVEL_SSE3 3
-#define X265_CPU_LEVEL_SSSE3 4
-#define X265_CPU_LEVEL_SSE41 5
-#define X265_CPU_LEVEL_SSE42 6
-#define X265_CPU_LEVEL_AVX 7
-#define X265_CPU_LEVEL_AVX2 8
-
/***
* If not called, first encoder allocated will auto-detect the CPU and
* initialize performance primitives, which are process global */
-void x265_setup_primitives(x265_param_t *param, int cpulevel);
+void x265_setup_primitives(x265_param_t *param, int cpu);
/***
* Initialize an x265_param_t structure to default values
More information about the x265-devel
mailing list