[x265] [PATCH 001 of 307] x86: AVX-512 support
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:29:59 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1498107357 -19800
# Thu Jun 22 10:25:57 2017 +0530
# Node ID d7e105cac1d01fa74adc8f7f7431d33b7e261b4f
# Parent e1ed4d609b52a361e758a66f45e8c070dd245211
x86: AVX-512 support
diff -r e1ed4d609b52 -r d7e105cac1d0 source/common/cpu.cpp
--- a/source/common/cpu.cpp Tue Apr 03 13:49:25 2018 +0530
+++ b/source/common/cpu.cpp Thu Jun 22 10:25:57 2017 +0530
@@ -61,7 +61,7 @@
const cpu_name_t cpu_names[] =
{
#if X265_ARCH_X86
-#define MMX2 X265_CPU_MMX | X265_CPU_MMX2 | X265_CPU_CMOV
+#define MMX2 X265_CPU_MMX | X265_CPU_MMX2
{ "MMX2", MMX2 },
{ "MMXEXT", MMX2 },
{ "SSE", MMX2 | X265_CPU_SSE },
@@ -84,13 +84,13 @@
{ "BMI2", AVX | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 },
#define AVX2 AVX | X265_CPU_FMA3 | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 | X265_CPU_AVX2
{ "AVX2", AVX2},
+ { "AVX512", AVX2 | X265_CPU_AVX512 },
#undef AVX2
#undef AVX
#undef SSE2
#undef MMX2
{ "Cache32", X265_CPU_CACHELINE_32 },
{ "Cache64", X265_CPU_CACHELINE_64 },
- { "SlowCTZ", X265_CPU_SLOW_CTZ },
{ "SlowAtom", X265_CPU_SLOW_ATOM },
{ "SlowPshufb", X265_CPU_SLOW_PSHUFB },
{ "SlowPalignr", X265_CPU_SLOW_PALIGNR },
@@ -115,7 +115,7 @@
/* cpu-a.asm */
int PFX(cpu_cpuid_test)(void);
void PFX(cpu_cpuid)(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
-void PFX(cpu_xgetbv)(uint32_t op, uint32_t *eax, uint32_t *edx);
+uint64_t PFX(cpu_xgetbv)(int xcr);
}
#if defined(_MSC_VER)
@@ -129,14 +129,14 @@
uint32_t eax, ebx, ecx, edx;
uint32_t vendor[4] = { 0 };
uint32_t max_extended_cap, max_basic_cap;
+ uint64_t xcr0 = 0;
#if !X86_64
if (!PFX(cpu_cpuid_test)())
return 0;
#endif
- PFX(cpu_cpuid)(0, &eax, vendor + 0, vendor + 2, vendor + 1);
- max_basic_cap = eax;
+ PFX(cpu_cpuid)(0, &max_basic_cap, vendor + 0, vendor + 2, vendor + 1);
if (max_basic_cap == 0)
return 0;
@@ -147,27 +147,24 @@
return cpu;
if (edx & 0x02000000)
cpu |= X265_CPU_MMX2 | X265_CPU_SSE;
- if (edx & 0x00008000)
- cpu |= X265_CPU_CMOV;
- else
- return cpu;
if (edx & 0x04000000)
cpu |= X265_CPU_SSE2;
if (ecx & 0x00000001)
cpu |= X265_CPU_SSE3;
if (ecx & 0x00000200)
- cpu |= X265_CPU_SSSE3;
+ cpu |= X265_CPU_SSSE3 | X265_CPU_SSE2_IS_FAST;
if (ecx & 0x00080000)
cpu |= X265_CPU_SSE4;
if (ecx & 0x00100000)
cpu |= X265_CPU_SSE42;
- /* Check OXSAVE and AVX bits */
- if ((ecx & 0x18000000) == 0x18000000)
+
+ if (ecx & 0x08000000) /* XGETBV supported and XSAVE enabled by OS */
{
/* Check for OS support */
- PFX(cpu_xgetbv)(0, &eax, &edx);
- if ((eax & 0x6) == 0x6)
+ xcr0 = PFX(cpu_xgetbv)(0);
+ if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
{
+ if (ecx & 0x10000000)
cpu |= X265_CPU_AVX;
if (ecx & 0x00001000)
cpu |= X265_CPU_FMA3;
@@ -178,19 +175,24 @@
{
PFX(cpu_cpuid)(7, &eax, &ebx, &ecx, &edx);
/* AVX2 requires OS support, but BMI1/2 don't. */
- if ((cpu & X265_CPU_AVX) && (ebx & 0x00000020))
- cpu |= X265_CPU_AVX2;
if (ebx & 0x00000008)
+ cpu |= X265_CPU_BMI1;
+ if (ebx & 0x00000100)
+ cpu |= X265_CPU_BMI2;
+
+ if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
{
- cpu |= X265_CPU_BMI1;
- if (ebx & 0x00000100)
- cpu |= X265_CPU_BMI2;
+ if (ebx & 0x00000020)
+ cpu |= X265_CPU_AVX2;
+
+ if ((xcr0 & 0xE0) == 0xE0) /* OPMASK/ZMM state */
+ {
+ if ((ebx & 0xD0030000) == 0xD0030000)
+ cpu |= X265_CPU_AVX512;
+ }
}
}
- if (cpu & X265_CPU_SSSE3)
- cpu |= X265_CPU_SSE2_IS_FAST;
-
PFX(cpu_cpuid)(0x80000000, &eax, &ebx, &ecx, &edx);
max_extended_cap = eax;
@@ -230,8 +232,6 @@
{
if (edx & 0x00400000)
cpu |= X265_CPU_MMX2;
- if (!(cpu & X265_CPU_LZCNT))
- cpu |= X265_CPU_SLOW_CTZ;
if ((cpu & X265_CPU_SSE2) && !(cpu & X265_CPU_SSE2_IS_FAST))
cpu |= X265_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
}
@@ -256,7 +256,6 @@
else if (model == 28)
{
cpu |= X265_CPU_SLOW_ATOM;
- cpu |= X265_CPU_SLOW_CTZ;
cpu |= X265_CPU_SLOW_PSHUFB;
}
diff -r e1ed4d609b52 -r d7e105cac1d0 source/common/x86/cpu-a.asm
--- a/source/common/x86/cpu-a.asm Tue Apr 03 13:49:25 2018 +0530
+++ b/source/common/x86/cpu-a.asm Thu Jun 22 10:25:57 2017 +0530
@@ -54,18 +54,16 @@
RET
;-----------------------------------------------------------------------------
-; void cpu_xgetbv( int op, int *eax, int *edx )
+; uint64_t cpu_xgetbv( int xcr )
;-----------------------------------------------------------------------------
-cglobal cpu_xgetbv, 3,7
- push r2
- push r1
- mov ecx, r0d
+cglobal cpu_xgetbv
+ movifnidn ecx, r0m
xgetbv
- pop r4
- mov [r4], eax
- pop r4
- mov [r4], edx
- RET
+%if ARCH_X86_64
+ shl rdx, 32
+ or rax, rdx
+%endif
+ ret
%if ARCH_X86_64
@@ -78,7 +76,7 @@
%if WIN64
sub rsp, 32 ; shadow space
%endif
- and rsp, ~31
+ and rsp, ~(STACK_ALIGNMENT - 1)
mov rax, r0
mov r0, r1
mov r1, r2
@@ -119,7 +117,7 @@
push ebp
mov ebp, esp
sub esp, 12
- and esp, ~31
+ and esp, ~(STACK_ALIGNMENT - 1)
mov ecx, [ebp+8]
mov edx, [ebp+12]
mov [esp], edx
diff -r e1ed4d609b52 -r d7e105cac1d0 source/common/x86/x86inc.asm
--- a/source/common/x86/x86inc.asm Tue Apr 03 13:49:25 2018 +0530
+++ b/source/common/x86/x86inc.asm Thu Jun 22 10:25:57 2017 +0530
@@ -325,6 +325,8 @@
%endmacro
%define required_stack_alignment ((mmsize + 15) & ~15)
+%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
+%define high_mm_regs (16*cpuflag(avx512))
%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
%ifnum %1
@@ -438,15 +440,16 @@
%macro WIN64_PUSH_XMM 0
; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
- %if xmm_regs_used > 6
+ %if xmm_regs_used > 6 + high_mm_regs
movaps [rstk + stack_offset + 8], xmm6
%endif
- %if xmm_regs_used > 7
+ %if xmm_regs_used > 7 + high_mm_regs
movaps [rstk + stack_offset + 24], xmm7
%endif
- %if xmm_regs_used > 8
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+ %if %%xmm_regs_on_stack > 0
%assign %%i 8
- %rep xmm_regs_used-8
+ %rep %%xmm_regs_on_stack
movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
%assign %%i %%i+1
%endrep
@@ -455,8 +458,9 @@
%macro WIN64_SPILL_XMM 1
%assign xmm_regs_used %1
- ASSERT xmm_regs_used <= 16
- %if xmm_regs_used > 8
+ ASSERT xmm_regs_used <= 16 + high_mm_regs
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+ %if %%xmm_regs_on_stack > 0
; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
%assign %%pad (xmm_regs_used-8)*16 + 32
%assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
@@ -467,9 +471,10 @@
%macro WIN64_RESTORE_XMM_INTERNAL 0
%assign %%pad_size 0
- %if xmm_regs_used > 8
- %assign %%i xmm_regs_used
- %rep xmm_regs_used-8
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+ %if %%xmm_regs_on_stack > 0
+ %assign %%i xmm_regs_used - high_mm_regs
+ %rep %%xmm_regs_on_stack
%assign %%i %%i-1
movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
%endrep
@@ -482,10 +487,10 @@
%assign %%pad_size stack_size_padded
%endif
%endif
- %if xmm_regs_used > 7
+ %if xmm_regs_used > 7 + high_mm_regs
movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
%endif
- %if xmm_regs_used > 6
+ %if xmm_regs_used > 6 + high_mm_regs
movaps xmm6, [rsp + stack_offset - %%pad_size + 8]
%endif
%endmacro
@@ -497,12 +502,12 @@
%assign xmm_regs_used 0
%endmacro
-%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6 + high_mm_regs
%macro RET 0
WIN64_RESTORE_XMM_INTERNAL
POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
- %if mmsize == 32
+ %if vzeroupper_required
vzeroupper
%endif
AUTO_REP_RET
@@ -526,9 +531,10 @@
DECLARE_REG 13, R12, 64
DECLARE_REG 14, R13, 72
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+%macro PROLOGUE 2-5+ 0; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1
%assign regs_used %2
+ %assign xmm_regs_used %3
ASSERT regs_used >= num_args
SETUP_STACK_POINTER %4
ASSERT regs_used <= 15
@@ -538,7 +544,7 @@
DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro
-%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
%macro RET 0
%if stack_size_padded > 0
@@ -549,7 +555,7 @@
%endif
%endif
POP_IF_USED 14, 13, 12, 11, 10, 9
- %if mmsize == 32
+ %if vzeroupper_required
vzeroupper
%endif
AUTO_REP_RET
@@ -594,7 +600,7 @@
DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro
-%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
%macro RET 0
%if stack_size_padded > 0
@@ -605,7 +611,7 @@
%endif
%endif
POP_IF_USED 6, 5, 4, 3
- %if mmsize == 32
+ %if vzeroupper_required
vzeroupper
%endif
AUTO_REP_RET
@@ -710,7 +716,7 @@
%assign stack_offset 0 ; stack pointer offset relative to the return address
%assign stack_size 0 ; amount of stack space that can be freely used inside a function
%assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
- %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
+ %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
%ifnidn %3, ""
PROLOGUE %3
%endif
@@ -768,10 +774,10 @@
%assign cpuflags_bmi1 (1<<16)| cpuflags_avx | cpuflags_lzcnt
%assign cpuflags_bmi2 (1<<17)| cpuflags_bmi1
%assign cpuflags_avx2 (1<<18)| cpuflags_fma3 | cpuflags_bmi2
+%assign cpuflags_avx512 (1<<19)| cpuflags_avx2 ; F, CD, BW, DQ, VL
-%assign cpuflags_cache32 (1<<19)
-%assign cpuflags_cache64 (1<<20)
-%assign cpuflags_slowctz (1<<21)
+%assign cpuflags_cache32 (1<<20)
+%assign cpuflags_cache64 (1<<21)
%assign cpuflags_aligned (1<<22) ; not a cpu feature, but a function variant
%assign cpuflags_atom (1<<23)
@@ -829,11 +835,12 @@
%endif
%endmacro
-; Merge mmx and sse*
+; Merge mmx and sse*, and avx*
; m# is a simd register of the currently selected size
; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
-; (All 3 remain in sync through SWAP.)
+; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
+; (All 4 remain in sync through SWAP.)
%macro CAT_XDEFINE 3
%xdefine %1%2 %3
@@ -843,6 +850,18 @@
%undef %1%2
%endmacro
+; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
+%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
+ %if ARCH_X86_64 && cpuflag(avx512)
+ %assign %%i %1
+ %rep 16-%1
+ %assign %%i_high %%i+16
+ SWAP %%i, %%i_high
+ %assign %%i %%i+1
+ %endrep
+ %endif
+%endmacro
+
%macro INIT_MMX 0-1+
%assign avx_enabled 0
%define RESET_MM_PERMUTATION INIT_MMX %1
@@ -858,7 +877,7 @@
CAT_XDEFINE nnmm, %%i, %%i
%assign %%i %%i+1
%endrep
- %rep 8
+ %rep 24
CAT_UNDEF m, %%i
CAT_UNDEF nnmm, %%i
%assign %%i %%i+1
@@ -872,7 +891,7 @@
%define mmsize 16
%define num_mmregs 8
%if ARCH_X86_64
- %define num_mmregs 16
+ %define num_mmregs 32
%endif
%define mova movdqa
%define movu movdqu
@@ -885,6 +904,10 @@
%assign %%i %%i+1
%endrep
INIT_CPUFLAGS %1
+ %if WIN64
+ ; Swap callee-saved registers with volatile registers
+ AVX512_MM_PERMUTATION 6
+ %endif
%endmacro
%macro INIT_YMM 0-1+
@@ -893,7 +916,7 @@
%define mmsize 32
%define num_mmregs 8
%if ARCH_X86_64
- %define num_mmregs 16
+ %define num_mmregs 32
%endif
%define mova movdqa
%define movu movdqu
@@ -906,6 +929,29 @@
%assign %%i %%i+1
%endrep
INIT_CPUFLAGS %1
+ AVX512_MM_PERMUTATION
+%endmacro
+
+%macro INIT_ZMM 0-1+
+ %assign avx_enabled 1
+ %define RESET_MM_PERMUTATION INIT_ZMM %1
+ %define mmsize 64
+ %define num_mmregs 8
+ %if ARCH_X86_64
+ %define num_mmregs 32
+ %endif
+ %define mova movdqa
+ %define movu movdqu
+ %undef movh
+ %define movnta movntdq
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE m, %%i, zmm %+ %%i
+ CAT_XDEFINE nnzmm, %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ INIT_CPUFLAGS %1
+ AVX512_MM_PERMUTATION
%endmacro
INIT_XMM
@@ -914,18 +960,26 @@
%define mmmm%1 mm%1
%define mmxmm%1 mm%1
%define mmymm%1 mm%1
+ %define mmzmm%1 mm%1
%define xmmmm%1 mm%1
%define xmmxmm%1 xmm%1
%define xmmymm%1 xmm%1
+ %define xmmzmm%1 xmm%1
%define ymmmm%1 mm%1
%define ymmxmm%1 xmm%1
%define ymmymm%1 ymm%1
+ %define ymmzmm%1 ymm%1
+ %define zmmmm%1 mm%1
+ %define zmmxmm%1 xmm%1
+ %define zmmymm%1 ymm%1
+ %define zmmzmm%1 zmm%1
%define xm%1 xmm %+ m%1
%define ym%1 ymm %+ m%1
+ %define zm%1 zmm %+ m%1
%endmacro
%assign i 0
-%rep 16
+%rep 32
DECLARE_MMCAST i
%assign i i+1
%endrep
@@ -1060,12 +1114,17 @@
;=============================================================================
%assign i 0
-%rep 16
+%rep 32
%if i < 8
CAT_XDEFINE sizeofmm, i, 8
+ CAT_XDEFINE regnumofmm, i, i
%endif
CAT_XDEFINE sizeofxmm, i, 16
CAT_XDEFINE sizeofymm, i, 32
+ CAT_XDEFINE sizeofzmm, i, 64
+ CAT_XDEFINE regnumofxmm, i, i
+ CAT_XDEFINE regnumofymm, i, i
+ CAT_XDEFINE regnumofzmm, i, i
%assign i i+1
%endrep
%undef i
@@ -1182,7 +1241,7 @@
%endmacro
%endmacro
-; Instructions with both VEX and non-VEX encodings
+; Instructions with both VEX/EVEX and legacy encodings
; Non-destructive instructions are written without parameters
AVX_INSTR addpd, sse2, 1, 0, 1
AVX_INSTR addps, sse, 1, 0, 1
@@ -1513,3 +1572,49 @@
FMA4_INSTR fmsubadd, pd, ps
FMA4_INSTR fnmadd, pd, ps, sd, ss
FMA4_INSTR fnmsub, pd, ps, sd, ss
+
+; Macros for converting VEX instructions to equivalent EVEX ones.
+%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
+ %macro %1 2-7 fnord, fnord, %1, %2, %3
+ %ifidn %3, fnord
+ %define %%args %1, %2
+ %elifidn %4, fnord
+ %define %%args %1, %2, %3
+ %else
+ %define %%args %1, %2, %3, %4
+ %endif
+ %assign %%evex_required cpuflag(avx512) & %7
+ %ifnum regnumof%1
+ %if regnumof%1 >= 16 || sizeof%1 > 32
+ %assign %%evex_required 1
+ %endif
+ %endif
+ %ifnum regnumof%2
+ %if regnumof%2 >= 16 || sizeof%2 > 32
+ %assign %%evex_required 1
+ %endif
+ %endif
+ %if %%evex_required
+ %6 %%args
+ %else
+ %5 %%args ; Prefer VEX over EVEX due to shorter instruction length
+ %endif
+ %endmacro
+%endmacro
+
+EVEX_INSTR vbroadcastf128, vbroadcastf32x4
+EVEX_INSTR vbroadcasti128, vbroadcasti32x4
+EVEX_INSTR vextractf128, vextractf32x4
+EVEX_INSTR vextracti128, vextracti32x4
+EVEX_INSTR vinsertf128, vinsertf32x4
+EVEX_INSTR vinserti128, vinserti32x4
+EVEX_INSTR vmovdqa, vmovdqa32
+EVEX_INSTR vmovdqu, vmovdqu32
+EVEX_INSTR vpand, vpandd
+EVEX_INSTR vpandn, vpandnd
+EVEX_INSTR vpor, vpord
+EVEX_INSTR vpxor, vpxord
+EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision
+EVEX_INSTR vrcpss, vrcp14ss, 1
+EVEX_INSTR vrsqrtps, vrsqrt14ps, 1
+EVEX_INSTR vrsqrtss, vrsqrt14ss, 1
diff -r e1ed4d609b52 -r d7e105cac1d0 source/common/x86/x86util.asm
--- a/source/common/x86/x86util.asm Tue Apr 03 13:49:25 2018 +0530
+++ b/source/common/x86/x86util.asm Thu Jun 22 10:25:57 2017 +0530
@@ -299,6 +299,18 @@
pminsw %2, %4
%endmacro
+%macro MOVHL 2 ; dst, src
+%ifidn %1, %2
+ punpckhqdq %1, %2
+%elif cpuflag(avx)
+ punpckhqdq %1, %2, %2
+%elif cpuflag(sse4)
+ pshufd %1, %2, q3232 ; pshufd is slow on some older CPUs, so only use it on more modern ones
+%else
+ movhlps %1, %2 ; may cause an int/float domain transition and has a dependency on dst
+%endif
+%endmacro
+
%macro HADDD 2 ; sum junk
%if sizeof%1 == 32
%define %2 xmm%2
@@ -324,7 +336,7 @@
%macro HADDW 2 ; reg, tmp
%if cpuflag(xop) && sizeof%1 == 16
vphaddwq %1, %1
- movhlps %2, %1
+ MOVHL %2, %1
paddd %1, %2
%else
pmaddwd %1, [pw_1]
@@ -346,7 +358,7 @@
%macro HADDUW 2
%if cpuflag(xop) && sizeof%1 == 16
vphadduwq %1, %1
- movhlps %2, %1
+ MOVHL %2, %1
paddd %1, %2
%else
HADDUWD %1, %2
@@ -739,25 +751,25 @@
%if %6 ; %5 aligned?
mova %1, %4
psubw %1, %5
+%elif cpuflag(avx)
+ movu %1, %4
+ psubw %1, %5
%else
movu %1, %4
movu %2, %5
psubw %1, %2
%endif
%else ; !HIGH_BIT_DEPTH
-%ifidn %3, none
movh %1, %4
movh %2, %5
+%ifidn %3, none
punpcklbw %1, %2
punpcklbw %2, %2
+%else
+ punpcklbw %1, %3
+ punpcklbw %2, %3
+%endif
psubw %1, %2
-%else
- movh %1, %4
- punpcklbw %1, %3
- movh %2, %5
- punpcklbw %2, %3
- psubw %1, %2
-%endif
%endif ; HIGH_BIT_DEPTH
%endmacro
diff -r e1ed4d609b52 -r d7e105cac1d0 source/test/testbench.cpp
--- a/source/test/testbench.cpp Tue Apr 03 13:49:25 2018 +0530
+++ b/source/test/testbench.cpp Thu Jun 22 10:25:57 2017 +0530
@@ -169,6 +169,7 @@
{ "XOP", X265_CPU_XOP },
{ "AVX2", X265_CPU_AVX2 },
{ "BMI2", X265_CPU_AVX2 | X265_CPU_BMI1 | X265_CPU_BMI2 },
+ { "AVX512", X265_CPU_AVX512 },
{ "ARMv6", X265_CPU_ARMV6 },
{ "NEON", X265_CPU_NEON },
{ "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
diff -r e1ed4d609b52 -r d7e105cac1d0 source/test/testharness.h
--- a/source/test/testharness.h Tue Apr 03 13:49:25 2018 +0530
+++ b/source/test/testharness.h Thu Jun 22 10:25:57 2017 +0530
@@ -91,7 +91,7 @@
}
#endif // ifdef _MSC_VER
-#define BENCH_RUNS 1000
+#define BENCH_RUNS 2000
// Adapted from checkasm.c, runs each optimized primitive four times, measures rdtsc
// and discards invalid times. Repeats 1000 times to get a good average. Then measures
diff -r e1ed4d609b52 -r d7e105cac1d0 source/x265.h
--- a/source/x265.h Tue Apr 03 13:49:25 2018 +0530
+++ b/source/x265.h Thu Jun 22 10:25:57 2017 +0530
@@ -382,39 +382,38 @@
/* CPU flags */
/* x86 */
-#define X265_CPU_CMOV 0x0000001
-#define X265_CPU_MMX 0x0000002
-#define X265_CPU_MMX2 0x0000004 /* MMX2 aka MMXEXT aka ISSE */
+#define X265_CPU_MMX (1 << 0)
+#define X265_CPU_MMX2 (1 << 1) /* MMX2 aka MMXEXT aka ISSE */
#define X265_CPU_MMXEXT X265_CPU_MMX2
-#define X265_CPU_SSE 0x0000008
-#define X265_CPU_SSE2 0x0000010
-#define X265_CPU_SSE3 0x0000020
-#define X265_CPU_SSSE3 0x0000040
-#define X265_CPU_SSE4 0x0000080 /* SSE4.1 */
-#define X265_CPU_SSE42 0x0000100 /* SSE4.2 */
-#define X265_CPU_LZCNT 0x0000200 /* Phenom support for "leading zero count" instruction. */
-#define X265_CPU_AVX 0x0000400 /* AVX support: requires OS support even if YMM registers aren't used. */
-#define X265_CPU_XOP 0x0000800 /* AMD XOP */
-#define X265_CPU_FMA4 0x0001000 /* AMD FMA4 */
-#define X265_CPU_AVX2 0x0002000 /* AVX2 */
-#define X265_CPU_FMA3 0x0004000 /* Intel FMA3 */
-#define X265_CPU_BMI1 0x0008000 /* BMI1 */
-#define X265_CPU_BMI2 0x0010000 /* BMI2 */
+#define X265_CPU_SSE (1 << 2)
+#define X265_CPU_SSE2 (1 << 3)
+#define X265_CPU_LZCNT (1 << 4)
+#define X265_CPU_SSE3 (1 << 5)
+#define X265_CPU_SSSE3 (1 << 6)
+#define X265_CPU_SSE4 (1 << 7) /* SSE4.1 */
+#define X265_CPU_SSE42 (1 << 8) /* SSE4.2 */
+#define X265_CPU_AVX (1 << 9) /* Requires OS support even if YMM registers aren't used. */
+#define X265_CPU_XOP (1 << 10) /* AMD XOP */
+#define X265_CPU_FMA4 (1 << 11) /* AMD FMA4 */
+#define X265_CPU_FMA3 (1 << 12) /* Intel FMA3 */
+#define X265_CPU_BMI1 (1 << 13) /* BMI1 */
+#define X265_CPU_BMI2 (1 << 14) /* BMI2 */
+#define X265_CPU_AVX2 (1 << 15) /* AVX2 */
+#define X265_CPU_AVX512 (1 << 16) /* AVX-512 {F, CD, BW, DQ, VL}, requires OS support */
/* x86 modifiers */
-#define X265_CPU_CACHELINE_32 0x0020000 /* avoid memory loads that span the border between two cachelines */
-#define X265_CPU_CACHELINE_64 0x0040000 /* 32/64 is the size of a cacheline in bytes */
-#define X265_CPU_SSE2_IS_SLOW 0x0080000 /* avoid most SSE2 functions on Athlon64 */
-#define X265_CPU_SSE2_IS_FAST 0x0100000 /* a few functions are only faster on Core2 and Phenom */
-#define X265_CPU_SLOW_SHUFFLE 0x0200000 /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
-#define X265_CPU_STACK_MOD4 0x0400000 /* if stack is only mod4 and not mod16 */
-#define X265_CPU_SLOW_CTZ 0x0800000 /* BSR/BSF x86 instructions are really slow on some CPUs */
-#define X265_CPU_SLOW_ATOM 0x1000000 /* The Atom is terrible: slow SSE unaligned loads, slow
+#define X265_CPU_CACHELINE_32 (1 << 17) /* avoid memory loads that span the border between two cachelines */
+#define X265_CPU_CACHELINE_64 (1 << 18) /* 32/64 is the size of a cacheline in bytes */
+#define X265_CPU_SSE2_IS_SLOW (1 << 19) /* avoid most SSE2 functions on Athlon64 */
+#define X265_CPU_SSE2_IS_FAST (1 << 20) /* a few functions are only faster on Core2 and Phenom */
+#define X265_CPU_SLOW_SHUFFLE (1 << 21) /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
+#define X265_CPU_STACK_MOD4 (1 << 22) /* if stack is only mod4 and not mod16 */
+#define X265_CPU_SLOW_ATOM (1 << 23) /* The Atom is terrible: slow SSE unaligned loads, slow
* SIMD multiplies, slow SIMD variable shifts, slow pshufb,
* cacheline split penalties -- gather everything here that
* isn't shared by other CPUs to avoid making half a dozen
* new SLOW flags. */
-#define X265_CPU_SLOW_PSHUFB 0x2000000 /* such as on the Intel Atom */
-#define X265_CPU_SLOW_PALIGNR 0x4000000 /* such as on the AMD Bobcat */
+#define X265_CPU_SLOW_PSHUFB (1 << 24) /* such as on the Intel Atom */
+#define X265_CPU_SLOW_PALIGNR (1 << 25) /* such as on the AMD Bobcat */
/* ARM */
#define X265_CPU_ARMV6 0x0000001
More information about the x265-devel
mailing list