[x265] [PATCH 001 of 307] x86: AVX-512 support

Sat Apr 7 04:29:59 CEST 2018

# HG changeset patch
# User Vignesh Vijayakumar
# Date 1498107357 -19800
#      Thu Jun 22 10:25:57 2017 +0530
# Node ID d7e105cac1d01fa74adc8f7f7431d33b7e261b4f
# Parent  e1ed4d609b52a361e758a66f45e8c070dd245211
x86: AVX-512 support

diff -r e1ed4d609b52 -r d7e105cac1d0 source/common/cpu.cpp

--- a/source/common/cpu.cpp	Tue Apr 03 13:49:25 2018 +0530
+++ b/source/common/cpu.cpp	Thu Jun 22 10:25:57 2017 +0530
@@ -61,7 +61,7 @@
 const cpu_name_t cpu_names[] =
 {
 #if X265_ARCH_X86
-#define MMX2 X265_CPU_MMX | X265_CPU_MMX2 | X265_CPU_CMOV
+#define MMX2 X265_CPU_MMX | X265_CPU_MMX2
     { "MMX2",        MMX2 },
     { "MMXEXT",      MMX2 },
     { "SSE",         MMX2 | X265_CPU_SSE },
@@ -84,13 +84,13 @@
     { "BMI2",        AVX | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 },
 #define AVX2 AVX | X265_CPU_FMA3 | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 | X265_CPU_AVX2
     { "AVX2", AVX2},
+    { "AVX512", AVX2 | X265_CPU_AVX512 },
 #undef AVX2
 #undef AVX
 #undef SSE2
 #undef MMX2
     { "Cache32",         X265_CPU_CACHELINE_32 },
     { "Cache64",         X265_CPU_CACHELINE_64 },
-    { "SlowCTZ",         X265_CPU_SLOW_CTZ },
     { "SlowAtom",        X265_CPU_SLOW_ATOM },
     { "SlowPshufb",      X265_CPU_SLOW_PSHUFB },
     { "SlowPalignr",     X265_CPU_SLOW_PALIGNR },
@@ -115,7 +115,7 @@
 /* cpu-a.asm */
 int PFX(cpu_cpuid_test)(void);
 void PFX(cpu_cpuid)(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
-void PFX(cpu_xgetbv)(uint32_t op, uint32_t *eax, uint32_t *edx);
+uint64_t PFX(cpu_xgetbv)(int xcr);
 }
 
 #if defined(_MSC_VER)
@@ -129,14 +129,14 @@
     uint32_t eax, ebx, ecx, edx;
     uint32_t vendor[4] = { 0 };
     uint32_t max_extended_cap, max_basic_cap;
+    uint64_t xcr0 = 0;
 
 #if !X86_64
     if (!PFX(cpu_cpuid_test)())
         return 0;
 #endif
 
-    PFX(cpu_cpuid)(0, &eax, vendor + 0, vendor + 2, vendor + 1);
-    max_basic_cap = eax;
+    PFX(cpu_cpuid)(0, &max_basic_cap, vendor + 0, vendor + 2, vendor + 1);
     if (max_basic_cap == 0)
         return 0;
 
@@ -147,27 +147,24 @@
         return cpu;
     if (edx & 0x02000000)
         cpu |= X265_CPU_MMX2 | X265_CPU_SSE;
-    if (edx & 0x00008000)
-        cpu |= X265_CPU_CMOV;
-    else
-        return cpu;
     if (edx & 0x04000000)
         cpu |= X265_CPU_SSE2;
     if (ecx & 0x00000001)
         cpu |= X265_CPU_SSE3;
     if (ecx & 0x00000200)
-        cpu |= X265_CPU_SSSE3;
+        cpu |= X265_CPU_SSSE3 | X265_CPU_SSE2_IS_FAST;
     if (ecx & 0x00080000)
         cpu |= X265_CPU_SSE4;
     if (ecx & 0x00100000)
         cpu |= X265_CPU_SSE42;
-    /* Check OXSAVE and AVX bits */
-    if ((ecx & 0x18000000) == 0x18000000)
+
+    if (ecx & 0x08000000) /* XGETBV supported and XSAVE enabled by OS */
     {
         /* Check for OS support */
-        PFX(cpu_xgetbv)(0, &eax, &edx);
-        if ((eax & 0x6) == 0x6)
+        xcr0 = PFX(cpu_xgetbv)(0);
+        if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
         {
+            if (ecx & 0x10000000)
             cpu |= X265_CPU_AVX;
             if (ecx & 0x00001000)
                 cpu |= X265_CPU_FMA3;
@@ -178,19 +175,24 @@
     {
         PFX(cpu_cpuid)(7, &eax, &ebx, &ecx, &edx);
         /* AVX2 requires OS support, but BMI1/2 don't. */
-        if ((cpu & X265_CPU_AVX) && (ebx & 0x00000020))
-            cpu |= X265_CPU_AVX2;
         if (ebx & 0x00000008)
+            cpu |= X265_CPU_BMI1;
+        if (ebx & 0x00000100)
+            cpu |= X265_CPU_BMI2;
+
+        if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
         {
-            cpu |= X265_CPU_BMI1;
-            if (ebx & 0x00000100)
-                cpu |= X265_CPU_BMI2;
+            if (ebx & 0x00000020)
+                cpu |= X265_CPU_AVX2;
+
+            if ((xcr0 & 0xE0) == 0xE0) /* OPMASK/ZMM state */
+            {
+                if ((ebx & 0xD0030000) == 0xD0030000)
+                    cpu |= X265_CPU_AVX512;
+            }
         }
     }
 
-    if (cpu & X265_CPU_SSSE3)
-        cpu |= X265_CPU_SSE2_IS_FAST;
-
     PFX(cpu_cpuid)(0x80000000, &eax, &ebx, &ecx, &edx);
     max_extended_cap = eax;
 
@@ -230,8 +232,6 @@
         {
             if (edx & 0x00400000)
                 cpu |= X265_CPU_MMX2;
-            if (!(cpu & X265_CPU_LZCNT))
-                cpu |= X265_CPU_SLOW_CTZ;
             if ((cpu & X265_CPU_SSE2) && !(cpu & X265_CPU_SSE2_IS_FAST))
                 cpu |= X265_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
         }
@@ -256,7 +256,6 @@
             else if (model == 28)
             {
                 cpu |= X265_CPU_SLOW_ATOM;
-                cpu |= X265_CPU_SLOW_CTZ;
                 cpu |= X265_CPU_SLOW_PSHUFB;
             }
 
diff -r e1ed4d609b52 -r d7e105cac1d0 source/common/x86/cpu-a.asm
--- a/source/common/x86/cpu-a.asm	Tue Apr 03 13:49:25 2018 +0530
+++ b/source/common/x86/cpu-a.asm	Thu Jun 22 10:25:57 2017 +0530
@@ -54,18 +54,16 @@
     RET
 
 ;-----------------------------------------------------------------------------
-; void cpu_xgetbv( int op, int *eax, int *edx )
+; uint64_t cpu_xgetbv( int xcr )
 ;-----------------------------------------------------------------------------
-cglobal cpu_xgetbv, 3,7
-    push  r2
-    push  r1
-    mov  ecx, r0d
+cglobal cpu_xgetbv
+    movifnidn ecx, r0m
     xgetbv
-    pop   r4
-    mov [r4], eax
-    pop   r4
-    mov [r4], edx
-    RET
+%if ARCH_X86_64
+    shl       rdx, 32
+    or        rax, rdx
+%endif
+    ret
 
 %if ARCH_X86_64
 
@@ -78,7 +76,7 @@
 %if WIN64
     sub  rsp, 32 ; shadow space
 %endif
-    and  rsp, ~31
+    and  rsp, ~(STACK_ALIGNMENT - 1)
     mov  rax, r0
     mov   r0, r1
     mov   r1, r2
@@ -119,7 +117,7 @@
     push ebp
     mov  ebp, esp
     sub  esp, 12
-    and  esp, ~31
+    and  esp, ~(STACK_ALIGNMENT - 1)
     mov  ecx, [ebp+8]
     mov  edx, [ebp+12]
     mov  [esp], edx
diff -r e1ed4d609b52 -r d7e105cac1d0 source/common/x86/x86inc.asm
--- a/source/common/x86/x86inc.asm	Tue Apr 03 13:49:25 2018 +0530
+++ b/source/common/x86/x86inc.asm	Thu Jun 22 10:25:57 2017 +0530
@@ -325,6 +325,8 @@
 %endmacro
 
 %define required_stack_alignment ((mmsize + 15) & ~15)
+%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
+%define high_mm_regs (16*cpuflag(avx512))
 
 %macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
     %ifnum %1
@@ -438,15 +440,16 @@
 
 %macro WIN64_PUSH_XMM 0
     ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
-    %if xmm_regs_used > 6
+    %if xmm_regs_used > 6 + high_mm_regs
         movaps [rstk + stack_offset +  8], xmm6
     %endif
-    %if xmm_regs_used > 7
+    %if xmm_regs_used > 7 + high_mm_regs
         movaps [rstk + stack_offset + 24], xmm7
     %endif
-    %if xmm_regs_used > 8
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+    %if %%xmm_regs_on_stack > 0
         %assign %%i 8
-        %rep xmm_regs_used-8
+        %rep %%xmm_regs_on_stack
             movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
             %assign %%i %%i+1
         %endrep
@@ -455,8 +458,9 @@
 
 %macro WIN64_SPILL_XMM 1
     %assign xmm_regs_used %1
-    ASSERT xmm_regs_used <= 16
-    %if xmm_regs_used > 8
+    ASSERT xmm_regs_used <= 16 + high_mm_regs
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+    %if %%xmm_regs_on_stack > 0
         ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
         %assign %%pad (xmm_regs_used-8)*16 + 32
         %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
@@ -467,9 +471,10 @@
 
 %macro WIN64_RESTORE_XMM_INTERNAL 0
     %assign %%pad_size 0
-    %if xmm_regs_used > 8
-        %assign %%i xmm_regs_used
-        %rep xmm_regs_used-8
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+    %if %%xmm_regs_on_stack > 0
+        %assign %%i xmm_regs_used - high_mm_regs
+        %rep %%xmm_regs_on_stack
             %assign %%i %%i-1
             movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
         %endrep
@@ -482,10 +487,10 @@
             %assign %%pad_size stack_size_padded
         %endif
     %endif
-    %if xmm_regs_used > 7
+    %if xmm_regs_used > 7 + high_mm_regs
         movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
     %endif
-    %if xmm_regs_used > 6
+    %if xmm_regs_used > 6 + high_mm_regs
         movaps xmm6, [rsp + stack_offset - %%pad_size +  8]
     %endif
 %endmacro
@@ -497,12 +502,12 @@
     %assign xmm_regs_used 0
 %endmacro
 
-%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6 + high_mm_regs
 
 %macro RET 0
     WIN64_RESTORE_XMM_INTERNAL
     POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
-    %if mmsize == 32
+    %if vzeroupper_required
         vzeroupper
     %endif
     AUTO_REP_RET
@@ -526,9 +531,10 @@
 DECLARE_REG 13, R12, 64
 DECLARE_REG 14, R13, 72
 
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+%macro PROLOGUE 2-5+ 0; #args, #regs, #xmm_regs, [stack_size,] arg_names...
     %assign num_args %1
     %assign regs_used %2
+    %assign xmm_regs_used %3
     ASSERT regs_used >= num_args
     SETUP_STACK_POINTER %4
     ASSERT regs_used <= 15
@@ -538,7 +544,7 @@
     DEFINE_ARGS_INTERNAL %0, %4, %5
 %endmacro
 
-%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
 
 %macro RET 0
     %if stack_size_padded > 0
@@ -549,7 +555,7 @@
         %endif
     %endif
     POP_IF_USED 14, 13, 12, 11, 10, 9
-    %if mmsize == 32
+    %if vzeroupper_required
         vzeroupper
     %endif
     AUTO_REP_RET
@@ -594,7 +600,7 @@
     DEFINE_ARGS_INTERNAL %0, %4, %5
 %endmacro
 
-%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
 
 %macro RET 0
     %if stack_size_padded > 0
@@ -605,7 +611,7 @@
         %endif
     %endif
     POP_IF_USED 6, 5, 4, 3
-    %if mmsize == 32
+    %if vzeroupper_required
         vzeroupper
     %endif
     AUTO_REP_RET
@@ -710,7 +716,7 @@
     %assign stack_offset 0      ; stack pointer offset relative to the return address
     %assign stack_size 0        ; amount of stack space that can be freely used inside a function
     %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
-    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
+    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
     %ifnidn %3, ""
         PROLOGUE %3
     %endif
@@ -768,10 +774,10 @@
 %assign cpuflags_bmi1     (1<<16)| cpuflags_avx | cpuflags_lzcnt
 %assign cpuflags_bmi2     (1<<17)| cpuflags_bmi1
 %assign cpuflags_avx2     (1<<18)| cpuflags_fma3 | cpuflags_bmi2
+%assign cpuflags_avx512   (1<<19)| cpuflags_avx2 ; F, CD, BW, DQ, VL
 
-%assign cpuflags_cache32  (1<<19)
-%assign cpuflags_cache64  (1<<20)
-%assign cpuflags_slowctz  (1<<21)
+%assign cpuflags_cache32  (1<<20)
+%assign cpuflags_cache64  (1<<21)
 %assign cpuflags_aligned  (1<<22) ; not a cpu feature, but a function variant
 %assign cpuflags_atom     (1<<23)
 
@@ -829,11 +835,12 @@
     %endif
 %endmacro
 
-; Merge mmx and sse*
+; Merge mmx and sse*, and avx*
 ; m# is a simd register of the currently selected size
 ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
 ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
-; (All 3 remain in sync through SWAP.)
+; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
+; (All 4 remain in sync through SWAP.)
 
 %macro CAT_XDEFINE 3
     %xdefine %1%2 %3
@@ -843,6 +850,18 @@
     %undef %1%2
 %endmacro
 
+; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
+%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
+    %if ARCH_X86_64 && cpuflag(avx512)
+        %assign %%i %1
+        %rep 16-%1
+            %assign %%i_high %%i+16
+            SWAP %%i, %%i_high
+            %assign %%i %%i+1
+        %endrep
+    %endif
+%endmacro
+
 %macro INIT_MMX 0-1+
     %assign avx_enabled 0
     %define RESET_MM_PERMUTATION INIT_MMX %1
@@ -858,7 +877,7 @@
         CAT_XDEFINE nnmm, %%i, %%i
         %assign %%i %%i+1
     %endrep
-    %rep 8
+    %rep 24
         CAT_UNDEF m, %%i
         CAT_UNDEF nnmm, %%i
         %assign %%i %%i+1
@@ -872,7 +891,7 @@
     %define mmsize 16
     %define num_mmregs 8
     %if ARCH_X86_64
-        %define num_mmregs 16
+        %define num_mmregs 32
     %endif
     %define mova movdqa
     %define movu movdqu
@@ -885,6 +904,10 @@
         %assign %%i %%i+1
     %endrep
     INIT_CPUFLAGS %1
+    %if WIN64
+        ; Swap callee-saved registers with volatile registers
+        AVX512_MM_PERMUTATION 6
+    %endif
 %endmacro
 
 %macro INIT_YMM 0-1+
@@ -893,7 +916,7 @@
     %define mmsize 32
     %define num_mmregs 8
     %if ARCH_X86_64
-        %define num_mmregs 16
+        %define num_mmregs 32
     %endif
     %define mova movdqa
     %define movu movdqu
@@ -906,6 +929,29 @@
         %assign %%i %%i+1
     %endrep
     INIT_CPUFLAGS %1
+    AVX512_MM_PERMUTATION
+%endmacro
+
+%macro INIT_ZMM 0-1+
+    %assign avx_enabled 1
+    %define RESET_MM_PERMUTATION INIT_ZMM %1
+    %define mmsize 64
+    %define num_mmregs 8
+    %if ARCH_X86_64
+        %define num_mmregs 32
+    %endif
+    %define mova movdqa
+    %define movu movdqu
+    %undef movh
+    %define movnta movntdq
+    %assign %%i 0
+    %rep num_mmregs
+        CAT_XDEFINE m, %%i, zmm %+ %%i
+        CAT_XDEFINE nnzmm, %%i, %%i
+        %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+    AVX512_MM_PERMUTATION
 %endmacro
 
 INIT_XMM
@@ -914,18 +960,26 @@
     %define  mmmm%1   mm%1
     %define  mmxmm%1  mm%1
     %define  mmymm%1  mm%1
+    %define  mmzmm%1  mm%1
     %define xmmmm%1   mm%1
     %define xmmxmm%1 xmm%1
     %define xmmymm%1 xmm%1
+    %define xmmzmm%1 xmm%1
     %define ymmmm%1   mm%1
     %define ymmxmm%1 xmm%1
     %define ymmymm%1 ymm%1
+    %define ymmzmm%1 ymm%1
+    %define zmmmm%1   mm%1
+    %define zmmxmm%1 xmm%1
+    %define zmmymm%1 ymm%1
+    %define zmmzmm%1 zmm%1
     %define xm%1 xmm %+ m%1
     %define ym%1 ymm %+ m%1
+    %define zm%1 zmm %+ m%1
 %endmacro
 
 %assign i 0
-%rep 16
+%rep 32
     DECLARE_MMCAST i
     %assign i i+1
 %endrep
@@ -1060,12 +1114,17 @@
 ;=============================================================================
 
 %assign i 0
-%rep 16
+%rep 32
     %if i < 8
         CAT_XDEFINE sizeofmm, i, 8
+        CAT_XDEFINE regnumofmm, i, i
     %endif
     CAT_XDEFINE sizeofxmm, i, 16
     CAT_XDEFINE sizeofymm, i, 32
+    CAT_XDEFINE sizeofzmm, i, 64
+    CAT_XDEFINE regnumofxmm, i, i
+    CAT_XDEFINE regnumofymm, i, i
+    CAT_XDEFINE regnumofzmm, i, i
     %assign i i+1
 %endrep
 %undef i
@@ -1182,7 +1241,7 @@
     %endmacro
 %endmacro
 
-; Instructions with both VEX and non-VEX encodings
+; Instructions with both VEX/EVEX and legacy encodings
 ; Non-destructive instructions are written without parameters
 AVX_INSTR addpd, sse2, 1, 0, 1
 AVX_INSTR addps, sse, 1, 0, 1
@@ -1513,3 +1572,49 @@
 FMA4_INSTR fmsubadd, pd, ps
 FMA4_INSTR fnmadd,   pd, ps, sd, ss
 FMA4_INSTR fnmsub,   pd, ps, sd, ss
+
+; Macros for converting VEX instructions to equivalent EVEX ones.
+%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
+    %macro %1 2-7 fnord, fnord, %1, %2, %3
+        %ifidn %3, fnord
+            %define %%args %1, %2
+        %elifidn %4, fnord
+            %define %%args %1, %2, %3
+        %else
+            %define %%args %1, %2, %3, %4
+        %endif
+        %assign %%evex_required cpuflag(avx512) & %7
+        %ifnum regnumof%1
+            %if regnumof%1 >= 16 || sizeof%1 > 32
+                %assign %%evex_required 1
+            %endif
+        %endif
+        %ifnum regnumof%2
+            %if regnumof%2 >= 16 || sizeof%2 > 32
+                %assign %%evex_required 1
+            %endif
+        %endif
+        %if %%evex_required
+            %6 %%args
+        %else
+            %5 %%args ; Prefer VEX over EVEX due to shorter instruction length
+        %endif
+    %endmacro
+%endmacro
+
+EVEX_INSTR vbroadcastf128, vbroadcastf32x4
+EVEX_INSTR vbroadcasti128, vbroadcasti32x4
+EVEX_INSTR vextractf128,   vextractf32x4
+EVEX_INSTR vextracti128,   vextracti32x4
+EVEX_INSTR vinsertf128,    vinsertf32x4
+EVEX_INSTR vinserti128,    vinserti32x4
+EVEX_INSTR vmovdqa,        vmovdqa32
+EVEX_INSTR vmovdqu,        vmovdqu32
+EVEX_INSTR vpand,          vpandd
+EVEX_INSTR vpandn,         vpandnd
+EVEX_INSTR vpor,           vpord
+EVEX_INSTR vpxor,          vpxord
+EVEX_INSTR vrcpps,         vrcp14ps,   1 ; EVEX versions have higher precision
+EVEX_INSTR vrcpss,         vrcp14ss,   1
+EVEX_INSTR vrsqrtps,       vrsqrt14ps, 1
+EVEX_INSTR vrsqrtss,       vrsqrt14ss, 1
diff -r e1ed4d609b52 -r d7e105cac1d0 source/common/x86/x86util.asm
--- a/source/common/x86/x86util.asm	Tue Apr 03 13:49:25 2018 +0530
+++ b/source/common/x86/x86util.asm	Thu Jun 22 10:25:57 2017 +0530
@@ -299,6 +299,18 @@
     pminsw %2, %4
 %endmacro
 
+%macro MOVHL 2 ; dst, src
+%ifidn %1, %2
+    punpckhqdq %1, %2
+%elif cpuflag(avx)
+    punpckhqdq %1, %2, %2
+%elif cpuflag(sse4)
+    pshufd     %1, %2, q3232 ; pshufd is slow on some older CPUs, so only use it on more modern ones
+%else
+    movhlps    %1, %2        ; may cause an int/float domain transition and has a dependency on dst
+%endif
+%endmacro
+
 %macro HADDD 2 ; sum junk
 %if sizeof%1 == 32
 %define %2 xmm%2
@@ -324,7 +336,7 @@
 %macro HADDW 2 ; reg, tmp
 %if cpuflag(xop) && sizeof%1 == 16
     vphaddwq  %1, %1
-    movhlps   %2, %1
+    MOVHL     %2, %1
     paddd     %1, %2
 %else
     pmaddwd %1, [pw_1]
@@ -346,7 +358,7 @@
 %macro HADDUW 2
 %if cpuflag(xop) && sizeof%1 == 16
     vphadduwq %1, %1
-    movhlps   %2, %1
+    MOVHL     %2, %1
     paddd     %1, %2
 %else
     HADDUWD   %1, %2
@@ -739,25 +751,25 @@
 %if %6 ; %5 aligned?
     mova       %1, %4
     psubw      %1, %5
+%elif cpuflag(avx)
+    movu       %1, %4
+    psubw      %1, %5
 %else
     movu       %1, %4
     movu       %2, %5
     psubw      %1, %2
 %endif
 %else ; !HIGH_BIT_DEPTH
-%ifidn %3, none
     movh       %1, %4
     movh       %2, %5
+%ifidn %3, none
     punpcklbw  %1, %2
     punpcklbw  %2, %2
+%else
+    punpcklbw  %1, %3
+    punpcklbw  %2, %3
+%endif
     psubw      %1, %2
-%else
-    movh       %1, %4
-    punpcklbw  %1, %3
-    movh       %2, %5
-    punpcklbw  %2, %3
-    psubw      %1, %2
-%endif
 %endif ; HIGH_BIT_DEPTH
 %endmacro
 
diff -r e1ed4d609b52 -r d7e105cac1d0 source/test/testbench.cpp
--- a/source/test/testbench.cpp	Tue Apr 03 13:49:25 2018 +0530
+++ b/source/test/testbench.cpp	Thu Jun 22 10:25:57 2017 +0530
@@ -169,6 +169,7 @@
         { "XOP", X265_CPU_XOP },
         { "AVX2", X265_CPU_AVX2 },
         { "BMI2", X265_CPU_AVX2 | X265_CPU_BMI1 | X265_CPU_BMI2 },
+        { "AVX512", X265_CPU_AVX512 },
         { "ARMv6", X265_CPU_ARMV6 },
         { "NEON", X265_CPU_NEON },
         { "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
diff -r e1ed4d609b52 -r d7e105cac1d0 source/test/testharness.h
--- a/source/test/testharness.h	Tue Apr 03 13:49:25 2018 +0530
+++ b/source/test/testharness.h	Thu Jun 22 10:25:57 2017 +0530
@@ -91,7 +91,7 @@
 }
 #endif // ifdef _MSC_VER
 
-#define BENCH_RUNS 1000
+#define BENCH_RUNS 2000
 
 // Adapted from checkasm.c, runs each optimized primitive four times, measures rdtsc
 // and discards invalid times.  Repeats 1000 times to get a good average.  Then measures
diff -r e1ed4d609b52 -r d7e105cac1d0 source/x265.h
--- a/source/x265.h	Tue Apr 03 13:49:25 2018 +0530
+++ b/source/x265.h	Thu Jun 22 10:25:57 2017 +0530
@@ -382,39 +382,38 @@
 /* CPU flags */
 
 /* x86 */
-#define X265_CPU_CMOV            0x0000001
-#define X265_CPU_MMX             0x0000002
-#define X265_CPU_MMX2            0x0000004  /* MMX2 aka MMXEXT aka ISSE */
+#define X265_CPU_MMX             (1 << 0)
+#define X265_CPU_MMX2            (1 << 1)  /* MMX2 aka MMXEXT aka ISSE */
 #define X265_CPU_MMXEXT          X265_CPU_MMX2
-#define X265_CPU_SSE             0x0000008
-#define X265_CPU_SSE2            0x0000010
-#define X265_CPU_SSE3            0x0000020
-#define X265_CPU_SSSE3           0x0000040
-#define X265_CPU_SSE4            0x0000080  /* SSE4.1 */
-#define X265_CPU_SSE42           0x0000100  /* SSE4.2 */
-#define X265_CPU_LZCNT           0x0000200  /* Phenom support for "leading zero count" instruction. */
-#define X265_CPU_AVX             0x0000400  /* AVX support: requires OS support even if YMM registers aren't used. */
-#define X265_CPU_XOP             0x0000800  /* AMD XOP */
-#define X265_CPU_FMA4            0x0001000  /* AMD FMA4 */
-#define X265_CPU_AVX2            0x0002000  /* AVX2 */
-#define X265_CPU_FMA3            0x0004000  /* Intel FMA3 */
-#define X265_CPU_BMI1            0x0008000  /* BMI1 */
-#define X265_CPU_BMI2            0x0010000  /* BMI2 */
+#define X265_CPU_SSE             (1 << 2)
+#define X265_CPU_SSE2            (1 << 3)
+#define X265_CPU_LZCNT           (1 << 4)
+#define X265_CPU_SSE3            (1 << 5)
+#define X265_CPU_SSSE3           (1 << 6)
+#define X265_CPU_SSE4            (1 << 7)  /* SSE4.1 */
+#define X265_CPU_SSE42           (1 << 8)  /* SSE4.2 */
+#define X265_CPU_AVX             (1 << 9)  /* Requires OS support even if YMM registers aren't used. */
+#define X265_CPU_XOP             (1 << 10)  /* AMD XOP */
+#define X265_CPU_FMA4            (1 << 11)  /* AMD FMA4 */
+#define X265_CPU_FMA3            (1 << 12)  /* Intel FMA3 */
+#define X265_CPU_BMI1            (1 << 13)  /* BMI1 */
+#define X265_CPU_BMI2            (1 << 14)  /* BMI2 */
+#define X265_CPU_AVX2            (1 << 15)  /* AVX2 */
+#define X265_CPU_AVX512          (1 << 16)  /* AVX-512 {F, CD, BW, DQ, VL}, requires OS support */
 /* x86 modifiers */
-#define X265_CPU_CACHELINE_32    0x0020000  /* avoid memory loads that span the border between two cachelines */
-#define X265_CPU_CACHELINE_64    0x0040000  /* 32/64 is the size of a cacheline in bytes */
-#define X265_CPU_SSE2_IS_SLOW    0x0080000  /* avoid most SSE2 functions on Athlon64 */
-#define X265_CPU_SSE2_IS_FAST    0x0100000  /* a few functions are only faster on Core2 and Phenom */
-#define X265_CPU_SLOW_SHUFFLE    0x0200000  /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
-#define X265_CPU_STACK_MOD4      0x0400000  /* if stack is only mod4 and not mod16 */
-#define X265_CPU_SLOW_CTZ        0x0800000  /* BSR/BSF x86 instructions are really slow on some CPUs */
-#define X265_CPU_SLOW_ATOM       0x1000000  /* The Atom is terrible: slow SSE unaligned loads, slow
+#define X265_CPU_CACHELINE_32    (1 << 17)  /* avoid memory loads that span the border between two cachelines */
+#define X265_CPU_CACHELINE_64    (1 << 18)  /* 32/64 is the size of a cacheline in bytes */
+#define X265_CPU_SSE2_IS_SLOW    (1 << 19)  /* avoid most SSE2 functions on Athlon64 */
+#define X265_CPU_SSE2_IS_FAST    (1 << 20)  /* a few functions are only faster on Core2 and Phenom */
+#define X265_CPU_SLOW_SHUFFLE    (1 << 21)  /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
+#define X265_CPU_STACK_MOD4      (1 << 22)  /* if stack is only mod4 and not mod16 */
+#define X265_CPU_SLOW_ATOM       (1 << 23)  /* The Atom is terrible: slow SSE unaligned loads, slow
                                              * SIMD multiplies, slow SIMD variable shifts, slow pshufb,
                                              * cacheline split penalties -- gather everything here that
                                              * isn't shared by other CPUs to avoid making half a dozen
                                              * new SLOW flags. */
-#define X265_CPU_SLOW_PSHUFB     0x2000000  /* such as on the Intel Atom */
-#define X265_CPU_SLOW_PALIGNR    0x4000000  /* such as on the AMD Bobcat */
+#define X265_CPU_SLOW_PSHUFB     (1 << 24)  /* such as on the Intel Atom */
+#define X265_CPU_SLOW_PALIGNR    (1 << 25)  /* such as on the AMD Bobcat */
 
 /* ARM */
 #define X265_CPU_ARMV6           0x0000001