[x265] [PATCH 251 of 307] x86: AVX512 pixel_var_64x64

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:34:09 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1512715363 -19800
#      Fri Dec 08 12:12:43 2017 +0530
# Node ID fa954ed4a1e7ce2741f3cac14006f78c3199191b
# Parent  86d3d34de566d7696028b5e798a79b9de3a6e62b
x86: AVX512 pixel_var_64x64

AVX2 performance    : 8.84x
AVX512 performance : 19.93x

diff -r 86d3d34de566 -r fa954ed4a1e7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Dec 07 17:32:55 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Dec 08 12:12:43 2017 +0530
@@ -4650,6 +4650,7 @@
         p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_avx512);
         p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512);
         p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_avx512);
+        p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_avx512);
         p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx512);
         p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx512);
         p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx512);
diff -r 86d3d34de566 -r fa954ed4a1e7 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Thu Dec 07 17:32:55 2017 +0530
+++ b/source/common/x86/pixel-util8.asm	Fri Dec 08 12:12:43 2017 +0530
@@ -7934,8 +7934,7 @@
     movd           edx, xm5
 %endif
 %endmacro
-
-%if HIGH_BIT_DEPTH==0
+%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
 ;-----------------------------------------------------------------------------
 ; int pixel_var_wxh( uint8_t *, intptr_t )
 ;-----------------------------------------------------------------------------
@@ -7954,8 +7953,55 @@
     PROCESS_VAR_32x8_AVX512
     PROCESS_VAR_AVX512_END
     RET
+
+INIT_ZMM avx512
+cglobal pixel_var_64x64, 2,4,7
+    pxor            m5, m5    ; sum
+    pxor            m6, m6    ; sum squared
+    mov             r2d, 32
+
+.loop:
+    pmovzxbw        m0, [r0]
+    pmovzxbw        m3, [r0 + mmsize/2]
+    pmovzxbw        m1, [r0 + r1]
+    pmovzxbw        m4, [r0 + r1 + mmsize/2]
+
+    lea             r0, [r0 + 2 * r1]
+
+    paddw           m5, m0
+    paddw           m5, m3
+    paddw           m5, m1
+    paddw           m5, m4
+    pmaddwd         m0, m0
+    pmaddwd         m3, m3
+    pmaddwd         m1, m1
+    pmaddwd         m4, m4
+    paddd           m6, m0
+    paddd           m6, m3
+    paddd           m6, m1
+    paddd           m6, m4
+
+    dec             r2d
+    jg              .loop
+
+    pxor            m1, m1
+    punpcklwd       m0, m5, m1
+    punpckhwd       m5, m1
+    paddd           m5, m0
+    vextracti32x8  ym2, m5, 1
+    vextracti32x8  ym1, m6, 1
+    paddd          ym5, ym2
+    paddd          ym6, ym1
+    vextracti32x4  xm2, m5, 1
+    vextracti32x4  xm1, m6, 1
+    paddd          xm5, xm2
+    paddd          xm6, xm1
+    HADDD          xm5, xm2
+    HADDD          xm6, xm1
+    punpckldq      xm5, xm6
+    movq           rax, xm5
+    RET
 %endif
-
 %macro VAR_AVX512_CORE 1 ; accum
 %if %1
     paddw    m0, m2


More information about the x265-devel mailing list