[x265] [PATCH 251 of 307] x86: AVX512 pixel_var_64x64
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:09 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1512715363 -19800
# Fri Dec 08 12:12:43 2017 +0530
# Node ID fa954ed4a1e7ce2741f3cac14006f78c3199191b
# Parent 86d3d34de566d7696028b5e798a79b9de3a6e62b
x86: AVX512 pixel_var_64x64
AVX2 performance : 8.84x
AVX512 performance : 19.93x
diff -r 86d3d34de566 -r fa954ed4a1e7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Dec 07 17:32:55 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Dec 08 12:12:43 2017 +0530
@@ -4650,6 +4650,7 @@
p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_avx512);
p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512);
p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_avx512);
+ p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_avx512);
p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx512);
p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx512);
p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx512);
diff -r 86d3d34de566 -r fa954ed4a1e7 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Thu Dec 07 17:32:55 2017 +0530
+++ b/source/common/x86/pixel-util8.asm Fri Dec 08 12:12:43 2017 +0530
@@ -7934,8 +7934,7 @@
movd edx, xm5
%endif
%endmacro
-
-%if HIGH_BIT_DEPTH==0
+%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
;-----------------------------------------------------------------------------
; int pixel_var_wxh( uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
@@ -7954,8 +7953,55 @@
PROCESS_VAR_32x8_AVX512
PROCESS_VAR_AVX512_END
RET
+
+INIT_ZMM avx512
+cglobal pixel_var_64x64, 2,4,7
+ pxor m5, m5 ; sum
+ pxor m6, m6 ; sum squared
+ mov r2d, 32
+
+.loop:
+ pmovzxbw m0, [r0]
+ pmovzxbw m3, [r0 + mmsize/2]
+ pmovzxbw m1, [r0 + r1]
+ pmovzxbw m4, [r0 + r1 + mmsize/2]
+
+ lea r0, [r0 + 2 * r1]
+
+ paddw m5, m0
+ paddw m5, m3
+ paddw m5, m1
+ paddw m5, m4
+ pmaddwd m0, m0
+ pmaddwd m3, m3
+ pmaddwd m1, m1
+ pmaddwd m4, m4
+ paddd m6, m0
+ paddd m6, m3
+ paddd m6, m1
+ paddd m6, m4
+
+ dec r2d
+ jg .loop
+
+ pxor m1, m1
+ punpcklwd m0, m5, m1
+ punpckhwd m5, m1
+ paddd m5, m0
+ vextracti32x8 ym2, m5, 1
+ vextracti32x8 ym1, m6, 1
+ paddd ym5, ym2
+ paddd ym6, ym1
+ vextracti32x4 xm2, m5, 1
+ vextracti32x4 xm1, m6, 1
+ paddd xm5, xm2
+ paddd xm6, xm1
+ HADDD xm5, xm2
+ HADDD xm6, xm1
+ punpckldq xm5, xm6
+ movq rax, xm5
+ RET
%endif
-
%macro VAR_AVX512_CORE 1 ; accum
%if %1
paddw m0, m2
More information about the x265-devel
mailing list