[x265] [PATCH 069 of 307] x86: AVX512 pixel_var_32x32
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:31:07 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1501843838 -19800
# Fri Aug 04 16:20:38 2017 +0530
# Node ID 039ed71e123c3e14bfaabbe3aada944157784b36
# Parent c5b5b7cb9bbef4365692bfaf05a2a83796d5f1b0
x86: AVX512 pixel_var_32x32
AVX2 performance : 9.15x
AVX512 performance : 13.49x
diff -r c5b5b7cb9bbe -r 039ed71e123c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Aug 04 14:27:51 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Aug 04 16:20:38 2017 +0530
@@ -3929,6 +3929,7 @@
p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_avx512);
p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512);
+ p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_avx512);
p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx512);
p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx512);
p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx512);
diff -r c5b5b7cb9bbe -r 039ed71e123c source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Fri Aug 04 14:27:51 2017 +0530
+++ b/source/common/x86/pixel-util8.asm Fri Aug 04 16:20:38 2017 +0530
@@ -7105,6 +7105,82 @@
RET
%endif ; !HIGH_BIT_DEPTH
+%macro PROCESS_VAR_32x8_AVX512 0
+ pmovzxbw m0, [r0]
+ pmovzxbw m1, [r0 + r1]
+ pmovzxbw m2, [r0 + 2 * r1]
+ pmovzxbw m3, [r0 + r2]
+
+ paddw m4, m0
+ paddw m4, m1
+ paddw m4, m2
+ paddw m4, m3
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ paddd m5, m0
+ paddd m5, m1
+ paddd m5, m2
+ paddd m5, m3
+
+ lea r0, [r0 + r1 * 4]
+
+ pmovzxbw m0, [r0]
+ pmovzxbw m1, [r0 + r1]
+ pmovzxbw m2, [r0 + 2 * r1]
+ pmovzxbw m3, [r0 + r2]
+
+ paddw m4, m0
+ paddw m4, m1
+ paddw m4, m2
+ paddw m4, m3
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ paddd m5, m0
+ paddd m5, m1
+ paddd m5, m2
+ paddd m5, m3
+%endmacro
+
+%macro PROCESS_VAR_AVX512_END 0
+ vextracti32x8 ym0, m4, 1
+ vextracti32x8 ym1, m5, 1
+ paddw ym4, ym0
+ paddd ym5, ym1
+ vextracti32x4 xm0, m4, 1
+ vextracti32x4 xm1, m5, 1
+ paddw xm4, xm0
+ paddd xm5, xm1
+ HADDW xm4, xm2
+ HADDD xm5, xm1
+ punpckldq xm4, xm5
+ movq rax, xm4
+%endmacro
+
+%if HIGH_BIT_DEPTH==0
+;-----------------------------------------------------------------------------
+; int pixel_var_wxh( uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal pixel_var_32x32, 2,4,6
+ pxor m4, m4 ; sum
+ pxor m5, m5 ; sum squared
+ lea r2, [3 * r1]
+
+ PROCESS_VAR_32x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ PROCESS_VAR_32x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ PROCESS_VAR_32x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ PROCESS_VAR_32x8_AVX512
+ PROCESS_VAR_AVX512_END
+ RET
+%endif
+
%macro VAR_AVX512_CORE 1 ; accum
%if %1
paddw m0, m2
More information about the x265-devel
mailing list