[x265] [PATCH 042 of 307] x86:AVX512 ssd_s_32
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:40 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1500629149 -19800
# Fri Jul 21 14:55:49 2017 +0530
# Node ID 6b3b8ef0f37e0f7860f4f43c99e581674b19f9e3
# Parent 156acfb1bbb3cee56ed7b3337850a1fc9e4429ee
x86:AVX512 ssd_s_32
AVX2 performance : 7.37x
AVX512 performance : 13.06x
diff -r 156acfb1bbb3 -r 6b3b8ef0f37e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jul 24 14:55:38 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Jul 21 14:55:49 2017 +0530
@@ -3858,6 +3858,7 @@
p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512);
p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512);
+ p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512);
p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
diff -r 156acfb1bbb3 -r 6b3b8ef0f37e source/common/x86/ssd-a.asm
--- a/source/common/x86/ssd-a.asm Mon Jul 24 14:55:38 2017 +0530
+++ b/source/common/x86/ssd-a.asm Fri Jul 21 14:55:49 2017 +0530
@@ -3389,3 +3389,65 @@
movd eax, xm0
%endif
RET
+
+;-----------------------------------------------------------------------------
+; ssd_s avx512 code start
+;-----------------------------------------------------------------------------
+%macro PROCESS_SSD_S_32x8_AVX512 0
+ movu m1, [r0]
+ movu m2, [r0 + r1]
+ movu m3, [r0 + 2 * r1]
+ movu m4, [r0 + r3]
+
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ pmaddwd m4, m4
+ paddd m1, m2
+ paddd m3, m4
+ paddd m1, m3
+ paddd m0, m1
+
+ lea r0, [r0 + 4 * r1]
+
+ movu m1, [r0]
+ movu m2, [r0 + r1]
+ movu m3, [r0 + 2 * r1]
+ movu m4, [r0 + r3]
+
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ pmaddwd m4, m4
+ paddd m1, m2
+ paddd m3, m4
+ paddd m1, m3
+ paddd m0, m1
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int pixel_ssd_s( int16_t *ref, intptr_t i_stride )
+;-----------------------------------------------------------------------------
+%if HIGH_BIT_DEPTH==0
+INIT_ZMM avx512
+cglobal pixel_ssd_s_32, 2,4,5
+ add r1, r1
+ lea r3, [r1 * 3]
+ pxor m0, m0
+
+ PROCESS_SSD_S_32x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ PROCESS_SSD_S_32x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ PROCESS_SSD_S_32x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ PROCESS_SSD_S_32x8_AVX512
+
+ ; calculate sum and return
+ HADDD m0, m1
+ movd eax, xm0
+ RET
+%endif
+;-----------------------------------------------------------------------------
+; ssd_s avx512 code end
+;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list