[x265] [PATCH 050 of 307] x86: AVX512 ssd_s_16
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:48 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1500967696 -19800
# Tue Jul 25 12:58:16 2017 +0530
# Node ID 09159f73f47b7eda15c8d0294774fe6eafdadea7
# Parent a75dd880817adddafac5e1105e512ea79c7a089b
x86: AVX512 ssd_s_16
This patch also reworks ssd_s_32 to support high bit depth
ssd_s_16
AVX2 performance : 14.11x
AVX512 performance : 16.14x
ssd_s_32 for high bit depth
AVX2 performance : 14.78x
AVX512 performance : 20.54x
diff -r a75dd880817a -r 09159f73f47b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Jul 26 10:04:24 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Jul 25 12:58:16 2017 +0530
@@ -2249,6 +2249,8 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512);
+ p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512);
+
}
}
#else // if HIGH_BIT_DEPTH
@@ -3919,6 +3921,7 @@
p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512);
p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512);
p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512);
+ p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx512);
p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
diff -r a75dd880817a -r 09159f73f47b source/common/x86/ssd-a.asm
--- a/source/common/x86/ssd-a.asm Wed Jul 26 10:04:24 2017 +0530
+++ b/source/common/x86/ssd-a.asm Tue Jul 25 12:58:16 2017 +0530
@@ -3425,10 +3425,28 @@
paddd m0, m1
%endmacro
+%macro PROCESS_SSD_S_16x8_AVX512 0
+ movu ym1, [r0]
+ vinserti32x8 m1, [r0 + r1], 1
+ movu ym2, [r0 + 2 * r1]
+ vinserti32x8 m2, [r0 + r3], 1
+ lea r0, [r0 + 4 * r1]
+ movu ym3, [r0]
+ vinserti32x8 m3, [r0 + r1], 1
+ movu ym4, [r0 + 2 * r1]
+ vinserti32x8 m4, [r0 + r3], 1
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ pmaddwd m4, m4
+ paddd m1, m2
+ paddd m3, m4
+ paddd m1, m3
+ paddd m0, m1
+%endmacro
;-----------------------------------------------------------------------------
; int pixel_ssd_s( int16_t *ref, intptr_t i_stride )
;-----------------------------------------------------------------------------
-%if HIGH_BIT_DEPTH==0
INIT_ZMM avx512
cglobal pixel_ssd_s_32, 2,4,5
add r1, r1
@@ -3444,10 +3462,39 @@
PROCESS_SSD_S_32x8_AVX512
; calculate sum and return
+%if BIT_DEPTH >= 10
+ movu m1, m0
+ pxor m2, m2
+ punpckldq m0, m2
+ punpckhdq m1, m2
+ paddq m0, m1
+ vextracti32x8 ym2, m0, 1
+ paddq ym0, ym2
+ vextracti32x4 xm2, m0, 1
+ paddq xm2, xm0
+ movhlps xm1, xm2
+ paddq xm2, xm1
+ movq rax, xm2
+%else
+ HADDD m0, m1
+ movd eax, xm0
+%endif
+ RET
+
+INIT_ZMM avx512
+cglobal pixel_ssd_s_16, 2,4,5
+ add r1, r1
+ lea r3, [r1 * 3]
+ pxor m0, m0
+
+ PROCESS_SSD_S_16x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ PROCESS_SSD_S_16x8_AVX512
+
+ ; calculate sum and return
HADDD m0, m1
movd eax, xm0
RET
-%endif
;-----------------------------------------------------------------------------
; ssd_s avx512 code end
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list