[x265] [PATCH 278 of 307] x86: AVX512 ssd_ss and ssd_pp for high bit depth
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:36 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1515047865 -19800
# Thu Jan 04 12:07:45 2018 +0530
# Node ID c225f79142184cb78ded05ba791edc9b3a40b3da
# Parent 4e9f2efdfd097910aa5bf704a4bbf38b0a28f2a5
x86: AVX512 ssd_ss and ssd_pp for high bit depth
Size | AVX2 performance | AVX512 performance
----------------------------------------------
32x32 | 6.83x | 12.15x
64x64 | 14.36x | 20.24x
diff -r 4e9f2efdfd09 -r c225f7914218 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Dec 29 09:52:27 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Jan 04 12:07:45 2018 +0530
@@ -3124,6 +3124,14 @@
p.cu[BLOCK_8x8].psyRdoQuant = PFX(psyRdoQuant8_avx512);
p.cu[BLOCK_16x16].psyRdoQuant = PFX(psyRdoQuant16_avx512);
p.cu[BLOCK_32x32].psyRdoQuant = PFX(psyRdoQuant32_avx512);
+
+ p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_32x32_avx512);
+ p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_64x64_avx512);
+ p.cu[BLOCK_32x32].sse_pp = PFX(pixel_ssd_32x32_avx512);
+ p.cu[BLOCK_64x64].sse_pp = PFX(pixel_ssd_64x64_avx512);
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_32x32_avx512);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_32x64_avx512);
+
}
#endif
}
diff -r 4e9f2efdfd09 -r c225f7914218 source/common/x86/ssd-a.asm
--- a/source/common/x86/ssd-a.asm Fri Dec 29 09:52:27 2017 +0530
+++ b/source/common/x86/ssd-a.asm Thu Jan 04 12:07:45 2018 +0530
@@ -703,8 +703,109 @@
paddq xm3, xm4
movq rax, xm3
RET
+
+INIT_ZMM avx512
+cglobal pixel_ssd_32x2
+ pxor m0, m0
+ movu m1, [r0]
+ psubw m1, [r2]
+ pmaddwd m1, m1
+ paddd m0, m1
+ movu m1, [r0 + r1]
+ psubw m1, [r2 + r3]
+ pmaddwd m1, m1
+ paddd m0, m1
+ lea r0, [r0 + r1 * 2]
+ lea r2, [r2 + r3 * 2]
+
+ mova m1, m0
+ pxor m2, m2
+ punpckldq m0, m2
+ punpckhdq m1, m2
+
+ paddq m3, m0
+ paddq m3, m1
+ret
+
+INIT_ZMM avx512
+cglobal pixel_ssd_32x32, 4,5,5
+ shl r1d, 1
+ shl r3d, 1
+ pxor m3, m3
+ mov r4, 16
+.iterate:
+ call pixel_ssd_32x2
+ dec r4d
+ jne .iterate
+
+ vextracti32x8 ym4, m3, 1
+ paddq ym3, ym4
+ vextracti32x4 xm4, m3, 1
+ paddq xm3, xm4
+ movhlps xm4, xm3
+ paddq xm3, xm4
+ movq rax, xm3
+RET
+
+INIT_ZMM avx512
+cglobal pixel_ssd_32x64, 4,5,5
+ shl r1d, 1
+ shl r3d, 1
+ pxor m3, m3
+ mov r4, 32
+.iterate:
+ call pixel_ssd_32x2
+ dec r4d
+ jne .iterate
+
+ vextracti32x8 ym4, m3, 1
+ paddq ym3, ym4
+ vextracti32x4 xm4, m3, 1
+ paddq xm3, xm4
+ movhlps xm4, xm3
+ paddq xm3, xm4
+ movq rax, xm3
+RET
+
+INIT_ZMM avx512
+cglobal pixel_ssd_64x64, 4,5,5
+ FIX_STRIDES r1, r3
+ mov r4d, 64
+ pxor m3, m3
+
+.loop:
+ pxor m0, m0
+ movu m1, [r0]
+ psubw m1, [r2]
+ pmaddwd m1, m1
+ paddd m0, m1
+ movu m1, [r0 + mmsize]
+ psubw m1, [r2 + mmsize]
+ pmaddwd m1, m1
+ paddd m0, m1
+
+ lea r0, [r0 + r1]
+ lea r2, [r2 + r3]
+
+ mova m1, m0
+ pxor m2, m2
+ punpckldq m0, m2
+ punpckhdq m1, m2
+ paddq m3, m0
+ paddq m3, m1
+
+ dec r4d
+ jg .loop
+
+ vextracti32x8 ym4, m3, 1
+ paddq ym3, ym4
+ vextracti32x4 xm4, m3, 1
+ paddq xm3, xm4
+ movhlps xm4, xm3
+ paddq xm3, xm4
+ movq rax, xm3
+ RET
%endif
-
INIT_MMX mmx2
SSD_ONE 4, 4
SSD_ONE 4, 8
More information about the x265-devel
mailing list