[x265] [PATCH 047 of 307] x86: AVX512 convert_p2s_64xN for high bit depth
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:45 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1500987523 -19800
# Tue Jul 25 18:28:43 2017 +0530
# Node ID d05b920865e7c9e8cc9441e77df888b48acb50d1
# Parent 9e1401dcdfc3c9fb633d81b7b39321ac5969a245
x86: AVX512 convert_p2s_64xN for high bit depth
Size | AVX2 performance | AVX512 performance
----------------------------------------------
64x16 | 10.53x | 18.40x
64x32 | 11.10x | 19.51x
64x48 | 11.14x | 19.07x
64x64 | 11.26x | 20.25x
diff -r 9e1401dcdfc3 -r d05b920865e7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Jul 25 16:23:42 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Jul 25 18:28:43 2017 +0530
@@ -2230,6 +2230,10 @@
p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = (copy_sp_t)PFX(blockcopy_ss_32x32_avx512);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = (copy_sp_t)PFX(blockcopy_ss_32x64_avx512);
+ p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_avx512);
+ p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx512);
+ p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx512);
+ p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx512);
}
}
#else // if HIGH_BIT_DEPTH
diff -r 9e1401dcdfc3 -r d05b920865e7 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm Tue Jul 25 16:23:42 2017 +0530
+++ b/source/common/x86/ipfilter16.asm Tue Jul 25 18:28:43 2017 +0530
@@ -301,6 +301,183 @@
FILTER_VER_LUMA_sse2 ps, 64, 16
FILTER_VER_LUMA_sse2 ps, 16, 64
+;-----------------------------------------------------------------------------
+;p2s avx512 code start
+;-----------------------------------------------------------------------------
+%macro P2S_64x8_AVX512 0
+ movu m0, [r0]
+ movu m1, [r0 + r1]
+ movu m2, [r0 + r1 * 2]
+ movu m3, [r0 + r5]
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+ movu [r2], m0
+ movu [r2 + r3], m1
+ movu [r2 + r3 * 2], m2
+ movu [r2 + r4], m3
+
+ movu m0, [r0 + mmsize]
+ movu m1, [r0 + r1 + mmsize]
+ movu m2, [r0 + r1 * 2 + mmsize]
+ movu m3, [r0 + r5 + mmsize]
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+ movu [r2 + mmsize], m0
+ movu [r2 + r3 + mmsize], m1
+ movu [r2 + r3 * 2 + mmsize], m2
+ movu [r2 + r4 + mmsize], m3
+
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+
+ movu m0, [r0]
+ movu m1, [r0 + r1]
+ movu m2, [r0 + r1 * 2]
+ movu m3, [r0 + r5]
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+ movu [r2], m0
+ movu [r2 + r3], m1
+ movu [r2 + r3 * 2], m2
+ movu [r2 + r4], m3
+
+ movu m0, [r0 + mmsize]
+ movu m1, [r0 + r1 + mmsize]
+ movu m2, [r0 + r1 * 2 + mmsize]
+ movu m3, [r0 + r5 + mmsize]
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+ movu [r2 + mmsize], m0
+ movu [r2 + r3 + mmsize], m1
+ movu [r2 + r3 * 2 + mmsize], m2
+ movu [r2 + r4 + mmsize], m3
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
+;-----------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal filterPixelToShort_64x16, 4, 6, 5
+ add r1d, r1d
+ add r3d, r3d
+ lea r4, [r3 * 3]
+ lea r5, [r1 * 3]
+
+ ; load constant
+ vbroadcasti32x8 m4, [pw_2000]
+ P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_64x8_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_64x32, 4, 6, 5
+ add r1d, r1d
+ add r3d, r3d
+ lea r4, [r3 * 3]
+ lea r5, [r1 * 3]
+
+ ; load constant
+ vbroadcasti32x8 m4, [pw_2000]
+ P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_64x8_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_64x48, 4, 6, 5
+ add r1d, r1d
+ add r3d, r3d
+ lea r4, [r3 * 3]
+ lea r5, [r1 * 3]
+
+ ; load constant
+ vbroadcasti32x8 m4, [pw_2000]
+ P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_64x8_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_64x64, 4, 6, 5
+ add r1d, r1d
+ add r3d, r3d
+ lea r4, [r3 * 3]
+ lea r5, [r1 * 3]
+
+ ; load constant
+ vbroadcasti32x8 m4, [pw_2000]
+ P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_64x8_AVX512
+ RET
+;-----------------------------------------------------------------------------------------------------------------------------
+;p2s avx512 code end
+;-----------------------------------------------------------------------------------------------------------------------------
%macro PROCESS_LUMA_VER_W4_4R 0
movq m0, [r0]
More information about the x265-devel
mailing list