[x265] [PATCH 031 of 307] x86: AVX512 convert_p2s_32xN
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:29 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1500445753 -19800
# Wed Jul 19 11:59:13 2017 +0530
# Node ID 60c8ad7f3cadcfe7bb5242a89908546ce38bb5d1
# Parent a77082ebfa67b40f3dbb8cd45b54c17e710a104c
x86: AVX512 convert_p2s_32xN
Size | AVX2 performance | AVX512 performance
------------------------------------------------
32x8 | 1.51x | 1.54x
32x16 | 2.18x | 3.62x
32x24 | 2.26x | 3.58x
32x32 | 2.28x | 3.94x
32x64 | 2.20x | 4.06x
diff -r a77082ebfa67 -r 60c8ad7f3cad source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Jul 12 16:48:22 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Jul 19 11:59:13 2017 +0530
@@ -3836,6 +3836,19 @@
p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx512);
p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx512);
p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx512);
+ p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_avx2);
+ p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_avx512);
+ p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx512);
+ p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx512);
+ p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s = PFX(filterPixelToShort_32x32_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s = PFX(filterPixelToShort_32x16_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512);
}
#endif
diff -r a77082ebfa67 -r 60c8ad7f3cad source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Wed Jul 12 16:48:22 2017 +0530
+++ b/source/common/x86/ipfilter8.asm Wed Jul 19 11:59:13 2017 +0530
@@ -1956,6 +1956,184 @@
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
+%macro PROCESS_P2S_32x8_AVX512 0
+ pmovzxbw m0, [r0]
+ pmovzxbw m1, [r0 + r1]
+ pmovzxbw m2, [r0 + r1 * 2]
+ pmovzxbw m3, [r0 + r5]
+
+ psllw m0, 6
+ psllw m1, 6
+ psllw m2, 6
+ psllw m3, 6
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+
+ movu [r2], m0
+ movu [r2 + r3], m1
+ movu [r2 + r3 * 2], m2
+ movu [r2 + r6], m3
+
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+
+ pmovzxbw m0, [r0]
+ pmovzxbw m1, [r0 + r1]
+ pmovzxbw m2, [r0 + r1 * 2]
+ pmovzxbw m3, [r0 + r5]
+
+ psllw m0, 6
+ psllw m1, 6
+ psllw m2, 6
+ psllw m3, 6
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+
+ movu [r2], m0
+ movu [r2 + r3], m1
+ movu [r2 + r3 * 2], m2
+ movu [r2 + r6], m3
+%endmacro
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_32x8, 3, 7, 5
+ mov r3d, r3m
+ add r3d, r3d
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+
+ ; load constant
+ vpbroadcastd m4, [pw_2000]
+
+ PROCESS_P2S_32x8_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_32x16, 3, 7, 5
+ mov r3d, r3m
+ add r3d, r3d
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+
+ ; load constant
+ vpbroadcastd m4, [pw_2000]
+
+ PROCESS_P2S_32x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_32x8_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_32x24, 3, 7, 5
+ mov r3d, r3m
+ add r3d, r3d
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+
+ ; load constant
+ vpbroadcastd m4, [pw_2000]
+
+ PROCESS_P2S_32x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_32x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_32x8_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_32x32, 3, 7, 5
+ mov r3d, r3m
+ add r3d, r3d
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+
+ ; load constant
+ vpbroadcastd m4, [pw_2000]
+
+ PROCESS_P2S_32x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_32x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_32x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_32x8_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_32x48, 3, 7, 5
+ mov r3d, r3m
+ add r3d, r3d
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+
+ ; load constant
+ vpbroadcastd m4, [pw_2000]
+
+ PROCESS_P2S_32x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_32x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_32x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_32x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_32x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_32x8_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_32x64, 3, 7, 5
+ mov r3d, r3m
+ add r3d, r3d
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+
+ ; load constant
+ vpbroadcastd m4, [pw_2000]
+
+ PROCESS_P2S_32x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_32x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_32x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_32x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_32x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_32x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_32x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_32x8_AVX512
+ RET
+
+;-----------------------------------------------------------------------------
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
+;-----------------------------------------------------------------------------
%macro P2S_H_64xN 1
INIT_XMM ssse3
cglobal filterPixelToShort_64x%1, 3, 7, 6
More information about the x265-devel
mailing list