[x265] [PATCH 030 of 307] x86: AVX512 convert_p2s 64xN
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:28 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1499858302 -19800
# Wed Jul 12 16:48:22 2017 +0530
# Node ID a77082ebfa67b40f3dbb8cd45b54c17e710a104c
# Parent 576a93cba7d189fddba3466a21188f0ece3ed278
x86: AVX512 convert_p2s 64xN
Size | AVX2 performance | AVX512 performance
------------------------------------------------
64x16 | 2.05x | 3.77x
64x32 | 2.16x | 3.88x
64x48 | 2.13x | 3.91x
64x64 | 2.16x | 4.00x
diff -r 576a93cba7d1 -r a77082ebfa67 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jul 17 09:23:17 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Jul 12 16:48:22 2017 +0530
@@ -3832,6 +3832,11 @@
p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx512);
+ p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_avx512);
+ p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx512);
+ p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx512);
+ p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx512);
+
}
#endif
}
diff -r 576a93cba7d1 -r a77082ebfa67 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Mon Jul 17 09:23:17 2017 +0530
+++ b/source/common/x86/ipfilter8.asm Wed Jul 12 16:48:22 2017 +0530
@@ -2269,6 +2269,186 @@
P2S_H_64xN_avx2 48
;-----------------------------------------------------------------------------
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
+;-----------------------------------------------------------------------------
+%macro PROCESS_P2S_64x8_AVX512 0
+ pmovzxbw m0, [r0]
+ pmovzxbw m1, [r0 + mmsize/2]
+ pmovzxbw m2, [r0 + r1]
+ pmovzxbw m3, [r0 + r1 + mmsize/2]
+
+ psllw m0, 6
+ psllw m1, 6
+ psllw m2, 6
+ psllw m3, 6
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+ movu [r2], m0
+ movu [r2 + mmsize], m1
+ movu [r2 + r3], m2
+ movu [r2 + r3 + mmsize], m3
+
+ pmovzxbw m0, [r0 + r1 * 2]
+ pmovzxbw m1, [r0 + r1 * 2 + mmsize/2]
+ pmovzxbw m2, [r0 + r5]
+ pmovzxbw m3, [r0 + r5 + mmsize/2]
+
+ psllw m0, 6
+ psllw m1, 6
+ psllw m2, 6
+ psllw m3, 6
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+ movu [r2 + r3 * 2], m0
+ movu [r2 + r3 * 2 + mmsize], m1
+ movu [r2 + r6], m2
+ movu [r2 + r6 + mmsize], m3
+
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+
+ pmovzxbw m0, [r0]
+ pmovzxbw m1, [r0 + mmsize/2]
+ pmovzxbw m2, [r0 + r1]
+ pmovzxbw m3, [r0 + r1 + mmsize/2]
+
+ psllw m0, 6
+ psllw m1, 6
+ psllw m2, 6
+ psllw m3, 6
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+ movu [r2], m0
+ movu [r2 + mmsize], m1
+ movu [r2 + r3], m2
+ movu [r2 + r3 + mmsize], m3
+
+ pmovzxbw m0, [r0 + r1 * 2]
+ pmovzxbw m1, [r0 + r1 * 2 + mmsize/2]
+ pmovzxbw m2, [r0 + r5]
+ pmovzxbw m3, [r0 + r5 + mmsize/2]
+
+ psllw m0, 6
+ psllw m1, 6
+ psllw m2, 6
+ psllw m3, 6
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+ movu [r2 + r3 * 2], m0
+ movu [r2 + r3 * 2 + mmsize], m1
+ movu [r2 + r6], m2
+ movu [r2 + r6 + mmsize], m3
+%endmacro
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_64x64, 3, 7, 5
+ mov r3d, r3m
+ add r3d, r3d
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+
+ ; load constant
+ vpbroadcastd m4, [pw_2000]
+
+ PROCESS_P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_64x8_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_64x48, 3, 7, 9
+ mov r3d, r3m
+ add r3d, r3d
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+
+ ; load constant
+ vpbroadcastd m8, [pw_2000]
+
+ PROCESS_P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_64x8_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_64x32, 3, 7, 9
+ mov r3d, r3m
+ add r3d, r3d
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+
+ ; load constant
+ vpbroadcastd m8, [pw_2000]
+
+ PROCESS_P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_64x8_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_64x16, 3, 7, 9
+ mov r3d, r3m
+ add r3d, r3d
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+
+ ; load constant
+ vpbroadcastd m8, [pw_2000]
+
+ PROCESS_P2S_64x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_64x8_AVX512
+ RET
+
+;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel src, intptr_t srcStride, int16_t dst, int16_t dstStride)
;-----------------------------------------------------------------------------
%macro P2S_H_12xN 1
diff -r 576a93cba7d1 -r a77082ebfa67 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h Mon Jul 17 09:23:17 2017 +0530
+++ b/source/common/x86/ipfilter8.h Wed Jul 12 16:48:22 2017 +0530
@@ -45,5 +45,6 @@
SETUP_FUNC_DEF(sse3);
SETUP_FUNC_DEF(sse4);
SETUP_FUNC_DEF(avx2);
+SETUP_FUNC_DEF(avx512);
#endif // ifndef X265_IPFILTER8_H
More information about the x265-devel
mailing list