[x265] [PATCH 032 of 307] x86: AVX512 convert_p2s 48x64
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:30 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1500447343 -19800
# Wed Jul 19 12:25:43 2017 +0530
# Node ID 97d5ab44b6da2db69584875c2dde97aef5533d9b
# Parent 60c8ad7f3cadcfe7bb5242a89908546ce38bb5d1
x86: AVX512 convert_p2s 48x64
AVX2 performance : 2.22x
AVX512 performance: 3.01x
diff -r 60c8ad7f3cad -r 97d5ab44b6da source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Jul 19 11:59:13 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Jul 19 12:25:43 2017 +0530
@@ -3841,6 +3841,7 @@
p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx512);
p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx512);
p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx512);
+ p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx512);
diff -r 60c8ad7f3cad -r 97d5ab44b6da source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Wed Jul 19 11:59:13 2017 +0530
+++ b/source/common/x86/ipfilter8.asm Wed Jul 19 12:25:43 2017 +0530
@@ -3047,6 +3047,115 @@
jnz .loop
RET
+;-----------------------------------------------------------------------------
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
+;-----------------------------------------------------------------------------
+%macro PROCESS_P2S_48x8_AVX512 0
+ pmovzxbw m0, [r0]
+ pmovzxbw m1, [r0 + r1]
+ pmovzxbw m2, [r0 + r1 * 2]
+ pmovzxbw m3, [r0 + r5]
+ psllw m0, 6
+ psllw m1, 6
+ psllw m2, 6
+ psllw m3, 6
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+ movu [r2], m0
+ movu [r2 + r3], m1
+ movu [r2 + r3 * 2], m2
+ movu [r2 + r6], m3
+
+ pmovzxbw ym0, [r0 + 32]
+ pmovzxbw ym1, [r0 + r1 + 32]
+ pmovzxbw ym2, [r0 + r1 * 2 + 32]
+ pmovzxbw ym3, [r0 + r5 + 32]
+ psllw ym0, 6
+ psllw ym1, 6
+ psllw ym2, 6
+ psllw ym3, 6
+ psubw ym0, ym4
+ psubw ym1, ym4
+ psubw ym2, ym4
+ psubw ym3, ym4
+ movu [r2 + 64], ym0
+ movu [r2 + r3 + 64], ym1
+ movu [r2 + r3 * 2 + 64], ym2
+ movu [r2 + r6 + 64], ym3
+
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+
+ pmovzxbw m0, [r0]
+ pmovzxbw m1, [r0 + r1]
+ pmovzxbw m2, [r0 + r1 * 2]
+ pmovzxbw m3, [r0 + r5]
+ psllw m0, 6
+ psllw m1, 6
+ psllw m2, 6
+ psllw m3, 6
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+ movu [r2], m0
+ movu [r2 + r3], m1
+ movu [r2 + r3 * 2], m2
+ movu [r2 + r6], m3
+
+ pmovzxbw ym0, [r0 + 32]
+ pmovzxbw ym1, [r0 + r1 + 32]
+ pmovzxbw ym2, [r0 + r1 * 2 + 32]
+ pmovzxbw ym3, [r0 + r5 + 32]
+ psllw ym0, 6
+ psllw ym1, 6
+ psllw ym2, 6
+ psllw ym3, 6
+ psubw ym0, ym4
+ psubw ym1, ym4
+ psubw ym2, ym4
+ psubw ym3, ym4
+ movu [r2 + 64], ym0
+ movu [r2 + r3 + 64], ym1
+ movu [r2 + r3 * 2 + 64], ym2
+ movu [r2 + r6 + 64], ym3
+%endmacro
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_48x64, 3,7,5
+ mov r3d, r3m
+ add r3d, r3d
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+
+ ; load constant
+ vpbroadcastd m8, [pw_2000]
+
+ PROCESS_P2S_48x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_48x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_48x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_48x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_48x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_48x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_48x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_48x8_AVX512
+ RET
%macro PROCESS_LUMA_W4_4R 0
movd m0, [r0]
More information about the x265-devel
mailing list