[x265] [PATCH 049 of 307] x86: AVX512 convert_p2s_48x64 for high bit depth
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:47 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1501043664 -19800
# Wed Jul 26 10:04:24 2017 +0530
# Node ID a75dd880817adddafac5e1105e512ea79c7a089b
# Parent b4c2149e9bb1119857363094492b50e85593fb74
x86: AVX512 convert_p2s_48x64 for high bit depth
AVX2 performance : 9.77x
AVX512 performance : 14.64x
diff -r b4c2149e9bb1 -r a75dd880817a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Jul 25 18:50:51 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Jul 26 10:04:24 2017 +0530
@@ -2239,6 +2239,7 @@
p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx512);
p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx512);
p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx512);
+ p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx512);
diff -r b4c2149e9bb1 -r a75dd880817a source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm Tue Jul 25 18:50:51 2017 +0530
+++ b/source/common/x86/ipfilter16.asm Wed Jul 26 10:04:24 2017 +0530
@@ -416,6 +416,79 @@
movu [r2 + r4], m3
%endmacro
+%macro P2S_48x8_AVX512 0
+ movu m0, [r0]
+ movu m1, [r0 + r1]
+ movu m2, [r0 + r1 * 2]
+ movu m3, [r0 + r5]
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+ movu [r2], m0
+ movu [r2 + r3], m1
+ movu [r2 + r3 * 2], m2
+ movu [r2 + r4], m3
+
+ movu ym0, [r0 + mmsize]
+ movu ym1, [r0 + r1 + mmsize]
+ movu ym2, [r0 + r1 * 2 + mmsize]
+ movu ym3, [r0 + r5 + mmsize]
+ psllw ym0, (14 - BIT_DEPTH)
+ psllw ym1, (14 - BIT_DEPTH)
+ psllw ym2, (14 - BIT_DEPTH)
+ psllw ym3, (14 - BIT_DEPTH)
+ psubw ym0, ym4
+ psubw ym1, ym4
+ psubw ym2, ym4
+ psubw ym3, ym4
+ movu [r2 + mmsize], ym0
+ movu [r2 + r3 + mmsize], ym1
+ movu [r2 + r3 * 2 + mmsize], ym2
+ movu [r2 + r4 + mmsize], ym3
+
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+
+ movu m0, [r0]
+ movu m1, [r0 + r1]
+ movu m2, [r0 + r1 * 2]
+ movu m3, [r0 + r5]
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+ movu [r2], m0
+ movu [r2 + r3], m1
+ movu [r2 + r3 * 2], m2
+ movu [r2 + r4], m3
+
+ movu ym0, [r0 + mmsize]
+ movu ym1, [r0 + r1 + mmsize]
+ movu ym2, [r0 + r1 * 2 + mmsize]
+ movu ym3, [r0 + r5 + mmsize]
+ psllw ym0, (14 - BIT_DEPTH)
+ psllw ym1, (14 - BIT_DEPTH)
+ psllw ym2, (14 - BIT_DEPTH)
+ psllw ym3, (14 - BIT_DEPTH)
+ psubw ym0, ym4
+ psubw ym1, ym4
+ psubw ym2, ym4
+ psubw ym3, ym4
+ movu [r2 + mmsize], ym0
+ movu [r2 + r3 + mmsize], ym1
+ movu [r2 + r3 * 2 + mmsize], ym2
+ movu [r2 + r4 + mmsize], ym3
+%endmacro
+
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
;-----------------------------------------------------------------------------
@@ -640,6 +713,39 @@
lea r2, [r2 + r3 * 4]
P2S_32x8_AVX512
RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_48x64, 4, 6, 5
+ add r1d, r1d
+ add r3d, r3d
+ lea r4, [r3 * 3]
+ lea r5, [r1 * 3]
+
+ ; load constant
+ vbroadcasti32x8 m4, [pw_2000]
+ P2S_48x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_48x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_48x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_48x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_48x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_48x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_48x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_48x8_AVX512
+ RET
;-----------------------------------------------------------------------------------------------------------------------------
;p2s avx512 code end
;-----------------------------------------------------------------------------------------------------------------------------
More information about the x265-devel
mailing list