[x265] [PATCH 169 of 307] x86: AVX512 interp_4tap_vert_sp_32xN and interp_4tap_vert_ss_32xN for high bit depth
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:32:47 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1510659366 -19800
# Tue Nov 14 17:06:06 2017 +0530
# Node ID a6c12a9c8cba58df74e482e580840984991b31c9
# Parent f1a6372b01dbb342ce82f23f5e37d71b3126ac06
x86: AVX512 interp_4tap_vert_sp_32xN and interp_4tap_vert_ss_32xN for high bit depth
i444 chroma_vsp
Size | AVX2 performance | AVX512 performance
----------------------------------------------
32x8 | 22.89x | 40.73x
32x16 | 23.63x | 42.78x
32x24 | 23.87x | 40.49x
32x32 | 23.96x | 44.02x
32x64 | 24.28x | 41.92x
i444 chroma_vss
Size | AVX2 performance | AVX512 performance
----------------------------------------------
32x8 | 24.49x | 36.64x
32x16 | 25.25x | 38.51x
32x24 | 26.40x | 39.21x
32x32 | 26.45x | 39.68x
32x64 | 26.80x | 40.26x
diff -r f1a6372b01db -r a6c12a9c8cba source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Nov 14 13:49:11 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Nov 14 17:06:06 2017 +0530
@@ -2656,6 +2656,7 @@
p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vps = PFX(interp_4tap_vert_ps_48x64_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vsp = PFX(interp_4tap_vert_sp_48x64_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vss = PFX(interp_4tap_vert_ss_48x64_avx512);
+
p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512);
@@ -2666,6 +2667,16 @@
p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vps = PFX(interp_4tap_vert_ps_32x24_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vss = PFX(interp_4tap_vert_ss_32x8_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vss = PFX(interp_4tap_vert_ss_32x24_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vss = PFX(interp_4tap_vert_ss_32x64_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vsp = PFX(interp_4tap_vert_sp_32x8_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vsp = PFX(interp_4tap_vert_sp_32x24_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vsp = PFX(interp_4tap_vert_sp_32x64_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vpp = PFX(interp_4tap_vert_pp_16x4_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vpp = PFX(interp_4tap_vert_pp_16x12_avx512);
@@ -2693,6 +2704,14 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vps = PFX(interp_4tap_vert_ps_32x48_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vss = PFX(interp_4tap_vert_ss_32x48_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vss = PFX(interp_4tap_vert_ss_32x64_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vsp = PFX(interp_4tap_vert_sp_32x48_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vsp = PFX(interp_4tap_vert_sp_32x64_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = PFX(interp_4tap_vert_pp_16x24_avx512);
@@ -2720,6 +2739,14 @@
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vps = PFX(interp_4tap_vert_ps_32x24_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vss = PFX(interp_4tap_vert_ss_32x8_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vss = PFX(interp_4tap_vert_ss_32x24_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vsp = PFX(interp_4tap_vert_sp_32x8_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vsp = PFX(interp_4tap_vert_sp_32x24_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vpp = PFX(interp_4tap_vert_pp_16x4_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vpp = PFX(interp_4tap_vert_pp_16x12_avx512);
diff -r f1a6372b01db -r a6c12a9c8cba source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm Tue Nov 14 13:49:11 2017 +0530
+++ b/source/common/x86/ipfilter16.asm Tue Nov 14 17:06:06 2017 +0530
@@ -8272,6 +8272,105 @@
FILTER_VER_PS_CHROMA_64xN_AVX512 64
%endif
+%macro PROCESS_CHROMA_VERT_S_32x2_AVX512 1
+ movu m1, [r0]
+ movu m3, [r0 + r1]
+ punpcklwd m0, m1, m3
+ pmaddwd m0, m9
+ punpckhwd m1, m3
+ pmaddwd m1, m9
+
+ movu m4, [r0 + 2 * r1]
+ punpcklwd m2, m3, m4
+ pmaddwd m2, m9
+ punpckhwd m3, m4
+ pmaddwd m3, m9
+
+ lea r0, [r0 + 2 * r1]
+ movu m5, [r0 + r1]
+ punpcklwd m6, m4, m5
+ pmaddwd m6, m10
+ paddd m0, m6
+ punpckhwd m4, m5
+ pmaddwd m4, m10
+ paddd m1, m4
+
+ movu m4, [r0 + 2 * r1]
+ punpcklwd m6, m5, m4
+ pmaddwd m6, m10
+ paddd m2, m6
+ punpckhwd m5, m4
+ pmaddwd m5, m10
+ paddd m3, m5
+
+%ifidn %1,sp
+ paddd m0, m7
+ paddd m1, m7
+ paddd m2, m7
+ paddd m3, m7
+ psrad m0, INTERP_SHIFT_SP
+ psrad m1, INTERP_SHIFT_SP
+ psrad m2, INTERP_SHIFT_SP
+ psrad m3, INTERP_SHIFT_SP
+%else
+ psrad m0, 6
+ psrad m1, 6
+ psrad m2, 6
+ psrad m3, 6
+%endif
+ packssdw m0, m1
+ packssdw m2, m3
+ movu [r2], m0
+ movu [r2 + r3], m2
+%endmacro
+
+;-----------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_S_CHROMA_32xN_AVX512 2
+INIT_ZMM avx512
+cglobal interp_4tap_vert_%1_32x%2, 5, 7, 11
+ add r1d, r1d
+ add r3d, r3d
+ sub r0, r1
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffV_avx512]
+ lea r5, [r5 + r4]
+%else
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
+%endif
+
+%ifidn %1, sp
+ vbroadcasti32x4 m7, [INTERP_OFFSET_SP]
+%endif
+ mova m9, [r5]
+ mova m10, [r5 + mmsize]
+
+%rep %2/2 - 1
+ PROCESS_CHROMA_VERT_S_32x2_AVX512 %1
+ lea r2, [r2 + 2 * r3]
+%endrep
+ PROCESS_CHROMA_VERT_S_32x2_AVX512 %1
+ RET
+%endmacro
+
+%if ARCH_X86_64
+ FILTER_VER_S_CHROMA_32xN_AVX512 ss,8
+ FILTER_VER_S_CHROMA_32xN_AVX512 ss,16
+ FILTER_VER_S_CHROMA_32xN_AVX512 ss,24
+ FILTER_VER_S_CHROMA_32xN_AVX512 ss,32
+ FILTER_VER_S_CHROMA_32xN_AVX512 ss,48
+ FILTER_VER_S_CHROMA_32xN_AVX512 ss,64
+ FILTER_VER_S_CHROMA_32xN_AVX512 sp,8
+ FILTER_VER_S_CHROMA_32xN_AVX512 sp,16
+ FILTER_VER_S_CHROMA_32xN_AVX512 sp,24
+ FILTER_VER_S_CHROMA_32xN_AVX512 sp,32
+ FILTER_VER_S_CHROMA_32xN_AVX512 sp,48
+ FILTER_VER_S_CHROMA_32xN_AVX512 sp,64
+%endif
+
%macro PROCESS_CHROMA_VERT_S_48x4_AVX512 1
movu m1, [r0]
lea r6, [r0 + 2 * r1]
More information about the x265-devel
mailing list