[x265] [PATCH 232 of 307] x86: AVX512 optimise interp_4tap_vert_pp_16xN
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:33:50 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1512377610 -19800
# Mon Dec 04 14:23:30 2017 +0530
# Node ID 3e8615bc86537e07754a1c023ade702a837042a8
# Parent 465682e66d91ecf207feae78c33e32f0eaaf45c4
x86: AVX512 optimise interp_4tap_vert_pp_16xN
i444
Size | AVX2 performance | AVX512 performance
----------------------------------------------
16x4 | 26.22x | 32.07x
16x12 | 30.95x | 40.01x
diff -r 465682e66d91 -r 3e8615bc8653 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Dec 04 12:33:32 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Dec 04 14:23:30 2017 +0530
@@ -4893,9 +4893,11 @@
p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hps = PFX(interp_4tap_horiz_ps_48x64_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vpp = PFX(interp_4tap_vert_pp_16x4_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vpp = PFX(interp_4tap_vert_pp_16x12_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512);
@@ -4927,6 +4929,7 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = PFX(interp_4tap_vert_pp_16x24_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vpp = PFX(interp_4tap_vert_pp_16x64_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = PFX(interp_4tap_vert_pp_16x24_avx512);
@@ -4960,7 +4963,9 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vsp = PFX(interp_4tap_vert_sp_32x48_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vsp = PFX(interp_4tap_vert_sp_32x64_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vpp = PFX(interp_4tap_vert_pp_16x4_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vpp = PFX(interp_4tap_vert_pp_16x12_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vpp = PFX(interp_4tap_vert_pp_16x64_avx512);
diff -r 465682e66d91 -r 3e8615bc8653 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Mon Dec 04 12:33:32 2017 +0530
+++ b/source/common/x86/ipfilter8.asm Mon Dec 04 14:23:30 2017 +0530
@@ -10866,96 +10866,50 @@
;-------------------------------------------------------------------------------------------------------------
;avx512 chroma_vpp code start
;-------------------------------------------------------------------------------------------------------------
-%macro PROCESS_CHROMA_VERT_PP_16x8_AVX512 0
+%macro PROCESS_CHROMA_VERT_PP_16x4_AVX512 0
+ lea r5, [r0 + 4 * r1]
movu xm1, [r0]
- lea r8, [r0 + 4 * r1]
- lea r9, [r8 + 2 * r1]
- vinserti32x4 m1, [r0 + 2 * r1], 1
- vinserti32x4 m1, [r8], 2
- vinserti32x4 m1, [r9], 3
movu xm3, [r0 + r1]
- vinserti32x4 m3, [r0 + r6], 1
- vinserti32x4 m3, [r8 + r1], 2
- vinserti32x4 m3, [r9 + r1], 3
+ vinserti32x4 m1, [r0 + r1], 1
+ vinserti32x4 m3, [r0 + 2 * r1], 1
+ vinserti32x4 m1, [r0 + 2 * r1], 2
+ vinserti32x4 m3, [r0 + r6], 2
+ vinserti32x4 m1, [r0 + r6], 3
+ vinserti32x4 m3, [r0 + 4 * r1], 3
+
punpcklbw m0, m1, m3
pmaddubsw m0, m8
punpckhbw m1, m3
pmaddubsw m1, m8
movu xm4, [r0 + 2 * r1]
- vinserti32x4 m4, [r0 + 4 * r1], 1
- vinserti32x4 m4, [r8 + 2 * r1], 2
- vinserti32x4 m4, [r9 + 2 * r1], 3
- punpcklbw m2, m3, m4
- pmaddubsw m2, m8
- punpckhbw m3, m4
- pmaddubsw m3, m8
-
movu xm5, [r0 + r6]
- vinserti32x4 m5, [r8 + r1], 1
- vinserti32x4 m5, [r8 + r6], 2
- vinserti32x4 m5, [r9 + r6], 3
- punpcklbw m6, m4, m5
- pmaddubsw m6, m9
- paddw m0, m6
+ vinserti32x4 m4, [r0 + r6], 1
+ vinserti32x4 m5, [r5], 1
+ vinserti32x4 m4, [r5], 2
+ vinserti32x4 m5, [r5 + r1], 2
+ vinserti32x4 m4, [r5 + r1], 3
+ vinserti32x4 m5, [r5 + 2 * r1], 3
+
+ punpcklbw m3, m4, m5
+ pmaddubsw m3, m9
punpckhbw m4, m5
pmaddubsw m4, m9
+
+ paddw m0, m3
paddw m1, m4
-
- movu xm4, [r0 + 4 * r1]
- vinserti32x4 m4, [r8 + 2 * r1], 1
- vinserti32x4 m4, [r8 + 4 * r1], 2
- vinserti32x4 m4, [r9 + 4 * r1], 3
- punpcklbw m6, m5, m4
- pmaddubsw m6, m9
- paddw m2, m6
- punpckhbw m5, m4
- pmaddubsw m5, m9
- paddw m3, m5
-
pmulhrsw m0, m7
pmulhrsw m1, m7
- pmulhrsw m2, m7
- pmulhrsw m3, m7
-
packuswb m0, m1
- packuswb m2, m3
movu [r2], xm0
- movu [r2 + r3], xm2
- vextracti32x4 [r2 + 2 * r3], m0, 1
- vextracti32x4 [r2 + r7], m2, 1
- lea r2, [r2 + 4 * r3]
- vextracti32x4 [r2], m0, 2
- vextracti32x4 [r2 + r3], m2, 2
- vextracti32x4 [r2 + 2 * r3], m0, 3
- vextracti32x4 [r2 + r7], m2, 3
+ vextracti32x4 [r2 + r3], m0, 1
+ vextracti32x4 [r2 + 2 * r3], m0, 2
+ vextracti32x4 [r2 + r7], m0, 3
%endmacro
;-----------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------------------------------------------
-%if ARCH_X86_64
-INIT_ZMM avx512
-cglobal interp_4tap_vert_pp_16x8, 4, 10, 10
- mov r4d, r4m
- shl r4d, 7
- sub r0, r1
-
-%ifdef PIC
- lea r5, [tab_ChromaCoeffVer_32_avx512]
- mova m8, [r5 + r4]
- mova m9, [r5 + r4 + mmsize]
-%else
- mova m8, [tab_ChromaCoeffVer_32_avx512 + r4]
- mova m9, [tab_ChromaCoeffVer_32_avx512 + r4 + mmsize]
-%endif
- vbroadcasti32x8 m7, [pw_512]
- lea r6, [3 * r1]
- lea r7, [3 * r3]
- PROCESS_CHROMA_VERT_PP_16x8_AVX512
- RET
-%endif
-
%macro FILTER_VER_PP_CHROMA_16xN_AVX512 1
INIT_ZMM avx512
cglobal interp_4tap_vert_pp_16x%1, 4, 10, 10
@@ -10975,20 +10929,23 @@
lea r6, [3 * r1]
lea r7, [3 * r3]
-%rep %1/8 - 1
- PROCESS_CHROMA_VERT_PP_16x8_AVX512
- lea r0, [r8 + 4 * r1]
+%rep %1/4 - 1
+ PROCESS_CHROMA_VERT_PP_16x4_AVX512
+ lea r0, [r0 + 4 * r1]
lea r2, [r2 + 4 * r3]
%endrep
- PROCESS_CHROMA_VERT_PP_16x8_AVX512
+ PROCESS_CHROMA_VERT_PP_16x4_AVX512
RET
%endmacro
%if ARCH_X86_64
-FILTER_VER_PP_CHROMA_16xN_AVX512 16
-FILTER_VER_PP_CHROMA_16xN_AVX512 24
-FILTER_VER_PP_CHROMA_16xN_AVX512 32
-FILTER_VER_PP_CHROMA_16xN_AVX512 64
+ FILTER_VER_PP_CHROMA_16xN_AVX512 4
+ FILTER_VER_PP_CHROMA_16xN_AVX512 8
+ FILTER_VER_PP_CHROMA_16xN_AVX512 12
+ FILTER_VER_PP_CHROMA_16xN_AVX512 16
+ FILTER_VER_PP_CHROMA_16xN_AVX512 24
+ FILTER_VER_PP_CHROMA_16xN_AVX512 32
+ FILTER_VER_PP_CHROMA_16xN_AVX512 64
%endif
%macro PROCESS_CHROMA_VERT_PP_32x4_AVX512 0
More information about the x265-devel
mailing list