[x265] [PATCH 233 of 307] x86: AVX512 optimise interp_4tap_vert_ss_8xN
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:33:51 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1512380104 -19800
# Mon Dec 04 15:05:04 2017 +0530
# Node ID ae75b2d09d10f28391d573507c13512360593386
# Parent 3e8615bc86537e07754a1c023ade702a837042a8
x86: AVX512 optimise interp_4tap_vert_ss_8xN
i444 8x4
AVX2 performance : 10.61x
AVX512 performance : 18.93x
diff -r 3e8615bc8653 -r ae75b2d09d10 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Dec 04 14:23:30 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Dec 04 15:05:04 2017 +0530
@@ -4903,6 +4903,7 @@
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vss = PFX(interp_4tap_vert_ss_8x4_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx512);
@@ -4938,7 +4939,9 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vpp = PFX(interp_4tap_vert_pp_32x48_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vss = PFX(interp_4tap_vert_ss_8x4_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vss = PFX(interp_4tap_vert_ss_8x12_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vss = PFX(interp_4tap_vert_ss_8x64_avx512);
@@ -4979,6 +4982,7 @@
p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = PFX(interp_4tap_vert_pp_64x32_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = PFX(interp_4tap_vert_pp_64x16_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vss = PFX(interp_4tap_vert_ss_8x4_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx512);
diff -r 3e8615bc8653 -r ae75b2d09d10 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Mon Dec 04 14:23:30 2017 +0530
+++ b/source/common/x86/ipfilter8.asm Mon Dec 04 15:05:04 2017 +0530
@@ -11146,78 +11146,54 @@
;-------------------------------------------------------------------------------------------------------------
;avx512 chroma_vss code start
;-------------------------------------------------------------------------------------------------------------
-%macro PROCESS_CHROMA_VERT_SS_8x8_AVX512 0
+%macro PROCESS_CHROMA_VERT_SS_8x4_AVX512 0
+ lea r5, [r0 + 4 * r1]
movu xm1, [r0]
- lea r6, [r0 + 2 * r1]
- lea r8, [r0 + 4 * r1]
- lea r9, [r8 + 2 * r1]
- vinserti32x4 m1, [r6], 1
- vinserti32x4 m1, [r8], 2
- vinserti32x4 m1, [r9], 3
movu xm3, [r0 + r1]
- vinserti32x4 m3, [r6 + r1], 1
- vinserti32x4 m3, [r8 + r1], 2
- vinserti32x4 m3, [r9 + r1], 3
+ vinserti32x4 m1, [r0 + r1], 1
+ vinserti32x4 m3, [r0 + 2 * r1], 1
+ vinserti32x4 m1, [r0 + 2 * r1], 2
+ vinserti32x4 m3, [r0 + r6], 2
+ vinserti32x4 m1, [r0 + r6], 3
+ vinserti32x4 m3, [r0 + 4 * r1], 3
+
punpcklwd m0, m1, m3
pmaddwd m0, m8
punpckhwd m1, m3
pmaddwd m1, m8
movu xm4, [r0 + 2 * r1]
- vinserti32x4 m4, [r6 + 2 * r1], 1
- vinserti32x4 m4, [r8 + 2 * r1], 2
- vinserti32x4 m4, [r9 + 2 * r1], 3
- punpcklwd m2, m3, m4
- pmaddwd m2, m8
- punpckhwd m3, m4
- pmaddwd m3, m8
-
- movu xm5, [r0 + r10]
- vinserti32x4 m5, [r6 + r10], 1
- vinserti32x4 m5, [r8 + r10], 2
- vinserti32x4 m5, [r9 + r10], 3
- punpcklwd m6, m4, m5
- pmaddwd m6, m9
- paddd m0, m6
+ movu xm5, [r0 + r6]
+ vinserti32x4 m4, [r0 + r6], 1
+ vinserti32x4 m5, [r5], 1
+ vinserti32x4 m4, [r5], 2
+ vinserti32x4 m5, [r5 + r1], 2
+ vinserti32x4 m4, [r5 + r1], 3
+ vinserti32x4 m5, [r5 + 2 * r1], 3
+
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m9
punpckhwd m4, m5
pmaddwd m4, m9
+
+ paddd m0, m3
paddd m1, m4
- movu xm4, [r0 + 4 * r1]
- vinserti32x4 m4, [r6 + 4 * r1], 1
- vinserti32x4 m4, [r8 + 4 * r1], 2
- vinserti32x4 m4, [r9 + 4 * r1], 3
- punpcklwd m6, m5, m4
- pmaddwd m6, m9
- paddd m2, m6
- punpckhwd m5, m4
- pmaddwd m5, m9
- paddd m3, m5
-
psrad m0, 6
psrad m1, 6
- psrad m2, 6
- psrad m3, 6
packssdw m0, m1
- packssdw m2, m3
-
movu [r2], xm0
- movu [r2 + r3], xm2
- vextracti32x4 [r2 + 2 * r3], m0, 1
- vextracti32x4 [r2 + r7], m2, 1
- lea r2, [r2 + 4 * r3]
- vextracti32x4 [r2], m0, 2
- vextracti32x4 [r2 + r3], m2, 2
- vextracti32x4 [r2 + 2 * r3], m0, 3
- vextracti32x4 [r2 + r7], m2, 3
+ vextracti32x4 [r2 + r3], m0, 1
+ vextracti32x4 [r2 + 2 * r3], m0, 2
+ vextracti32x4 [r2 + r7], m0, 3
%endmacro
;-----------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------------------------------------------
-%if ARCH_X86_64
+%macro FILTER_VER_SS_CHROMA_8xN_AVX512 1
INIT_ZMM avx512
-cglobal interp_4tap_vert_ss_8x8, 5, 11, 10
+cglobal interp_4tap_vert_ss_8x%1, 5, 8, 10
add r1d, r1d
add r3d, r3d
sub r0, r1
@@ -11231,41 +11207,22 @@
mova m8, [r5]
mova m9, [r5 + mmsize]
%endif
- lea r10, [3 * r1]
+ lea r6, [3 * r1]
lea r7, [3 * r3]
- PROCESS_CHROMA_VERT_SS_8x8_AVX512
- RET
-%endif
-
-%macro FILTER_VER_SS_CHROMA_8xN_AVX512 1
-INIT_ZMM avx512
-cglobal interp_4tap_vert_ss_8x%1, 5, 11, 10
- add r1d, r1d
- add r3d, r3d
- sub r0, r1
- shl r4d, 7
-%ifdef PIC
- lea r5, [pw_ChromaCoeffVer_32_avx512]
- mova m8, [r5 + r4]
- mova m9, [r5 + r4 + mmsize]
-%else
- lea r5, [pw_ChromaCoeffVer_32_avx512 + r4]
- mova m8, [r5]
- mova m9, [r5 + mmsize]
-%endif
- lea r10, [3 * r1]
- lea r7, [3 * r3]
-
-%rep %1/8 - 1
- PROCESS_CHROMA_VERT_SS_8x8_AVX512
- lea r0, [r8 + 4 * r1]
+
+%rep %1/4 - 1
+ PROCESS_CHROMA_VERT_SS_8x4_AVX512
+ lea r0, [r0 + 4 * r1]
lea r2, [r2 + 4 * r3]
%endrep
- PROCESS_CHROMA_VERT_SS_8x8_AVX512
+ PROCESS_CHROMA_VERT_SS_8x4_AVX512
RET
%endmacro
%if ARCH_X86_64
+ FILTER_VER_SS_CHROMA_8xN_AVX512 4
+ FILTER_VER_SS_CHROMA_8xN_AVX512 8
+ FILTER_VER_SS_CHROMA_8xN_AVX512 12
FILTER_VER_SS_CHROMA_8xN_AVX512 16
FILTER_VER_SS_CHROMA_8xN_AVX512 32
FILTER_VER_SS_CHROMA_8xN_AVX512 64
More information about the x265-devel
mailing list