[x265] [PATCH 246 of 307] x86: AVX512 interp_4tap_vert_ps_16xN
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:04 CEST 2018
# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1512627824 -19800
# Thu Dec 07 11:53:44 2017 +0530
# Node ID 862e490b929710720e0066335c95a6e8951b2194
# Parent c335a7ca4304001e245dea7977cde1c2e0c0a8ee
x86: AVX512 interp_4tap_vert_ps_16xN
Disabled lesser performance giving kernels compared to avx2
i420
Size | AVX2 performance | AVX512 performance
----------------------------------------------
16x4 | 28.72x | 29.44x
16x8 | 29.46x | 33.70x
16x12 | 29.53x | 30.64x
16x16 | 33.21x | 31.39x
16x32 | 32.57x | 33.48x
i422
Size | AVX2 performance | AVX512 performance
----------------------------------------------
16x8 | 29.99x | 28.65x
16x16 | 32.81x | 31.16x
16x24 | 36.03x | 33.30x
16x32 | 33.62x | 32.78x
16x64 | 34.77x | 32.14x
i444
Size | AVX2 performance | AVX512 performance
----------------------------------------------
16x4 | 28.04x | 30.08x
16x8 | 31.86x | 33.69x
16x12 | 32.99x | 31.16x
16x16 | 32.89x | 29.70x
16x32 | 32.85x | 33.23x
16x64 | 31.80x | 32.18x
diff -r c335a7ca4304 -r 862e490b9297 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Dec 05 12:01:18 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Dec 07 11:53:44 2017 +0530
@@ -5175,6 +5175,25 @@
p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vps = PFX(interp_4tap_vert_ps_32x8_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vps = PFX(interp_4tap_vert_ps_16x4_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vps = PFX(interp_4tap_vert_ps_16x12_avx512);
+ //p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx512);
+
+ /*p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vps = PFX(interp_4tap_vert_ps_16x64_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vps = PFX(interp_4tap_vert_ps_16x24_avx512);*/
+
+ //p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx512);
+ //p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vps = PFX(interp_4tap_vert_ps_16x12_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vps = PFX(interp_4tap_vert_ps_16x4_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vps = PFX(interp_4tap_vert_ps_16x64_avx512);
+
}
#endif
}
diff -r c335a7ca4304 -r 862e490b9297 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Tue Dec 05 12:01:18 2017 +0530
+++ b/source/common/x86/ipfilter8.asm Thu Dec 07 11:53:44 2017 +0530
@@ -10869,7 +10869,7 @@
;-------------------------------------------------------------------------------------------------------------
;avx512 chroma_vpp and chroma_vps code start
;-------------------------------------------------------------------------------------------------------------
-%macro PROCESS_CHROMA_VERT_PP_16x4_AVX512 0
+%macro PROCESS_CHROMA_VERT_16x4_AVX512 1
lea r5, [r0 + 4 * r1]
movu xm1, [r0]
movu xm3, [r0 + r1]
@@ -10901,6 +10901,7 @@
paddw m0, m3
paddw m1, m4
+%ifidn %1,pp
pmulhrsw m0, m7
pmulhrsw m1, m7
packuswb m0, m1
@@ -10908,14 +10909,28 @@
vextracti32x4 [r2 + r3], m0, 1
vextracti32x4 [r2 + 2 * r3], m0, 2
vextracti32x4 [r2 + r7], m0, 3
+%else
+ psubw m0, m7
+ psubw m1, m7
+ mova m2, m10
+ mova m3, m11
+
+ vpermi2q m2, m0, m1
+ vpermi2q m3, m0, m1
+
+ movu [r2], ym2
+ vextracti32x8 [r2 + r3], m2, 1
+ movu [r2 + 2 * r3], ym3
+ vextracti32x8 [r2 + r7], m3, 1
+%endif
%endmacro
;-----------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_PP_CHROMA_16xN_AVX512 1
+%macro FILTER_VERT_CHROMA_16xN_AVX512 2
INIT_ZMM avx512
-cglobal interp_4tap_vert_pp_16x%1, 4, 10, 10
+cglobal interp_4tap_vert_%1_16x%2, 4, 10, 12
mov r4d, r4m
shl r4d, 7
sub r0, r1
@@ -10928,29 +10943,44 @@
mova m8, [tab_ChromaCoeffVer_32_avx512 + r4]
mova m9, [tab_ChromaCoeffVer_32_avx512 + r4 + mmsize]
%endif
+
+%ifidn %1, pp
vbroadcasti32x8 m7, [pw_512]
+%else
+ add r3d, r3d
+ vbroadcasti32x8 m7, [pw_2000]
+ mova m10, [interp4_vps_store1_avx512]
+ mova m11, [interp4_vps_store2_avx512]
+%endif
lea r6, [3 * r1]
lea r7, [3 * r3]
-%rep %1/4 - 1
- PROCESS_CHROMA_VERT_PP_16x4_AVX512
+%rep %2/4 - 1
+ PROCESS_CHROMA_VERT_16x4_AVX512 %1
lea r0, [r0 + 4 * r1]
lea r2, [r2 + 4 * r3]
%endrep
- PROCESS_CHROMA_VERT_PP_16x4_AVX512
+ PROCESS_CHROMA_VERT_16x4_AVX512 %1
RET
%endmacro
%if ARCH_X86_64
- FILTER_VER_PP_CHROMA_16xN_AVX512 4
- FILTER_VER_PP_CHROMA_16xN_AVX512 8
- FILTER_VER_PP_CHROMA_16xN_AVX512 12
- FILTER_VER_PP_CHROMA_16xN_AVX512 16
- FILTER_VER_PP_CHROMA_16xN_AVX512 24
- FILTER_VER_PP_CHROMA_16xN_AVX512 32
- FILTER_VER_PP_CHROMA_16xN_AVX512 64
-%endif
-
+ FILTER_VERT_CHROMA_16xN_AVX512 pp, 4
+ FILTER_VERT_CHROMA_16xN_AVX512 pp, 8
+ FILTER_VERT_CHROMA_16xN_AVX512 pp, 12
+ FILTER_VERT_CHROMA_16xN_AVX512 pp, 16
+ FILTER_VERT_CHROMA_16xN_AVX512 pp, 24
+ FILTER_VERT_CHROMA_16xN_AVX512 pp, 32
+ FILTER_VERT_CHROMA_16xN_AVX512 pp, 64
+
+ FILTER_VERT_CHROMA_16xN_AVX512 ps, 4
+ FILTER_VERT_CHROMA_16xN_AVX512 ps, 8
+ FILTER_VERT_CHROMA_16xN_AVX512 ps, 12
+ FILTER_VERT_CHROMA_16xN_AVX512 ps, 16
+ FILTER_VERT_CHROMA_16xN_AVX512 ps, 24
+ FILTER_VERT_CHROMA_16xN_AVX512 ps, 32
+ FILTER_VERT_CHROMA_16xN_AVX512 ps, 64
+%endif
%macro PROCESS_CHROMA_VERT_32x4_AVX512 1
movu ym1, [r0]
movu ym3, [r0 + r1]
More information about the x265-devel
mailing list