[x265] [PATCH 245 of 307] x86: AVX512 interp_4tap_vert_ps_32xN
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:03 CEST 2018
# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1512455478 -19800
# Tue Dec 05 12:01:18 2017 +0530
# Node ID c335a7ca4304001e245dea7977cde1c2e0c0a8ee
# Parent 81a870948ac446b36c248325e0c7264cf8f3f09e
x86: AVX512 interp_4tap_vert_ps_32xN
i420
Size | AVX2 performance | AVX512 performance
----------------------------------------------
32x8 | 36.28x | 47.86x
32x16 | 40.43x | 51.57x
32x24 | 40.96x | 54.05x
32x32 | 40.12x | 54.27x
i422
Size | AVX2 performance | AVX512 performance
----------------------------------------------
32x16 | 39.84x | 51.35x
32x32 | 39.86x | 54.17x
32x48 | 41.14x | 54.85x
32x64 | 42.00x | 56.50x
i444
Size | AVX2 performance | AVX512 performance
----------------------------------------------
32x8 | 36.08x | 47.61x
32x16 | 39.96x | 51.41x
32x24 | 40.38x | 54.51x
32x32 | 40.07x | 54.56x
32x64 | 41.94x | 56.59x
diff -r 81a870948ac4 -r c335a7ca4304 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Dec 07 15:31:54 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Dec 05 12:01:18 2017 +0530
@@ -5158,6 +5158,23 @@
p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vps = PFX(interp_4tap_vert_ps_64x48_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vps = PFX(interp_4tap_vert_ps_64x32_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vps = PFX(interp_4tap_vert_ps_64x16_avx512);
+
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vps = PFX(interp_4tap_vert_ps_32x24_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vps = PFX(interp_4tap_vert_ps_32x8_avx512);
+
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vps = PFX(interp_4tap_vert_ps_32x48_avx512);
+
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vps = PFX(interp_4tap_vert_ps_32x24_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vps = PFX(interp_4tap_vert_ps_32x8_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_avx512);
+
}
#endif
}
diff -r 81a870948ac4 -r c335a7ca4304 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Thu Dec 07 15:31:54 2017 +0530
+++ b/source/common/x86/ipfilter8.asm Tue Dec 05 12:01:18 2017 +0530
@@ -10951,7 +10951,7 @@
FILTER_VER_PP_CHROMA_16xN_AVX512 64
%endif
-%macro PROCESS_CHROMA_VERT_PP_32x4_AVX512 0
+%macro PROCESS_CHROMA_VERT_32x4_AVX512 1
movu ym1, [r0]
movu ym3, [r0 + r1]
vinserti32x8 m1, [r0 + 2 * r1], 1
@@ -10988,25 +10988,45 @@
pmaddubsw m5, m9
paddw m3, m5
+%ifidn %1,pp
pmulhrsw m0, m7
pmulhrsw m1, m7
pmulhrsw m2, m7
pmulhrsw m3, m7
-
packuswb m0, m1
packuswb m2, m3
movu [r2], ym0
movu [r2 + r3], ym2
vextracti32x8 [r2 + 2 * r3], m0, 1
vextracti32x8 [r2 + r7], m2, 1
+%else
+ psubw m0, m7
+ psubw m1, m7
+ psubw m2, m7
+ psubw m3, m7
+
+ mova m4, m10
+ mova m5, m11
+ vpermi2q m4, m0, m1
+ vpermi2q m5, m0, m1
+ mova m6, m10
+ mova m12, m11
+ vpermi2q m6, m2, m3
+ vpermi2q m12, m2, m3
+
+ movu [r2], m4
+ movu [r2 + r3], m6
+ movu [r2 + 2 * r3], m5
+ movu [r2 + r7], m12
+%endif
%endmacro
;-----------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_PP_CHROMA_32xN_AVX512 1
+%macro FILTER_VERT_CHROMA_32xN_AVX512 2
INIT_ZMM avx512
-cglobal interp_4tap_vert_pp_32x%1, 4, 10, 8
+cglobal interp_4tap_vert_%1_32x%2, 4, 8, 13
mov r4d, r4m
shl r4d, 7
sub r0, r1
@@ -11019,26 +11039,42 @@
mova m8, [tab_ChromaCoeffVer_32_avx512 + r4]
mova m9, [tab_ChromaCoeffVer_32_avx512 + r4 + mmsize]
%endif
- vbroadcasti32x8 m7, [pw_512]
+
+%ifidn %1,pp
+ vbroadcasti32x8 m7, [pw_512]
+%else
+ add r3d, r3d
+ vbroadcasti32x8 m7, [pw_2000]
+ mova m10, [interp4_vps_store1_avx512]
+ mova m11, [interp4_vps_store2_avx512]
+%endif
+
lea r6, [3 * r1]
lea r7, [3 * r3]
-%rep %1/4 - 1
- PROCESS_CHROMA_VERT_PP_32x4_AVX512
+%rep %2/4 - 1
+ PROCESS_CHROMA_VERT_32x4_AVX512 %1
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 4 * r3]
%endrep
- PROCESS_CHROMA_VERT_PP_32x4_AVX512
+ PROCESS_CHROMA_VERT_32x4_AVX512 %1
RET
%endmacro
%if ARCH_X86_64
- FILTER_VER_PP_CHROMA_32xN_AVX512 8
- FILTER_VER_PP_CHROMA_32xN_AVX512 16
- FILTER_VER_PP_CHROMA_32xN_AVX512 24
- FILTER_VER_PP_CHROMA_32xN_AVX512 32
- FILTER_VER_PP_CHROMA_32xN_AVX512 48
- FILTER_VER_PP_CHROMA_32xN_AVX512 64
+ FILTER_VERT_CHROMA_32xN_AVX512 pp, 8
+ FILTER_VERT_CHROMA_32xN_AVX512 pp, 16
+ FILTER_VERT_CHROMA_32xN_AVX512 pp, 24
+ FILTER_VERT_CHROMA_32xN_AVX512 pp, 32
+ FILTER_VERT_CHROMA_32xN_AVX512 pp, 48
+ FILTER_VERT_CHROMA_32xN_AVX512 pp, 64
+
+ FILTER_VERT_CHROMA_32xN_AVX512 ps, 8
+ FILTER_VERT_CHROMA_32xN_AVX512 ps, 16
+ FILTER_VERT_CHROMA_32xN_AVX512 ps, 24
+ FILTER_VERT_CHROMA_32xN_AVX512 ps, 32
+ FILTER_VERT_CHROMA_32xN_AVX512 ps, 48
+ FILTER_VERT_CHROMA_32xN_AVX512 ps, 64
%endif
%macro PROCESS_CHROMA_VERT_PP_48x4_AVX512 0
More information about the x265-devel
mailing list