[x265] [PATCH] asm: interp_4tap_vert_X[32xN] avx2 10bit code for i420
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Tue Jun 9 13:37:34 CEST 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1433840941 -19800
# Tue Jun 09 14:39:01 2015 +0530
# Node ID 5994f794ff36e881e3f992e78b2167a0e4ad4768
# Parent b252468dde7ffca57da27575388d95ce538945d2
asm: interp_4tap_vert_X[32xN] avx2 10bit code for i420
avx2:
chroma_vpp[32x32] 17.04x 7760.78 132244.98
chroma_vps[32x32] 11.50x 5777.58 66423.06
chroma_vsp[32x32] 17.02x 8268.52 140768.66
chroma_vss[32x32] 16.44x 6397.38 105184.93
sse4:
chroma_vpp[32x32] 6.77x 20222.99 136859.22
chroma_vps[32x32] 5.08x 21605.75 109672.22
chroma_vsp[32x32] 7.03x 20796.80 146263.52
chroma_vss[32x32] 6.09x 17754.47 108065.09
diff -r b252468dde7f -r 5994f794ff36 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Jun 09 10:16:44 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Jun 09 14:39:01 2015 +0530
@@ -1734,6 +1734,22 @@
p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vsp = x265_interp_4tap_vert_sp_16x12_avx2;
p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vsp = x265_interp_4tap_vert_sp_16x16_avx2;
p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vsp = x265_interp_4tap_vert_sp_16x32_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vpp = x265_interp_4tap_vert_pp_32x8_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vpp = x265_interp_4tap_vert_pp_32x16_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = x265_interp_4tap_vert_pp_32x24_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = x265_interp_4tap_vert_pp_32x32_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vps = x265_interp_4tap_vert_ps_32x8_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vps = x265_interp_4tap_vert_ps_32x16_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vps = x265_interp_4tap_vert_ps_32x24_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vps = x265_interp_4tap_vert_ps_32x32_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vss = x265_interp_4tap_vert_ss_32x8_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = x265_interp_4tap_vert_ss_32x16_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vss = x265_interp_4tap_vert_ss_32x24_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vss = x265_interp_4tap_vert_ss_32x32_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vsp = x265_interp_4tap_vert_sp_32x8_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vsp = x265_interp_4tap_vert_sp_32x16_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vsp = x265_interp_4tap_vert_sp_32x24_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vsp = x265_interp_4tap_vert_sp_32x32_avx2;
if (cpuMask & X265_CPU_BMI2)
p.scanPosLast = x265_scanPosLast_avx2_bmi2;
diff -r b252468dde7f -r 5994f794ff36 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm Tue Jun 09 10:16:44 2015 +0530
+++ b/source/common/x86/ipfilter16.asm Tue Jun 09 14:39:01 2015 +0530
@@ -4920,6 +4920,202 @@
FILTER_VER_CHROMA_W16_16xN_avx2 16, sp, 8
FILTER_VER_CHROMA_W16_16xN_avx2 32, sp, 8
+%macro PROCESS_CHROMA_VERT_W32_2R 0
+ movu m1, [r0]
+ movu m3, [r0 + r1]
+ punpcklwd m0, m1, m3
+ pmaddwd m0, [r5 + 0 * mmsize]
+ punpckhwd m1, m3
+ pmaddwd m1, [r5 + 0 * mmsize]
+
+ movu m9, [r0 + mmsize]
+ movu m11, [r0 + r1 + mmsize]
+ punpcklwd m8, m9, m11
+ pmaddwd m8, [r5 + 0 * mmsize]
+ punpckhwd m9, m11
+ pmaddwd m9, [r5 + 0 * mmsize]
+
+ movu m4, [r0 + 2 * r1]
+ punpcklwd m2, m3, m4
+ pmaddwd m2, [r5 + 0 * mmsize]
+ punpckhwd m3, m4
+ pmaddwd m3, [r5 + 0 * mmsize]
+
+ movu m12, [r0 + 2 * r1 + mmsize]
+ punpcklwd m10, m11, m12
+ pmaddwd m10, [r5 + 0 * mmsize]
+ punpckhwd m11, m12
+ pmaddwd m11, [r5 + 0 * mmsize]
+
+ lea r6, [r0 + 2 * r1]
+ movu m5, [r6 + r1]
+ punpcklwd m6, m4, m5
+ pmaddwd m6, [r5 + 1 * mmsize]
+ paddd m0, m6
+ punpckhwd m4, m5
+ pmaddwd m4, [r5 + 1 * mmsize]
+ paddd m1, m4
+
+ movu m13, [r6 + r1 + mmsize]
+ punpcklwd m14, m12, m13
+ pmaddwd m14, [r5 + 1 * mmsize]
+ paddd m8, m14
+ punpckhwd m12, m13
+ pmaddwd m12, [r5 + 1 * mmsize]
+ paddd m9, m12
+
+ movu m4, [r6 + 2 * r1]
+ punpcklwd m6, m5, m4
+ pmaddwd m6, [r5 + 1 * mmsize]
+ paddd m2, m6
+ punpckhwd m5, m4
+ pmaddwd m5, [r5 + 1 * mmsize]
+ paddd m3, m5
+
+ movu m12, [r6 + 2 * r1 + mmsize]
+ punpcklwd m14, m13, m12
+ pmaddwd m14, [r5 + 1 * mmsize]
+ paddd m10, m14
+ punpckhwd m13, m12
+ pmaddwd m13, [r5 + 1 * mmsize]
+ paddd m11, m13
+%endmacro
+
+;-----------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_W16_32xN_avx2 3
+INIT_YMM avx2
+%if ARCH_X86_64
+cglobal interp_4tap_vert_%2_32x%1, 5, 7, %3
+ add r1d, r1d
+ add r3d, r3d
+ sub r0, r1
+ shl r4d, 6
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffV]
+ lea r5, [r5 + r4]
+%else
+ lea r5, [tab_ChromaCoeffV + r4]
+%endif
+ mov r4d, %1/2
+
+%ifidn %2, pp
+ mova m7, [tab_c_32]
+%elifidn %2, sp
+ mova m7, [pd_524800]
+%elifidn %2, ps
+ mova m7, [tab_c_n32768]
+%endif
+
+.loopH:
+ PROCESS_CHROMA_VERT_W32_2R
+%ifidn %2, ss
+ psrad m0, 6
+ psrad m1, 6
+ psrad m2, 6
+ psrad m3, 6
+
+ psrad m8, 6
+ psrad m9, 6
+ psrad m10, 6
+ psrad m11, 6
+
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m8, m9
+ packssdw m10, m11
+%elifidn %2, ps
+ paddd m0, m7
+ paddd m1, m7
+ paddd m2, m7
+ paddd m3, m7
+ psrad m0, 2
+ psrad m1, 2
+ psrad m2, 2
+ psrad m3, 2
+ paddd m8, m7
+ paddd m9, m7
+ paddd m10, m7
+ paddd m11, m7
+ psrad m8, 2
+ psrad m9, 2
+ psrad m10, 2
+ psrad m11, 2
+
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m8, m9
+ packssdw m10, m11
+%else
+ paddd m0, m7
+ paddd m1, m7
+ paddd m2, m7
+ paddd m3, m7
+ paddd m8, m7
+ paddd m9, m7
+ paddd m10, m7
+ paddd m11, m7
+ %ifidn %2, pp
+ psrad m0, 6
+ psrad m1, 6
+ psrad m2, 6
+ psrad m3, 6
+ psrad m8, 6
+ psrad m9, 6
+ psrad m10, 6
+ psrad m11, 6
+%else
+ psrad m0, 10
+ psrad m1, 10
+ psrad m2, 10
+ psrad m3, 10
+ psrad m8, 10
+ psrad m9, 10
+ psrad m10, 10
+ psrad m11, 10
+%endif
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m8, m9
+ packssdw m10, m11
+ pxor m5, m5
+ CLIPW2 m0, m2, m5, [pw_pixel_max]
+ CLIPW2 m8, m10, m5, [pw_pixel_max]
+%endif
+
+ movu [r2], m0
+ movu [r2 + r3], m2
+ movu [r2 + mmsize], m8
+ movu [r2 + r3 + mmsize], m10
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
+ dec r4d
+ jnz .loopH
+ RET
+%endif
+%endmacro
+ FILTER_VER_CHROMA_W16_32xN_avx2 8, pp, 15
+ FILTER_VER_CHROMA_W16_32xN_avx2 16, pp, 15
+ FILTER_VER_CHROMA_W16_32xN_avx2 24, pp, 15
+ FILTER_VER_CHROMA_W16_32xN_avx2 32, pp, 15
+
+ FILTER_VER_CHROMA_W16_32xN_avx2 8, ps, 15
+ FILTER_VER_CHROMA_W16_32xN_avx2 16, ps, 15
+ FILTER_VER_CHROMA_W16_32xN_avx2 24, ps, 15
+ FILTER_VER_CHROMA_W16_32xN_avx2 32, ps, 15
+
+ FILTER_VER_CHROMA_W16_32xN_avx2 8, ss, 15
+ FILTER_VER_CHROMA_W16_32xN_avx2 16, ss, 15
+ FILTER_VER_CHROMA_W16_32xN_avx2 24, ss, 15
+ FILTER_VER_CHROMA_W16_32xN_avx2 32, ss, 15
+
+ FILTER_VER_CHROMA_W16_32xN_avx2 8, sp, 15
+ FILTER_VER_CHROMA_W16_32xN_avx2 16, sp, 15
+ FILTER_VER_CHROMA_W16_32xN_avx2 24, sp, 15
+ FILTER_VER_CHROMA_W16_32xN_avx2 32, sp, 15
+
INIT_XMM sse2
cglobal chroma_p2s, 3, 7, 3
More information about the x265-devel
mailing list