[x265] [PATCH] asm: filter_vpp, filter_vps for 4x32 in avx2
Divya Manivannan
divya at multicorewareinc.com
Thu May 14 07:18:51 CEST 2015
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1431579857 -19800
# Thu May 14 10:34:17 2015 +0530
# Node ID 68ebb4bdda79543be758a31e4308e3f4e23ff274
# Parent 479087422e29a672d6e9bc8d0cd2a65649d71fe2
asm: filter_vpp, filter_vps for 4x32 in avx2
filter_vpp[4x32]: 1564c->1172c
filter_vps[4x32]: 1283c->1035c
diff -r 479087422e29 -r 68ebb4bdda79 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed May 13 16:52:59 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp Thu May 14 10:34:17 2015 +0530
@@ -2683,6 +2683,7 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].filter_vps = x265_interp_4tap_vert_ps_2x4_avx2;
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vps = x265_interp_4tap_vert_ps_16x24_avx2;
p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_vps = x265_interp_4tap_vert_ps_2x16_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_vps = x265_interp_4tap_vert_ps_4x32_avx2;
//i444 for chroma_vps
p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vps = x265_interp_4tap_vert_ps_4x4_avx2;
@@ -2729,6 +2730,7 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].filter_vpp = x265_interp_4tap_vert_pp_2x4_avx2;
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = x265_interp_4tap_vert_pp_16x24_avx2;
p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_vpp = x265_interp_4tap_vert_pp_2x16_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_vpp = x265_interp_4tap_vert_pp_4x32_avx2;
//i444 for chroma_vpp
p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vpp = x265_interp_4tap_vert_pp_4x4_avx2;
diff -r 479087422e29 -r 68ebb4bdda79 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Wed May 13 16:52:59 2015 -0700
+++ b/source/common/x86/ipfilter8.asm Thu May 14 10:34:17 2015 +0530
@@ -5698,10 +5698,10 @@
FILTER_VER_CHROMA_AVX2_4x8 pp
FILTER_VER_CHROMA_AVX2_4x8 ps
-%macro FILTER_VER_CHROMA_AVX2_4x16 1
-INIT_YMM avx2
-%if ARCH_X86_64 == 1
-cglobal interp_4tap_vert_%1_4x16, 4, 6, 9
+%macro FILTER_VER_CHROMA_AVX2_4xN 2
+%if ARCH_X86_64 == 1
+INIT_YMM avx2
+cglobal interp_4tap_vert_%1_4x%2, 4, 6, 12
mov r4d, r4m
shl r4d, 6
sub r0, r1
@@ -5714,7 +5714,16 @@
%endif
lea r4, [r1 * 3]
-
+ mova m10, [r5]
+ mova m11, [r5 + mmsize]
+%ifidn %1,pp
+ mova m9, [pw_512]
+%else
+ add r3d, r3d
+ mova m9, [pw_2000]
+%endif
+ lea r5, [r3 * 3]
+%rep %2 / 16
movd xm1, [r0]
pinsrd xm1, [r0 + r1], 1
pinsrd xm1, [r0 + r1 * 2], 2
@@ -5762,29 +5771,27 @@
pshufb m6, m6, m5
pshufb m7, m7, m5
pshufb m8, m8, m5
- pmaddubsw m0, [r5]
- pmaddubsw m6, [r5]
- pmaddubsw m7, [r5]
- pmaddubsw m8, [r5]
- pmaddubsw m1, [r5 + mmsize]
- pmaddubsw m2, [r5 + mmsize]
- pmaddubsw m3, [r5 + mmsize]
- pmaddubsw m4, [r5 + mmsize]
+ pmaddubsw m0, m10
+ pmaddubsw m6, m10
+ pmaddubsw m7, m10
+ pmaddubsw m8, m10
+ pmaddubsw m1, m11
+ pmaddubsw m2, m11
+ pmaddubsw m3, m11
+ pmaddubsw m4, m11
paddw m0, m1 ; m0 = WORD ROW[3 2 1 0]
paddw m6, m2 ; m6 = WORD ROW[7 6 5 4]
paddw m7, m3 ; m7 = WORD ROW[11 10 9 8]
paddw m8, m4 ; m8 = WORD ROW[15 14 13 12]
%ifidn %1,pp
- mova m5, [pw_512]
- pmulhrsw m0, m5
- pmulhrsw m6, m5
- pmulhrsw m7, m5
- pmulhrsw m8, m5
+ pmulhrsw m0, m9
+ pmulhrsw m6, m9
+ pmulhrsw m7, m9
+ pmulhrsw m8, m9
packuswb m0, m6
packuswb m7, m8
vextracti128 xm1, m0, 1
vextracti128 xm2, m7, 1
- lea r5, [r3 * 3]
movd [r2], xm0
pextrd [r2 + r3], xm0, 1
movd [r2 + r3 * 2], xm1
@@ -5805,17 +5812,14 @@
pextrd [r2 + r3 * 2], xm2, 2
pextrd [r2 + r5], xm2, 3
%else
- add r3d, r3d
- mova m5, [pw_2000]
- psubw m0, m5
- psubw m6, m5
- psubw m7, m5
- psubw m8, m5
+ psubw m0, m9
+ psubw m6, m9
+ psubw m7, m9
+ psubw m8, m9
vextracti128 xm1, m0, 1
vextracti128 xm2, m6, 1
vextracti128 xm3, m7, 1
vextracti128 xm4, m8, 1
- lea r5, [r3 * 3]
movq [r2], xm0
movhps [r2 + r3], xm0
movq [r2 + r3 * 2], xm1
@@ -5836,12 +5840,16 @@
movq [r2 + r3 * 2], xm4
movhps [r2 + r5], xm4
%endif
- RET
-%endif
-%endmacro
-
- FILTER_VER_CHROMA_AVX2_4x16 pp
- FILTER_VER_CHROMA_AVX2_4x16 ps
+ lea r2, [r2 + r3 * 4]
+%endrep
+ RET
+%endif
+%endmacro
+
+ FILTER_VER_CHROMA_AVX2_4xN pp, 16
+ FILTER_VER_CHROMA_AVX2_4xN ps, 16
+ FILTER_VER_CHROMA_AVX2_4xN pp, 32
+ FILTER_VER_CHROMA_AVX2_4xN ps, 32
;-----------------------------------------------------------------------------
; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
More information about the x265-devel
mailing list