[x265] [PATCH] asm: filter_vpp, filter_vps for 2x16 in avx2
Divya Manivannan
divya at multicorewareinc.com
Tue May 12 06:42:58 CEST 2015
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1431405304 -19800
# Tue May 12 10:05:04 2015 +0530
# Node ID d4ff48ee1f104aef8a3dd7b18f98a83bc8a52f0e
# Parent f2081ef64fd27dfd3a5bec92ee1a835a74061761
asm: filter_vpp, filter_vps for 2x16 in avx2
filter_vpp[2x16]: 859c->528c
filter_vps[2x16]: 724c->452c
diff -r f2081ef64fd2 -r d4ff48ee1f10 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon May 11 18:50:03 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Tue May 12 10:05:04 2015 +0530
@@ -2622,6 +2622,7 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vps = x265_interp_4tap_vert_ps_8x12_avx2;
p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].filter_vps = x265_interp_4tap_vert_ps_2x4_avx2;
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vps = x265_interp_4tap_vert_ps_16x24_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_vps = x265_interp_4tap_vert_ps_2x16_avx2;
//i444 for chroma_vps
p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vps = x265_interp_4tap_vert_ps_4x4_avx2;
@@ -2667,6 +2668,7 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vpp = x265_interp_4tap_vert_pp_8x12_avx2;
p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].filter_vpp = x265_interp_4tap_vert_pp_2x4_avx2;
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = x265_interp_4tap_vert_pp_16x24_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_vpp = x265_interp_4tap_vert_pp_2x16_avx2;
//i444 for chroma_vpp
p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vpp = x265_interp_4tap_vert_pp_4x4_avx2;
diff -r f2081ef64fd2 -r d4ff48ee1f10 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Mon May 11 18:50:03 2015 -0500
+++ b/source/common/x86/ipfilter8.asm Tue May 12 10:05:04 2015 +0530
@@ -4412,6 +4412,123 @@
FILTER_VER_CHROMA_AVX2_2x8 pp
FILTER_VER_CHROMA_AVX2_2x8 ps
+%macro FILTER_VER_CHROMA_AVX2_2x16 1
+INIT_YMM avx2
+cglobal interp_4tap_vert_%1_2x16, 4, 6, 3
+ mov r4d, r4m
+ shl r4d, 6
+ sub r0, r1
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_ChromaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+
+ movd xm1, [r0]
+ pinsrw xm1, [r0 + r1], 1
+ pinsrw xm1, [r0 + r1 * 2], 2
+ pinsrw xm1, [r0 + r4], 3
+ lea r0, [r0 + r1 * 4]
+ pinsrw xm1, [r0], 4
+ pinsrw xm1, [r0 + r1], 5
+ pinsrw xm1, [r0 + r1 * 2], 6
+ pinsrw xm1, [r0 + r4], 7
+ lea r0, [r0 + r1 * 4]
+ pinsrw xm0, [r0], 4
+ pinsrw xm0, [r0 + r1], 5
+ pinsrw xm0, [r0 + r1 * 2], 6
+ pinsrw xm0, [r0 + r4], 7
+ punpckhqdq xm0, xm1, xm0
+ vinserti128 m1, m1, xm0, 1
+
+ pshufb m2, m1, [interp_vert_shuf]
+ pshufb m1, [interp_vert_shuf + 32]
+ pmaddubsw m2, [r5]
+ pmaddubsw m1, [r5 + 1 * mmsize]
+ paddw m2, m1
+
+ lea r0, [r0 + r1 * 4]
+ pinsrw xm1, [r0], 4
+ pinsrw xm1, [r0 + r1], 5
+ pinsrw xm1, [r0 + r1 * 2], 6
+ pinsrw xm1, [r0 + r4], 7
+ punpckhqdq xm1, xm0, xm1
+ lea r0, [r0 + r1 * 4]
+ pinsrw xm0, [r0], 4
+ pinsrw xm0, [r0 + r1], 5
+ pinsrw xm0, [r0 + r1 * 2], 6
+ punpckhqdq xm0, xm1, xm0
+ vinserti128 m1, m1, xm0, 1
+
+ pshufb m0, m1, [interp_vert_shuf]
+ pshufb m1, [interp_vert_shuf + 32]
+ pmaddubsw m0, [r5]
+ pmaddubsw m1, [r5 + 1 * mmsize]
+ paddw m0, m1
+%ifidn %1,pp
+ mova m1, [pw_512]
+ pmulhrsw m2, m1
+ pmulhrsw m0, m1
+ packuswb m2, m0
+ lea r4, [r3 * 3]
+ pextrw [r2], xm2, 0
+ pextrw [r2 + r3], xm2, 1
+ pextrw [r2 + r3 * 2], xm2, 2
+ pextrw [r2 + r4], xm2, 3
+ vextracti128 xm0, m2, 1
+ lea r2, [r2 + r3 * 4]
+ pextrw [r2], xm0, 0
+ pextrw [r2 + r3], xm0, 1
+ pextrw [r2 + r3 * 2], xm0, 2
+ pextrw [r2 + r4], xm0, 3
+ lea r2, [r2 + r3 * 4]
+ pextrw [r2], xm2, 4
+ pextrw [r2 + r3], xm2, 5
+ pextrw [r2 + r3 * 2], xm2, 6
+ pextrw [r2 + r4], xm2, 7
+ lea r2, [r2 + r3 * 4]
+ pextrw [r2], xm0, 4
+ pextrw [r2 + r3], xm0, 5
+ pextrw [r2 + r3 * 2], xm0, 6
+ pextrw [r2 + r4], xm0, 7
+%else
+ add r3d, r3d
+ lea r4, [r3 * 3]
+ vbroadcasti128 m1, [pw_2000]
+ psubw m2, m1
+ psubw m0, m1
+ vextracti128 xm1, m2, 1
+ movd [r2], xm2
+ pextrd [r2 + r3], xm2, 1
+ pextrd [r2 + r3 * 2], xm2, 2
+ pextrd [r2 + r4], xm2, 3
+ lea r2, [r2 + r3 * 4]
+ movd [r2], xm1
+ pextrd [r2 + r3], xm1, 1
+ pextrd [r2 + r3 * 2], xm1, 2
+ pextrd [r2 + r4], xm1, 3
+ vextracti128 xm1, m0, 1
+ lea r2, [r2 + r3 * 4]
+ movd [r2], xm0
+ pextrd [r2 + r3], xm0, 1
+ pextrd [r2 + r3 * 2], xm0, 2
+ pextrd [r2 + r4], xm0, 3
+ lea r2, [r2 + r3 * 4]
+ movd [r2], xm1
+ pextrd [r2 + r3], xm1, 1
+ pextrd [r2 + r3 * 2], xm1, 2
+ pextrd [r2 + r4], xm1, 3
+%endif
+ RET
+%endmacro
+
+ FILTER_VER_CHROMA_AVX2_2x16 pp
+ FILTER_VER_CHROMA_AVX2_2x16 ps
+
;-----------------------------------------------------------------------------
; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list