[x265] [PATCH] asm: filter_hpp[2x8] in avx2: 301c->249c
Divya Manivannan
divya at multicorewareinc.com
Mon Mar 23 05:41:32 CET 2015
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1427085460 -19800
# Mon Mar 23 10:07:40 2015 +0530
# Node ID 1ac750487fb5ad497f95510327ebac46ff5e3467
# Parent cc496665280f9e9e4776327e9a1cc1b2eeffecbc
asm: filter_hpp[2x8] in avx2: 301c->249c
diff -r cc496665280f -r 1ac750487fb5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Sun Mar 22 22:16:45 2015 -0400
+++ b/source/common/x86/asm-primitives.cpp Mon Mar 23 10:07:40 2015 +0530
@@ -1650,6 +1650,7 @@
p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hpp = x265_interp_4tap_horiz_pp_16x16_avx2;
p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_hpp = x265_interp_4tap_horiz_pp_2x4_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_hpp = x265_interp_4tap_horiz_pp_2x8_avx2;
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_hpp = x265_interp_4tap_horiz_pp_4x2_avx2;
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hpp = x265_interp_4tap_horiz_pp_4x8_avx2;
diff -r cc496665280f -r 1ac750487fb5 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Sun Mar 22 22:16:45 2015 -0400
+++ b/source/common/x86/ipfilter8.asm Mon Mar 23 10:07:40 2015 +0530
@@ -1695,6 +1695,56 @@
pextrw [r2 + r4], xm1, 3
RET
+INIT_YMM avx2
+cglobal interp_4tap_horiz_pp_2x8, 4, 6, 6
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ vpbroadcastd m0, [r5 + r4 * 4]
+%else
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ mova m4, [interp4_hpp_shuf]
+ mova m5, [pw_1]
+ dec r0
+ lea r4, [r1 * 3]
+ movq xm1, [r0]
+ movhps xm1, [r0 + r1]
+ movq xm2, [r0 + r1 * 2]
+ movhps xm2, [r0 + r4]
+ vinserti128 m1, m1, xm2, 1
+ lea r0, [r0 + r1 * 4]
+ movq xm3, [r0]
+ movhps xm3, [r0 + r1]
+ movq xm2, [r0 + r1 * 2]
+ movhps xm2, [r0 + r4]
+ vinserti128 m3, m3, xm2, 1
+
+ pshufb m1, m4
+ pshufb m3, m4
+ pmaddubsw m1, m0
+ pmaddubsw m3, m0
+ pmaddwd m1, m5
+ pmaddwd m3, m5
+ packssdw m1, m3
+ pmulhrsw m1, [pw_512]
+ vextracti128 xm2, m1, 1
+ packuswb xm1, xm2
+
+ lea r4, [r3 * 3]
+ pextrw [r2], xm1, 0
+ pextrw [r2 + r3], xm1, 1
+ pextrw [r2 + r3 * 2], xm1, 4
+ pextrw [r2 + r4], xm1, 5
+ lea r2, [r2 + r3 * 4]
+ pextrw [r2], xm1, 2
+ pextrw [r2 + r3], xm1, 3
+ pextrw [r2 + r3 * 2], xm1, 6
+ pextrw [r2 + r4], xm1, 7
+ RET
+
INIT_YMM avx2
cglobal interp_4tap_horiz_pp_32x32, 4,6,7
mov r4d, r4m
More information about the x265-devel
mailing list