[x265] [PATCH 5 of 5] asm: filter_hpp[2x4] in avx2: 185c->161c
Divya Manivannan
divya at multicorewareinc.com
Thu Mar 19 06:13:57 CET 2015
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1426740596 -19800
# Thu Mar 19 10:19:56 2015 +0530
# Node ID 46bcd60c8a1527f09c69c5a97664d46f7517e9e7
# Parent ad53f152fce599c1801304a0fd1ed0c5992f834f
asm: filter_hpp[2x4] in avx2: 185c->161c
diff -r ad53f152fce5 -r 46bcd60c8a15 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Mar 19 10:14:57 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Mar 19 10:19:56 2015 +0530
@@ -1604,6 +1604,8 @@
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = x265_interp_4tap_horiz_pp_32x32_avx2;
p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hpp = x265_interp_4tap_horiz_pp_16x16_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_hpp = x265_interp_4tap_horiz_pp_2x4_avx2;
+
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_hpp = x265_interp_4tap_horiz_pp_4x2_avx2;
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hpp = x265_interp_4tap_horiz_pp_4x8_avx2;
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_hpp = x265_interp_4tap_horiz_pp_4x16_avx2;
diff -r ad53f152fce5 -r 46bcd60c8a15 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Thu Mar 19 10:14:57 2015 +0530
+++ b/source/common/x86/ipfilter8.asm Thu Mar 19 10:19:56 2015 +0530
@@ -285,6 +285,8 @@
interp4_horiz_shuf1: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+ALIGN 32
+interp4_hpp_shuf: times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
ALIGN 32
interp8_hps_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
@@ -1561,6 +1563,39 @@
pextrd [r2+r0], xm3, 3
RET
+INIT_YMM avx2
+cglobal interp_4tap_horiz_pp_2x4, 4, 6, 3
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ vpbroadcastd m0, [r5 + r4 * 4]
+%else
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ dec r0
+ lea r4, [r1 * 3]
+ movq xm1, [r0]
+ movhps xm1, [r0 + r1]
+ movq xm2, [r0 + r1 * 2]
+ movhps xm2, [r0 + r4]
+ vinserti128 m1, m1, xm2, 1
+ pshufb m1, [interp4_hpp_shuf]
+ pmaddubsw m1, m0
+ pmaddwd m1, [pw_1]
+ vextracti128 xm2, m1, 1
+ packssdw xm1, xm2
+ pmulhrsw xm1, [pw_512]
+ packuswb xm1, xm1
+
+ lea r4, [r3 * 3]
+ pextrw [r2], xm1, 0
+ pextrw [r2 + r3], xm1, 1
+ pextrw [r2 + r3 * 2], xm1, 2
+ pextrw [r2 + r4], xm1, 3
+ RET
+
INIT_YMM avx2
cglobal interp_4tap_horiz_pp_32x32, 4,6,7
mov r4d, r4m
More information about the x265-devel
mailing list