[x265] [PATCH 3 of 3] asm: chroma_hpp[2x16] for i422 - improved 595c->500c
aasaipriya at multicorewareinc.com
aasaipriya at multicorewareinc.com
Wed Apr 15 11:11:29 CEST 2015
# HG changeset patch
# User Aasaipriya Chandran <aasaipriya at multicorewareinc.com>
# Date 1429089028 -19800
# Wed Apr 15 14:40:28 2015 +0530
# Node ID 32f1d60a0e163c371234a89fc88efaca2b1a20cb
# Parent ad71befbc5eb57cecd53b1a1d16421a9d9863a6f
asm: chroma_hpp[2x16] for i422 - improved 595c->500c
diff -r ad71befbc5eb -r 32f1d60a0e16 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Apr 15 14:39:25 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Apr 15 14:40:28 2015 +0530
@@ -2129,6 +2129,7 @@
//i422 for chroma_hpp
p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_hpp = x265_interp_4tap_horiz_pp_12x32_avx2;
p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_hpp = x265_interp_4tap_horiz_pp_24x64_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_hpp = x265_interp_4tap_horiz_pp_2x16_avx2;
if ((cpuMask & X265_CPU_BMI1) && (cpuMask & X265_CPU_BMI2))
p.findPosLast = x265_findPosLast_x64;
diff -r ad71befbc5eb -r 32f1d60a0e16 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Wed Apr 15 14:39:25 2015 +0530
+++ b/source/common/x86/ipfilter8.asm Wed Apr 15 14:40:28 2015 +0530
@@ -21903,3 +21903,90 @@
jnz .loop
RET
+
+INIT_YMM avx2
+cglobal interp_4tap_horiz_pp_2x16, 4, 6, 6
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ vpbroadcastd m0, [r5 + r4 * 4]
+%else
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ mova m4, [interp4_hpp_shuf]
+ mova m5, [pw_1]
+ dec r0
+ lea r4, [r1 * 3]
+ movq xm1, [r0]
+ movhps xm1, [r0 + r1]
+ movq xm2, [r0 + r1 * 2]
+ movhps xm2, [r0 + r4]
+ vinserti128 m1, m1, xm2, 1
+ lea r0, [r0 + r1 * 4]
+ movq xm3, [r0]
+ movhps xm3, [r0 + r1]
+ movq xm2, [r0 + r1 * 2]
+ movhps xm2, [r0 + r4]
+ vinserti128 m3, m3, xm2, 1
+
+ pshufb m1, m4
+ pshufb m3, m4
+ pmaddubsw m1, m0
+ pmaddubsw m3, m0
+ pmaddwd m1, m5
+ pmaddwd m3, m5
+ packssdw m1, m3
+ pmulhrsw m1, [pw_512]
+ vextracti128 xm2, m1, 1
+ packuswb xm1, xm2
+
+ lea r4, [r3 * 3]
+ pextrw [r2], xm1, 0
+ pextrw [r2 + r3], xm1, 1
+ pextrw [r2 + r3 * 2], xm1, 4
+ pextrw [r2 + r4], xm1, 5
+ lea r2, [r2 + r3 * 4]
+ pextrw [r2], xm1, 2
+ pextrw [r2 + r3], xm1, 3
+ pextrw [r2 + r3 * 2], xm1, 6
+ pextrw [r2 + r4], xm1, 7
+ lea r2, [r2 + r3 * 4]
+ lea r0, [r0 + r1 * 4]
+
+ lea r4, [r1 * 3]
+ movq xm1, [r0]
+ movhps xm1, [r0 + r1]
+ movq xm2, [r0 + r1 * 2]
+ movhps xm2, [r0 + r4]
+ vinserti128 m1, m1, xm2, 1
+ lea r0, [r0 + r1 * 4]
+ movq xm3, [r0]
+ movhps xm3, [r0 + r1]
+ movq xm2, [r0 + r1 * 2]
+ movhps xm2, [r0 + r4]
+ vinserti128 m3, m3, xm2, 1
+
+ pshufb m1, m4
+ pshufb m3, m4
+ pmaddubsw m1, m0
+ pmaddubsw m3, m0
+ pmaddwd m1, m5
+ pmaddwd m3, m5
+ packssdw m1, m3
+ pmulhrsw m1, [pw_512]
+ vextracti128 xm2, m1, 1
+ packuswb xm1, xm2
+
+ lea r4, [r3 * 3]
+ pextrw [r2], xm1, 0
+ pextrw [r2 + r3], xm1, 1
+ pextrw [r2 + r3 * 2], xm1, 4
+ pextrw [r2 + r4], xm1, 5
+ lea r2, [r2 + r3 * 4]
+ pextrw [r2], xm1, 2
+ pextrw [r2 + r3], xm1, 3
+ pextrw [r2 + r3 * 2], xm1, 6
+ pextrw [r2 + r4], xm1, 7
+ RET
More information about the x265-devel
mailing list