[x265] [PATCH] asm: avx2 code for addavg[24x64] for 10 bpp (5584 -> 4052)
sumalatha at multicorewareinc.com
sumalatha at multicorewareinc.com
Tue May 26 06:57:58 CEST 2015
# HG changeset patch
# User Sumalatha Polureddy
# Date 1432616269 -19800
# Tue May 26 10:27:49 2015 +0530
# Node ID 9162e9b4edc45c9333889dbd0b8d90374dc32921
# Parent 1404801f5874bb421b57b1e46c84ad888f5250a2
asm: avx2 code for addavg[24x64] for 10 bpp (5584 -> 4052)
sse4:
[i422] addAvg[24x64] 10.09x 5584.70 56358.54
avx2:
[i422] addAvg[24x64] 13.89x 4052.17 56287.46
diff -r 1404801f5874 -r 9162e9b4edc4 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon May 25 15:50:39 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue May 26 10:27:49 2015 +0530
@@ -1249,6 +1249,7 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg = x265_addAvg_16x24_avx2;
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg = x265_addAvg_16x8_avx2;
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg = x265_addAvg_8x64_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg = x265_addAvg_24x64_avx2;
p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2;
diff -r 1404801f5874 -r 9162e9b4edc4 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Mon May 25 15:50:39 2015 +0530
+++ b/source/common/x86/mc-a.asm Tue May 26 10:27:49 2015 +0530
@@ -1304,6 +1304,62 @@
jnz .loop
RET
+cglobal addAvg_24x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
+ mova m5, [pw_1023]
+ paddw m3, m4, m4
+ pxor m1, m1
+ add r3, r3
+ add r4, r4
+ add r5, r5
+
+ mov r6d, 32
+
+.loop:
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m1
+ pminsw m0, m5
+ movu [r2], m0
+
+ movu xm0, [r0 + 32]
+ movu xm2, [r1 + 32]
+ paddw xm0, xm2
+ pmulhrsw xm0, xm3
+ paddw xm0, xm4
+ pmaxsw xm0, xm1
+ pminsw xm0, xm5
+ movu [r2 + 32], xm0
+
+ movu m0, [r0 + r3]
+ movu m2, [r1 + r4]
+ paddw m0, m2
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m1
+ pminsw m0, m5
+ movu [r2 + r5], m0
+
+ movu xm2, [r0 + r3 + 32]
+ movu xm0, [r1 + r4 + 32]
+ paddw xm2, xm0
+ pmulhrsw xm2, xm3
+ paddw xm2, xm4
+ pmaxsw xm2, xm1
+ pminsw xm2, xm5
+ movu [r2 + r5 + 32], xm2
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+
%macro ADDAVG_W32_H2_AVX2 1
cglobal addAvg_32x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
mova m4, [pw_512]
More information about the x265-devel
mailing list