[x265] [PATCH] weight_pp avx2 asm code, improved from 8608.65 cycles to 5138.09 cycles over sse version of asm code
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Thu Oct 16 11:20:13 CEST 2014
# HG changeset patch
# User Praveen Tiwari
# Date 1413451199 -19800
# Node ID 858be8d7d7176ab6c6d01cf92d00c8478fe99b34
# Parent 79702581ec824a2a375aebe228d69c3930aeea96
weight_pp avx2 asm code, improved from 8608.65 cycles to 5138.09 cycles over sse version of asm code
diff -r 79702581ec82 -r 858be8d7d717 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Oct 15 17:49:35 2014 -0500
+++ b/source/common/x86/asm-primitives.cpp Thu Oct 16 14:49:59 2014 +0530
@@ -1784,6 +1784,8 @@
p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x32] = x265_blockcopy_ss_16x32_avx;
p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x64] = x265_blockcopy_ss_16x64_avx;
+ p.weight_pp = x265_weight_pp_avx2;
+
#if X86_64
p.dct[DCT_8x8] = x265_dct8_avx2;
p.dct[DCT_16x16] = x265_dct16_avx2;
diff -r 79702581ec82 -r 858be8d7d717 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Wed Oct 15 17:49:35 2014 -0500
+++ b/source/common/x86/pixel-util.h Thu Oct 16 14:49:59 2014 +0530
@@ -58,6 +58,7 @@
int x265_count_nonzero_ssse3(const int16_t *quantCoeff, int numCoeff);
void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
+void x265_weight_pp_avx2(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
void x265_weight_sp_sse4(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
void x265_pixel_ssim_4x4x2_core_mmx2(const uint8_t * pix1, intptr_t stride1,
diff -r 79702581ec82 -r 858be8d7d717 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Wed Oct 15 17:49:35 2014 -0500
+++ b/source/common/x86/pixel-util8.asm Thu Oct 16 14:49:59 2014 +0530
@@ -1375,6 +1375,60 @@
RET
+INIT_YMM avx2
+cglobal weight_pp, 6, 7, 6
+
+ mov r6d, r6m
+ shl r6d, 6 ; m0 = [w0<<6]
+ movd xm0, r6d
+
+ movd xm1, r7m ; m1 = [round]
+ punpcklwd xm0, xm1
+ pshufd xm0, xm0, 0
+ vinserti128 m0, m0, xm0, 1 ; assuming both (w0<<6) and round are using maximum of 16 bits each, m0 = [w0<<6 round]
+
+ movd xm1, r8m
+ vpbroadcastd m2, r9m
+ mova m5, [pw_1]
+ sub r2d, r4d
+ sub r3d, r4d
+
+.loopH:
+ mov r6d, r4d
+ shr r6d, 4
+.loopW:
+ movu xm4, [r0]
+ pmovzxbw m4, xm4
+
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m0
+ psrad m3, xm1
+ paddd m3, m2
+
+ punpckhwd m4, m5
+ pmaddwd m4, m0
+ psrad m4, xm1
+ paddd m4, m2
+
+ packssdw m3, m4
+ vextracti128 xm4, m3, 1
+ packuswb m3, m4
+ movu [r1], xm3
+
+ add r0, 16
+ add r1, 16
+
+ dec r6d
+ jnz .loopW
+
+ lea r0, [r0 + r2]
+ lea r1, [r1 + r3]
+
+ dec r5d
+ jnz .loopH
+
+ RET
+
;-------------------------------------------------------------------------------------------------------------------------------------------------
;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
;-------------------------------------------------------------------------------------------------------------------------------------------------
More information about the x265-devel
mailing list