[x265] [PATCH] asm: avx2 code for weight_pp() for 10 bpp
sumalatha at multicorewareinc.com
sumalatha at multicorewareinc.com
Wed Jun 17 11:28:06 CEST 2015
# HG changeset patch
# User Sumalatha Polureddy<sumalatha at multicorewareinc.com>
# Date 1434533281 -19800
# Wed Jun 17 14:58:01 2015 +0530
# Node ID 855feadb8cb588ec8333c2984cb6e0dbb4d09260
# Parent be0ed447922cc81e809d296e75424bb71822aea7
asm: avx2 code for weight_pp() for 10 bpp
sse4
weight_pp 9.37x 6768.87 63435.43
avx2
weight_pp 16.45x 4187.86 68871.50
diff -r be0ed447922c -r 855feadb8cb5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Jun 16 11:15:03 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Jun 17 14:58:01 2015 +0530
@@ -1467,7 +1467,7 @@
p.scale1D_128to64 = PFX(scale1D_128to64_avx2);
p.scale2D_64to32 = PFX(scale2D_64to32_avx2);
- // p.weight_pp = PFX(weight_pp_avx2); fails tests
+ p.weight_pp = PFX(weight_pp_avx2);
p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_avx2);
p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx2);
diff -r be0ed447922c -r 855feadb8cb5 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Tue Jun 16 11:15:03 2015 +0530
+++ b/source/common/x86/pixel-util8.asm Wed Jun 17 14:58:01 2015 +0530
@@ -1280,7 +1280,57 @@
%endif ; end of (HIGH_BIT_DEPTH == 0)
-
+%if HIGH_BIT_DEPTH
+INIT_YMM avx2
+cglobal weight_pp, 6, 7, 7
+ shl r5d, 4 ; m0 = [w0<<4]
+ mov r6d, r6m
+ shl r6d, 16
+ or r6d, r5d ; assuming both (w0<<4) and round are using maximum of 16 bits each.
+
+ vpbroadcastd m0, r6d
+
+ movd xm1, r7m
+ vpbroadcastd m2, r8m
+ mova m5, [pw_1]
+ mova m6, [pw_1023]
+ add r2d, r2d
+ add r3d, r3d
+ sub r2d, r3d
+ shr r3d, 5
+
+.loopH:
+ mov r5d, r3d
+
+.loopW:
+ movu m4, [r0]
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m0
+ psrad m3, xm1
+ paddd m3, m2
+
+ punpckhwd m4, m5
+ pmaddwd m4, m0
+ psrad m4, xm1
+ paddd m4, m2
+
+ packusdw m3, m4
+ pminuw m3, m6
+ movu [r1], m3
+
+ add r0, 32
+ add r1, 32
+
+ dec r5d
+ jnz .loopW
+
+ lea r0, [r0 + r2]
+ lea r1, [r1 + r2]
+
+ dec r4d
+ jnz .loopH
+ RET
+%else
INIT_YMM avx2
cglobal weight_pp, 6, 7, 6
@@ -1329,7 +1379,7 @@
dec r4d
jnz .loopH
RET
-
+%endif
;-------------------------------------------------------------------------------------------------------------------------------------------------
;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
;-------------------------------------------------------------------------------------------------------------------------------------------------
More information about the x265-devel
mailing list