[x265] [PATCH Review Only] weight_sp: avx2 asm code, improved from 7849.40 cycles to 4922.20 cycles over sse version
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Thu Oct 23 14:19:10 CEST 2014
# HG changeset patch
# User Praveen Tiwari
# Date 1414061274 -19800
# Node ID a19f9a6e25da7f4b8b8d2f5105eaaa19df4a6529
# Parent ce304756a6e469b94cceef930e62972bd2168e4f
weight_sp: avx2 asm code, improved from 7849.40 cycles to 4922.20 cycles over sse version
diff -r ce304756a6e4 -r a19f9a6e25da source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Oct 22 23:16:13 2014 -0500
+++ b/source/common/x86/asm-primitives.cpp Thu Oct 23 16:17:54 2014 +0530
@@ -1792,6 +1792,7 @@
p.scale1D_128to64 = x265_scale1D_128to64_avx2;
p.weight_pp = x265_weight_pp_avx2;
+ p.weight_sp = x265_weight_sp_avx2;
#if X86_64
p.dct[DCT_8x8] = x265_dct8_avx2;
diff -r ce304756a6e4 -r a19f9a6e25da source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Wed Oct 22 23:16:13 2014 -0500
+++ b/source/common/x86/pixel-util.h Thu Oct 23 16:17:54 2014 +0530
@@ -60,6 +60,7 @@
void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
void x265_weight_pp_avx2(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
void x265_weight_sp_sse4(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
+void x265_weight_sp_avx2(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
void x265_pixel_ssim_4x4x2_core_mmx2(const uint8_t * pix1, intptr_t stride1,
const uint8_t * pix2, intptr_t stride2, int sums[2][4]);
diff -r ce304756a6e4 -r a19f9a6e25da source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Wed Oct 22 23:16:13 2014 -0500
+++ b/source/common/x86/pixel-util8.asm Thu Oct 23 16:17:54 2014 +0530
@@ -1498,6 +1498,90 @@
RET
+INIT_YMM avx2
+%if ARCH_X86_64
+cglobal weight_sp, 6, 7+2, 7
+ %define tmp_r0 r7
+ %define tmp_r1 r8
+%else ; ARCH_X86_64 = 0
+cglobal weight_sp, 6, 7, 7, 0-(2*4)
+ %define tmp_r0 [(rsp + 0 * 4)]
+ %define tmp_r1 [(rsp + 1 * 4)]
+%endif ; ARCH_X86_64
+
+ movd xm0, r6m ; m0 = [w0]
+ movd xm1, r7m ; m1 = [round]
+ punpcklwd xm0, xm1
+ pshufd xm0, xm0, 0 ; m0 = [w0 round]
+ vinserti128 m0, m0, xm0, 1 ; document says (pshufd + vinserti128) can be replaced with vpbroadcastd m0, xm0, but having build problem, need to investigate
+ movd xm1, r8m ; m1 = [shift]
+ vpbroadcastd m2, r9m ; m2 = [offset]
+ vpbroadcastw m3, [pw_1]
+ vpbroadcastw m4, [pw_2000]
+
+ add r2d, r2d
+
+.loopH:
+ mov r6d, r4d
+
+ ; save old src and dst
+ mov tmp_r0, r0
+ mov tmp_r1, r1
+.loopW:
+ movu m5, [r0]
+ paddw m5, m4
+
+ punpcklwd m6, m5, m3
+ pmaddwd m6, m0
+ psrad m6, xm1
+ paddd m6, m2
+
+ punpckhwd m5, m3
+ pmaddwd m5, m0
+ psrad m5, xm1
+ paddd m5, m2
+
+ packssdw m6, m5
+ vextracti128 xm5, m6, 1
+ packuswb xm6, xm5
+
+ sub r6d, 16
+ jl .width8
+ movu [r1], xm6
+ je .nextH
+ add r0, 32
+ add r1, 16
+
+ jmp .loopW
+
+.width8:
+ cmp r6d, -8
+ jl .width4
+ movq [r1], xm6
+ je .nextH
+ add r1, 8
+
+.width4:
+ cmp r6d, -4
+ jl .width2
+ movd [r1], xm6
+ je .nextH
+ add r1, 4
+ pshufd m6, m6, 1
+
+.width2:
+ pextrw [r1], xm6, 0
+
+.nextH:
+ mov r0, tmp_r0
+ mov r1, tmp_r1
+ lea r0, [r0 + r2]
+ lea r1, [r1 + r3]
+
+ dec r5d
+ jnz .loopH
+ RET
+
;-----------------------------------------------------------------
; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)
;-----------------------------------------------------------------
More information about the x265-devel
mailing list