[x265] [PATCH Review Only] weight_sp: avx2 asm code, improved from 7849.40 cycles to 4922.20 cycles over sse version
chen
chenm003 at 163.com
Thu Oct 23 22:31:05 CEST 2014
Code is right, just improve code style
A little tip, when you prepare calculate r2 with (r2 - width/mmsize*mmsize), you didn't need origin r0 anymore.
At 2014-10-23 20:19:10,praveen at multicorewareinc.com wrote:
># HG changeset patch
># User Praveen Tiwari
># Date 1414061274 -19800
># Node ID a19f9a6e25da7f4b8b8d2f5105eaaa19df4a6529
># Parent ce304756a6e469b94cceef930e62972bd2168e4f
>weight_sp: avx2 asm code, improved from 7849.40 cycles to 4922.20 cycles over sse version
>
>diff -r ce304756a6e4 -r a19f9a6e25da source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Wed Oct 22 23:16:13 2014 -0500
>+++ b/source/common/x86/asm-primitives.cpp Thu Oct 23 16:17:54 2014 +0530
>@@ -1792,6 +1792,7 @@
> p.scale1D_128to64 = x265_scale1D_128to64_avx2;
>
> p.weight_pp = x265_weight_pp_avx2;
>+ p.weight_sp = x265_weight_sp_avx2;
>
> #if X86_64
> p.dct[DCT_8x8] = x265_dct8_avx2;
>diff -r ce304756a6e4 -r a19f9a6e25da source/common/x86/pixel-util.h
>--- a/source/common/x86/pixel-util.h Wed Oct 22 23:16:13 2014 -0500
>+++ b/source/common/x86/pixel-util.h Thu Oct 23 16:17:54 2014 +0530
>@@ -60,6 +60,7 @@
> void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
> void x265_weight_pp_avx2(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
> void x265_weight_sp_sse4(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
>+void x265_weight_sp_avx2(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
>
> void x265_pixel_ssim_4x4x2_core_mmx2(const uint8_t * pix1, intptr_t stride1,
> const uint8_t * pix2, intptr_t stride2, int sums[2][4]);
>diff -r ce304756a6e4 -r a19f9a6e25da source/common/x86/pixel-util8.asm
>--- a/source/common/x86/pixel-util8.asm Wed Oct 22 23:16:13 2014 -0500
>+++ b/source/common/x86/pixel-util8.asm Thu Oct 23 16:17:54 2014 +0530
>@@ -1498,6 +1498,90 @@
>
> RET
>
>+INIT_YMM avx2
>+%if ARCH_X86_64
>+cglobal weight_sp, 6, 7+2, 7
>+ %define tmp_r0 r7
>+ %define tmp_r1 r8
>+%else ; ARCH_X86_64 = 0
>+cglobal weight_sp, 6, 7, 7, 0-(2*4)
>+ %define tmp_r0 [(rsp + 0 * 4)]
>+ %define tmp_r1 [(rsp + 1 * 4)]
4 ==> gprsize
>+%endif ; ARCH_X86_64
>+
>+ movd xm0, r6m ; m0 = [w0]
>+ movd xm1, r7m ; m1 = [round]
>+ punpcklwd xm0, xm1
>+ pshufd xm0, xm0, 0 ; m0 = [w0 round]
>+ vinserti128 m0, m0, xm0, 1 ; document says (pshufd + vinserti128) can be replaced with vpbroadcastd m0, xm0, but having build problem, need to investigate
>+ movd xm1, r8m ; m1 = [shift]
>+ vpbroadcastd m2, r9m ; m2 = [offset]
>+ vpbroadcastw m3, [pw_1]
>+ vpbroadcastw m4, [pw_2000]
>+
>+ add r2d, r2d
>+
>+.loopH:
>+ mov r6d, r4d
>+
>+ ; save old src and dst
>+ mov tmp_r0, r0
>+ mov tmp_r1, r1
>+.loopW:
>+ movu m5, [r0]
>+ paddw m5, m4
>+
>+ punpcklwd m6, m5, m3
>+ pmaddwd m6, m0
>+ psrad m6, xm1
>+ paddd m6, m2
>+
>+ punpckhwd m5, m3
>+ pmaddwd m5, m0
>+ psrad m5, xm1
>+ paddd m5, m2
>+
>+ packssdw m6, m5
>+ vextracti128 xm5, m6, 1
>+ packuswb xm6, xm5
>+
>+ sub r6d, 16
>+ jl .width8
>+ movu [r1], xm6
>+ je .nextH
>+ add r0, 32
>+ add r1, 16
32 ==> mmsize
>+ jmp .loopW
>+
>+.width8:
>+ cmp r6d, -8
>+ jl .width4
>+ movq [r1], xm6
>+ je .nextH
>+ add r1, 8
>+
>+.width4:
>+ cmp r6d, -4
>+ jl .width2
>+ movd [r1], xm6
>+ je .nextH
>+ add r1, 4
>+ pshufd m6, m6, 1
>+
>+.width2:
>+ pextrw [r1], xm6, 0
>+
>+.nextH:
>+ mov r0, tmp_r0
>+ mov r1, tmp_r1
>+ lea r0, [r0 + r2]
>+ lea r1, [r1 + r3]
>+
>+ dec r5d
>+ jnz .loopH
>+ RET
>+
> ;-----------------------------------------------------------------
> ; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)
> ;-----------------------------------------------------------------
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20141024/bee280fb/attachment-0001.html>
More information about the x265-devel
mailing list