[x265] [PATCH Review Only] weight_sp: avx2 asm code, improved from 7849.40 cycles to 4922.20 cycles over sse version

Thu Oct 23 22:31:05 CEST 2014

Code is right, just improve code style
 A little tip, when you prepare calculate r2 with (r2 - width/mmsize*mmsize), you didn't need origin r0 anymore.

At 2014-10-23 20:19:10,praveen at multicorewareinc.com wrote:
># HG changeset patch
># User Praveen Tiwari
># Date 1414061274 -19800
># Node ID a19f9a6e25da7f4b8b8d2f5105eaaa19df4a6529
># Parent  ce304756a6e469b94cceef930e62972bd2168e4f
>weight_sp: avx2 asm code, improved from 7849.40 cycles to 4922.20 cycles over sse version
>
>diff -r ce304756a6e4 -r a19f9a6e25da source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp	Wed Oct 22 23:16:13 2014 -0500
>+++ b/source/common/x86/asm-primitives.cpp	Thu Oct 23 16:17:54 2014 +0530
>@@ -1792,6 +1792,7 @@
>         p.scale1D_128to64 = x265_scale1D_128to64_avx2;
> 
>         p.weight_pp = x265_weight_pp_avx2;
>+        p.weight_sp = x265_weight_sp_avx2;
> 
> #if X86_64
>         p.dct[DCT_8x8] = x265_dct8_avx2;
>diff -r ce304756a6e4 -r a19f9a6e25da source/common/x86/pixel-util.h
>--- a/source/common/x86/pixel-util.h	Wed Oct 22 23:16:13 2014 -0500
>+++ b/source/common/x86/pixel-util.h	Thu Oct 23 16:17:54 2014 +0530
>@@ -60,6 +60,7 @@
> void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
> void x265_weight_pp_avx2(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
> void x265_weight_sp_sse4(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
>+void x265_weight_sp_avx2(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
> 
> void x265_pixel_ssim_4x4x2_core_mmx2(const uint8_t * pix1, intptr_t stride1,
>                                      const uint8_t * pix2, intptr_t stride2, int sums[2][4]);
>diff -r ce304756a6e4 -r a19f9a6e25da source/common/x86/pixel-util8.asm
>--- a/source/common/x86/pixel-util8.asm	Wed Oct 22 23:16:13 2014 -0500
>+++ b/source/common/x86/pixel-util8.asm	Thu Oct 23 16:17:54 2014 +0530
>@@ -1498,6 +1498,90 @@
> 
>     RET
> 
>+INIT_YMM avx2
>+%if ARCH_X86_64
>+cglobal weight_sp, 6, 7+2, 7
>+    %define tmp_r0      r7
>+    %define tmp_r1      r8
>+%else ; ARCH_X86_64 = 0
>+cglobal weight_sp, 6, 7, 7, 0-(2*4)
>+    %define tmp_r0      [(rsp + 0 * 4)]
>+    %define tmp_r1      [(rsp + 1 * 4)]

4 ==> gprsize
>+%endif ; ARCH_X86_64
>+
>+    movd         xm0, r6m         ; m0 = [w0]
>+    movd         xm1, r7m         ; m1 = [round]
>+    punpcklwd    xm0, xm1
>+    pshufd       xm0, xm0, 0      ; m0 = [w0 round]
>+    vinserti128  m0,  m0, xm0, 1  ; document says (pshufd + vinserti128) can be replaced with vpbroadcastd m0, xm0, but having build problem, need to investigate
>+    movd         xm1, r8m         ; m1 = [shift]
>+    vpbroadcastd m2, r9m          ; m2 = [offset]
>+    vpbroadcastw m3, [pw_1]
>+    vpbroadcastw m4, [pw_2000]
>+
>+    add         r2d, r2d
>+
>+.loopH:
>+    mov         r6d, r4d
>+
>+    ; save old src and dst
>+    mov         tmp_r0, r0
>+    mov         tmp_r1, r1
>+.loopW:
>+    movu        m5, [r0]
>+    paddw       m5, m4
>+
>+    punpcklwd   m6, m5, m3
>+    pmaddwd     m6, m0
>+    psrad       m6, xm1
>+    paddd       m6, m2
>+
>+    punpckhwd   m5, m3
>+    pmaddwd     m5, m0
>+    psrad       m5, xm1
>+    paddd       m5, m2
>+
>+    packssdw     m6, m5
>+    vextracti128 xm5, m6, 1
>+    packuswb     xm6, xm5
>+
>+    sub         r6d, 16
>+    jl          .width8
>+    movu        [r1], xm6
>+    je          .nextH
>+    add         r0, 32
>+    add         r1, 16

32 ==> mmsize

>+    jmp         .loopW
>+
>+.width8:
>+    cmp         r6d, -8
>+    jl          .width4
>+    movq        [r1], xm6
>+    je          .nextH
>+    add         r1, 8
>+
>+.width4:
>+    cmp         r6d, -4
>+    jl          .width2
>+    movd        [r1], xm6
>+    je          .nextH
>+    add         r1, 4
>+    pshufd      m6, m6, 1
>+
>+.width2:
>+    pextrw      [r1], xm6, 0
>+
>+.nextH:
>+    mov         r0, tmp_r0
>+    mov         r1, tmp_r1
>+    lea         r0, [r0 + r2]
>+    lea         r1, [r1 + r3]
>+
>+    dec         r5d
>+    jnz         .loopH
>+    RET
>+
> ;-----------------------------------------------------------------
> ; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)
> ;-----------------------------------------------------------------

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20141024/bee280fb/attachment-0001.html>