<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><div style="color: rgb(0, 0, 0); line-height: 1.7; font-family: arial; font-size: 14px;">Code is right, just improve code style <div> A little tip, when you prepare calculate r2 with (r2 - width/mmsize*mmsize), you didn't need origin r0 anymore.</div><pre><br>At 2014-10-23 20:19:10,<a href="mailto:praveen@multicorewareinc.com">praveen@multicorewareinc.com</a> wrote:

># HG changeset patch

># User Praveen Tiwari

># Date 1414061274 -19800

># Node ID a19f9a6e25da7f4b8b8d2f5105eaaa19df4a6529

># Parent  ce304756a6e469b94cceef930e62972bd2168e4f

>weight_sp: avx2 asm code, improved from 7849.40 cycles to 4922.20 cycles over sse version

>

>diff -r ce304756a6e4 -r a19f9a6e25da source/common/x86/asm-primitives.cpp

>--- a/source/common/x86/asm-primitives.cpp     Wed Oct 22 23:16:13 2014 -0500

>+++ b/source/common/x86/asm-primitives.cpp     Thu Oct 23 16:17:54 2014 +0530

>@@ -1792,6 +1792,7 @@

>         p.scale1D_128to64 = x265_scale1D_128to64_avx2;

> 

>         p.weight_pp = x265_weight_pp_avx2;

>+        p.weight_sp = x265_weight_sp_avx2;

> 

> #if X86_64

>         p.dct[DCT_8x8] = x265_dct8_avx2;

>diff -r ce304756a6e4 -r a19f9a6e25da source/common/x86/pixel-util.h

>--- a/source/common/x86/pixel-util.h   Wed Oct 22 23:16:13 2014 -0500

>+++ b/source/common/x86/pixel-util.h   Thu Oct 23 16:17:54 2014 +0530

>@@ -60,6 +60,7 @@

> void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);

> void x265_weight_pp_avx2(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);

> void x265_weight_sp_sse4(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);

>+void x265_weight_sp_avx2(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);

> 

> void x265_pixel_ssim_4x4x2_core_mmx2(const uint8_t * pix1, intptr_t stride1,

>                                      const uint8_t * pix2, intptr_t stride2, int sums[2][4]);

>diff -r ce304756a6e4 -r a19f9a6e25da source/common/x86/pixel-util8.asm

>--- a/source/common/x86/pixel-util8.asm        Wed Oct 22 23:16:13 2014 -0500

>+++ b/source/common/x86/pixel-util8.asm        Thu Oct 23 16:17:54 2014 +0530

>@@ -1498,6 +1498,90 @@

> 

>     RET

> 

>+INIT_YMM avx2

>+%if ARCH_X86_64

>+cglobal weight_sp, 6, 7+2, 7

>+    %define tmp_r0      r7

>+    %define tmp_r1      r8

>+%else ; ARCH_X86_64 = 0

>+cglobal weight_sp, 6, 7, 7, 0-(2*4)

>+    %define tmp_r0      [(rsp + 0 * 4)]

>+    %define tmp_r1      [(rsp + 1 * 4)]

</pre><pre>4 ==> gprsize</pre><pre>>+%endif ; ARCH_X86_64

>+

>+    movd         xm0, r6m         ; m0 = [w0]

>+    movd         xm1, r7m         ; m1 = [round]

>+    punpcklwd    xm0, xm1

>+    pshufd       xm0, xm0, 0      ; m0 = [w0 round]

>+    vinserti128  m0,  m0, xm0, 1  ; document says (pshufd + vinserti128) can be replaced with vpbroadcastd m0, xm0, but having build problem, need to investigate

>+    movd         xm1, r8m         ; m1 = [shift]

>+    vpbroadcastd m2, r9m          ; m2 = [offset]

>+    vpbroadcastw m3, [pw_1]

>+    vpbroadcastw m4, [pw_2000]

>+

>+    add         r2d, r2d

>+

>+.loopH:

>+    mov         r6d, r4d

>+

>+    ; save old src and dst

>+    mov         tmp_r0, r0

>+    mov         tmp_r1, r1

>+.loopW:

>+    movu        m5, [r0]

>+    paddw       m5, m4

>+

>+    punpcklwd   m6, m5, m3

>+    pmaddwd     m6, m0

>+    psrad       m6, xm1

>+    paddd       m6, m2

>+

>+    punpckhwd   m5, m3

>+    pmaddwd     m5, m0

>+    psrad       m5, xm1

>+    paddd       m5, m2

>+

>+    packssdw     m6, m5

>+    vextracti128 xm5, m6, 1

>+    packuswb     xm6, xm5

>+

>+    sub         r6d, 16

>+    jl          .width8

>+    movu        [r1], xm6

>+    je          .nextH

>+    add         r0, 32

>+    add         r1, 16

</pre><pre>32 ==> mmsize</pre><pre> </pre><pre>>+    jmp         .loopW

>+

>+.width8:

>+    cmp         r6d, -8

>+    jl          .width4

>+    movq        [r1], xm6

>+    je          .nextH

>+    add         r1, 8

>+

>+.width4:

>+    cmp         r6d, -4

>+    jl          .width2

>+    movd        [r1], xm6

>+    je          .nextH

>+    add         r1, 4

>+    pshufd      m6, m6, 1

>+

>+.width2:

>+    pextrw      [r1], xm6, 0

>+

>+.nextH:

>+    mov         r0, tmp_r0

>+    mov         r1, tmp_r1

>+    lea         r0, [r0 + r2]

>+    lea         r1, [r1 + r3]

>+

>+    dec         r5d

>+    jnz         .loopH

>+    RET

>+

> ;-----------------------------------------------------------------

> ; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)

> ;-----------------------------------------------------------------

</pre><pre> </pre></div></div>