<div dir="ltr">Kindly dont push this patch. Need to add ARCH_X86_64 guard.<div><br></div><div>Thanks,</div><div>Aasaipriya</div></div><div class="gmail_extra"><br><div class="gmail_quote">On Tue, Jun 30, 2015 at 5:11 PM,  <span dir="ltr"><<a href="mailto:aasaipriya@multicorewareinc.com" target="_blank">aasaipriya@multicorewareinc.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><span class=""># HG changeset patch<br>
# User Aasaipriya Chandran <<a href="mailto:aasaipriya@multicorewareinc.com">aasaipriya@multicorewareinc.com</a>><br>
</span># Date 1435664485 -19800<br>
#      Tue Jun 30 17:11:25 2015 +0530<br>
# Node ID 0cc8a97207523ab1d1c14ee5bcd8c808be66f446<br>
# Parent  b1301944894051b9641006797e4d6253b277f3e4<br>
<span class="">asm: avx2 code for weight_sp() 16bpp<br>
<br>
</span> avx2: weight_sp  12.10x   4537.14         54879.57<br>
<span class=""> sse4: weight_sp  6.48x    8163.87         52870.36<br>
<br>
</span>diff -r b13019448940 -r 0cc8a9720752 source/common/x86/asm-primitives.cpp<br>
--- a/source/common/x86/asm-primitives.cpp      Mon Jun 29 17:19:07 2015 +0530<br>
+++ b/source/common/x86/asm-primitives.cpp      Tue Jun 30 17:11:25 2015 +0530<br>
@@ -1522,6 +1522,7 @@<br>
<span class="">         p.scale1D_128to64 = PFX(scale1D_128to64_avx2);<br>
         p.scale2D_64to32 = PFX(scale2D_64to32_avx2);<br>
         p.weight_pp = PFX(weight_pp_avx2);<br>
+        p.weight_sp = PFX(weight_sp_avx2);<br>
         p.sign = PFX(calSign_avx2);<br>
</span>         p.planecopy_cp = PFX(upShift_8_avx2);<br>
<br>
diff -r b13019448940 -r 0cc8a9720752 source/common/x86/pixel-util8.asm<br>
--- a/source/common/x86/pixel-util8.asm Mon Jun 29 17:19:07 2015 +0530<br>
+++ b/source/common/x86/pixel-util8.asm Tue Jun 30 17:11:25 2015 +0530<br>
@@ -1669,8 +1669,128 @@<br>
<span class="">     dec         r5d<br>
     jnz         .loopH<br>
     RET<br>
-<br>
-%if ARCH_X86_64<br>
+%endif<br>
+<br>
+<br>
+%if HIGH_BIT_DEPTH<br>
+INIT_YMM avx2<br>
</span>+cglobal weight_sp, 6,10,9<br>
<span class="">+    mova                      m1, [pw_1023]<br>
+    mova                      m2, [pw_1]<br>
+    mov                       r6d, r7m<br>
</span><div><div class="h5">+    shl                       r6d, 16<br>
+    or                        r6d, r6m<br>
+    vpbroadcastd              m3, r6d      ; m3 = [round w0]<br>
+    movd                      xm4, r8m     ; m4 = [shift]<br>
+    vpbroadcastd              m5, r9m      ; m5 = [offset]<br>
+<br>
+    ; correct row stride<br>
+    add                       r3d, r3d<br>
+    add                       r2d, r2d<br>
+    mov                       r6d, r4d<br>
+    and                       r6d, ~(mmsize / SIZEOF_PIXEL - 1)<br>
+    sub                       r3d, r6d<br>
+    sub                       r3d, r6d<br>
+    sub                       r2d, r6d<br>
+    sub                       r2d, r6d<br>
+<br>
+    ; generate partial width mask (MUST BE IN YMM0)<br>
+    mov                       r6d, r4d<br>
+    and                       r6d, (mmsize / SIZEOF_PIXEL - 1)<br>
+    movd                      xm0, r6d<br>
+    pshuflw                   m0, m0, 0<br>
+    punpcklqdq                m0, m0<br>
+    vinserti128               m0, m0, xm0, 1<br>
+    pcmpgtw                   m0, [pw_0_15]<br>
+<br>
+.loopH:<br>
+    mov                       r6d, r4d<br>
+<br>
+.loopW:<br>
+    movu                      m6, [r0]<br>
+    paddw                     m6, [pw_2000]<br>
+<br>
+    punpcklwd                 m7, m6, m2<br>
+    pmaddwd                   m7, m3       ;(round w0)<br>
+    psrad                     m7, xm4      ;(shift)<br>
+    paddd                     m7, m5       ;(offset)<br>
+<br>
+    punpckhwd                 m6, m2<br>
+    pmaddwd                   m6, m3<br>
+    psrad                     m6, xm4<br>
+    paddd                     m6, m5<br>
+<br>
+    packusdw                  m7, m6<br>
+    pminuw                    m7, m1<br>
+<br>
+    sub                       r6d, (mmsize / SIZEOF_PIXEL)<br>
+    jl                        .width14<br>
+    movu                      [r1], m7<br>
+    lea                       r0, [r0 + mmsize]<br>
+    lea                       r1, [r1 + mmsize]<br>
+    je                        .nextH<br>
+    jmp                       .loopW<br>
+<br>
+.width14:<br>
+    add                       r6d, 16<br>
+    cmp                       r6d, 14<br>
+    jl                        .width12<br>
+    movu                      [r1], xm7<br>
+    vextracti128              xm8, m7, 1<br>
+    movq                      [r1 + 16], xm8<br>
+    pextrd                    [r1 + 24], xm8, 2<br>
+    je                        .nextH<br>
+<br>
+.width12:<br>
+    cmp                       r6d, 12<br>
+    jl                        .width10<br>
+    movu                      [r1], xm7<br>
+    vextracti128              xm8, m7, 1<br>
+    movq                      [r1 + 16], xm8<br>
+    je                        .nextH<br>
+<br>
+.width10:<br>
+    cmp                       r6d, 10<br>
+    jl                        .width8<br>
+    movu                      [r1], xm7<br>
+    vextracti128              xm8, m7, 1<br>
+    movd                      [r1 + 16], xm8<br>
+    je                        .nextH<br>
+<br>
+.width8:<br>
+    cmp                       r6d, 8<br>
+    jl                        .width6<br>
+    movu                      [r1], xm7<br>
+    je                        .nextH<br>
+<br>
+.width6<br>
+    cmp                       r6d, 6<br>
+    jl                        .width4<br>
+    movq                      [r1], xm7<br>
+    pextrd                    [r1 + 8], xm7, 2<br>
+    je                        .nextH<br>
+<br>
+.width4:<br>
+    cmp                       r6d, 4<br>
+    jl                        .width2<br>
+    movq                      [r1], xm7<br>
+    je                        .nextH<br>
+    add                       r1, 4<br>
+    pshufd                    m6, m6, 1<br>
+    je                        .nextH<br>
+<br>
+.width2:<br>
+    movd                      [r1], xm7<br>
+<br>
+.nextH:<br>
+    add                       r0, r2<br>
+    add                       r1, r3<br>
+<br>
+    dec                       r5d<br>
+    jnz                       .loopH<br>
+    RET<br>
+<br>
+%else<br>
 INIT_YMM avx2<br>
 cglobal weight_sp, 6, 9, 7<br>
     mov             r7d, r7m<br>
</div></div>@@ -1747,8 +1867,6 @@<br>
<div class="HOEnZb"><div class="h5">     jnz             .loopH<br>
     RET<br>
 %endif<br>
-%endif  ; end of (HIGH_BIT_DEPTH == 0)<br>
-<br>
<br>
 ;-----------------------------------------------------------------<br>
 ; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)<br>
</div></div></blockquote></div><br></div>