<div dir="ltr">yes i will flag in our AVX2 development progress list<div><br></div><div>Regards</div><div>Sumalatha</div></div><div class="gmail_extra"><br><div class="gmail_quote">On Thu, Apr 2, 2015 at 3:32 PM, chen <span dir="ltr"><<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><div>we can use this version and improve in future</div>
<div>please make a flag in list to remember we improve it </div>
<div><div><div class="h5"><br>At 2015-04-02 17:51:08,<a href="mailto:sumalatha@multicorewareinc.com" target="_blank">sumalatha@multicorewareinc.com</a> wrote:<br>># HG changeset patch<br>># User Sumalatha Polureddy<br>># Date 1427968258 -19800<br>>#      Thu Apr 02 15:20:58 2015 +0530<br>># Node ID 7f976e1e89c5940a8bb2f5b965ebd9ed6e6948a6<br>># Parent  ac85c775620f1dcb0df056874633cbf916098bd2<br>>asm: avx2 code for weight_sp() for 8bpp<br>><br>>sse4<br>>weight_sp  16.40x   7768.71         127369.20<br>><br>>avx2<br>>weight_sp  25.83x   4918.74         127040.17<br>><br>>diff -r ac85c775620f -r 7f976e1e89c5 source/common/x86/asm-primitives.cpp<br>>--- a/source/common/x86/asm-primitives.cpp        Tue Mar 31 20:04:28 2015 -0500<br>>+++ b/source/common/x86/asm-primitives.cpp    Thu Apr 02 15:20:58 2015 +0530<br>>@@ -1604,6 +1604,7 @@<br>> <br>>         p.scale1D_128to64 = x265_scale1D_128to64_avx2;<br>>         p.weight_pp = x265_weight_pp_avx2;<br>>+        p.weight_sp = x265_weight_sp_avx2;<br>> <br>>         // intra_pred functions<br>>         <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_8x8].intra_pred[3] = x265_intra_pred_ang8_3_avx2;<br>>diff -r ac85c775620f -r 7f976e1e89c5 source/common/x86/pixel-util8.asm<br>>--- a/source/common/x86/pixel-util8.asm        Tue Mar 31 20:04:28 2015 -0500<br>>+++ b/source/common/x86/pixel-util8.asm       Thu Apr 02 15:20:58 2015 +0530<br>>@@ -1492,6 +1492,84 @@<br>>     dec         r5d<br>>     jnz         .loopH<br>>     RET<br>>+<br>>+%if ARCH_X86_64<br>>+INIT_YMM avx2<br>>+cglobal weight_sp, 6, 9, 7<br>>+    mov             r7d, r7m<br>>+    shl             r7d, 16<br>>+    or              r7d, r6m<br>>+    vpbroadcastd    m0, r7d            ; m0 = times 8 dw w0, round<br>>+    movd            xm1, r8m            ; m1 = [shift]<br>>+    vpbroadcastd    m2, r9m            ; m2 = times 16 dw offset<br>>+    vpbroadcastw    m3, [pw_1]<br>>+    vpbroadcastw    m4, [pw_2000]<br>>+<br>>+    add             r2d, r2d            ; 2 * srcstride<br>>+<br>>+    mov             r7, r0<br>>+    mov             r8, r1<br>>+.loopH:<br>>+    mov             r6d, r4d            ; width<br>>+<br>>+    ; save old src and dst<br>>+    mov             r0, r7              ; src<br>>+    mov             r1, r8              ; dst<br>>+.loopW:<br>>+    movu            m5, [r0]<br>>+    paddw           m5, m4<br>>+<br>>+    punpcklwd       m6,m5, m3<br>>+    pmaddwd         m6, m0<br>>+    psrad           m6, xm1<br>>+    paddd           m6, m2<br>>+<br>>+    punpckhwd       m5, m3<br>>+    pmaddwd         m5, m0<br>>+    psrad           m5, xm1<br>>+    paddd           m5, m2<br>>+<br>>+    packssdw        m6, m5<br>>+    packuswb        m6, m6<br>>+    vpermq          m6, m6, 10001000b<br>>+<br>>+    sub             r6d, 16<br>>+    jl              .width8<br>>+    movu            [r1], xm6<br>>+    je              .nextH<br>>+    add             r0, 32<br>>+    add             r1, 16<br>>+    jmp             .loopW<br>>+<br>>+.width8:<br>>+    add             r6d, 16<br>>+    cmp             r6d, 8<br>>+    jl              .width4<br>>+    movq            [r1], xm6<br>>+    je              .nextH<br>>+    psrldq          m6, 8<br>>+    sub             r6d, 8<br>>+    add             r1, 8<br>>+<br>>+.width4:<br>>+    cmp             r6d, 4<br>>+    jl              .width2<br>>+    movd            [r1], xm6<br>>+    je              .nextH<br>>+    add             r1, 4<br>>+    pshufd          m6, m6, 1<br>>+<br>>+.width2:<br>>+    pextrw          [r1], xm6, 0<br>>+<br>>+.nextH:<br>>+    lea             r7, [r7 + r2]<br>>+    lea             r8, [r8 + r3]<br>>+<br>>+    dec             r5d<br>>+    jnz             .loopH<br>>+    RET<br>>+%endif<br>> %endif  ; end of (HIGH_BIT_DEPTH == 0)<br>>     <br>> <br>>diff -r ac85c775620f -r 7f976e1e89c5 source/common/x86/pixel.h<br>>--- a/source/common/x86/pixel.h        Tue Mar 31 20:04:28 2015 -0500<br>>+++ b/source/common/x86/pixel.h       Thu Apr 02 15:20:58 2015 +0530<br>>@@ -272,6 +272,7 @@<br>> int x265_psyCost_ss_16x16_avx2(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);<br>> int x265_psyCost_ss_32x32_avx2(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);<br>> int x265_psyCost_ss_64x64_avx2(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);<br>>+void x265_weight_sp_avx2(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);<br>> <br>> #undef DECL_PIXELS<br>> #undef DECL_HEVC_SSD<br></div></div>>_______________________________________________<br>>x265-devel mailing list<br>><a href="mailto:x265-devel@videolan.org" target="_blank">x265-devel@videolan.org</a><br>><a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br></div></div><br>_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
<br></blockquote></div><br></div>