<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Fri, Mar 21, 2014 at 8:01 PM, chen <span dir="ltr"><<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>></span> wrote:<br>


<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">At 2014-03-21 13:35:31,<a href="mailto:murugan@multicorewareinc.com">murugan@multicorewareinc.com</a> wrote:<br>


<div class="">># HG changeset patch<br>


># User Murugan Vairavel <<a href="mailto:murugan@multicorewareinc.com">murugan@multicorewareinc.com</a>><br>


># Date 1395379456 -19800<br>


>#      Fri Mar 21 10:54:16 2014 +0530<br>


># Node ID 29728f7728591116192575d411ef2db2dff49c18<br>


># Parent  435e50b2b92c83e10fdb2bd86bc8e8df91b7338b<br>


>asm: code for input pixel upShift/downShift<br>


><br>


</div><div class="">>+; Input 10bpp, Output 8bpp, width is multiple of 16<br>


>+;------------------------------------------------------------------------------------------------------------------------<br>


>+;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)<br>


>+;------------------------------------------------------------------------------------------------------------------------<br>


>+INIT_XMM sse2<br>


>+cglobal downShift_10, 7,7,3<br>


>+    movd        m0, r6d        ; m0 = shift<br>


>+    add         r1, r1<br>


>+    dec         r5d>+.loopH:><br>


>+    xor         r6, r6<br>


</div>tip: r6 is a offset, when you do prepare 'r1=r1-r4', you may direct operator on r0<br></blockquote><div>But the pixels processed in each row is not equal to the width(r4), in case the width is not a multiple of 16. If i do it as above then the output mismatch will occur. </div>


<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">


<div class=""><br>


>+.loopW:<br>


>+    movu        m1, [r0 + r6 * 2]<br>


>+    movu        m2, [r0 + r6 * 2 + 16]<br>


>+    psrlw       m1, m0<br>


>+    psrlw       m2, m0<br>


>+    packuswb    m1, m2<br>


>+    movu        [r2 + r6], m1<br>


>+<br>


>+    add         r6, 16<br>


>+    cmp         r6d, r4d<br>


>+    jl          .loopW<br>


>+<br>


>+    ; move to next row<br>


>+    lea         r0, [r0 + r1]<br>


>+    lea         r2, [r2 + r3]<br>


</div>add r0,r1<br>


add r2,r3<br></blockquote><div>I will modify that. </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">


<div class=""><br>


>+    dec         r5d<br>


>+    jnz         .loopH<br>


>+<br>


>+;processing last row of every frame [To handle width which not a multiple of 16]<br>


>+<br>


>+.loop16:<br>


>+    movu        m1, [r0]<br>


>+    movu        m2, [r0 + 16]<br>


>+    psrlw       m1, m0<br>


>+    psrlw       m2, m0<br>


>+    packuswb    m1, m2<br>


>+    movu        [r2], m1<br>


>+<br>


>+    add         r0, 2 * mmsize<br>


>+    add         r2, mmsize<br>


>+    sub         r4d, 16<br>


>+    jz          .end<br>


>+    cmp         r4d, 15<br>


>+    jg          .loop16<br>


<br>


</div>--> (X > 16) && (X >15) ??<br></blockquote><div>means?? </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">


<div><div class="h5"><br>


>+    cmp         r4d, 8<br>


>+    jl          .process4<br>


>+    movu        m1, [r0]<br>


>+    psrlw       m1, m0<br>


>+    packuswb    m1, m1<br>


>+    movh        [r2], m1<br>


>+<br>


>+    add         r0, mmsize<br>


>+    add         r2, 8<br>


>+    sub         r4d, 8<br>


>+    jz          .end<br>


>+<br>


>+.process4:<br>


>+    cmp         r4d, 4<br>


>+    jl          .process2<br>


>+    movh        m1,[r0]<br>


>+    psrlw       m1, m0<br>


>+    packuswb    m1, m1<br>


>+    movd        [r2], m1<br>


>+<br>


>+    add         r0, 8<br>


>+    add         r2, 4<br>


>+    sub         r4d, 4<br>


>+    jz          .end<br>


>+<br>


>+.process2:<br>


>+    cmp         r4d, 2<br>


>+    jl          .process1<br>


>+    movd        m1, [r0]<br>


>+    psrlw       m1, m0<br>


>+    packuswb    m1, m1<br>


>+    movd        r6, m1<br>


>+    mov         [r2], r6w<br>


>+<br>


>+    add         r0, 4<br>


>+    add         r2, 2<br>


>+    sub         r4d, 2<br>


>+    jz          .end<br>


>+<br>


>+.process1:<br>


>+    movd        m1, [r0]<br>


>+    psrlw       m1, m0<br>


>+    packuswb    m1, m1<br>


>+    movd        r6, m1<br>


>+    mov         [r2], r6b<br>


>+.end:<br>


>+    RET<br>


<br>


</div></div>(4, 2, 1) pixels path may share calculate code<br></blockquote><div>Do you mean defining a macro for that?? </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">


<div class=""><br>


<br>


>+; Input 8bpp, Output 16bpp, width is multiple of 16<br>


>+;-----------------------------------------------------------------------------------------------------<br>


>+;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height)<br>


>+;-----------------------------------------------------------------------------------------------------<br>


>+INIT_XMM sse4<br>


</div>>+cglobal upShift_8, 6,7,2upShift_8to10<br>


<div><div class="h5">>+<br>


>+    add         r3, r3<br>


>+    dec         r5d<br>


>+<br>


>+.loopH:<br>


>+    xor         r6, r6<br>


>+.loopW:<br>


>+    pmovzxbw    m0,[r0 + r6]<br>


>+    pmovzxbw    m1,[r0 + r6 + 8]<br>


>+    psllw       m0, 2<br>


>+    psllw       m1, 2<br>


>+    movu        [r2 + r6 * 2], m0<br>


>+    movu        [r2 + r6 * 2 + 16], m1<br>


>+<br>


>+    add         r6, 16<br>


>+    cmp         r6d, r4d<br>


>+    jl          .loopW<br>


>+<br>


>+    ; move to next row<br>


>+    lea         r0, [r0 + r1]<br>


>+    lea         r2, [r2 + r3]<br>


>+    dec         r5d<br>


>+    jnz         .loopH<br>


>+<br>


>+;processing last row of every frame [To handle width which not a multiple of 16]<br>


<br>


</div></div>same comment as previous module<br>


<div><div class="h5"><br>


>+.loop16:<br>


>+    pmovzxbw    m0,[r0]<br>


>+    pmovzxbw    m1,[r0 + 8]<br>


>+    psllw       m0, 2<br>


>+    psllw       m1, 2<br>


>+    movu        [r2], m0<br>


>+    movu        [r2 + 16], m1<br>


>+<br>


>+    add         r0, mmsize<br>


>+    add         r2, 2 * mmsize<br>


>+    sub         r4d, 16<br>


>+    jz          .end<br>


>+    cmp         r4d, 15<br>


>+    jg          .loop16<br>


>+<br>


>+    cmp         r4d, 8<br>


>+    jl          .process4<br>


>+    pmovzxbw    m0,[r0]<br>


>+    psllw       m0, 2<br>


>+    movu        [r2], m0<br>


>+<br>


>+    add         r0, 8<br>


>+    add         r2, mmsize<br>


>+    sub         r4d, 8<br>


>+    jz          .end<br>


>+<br>


>+.process4:<br>


>+    cmp         r4d, 4<br>


>+    jl          .process2<br>


>+    movd        m0,[r0]<br>


>+    pmovzxbw    m0,m0<br>


>+    psllw       m0, 2<br>


>+    movh        [r2], m0<br>


>+<br>


>+    add         r0, 4<br>


>+    add         r2, 8<br>


>+    sub         r4d, 4<br>


>+    jz          .end<br>


>+<br>


>+.process2:<br>


>+    cmp         r4d, 2<br>


>+    jl          .process1<br>


>+    movzx       r6d, byte [r0]<br>


>+    shl         r6d, 2<br>


>+    mov         [r2], r6w<br>


>+    movzx       r6d, byte [r0 + 1]<br>


>+    shl         r6d, 2<br>


>+    mov         [r2 + 2], r6w<br>


>+<br>


>+    add         r0, 2<br>


>+    add         r2, 4<br>


>+    sub         r4d, 2<br>


>+    jz          .end<br>


>+<br>


>+.process1:<br>


>+    movzx       r6d, byte [r0]<br>


>+    shl         r6d, 2<br>


>+    mov         [r2], r6w<br>


>+.end:<br>


>+    RET<br>


</div></div>_______________________________________________<br>


x265-devel mailing list<br>


<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>


<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>


</blockquote></div><br><br clear="all"><div><br></div>-- <br><div dir="ltr">With Regards,<div><br></div><div>Murugan. V</div><div>+919659287478</div></div>


</div></div>