[x265] [PATCH 3 of 3] asm: code for input pixel upShift/downShift

Murugan Vairavel murugan at multicorewareinc.com
Mon Mar 24 11:13:30 CET 2014


On Fri, Mar 21, 2014 at 8:01 PM, chen <chenm003 at 163.com> wrote:

> At 2014-03-21 13:35:31,murugan at multicorewareinc.com wrote:
> ># HG changeset patch
> ># User Murugan Vairavel <murugan at multicorewareinc.com>
> ># Date 1395379456 -19800
> >#      Fri Mar 21 10:54:16 2014 +0530
> ># Node ID 29728f7728591116192575d411ef2db2dff49c18
> ># Parent  435e50b2b92c83e10fdb2bd86bc8e8df91b7338b
> >asm: code for input pixel upShift/downShift
> >
> >+; Input 10bpp, Output 8bpp, width is multiple of 16
>
> >+;------------------------------------------------------------------------------------------------------------------------
>
> >+;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
>
> >+;------------------------------------------------------------------------------------------------------------------------
> >+INIT_XMM sse2
> >+cglobal downShift_10, 7,7,3
> >+    movd        m0, r6d        ; m0 = shift
> >+    add         r1, r1
> >+    dec         r5d>+.loopH:>
> >+    xor         r6, r6
> tip: r6 is a offset, when you do prepare 'r1=r1-r4', you may direct
> operator on r0
>
But the pixels processed in each row is not equal to the width(r4), in case
the width is not a multiple of 16. If i do it as above then the output
mismatch will occur.

>
> >+.loopW:
> >+    movu        m1, [r0 + r6 * 2]
> >+    movu        m2, [r0 + r6 * 2 + 16]
> >+    psrlw       m1, m0
> >+    psrlw       m2, m0
> >+    packuswb    m1, m2
> >+    movu        [r2 + r6], m1
> >+
> >+    add         r6, 16
> >+    cmp         r6d, r4d
> >+    jl          .loopW
> >+
> >+    ; move to next row
> >+    lea         r0, [r0 + r1]
> >+    lea         r2, [r2 + r3]
> add r0,r1
> add r2,r3
>
I will modify that.

>
> >+    dec         r5d
> >+    jnz         .loopH
> >+
>
> >+;processing last row of every frame [To handle width which not a multiple of 16]
> >+
> >+.loop16:
> >+    movu        m1, [r0]
> >+    movu        m2, [r0 + 16]
> >+    psrlw       m1, m0
> >+    psrlw       m2, m0
> >+    packuswb    m1, m2
> >+    movu        [r2], m1
> >+
> >+    add         r0, 2 * mmsize
> >+    add         r2, mmsize
> >+    sub         r4d, 16
> >+    jz          .end
> >+    cmp         r4d, 15
> >+    jg          .loop16
>
> --> (X > 16) && (X >15) ??
>
means??

>
> >+    cmp         r4d, 8
> >+    jl          .process4
> >+    movu        m1, [r0]
> >+    psrlw       m1, m0
> >+    packuswb    m1, m1
> >+    movh        [r2], m1
> >+
> >+    add         r0, mmsize
> >+    add         r2, 8
> >+    sub         r4d, 8
> >+    jz          .end
> >+
> >+.process4:
> >+    cmp         r4d, 4
> >+    jl          .process2
> >+    movh        m1,[r0]
> >+    psrlw       m1, m0
> >+    packuswb    m1, m1
> >+    movd        [r2], m1
> >+
> >+    add         r0, 8
> >+    add         r2, 4
> >+    sub         r4d, 4
> >+    jz          .end
> >+
> >+.process2:
> >+    cmp         r4d, 2
> >+    jl          .process1
> >+    movd        m1, [r0]
> >+    psrlw       m1, m0
> >+    packuswb    m1, m1
> >+    movd        r6, m1
> >+    mov         [r2], r6w
> >+
> >+    add         r0, 4
> >+    add         r2, 2
> >+    sub         r4d, 2
> >+    jz          .end
> >+
> >+.process1:
> >+    movd        m1, [r0]
> >+    psrlw       m1, m0
> >+    packuswb    m1, m1
> >+    movd        r6, m1
> >+    mov         [r2], r6b
> >+.end:
> >+    RET
>
> (4, 2, 1) pixels path may share calculate code
>
Do you mean defining a macro for that??

>
>
> >+; Input 8bpp, Output 16bpp, width is multiple of 16
>
> >+;-----------------------------------------------------------------------------------------------------
>
> >+;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height)
>
> >+;-----------------------------------------------------------------------------------------------------
> >+INIT_XMM sse4
> >+cglobal upShift_8, 6,7,2upShift_8to10
> >+
> >+    add         r3, r3
> >+    dec         r5d
> >+
> >+.loopH:
> >+    xor         r6, r6
> >+.loopW:
> >+    pmovzxbw    m0,[r0 + r6]
> >+    pmovzxbw    m1,[r0 + r6 + 8]
> >+    psllw       m0, 2
> >+    psllw       m1, 2
> >+    movu        [r2 + r6 * 2], m0
> >+    movu        [r2 + r6 * 2 + 16], m1
> >+
> >+    add         r6, 16
> >+    cmp         r6d, r4d
> >+    jl          .loopW
> >+
> >+    ; move to next row
> >+    lea         r0, [r0 + r1]
> >+    lea         r2, [r2 + r3]
> >+    dec         r5d
> >+    jnz         .loopH
> >+
>
> >+;processing last row of every frame [To handle width which not a multiple of 16]
>
> same comment as previous module
>
> >+.loop16:
> >+    pmovzxbw    m0,[r0]
> >+    pmovzxbw    m1,[r0 + 8]
> >+    psllw       m0, 2
> >+    psllw       m1, 2
> >+    movu        [r2], m0
> >+    movu        [r2 + 16], m1
> >+
> >+    add         r0, mmsize
> >+    add         r2, 2 * mmsize
> >+    sub         r4d, 16
> >+    jz          .end
> >+    cmp         r4d, 15
> >+    jg          .loop16
> >+
> >+    cmp         r4d, 8
> >+    jl          .process4
> >+    pmovzxbw    m0,[r0]
> >+    psllw       m0, 2
> >+    movu        [r2], m0
> >+
> >+    add         r0, 8
> >+    add         r2, mmsize
> >+    sub         r4d, 8
> >+    jz          .end
> >+
> >+.process4:
> >+    cmp         r4d, 4
> >+    jl          .process2
> >+    movd        m0,[r0]
> >+    pmovzxbw    m0,m0
> >+    psllw       m0, 2
> >+    movh        [r2], m0
> >+
> >+    add         r0, 4
> >+    add         r2, 8
> >+    sub         r4d, 4
> >+    jz          .end
> >+
> >+.process2:
> >+    cmp         r4d, 2
> >+    jl          .process1
> >+    movzx       r6d, byte [r0]
> >+    shl         r6d, 2
> >+    mov         [r2], r6w
> >+    movzx       r6d, byte [r0 + 1]
> >+    shl         r6d, 2
> >+    mov         [r2 + 2], r6w
> >+
> >+    add         r0, 2
> >+    add         r2, 4
> >+    sub         r4d, 2
> >+    jz          .end
> >+
> >+.process1:
> >+    movzx       r6d, byte [r0]
> >+    shl         r6d, 2
> >+    mov         [r2], r6w
> >+.end:
> >+    RET
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>



-- 
With Regards,

Murugan. V
+919659287478
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140324/4c92e1bf/attachment-0001.html>


More information about the x265-devel mailing list