[x265] [PATCH] asm: interp_8tap_horiz pp and ps sse2

dave dtyx265 at gmail.com
Wed Apr 29 01:49:46 CEST 2015


On 04/28/2015 03:32 PM, chen wrote:
> Most part are fine now, just modify about r5, see below comment
>
> At 2015-04-29 06:27:27,dtyx265 at gmail.com wrote:
> ># HG changeset patch
> ># User David T Yuendtyx265 at gmail.com>
> ># Date 1430259967 25200
> ># Node ID 6108fbda1be654a481a78f7ef593518033919674
> ># Parent  e9df93f380664932e7d6c7e85b2cae16cd5e1dcd
> >asm: interp_8tap_horiz pp and ps sse2
> >
> >This replaces c code and covers
> >
>   <mailto:dtyx265 at gmail.com%3E%3E#%A0Date%A01430259967%A025200%3E#%A0Node%A0ID%A06108fbda1be654a481a78f7ef593518033919674%3E#%A0Parent%A0%A0e9df93f380664932e7d6c7e85b2cae16cd5e1dcd%3Easm:%A0interp_8tap_horiz%A0pp%A0and%A0ps%A0sse2%3E%3EThis%A0replaces%A0c%A0code%A0and%A0covers%3E>+;----------------------------------------------------------------------------------------------------------------------------
> >+; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
> >+;----------------------------------------------------------------------------------------------------------------------------
> >+%macro IPFILTER_LUMA_sse2 3
> >+INIT_XMM sse2
> >+cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8
> >+
> >+    mov       r4d, r4m
> >+    add       r4d, r4d
> >+    pxor      m6, m6
> >+%ifdef PIC
> >+    lea       r6, [tabw_LumaCoeff]
> >+    movu      m3, [r6 + r4 * 8]
> >+%else
> >+    movu      m3, [tabw_LumaCoeff + r4 * 8]
> >+%endif
> >+
> >+    mov       r4d, %2
> >+%ifidn %3, pp
> >+    mova      m2, [pw_32]
> >+%else
> >+    mova      m2, [pw_2000]
> >+    add       r3d, r3d
> >+    cmp       r5m, byte 0
> if we move above 2 lines to up, we can reduce r6 and reuse r5.
I am not sure if this can be done.  r4 is used to to set m3 then it is 
reused and modified depending on r5 and r5 can't be used for something 
else before it's cmp'ed.
> >+    je        .loopH
> >+    lea       r6, [r1 + 2 * r1]
> >+    sub       r0d, r6d
> >+    add       r4d, 7
> >+%endif
> >+
> >+.loopH:
> >+%assign x 0
> >+%rep %1 / 8
> >+    FILTER_H8_W8_sse2
> >+  %ifidn %3, pp
> >+    paddw     m1, m2
> >+    psraw     m1, 6
> >+    packuswb  m1, m1
> >+    movh      [r2 + x], m1
> >+  %else
> >+    psubw     m1, m2
> >+    movu      [r2 + 2 * x], m1
> >+  %endif
> >+%assign x x+8
> >+%endrep
> >+
> >+%rep (%1 % 8) / 4
> >+    FILTER_H8_W4_sse2
> >+  %ifidn %3, pp
> >+    paddw     m1, m2
> >+    psraw     m1, 6
> >+    packuswb  m1, m1
> >+    movd      [r2 + x], m1
> >+  %else
> >+    psubw     m1, m2
> >+    movh      [r2 + 2 * x], m1
> >+  %endif
> >+%endrep
> >+
> >+    add       r0d, r1d
> >+    add       r2d, r3d
> >+
> >+    dec       r4d
> >+    jnz       .loopH
> >+    RET
> >+
> >+%endmacro
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150428/865f5cba/attachment.html>


More information about the x265-devel mailing list