[x265] [PATCH] asm: interp_8tap_horiz pp and ps sse2

chen chenm003 at 163.com
Wed Apr 29 00:32:24 CEST 2015


Most part are fine now, just modify about r5, see below comment



At 2015-04-29 06:27:27,dtyx265 at gmail.com wrote:
># HG changeset patch
># User David T Yuen dtyx265 at gmail.com>
># Date 1430259967 25200
># Node ID 6108fbda1be654a481a78f7ef593518033919674
># Parent  e9df93f380664932e7d6c7e85b2cae16cd5e1dcd
>asm: interp_8tap_horiz pp and ps sse2
>
>This replaces c code and covers
>
+;----------------------------------------------------------------------------------------------------------------------------
>+; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
>+;----------------------------------------------------------------------------------------------------------------------------
>+%macro IPFILTER_LUMA_sse2 3
>+INIT_XMM sse2
>+cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8
>+
>+    mov       r4d, r4m
>+    add       r4d, r4d
>+    pxor      m6, m6
>+%ifdef PIC
>+    lea       r6, [tabw_LumaCoeff]
>+    movu      m3, [r6 + r4 * 8]
>+%else
>+    movu      m3, [tabw_LumaCoeff + r4 * 8]
>+%endif
>+
>+    mov       r4d, %2
>+%ifidn %3, pp
>+    mova      m2, [pw_32]
>+%else
>+    mova      m2, [pw_2000]
>+    add       r3d, r3d
>+    cmp       r5m, byte 0
if we move above 2 lines to up, we can reduce r6 and reuse r5.

>+    je        .loopH
>+    lea       r6, [r1 + 2 * r1]
>+    sub       r0d, r6d
>+    add       r4d, 7
>+%endif
>+
>+.loopH:
>+%assign x 0
>+%rep %1 / 8
>+    FILTER_H8_W8_sse2
>+  %ifidn %3, pp
>+    paddw     m1, m2
>+    psraw     m1, 6
>+    packuswb  m1, m1
>+    movh      [r2 + x], m1
>+  %else
>+    psubw     m1, m2
>+    movu      [r2 + 2 * x], m1
>+  %endif
>+%assign x x+8
>+%endrep
>+
>+%rep (%1 % 8) / 4
>+    FILTER_H8_W4_sse2
>+  %ifidn %3, pp
>+    paddw     m1, m2
>+    psraw     m1, 6
>+    packuswb  m1, m1
>+    movd      [r2 + x], m1
>+  %else
>+    psubw     m1, m2
>+    movh      [r2 + 2 * x], m1
>+  %endif
>+%endrep
>+
>+    add       r0d, r1d
>+    add       r2d, r3d
>+
>+    dec       r4d
>+    jnz       .loopH
>+    RET
>+
>+%endmacro
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150429/c559a735/attachment-0001.html>


More information about the x265-devel mailing list