[x265] [PATCH] asm: interp_8tap_horiz pp and ps sse2

dave dtyx265 at gmail.com
Wed Apr 29 03:51:31 CEST 2015


On 04/28/2015 06:38 PM, chen wrote:
>
> 在 2015-04-29 09:30:36,dave <dtyx265 at gmail.com> 写道:
>
>     On 04/28/2015 06:13 PM, chen wrote:
>>
>>     在 2015-04-29 07:49:46,dave <dtyx265 at gmail.com> 写道:
>>
>>         On 04/28/2015 03:32 PM, chen wrote:
>>>         Most part are fine now, just modify about r5, see below comment
>>>
>>>         At 2015-04-29 06:27:27,dtyx265 at gmail.com  wrote:
>>>         ># HG changeset patch
>>>         ># User David T Yuendtyx265 at gmail.com>
>>>         ># Date 1430259967 25200
>>>         ># Node ID 6108fbda1be654a481a78f7ef593518033919674
>>>         ># Parent  e9df93f380664932e7d6c7e85b2cae16cd5e1dcd
>>>         >asm: interp_8tap_horiz pp and ps sse2
>>>         >
>>>         >This replaces c code and covers
>>>         >
>>>           <mailto:dtyx265 at gmail.com%3E%3E#%A0Date%A01430259967%A025200%3E#%A0Node%A0ID%A06108fbda1be654a481a78f7ef593518033919674%3E#%A0Parent%A0%A0e9df93f380664932e7d6c7e85b2cae16cd5e1dcd%3Easm:%A0interp_8tap_horiz%A0pp%A0and%A0ps%A0sse2%3E%3EThis%A0replaces%A0c%A0code%A0and%A0covers%3E>+;----------------------------------------------------------------------------------------------------------------------------
>>>         >+; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
>>>         >+;----------------------------------------------------------------------------------------------------------------------------
>>>         >+%macro IPFILTER_LUMA_sse2 3
>>>         >+INIT_XMM sse2
>>>         >+cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8
>>>         >+
>>>         >+    mov       r4d, r4m
>>>         >+    add       r4d, r4d
>>>         >+    pxor      m6, m6
>>>         >+%ifdef PIC
>>>         >+    lea       r6, [tabw_LumaCoeff]
>>>         >+    movu      m3, [r6 + r4 * 8]
>>>         >+%else
>>>         >+    movu      m3, [tabw_LumaCoeff + r4 * 8]
>>>         >+%endif
>>>         >+
>>>         >+    mov       r4d, %2
>>>         >+%ifidn %3, pp
>>>         >+    mova      m2, [pw_32]
>>>         >+%else
>>>         >+    mova      m2, [pw_2000]
>>>         >+    add       r3d, r3d
>>>         >+    cmp       r5m, byte 0
>>>         if we move above 2 lines to up, we can reduce r6 and reuse r5.
>>         I am not sure if this can be done.  r4 is used to to set m3
>>         then it is reused and modified depending on r5 and r5 can't
>>         be used for something else before it's cmp'ed.
>>>         'mov, lea' didn't affect eflags register
>>
>>     but r4 is needed for the lea instruction and r4 is later reused
>>     and modified depending on r5.  Only one of them can be reused,
>>     not both, so r6 is needed.
>>
>>
>>     How about below code:
>>          pxor      m6, m6
>>          add       r3d, r3d
>>
>>          cmp       r5m, byte 0
>>     %ifdef PIC
>>          lea       r6, [tabw_LumaCoeff]
>>          movu      m3, [r6 + r4 * 8]
>>     %else
>>          movu      m3, [tabw_LumaCoeff + r4 * 8]
>>     %endif
>>          mov       r4d, %2
>>     %ifidn %3, pp
>>          mova      m2, [pw_32]
>>     %else
>>          mova      m2, [pw_2000]
>>         jnz ...
>
>
>
this works
cglobal interp_8tap_horiz_%3_%1x%2, 4,6,8
     mov       r4d, r4m
     add       r4d, r4d
     pxor      m6, m6
%ifidn %3, ps
     add       r3d, r3d
     cmp       r5m, byte 0
%endif
%ifdef PIC
     lea       r5, [tabw_LumaCoeff]
     movu      m3, [r5 + r4 * 8]
%else
     movu      m3, [tabw_LumaCoeff + r4 * 8]
%endif

     mov       r4d, %2
%ifidn %3, pp
     mova      m2, [pw_32]
%else
     mova      m2, [pw_2000]
     je        .loopH
     lea       r5, [r1 + 2 * r1]
     sub       r0d, r5d
     add       r4d, 7
%endif

> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150428/84040baa/attachment.html>


More information about the x265-devel mailing list