[x265] [PATCH] asm: interp_8tap_horiz pp and ps sse2
dave
dtyx265 at gmail.com
Wed Apr 29 03:51:31 CEST 2015
On 04/28/2015 06:38 PM, chen wrote:
>
> 在 2015-04-29 09:30:36,dave <dtyx265 at gmail.com> 写道:
>
> On 04/28/2015 06:13 PM, chen wrote:
>>
>> 在 2015-04-29 07:49:46,dave <dtyx265 at gmail.com> 写道:
>>
>> On 04/28/2015 03:32 PM, chen wrote:
>>> Most part are fine now, just modify about r5, see below comment
>>>
>>> At 2015-04-29 06:27:27,dtyx265 at gmail.com wrote:
>>> ># HG changeset patch
>>> ># User David T Yuendtyx265 at gmail.com>
>>> ># Date 1430259967 25200
>>> ># Node ID 6108fbda1be654a481a78f7ef593518033919674
>>> ># Parent e9df93f380664932e7d6c7e85b2cae16cd5e1dcd
>>> >asm: interp_8tap_horiz pp and ps sse2
>>> >
>>> >This replaces c code and covers
>>> >
>>> <mailto:dtyx265 at gmail.com%3E%3E#%A0Date%A01430259967%A025200%3E#%A0Node%A0ID%A06108fbda1be654a481a78f7ef593518033919674%3E#%A0Parent%A0%A0e9df93f380664932e7d6c7e85b2cae16cd5e1dcd%3Easm:%A0interp_8tap_horiz%A0pp%A0and%A0ps%A0sse2%3E%3EThis%A0replaces%A0c%A0code%A0and%A0covers%3E>+;----------------------------------------------------------------------------------------------------------------------------
>>> >+; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
>>> >+;----------------------------------------------------------------------------------------------------------------------------
>>> >+%macro IPFILTER_LUMA_sse2 3
>>> >+INIT_XMM sse2
>>> >+cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8
>>> >+
>>> >+ mov r4d, r4m
>>> >+ add r4d, r4d
>>> >+ pxor m6, m6
>>> >+%ifdef PIC
>>> >+ lea r6, [tabw_LumaCoeff]
>>> >+ movu m3, [r6 + r4 * 8]
>>> >+%else
>>> >+ movu m3, [tabw_LumaCoeff + r4 * 8]
>>> >+%endif
>>> >+
>>> >+ mov r4d, %2
>>> >+%ifidn %3, pp
>>> >+ mova m2, [pw_32]
>>> >+%else
>>> >+ mova m2, [pw_2000]
>>> >+ add r3d, r3d
>>> >+ cmp r5m, byte 0
>>> if we move above 2 lines to up, we can reduce r6 and reuse r5.
>> I am not sure if this can be done. r4 is used to to set m3
>> then it is reused and modified depending on r5 and r5 can't
>> be used for something else before it's cmp'ed.
>>> 'mov, lea' didn't affect eflags register
>>
>> but r4 is needed for the lea instruction and r4 is later reused
>> and modified depending on r5. Only one of them can be reused,
>> not both, so r6 is needed.
>>
>>
>> How about below code:
>> pxor m6, m6
>> add r3d, r3d
>>
>> cmp r5m, byte 0
>> %ifdef PIC
>> lea r6, [tabw_LumaCoeff]
>> movu m3, [r6 + r4 * 8]
>> %else
>> movu m3, [tabw_LumaCoeff + r4 * 8]
>> %endif
>> mov r4d, %2
>> %ifidn %3, pp
>> mova m2, [pw_32]
>> %else
>> mova m2, [pw_2000]
>> jnz ...
>
>
>
this works
cglobal interp_8tap_horiz_%3_%1x%2, 4,6,8
mov r4d, r4m
add r4d, r4d
pxor m6, m6
%ifidn %3, ps
add r3d, r3d
cmp r5m, byte 0
%endif
%ifdef PIC
lea r5, [tabw_LumaCoeff]
movu m3, [r5 + r4 * 8]
%else
movu m3, [tabw_LumaCoeff + r4 * 8]
%endif
mov r4d, %2
%ifidn %3, pp
mova m2, [pw_32]
%else
mova m2, [pw_2000]
je .loopH
lea r5, [r1 + 2 * r1]
sub r0d, r5d
add r4d, 7
%endif
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150428/84040baa/attachment.html>
More information about the x265-devel
mailing list