[x265] [PATCH] asm: improve algorithm logic on saoCuOrgE3

Deepthi Nandakumar deepthi at multicorewareinc.com
Wed Apr 15 12:31:31 CEST 2015


Sorry, realised Steve had already pushed this.

On Wed, Apr 15, 2015 at 3:58 PM, Deepthi Nandakumar <
deepthi at multicorewareinc.com> wrote:

> Min, pls resend. This conflicts with Divya's patch.
>
> On Wed, Apr 15, 2015 at 11:38 AM, Min Chen <chenm003 at 163.com> wrote:
>
>> # HG changeset patch
>> # User Min Chen <chenm003 at 163.com>
>> # Date 1429078116 -28800
>> # Node ID 677ecdf2ba50e52604e73a1e92ea88ab26e950c1
>> # Parent  dd456de98c239b86e29bf349881854a699056240
>> asm: improve algorithm logic on saoCuOrgE3
>> ---
>>  source/common/x86/loopfilter.asm |   40
>> ++++++++++++-------------------------
>>  1 files changed, 13 insertions(+), 27 deletions(-)
>>
>> diff -r dd456de98c23 -r 677ecdf2ba50 source/common/x86/loopfilter.asm
>> --- a/source/common/x86/loopfilter.asm  Tue Apr 14 13:41:40 2015 +0800
>> +++ b/source/common/x86/loopfilter.asm  Wed Apr 15 14:08:36 2015 +0800
>> @@ -456,19 +456,20 @@
>>  ;void saoCuOrgE3(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo,
>> intptr_t stride, int startX, int endX)
>>
>>  ;=======================================================================================================
>>  INIT_XMM sse4
>> -cglobal saoCuOrgE3, 3, 7, 8
>> +cglobal saoCuOrgE3, 3,6,8
>>      mov             r3d, r3m
>>      mov             r4d, r4m
>>      mov             r5d, r5m
>>
>> -    mov             r6d, r5d
>> -    sub             r6d, r4d
>> +    ; save latest 2 pixels for case startX=1 or left_endX=15
>> +    movh            m7, [r0 + r5]
>> +    movhps          m7, [r1 + r5 - 1]
>>
>> +    ; move to startX+1
>>      inc             r4d
>>      add             r0, r4
>>      add             r1, r4
>> -    movh            m7, [r0 + r6 - 1]
>> -    mov             r6, [r1 + r6 - 2]
>> +    sub             r5d, r4d
>>      pxor            m0, m0                      ; m0 = 0
>>      movu            m6, [pb_2]                  ; m6 = [2, 2, 2, 2, 2,
>> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
>>
>> @@ -508,30 +509,15 @@
>>      packuswb        m2, m1
>>      movu            [r0], m2
>>
>> -    sub             r5d, 16
>> -    jle             .end
>> +    add             r0, 16
>> +    add             r1, 16
>>
>> -    lea             r0, [r0 + 16]
>> -    lea             r1, [r1 + 16]
>> +    sub             r5, 16
>> +    jg             .loop
>>
>> -    jnz             .loop
>> -
>> -.end:
>> -    js              .skip
>> -    sub             r0, r4
>> -    sub             r1, r4
>> -    movh            [r0 + 16], m7
>> -    mov             [r1 + 15], r6
>> -    jmp             .quit
>> -
>> -.skip:
>> -    sub             r0, r4
>> -    sub             r1, r4
>> -    movh            [r0 + 15], m7
>> -    mov             [r1 + 14], r6
>> -
>> -.quit:
>> -
>> +    ; restore last pixels (up to 2)
>> +    movh            [r0 + r5], m7
>> +    movhps          [r1 + r5 - 1], m7
>>      RET
>>
>>
>>  ;=====================================================================================
>>
>> _______________________________________________
>> x265-devel mailing list
>> x265-devel at videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
>>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150415/f352041c/attachment.html>


More information about the x265-devel mailing list