[x265] [PATCH] asm: improve algorithm logic on saoCuOrgE3

Deepthi Nandakumar deepthi at multicorewareinc.com
Wed Apr 15 12:28:13 CEST 2015


Min, pls resend. This conflicts with Divya's patch.

On Wed, Apr 15, 2015 at 11:38 AM, Min Chen <chenm003 at 163.com> wrote:

> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1429078116 -28800
> # Node ID 677ecdf2ba50e52604e73a1e92ea88ab26e950c1
> # Parent  dd456de98c239b86e29bf349881854a699056240
> asm: improve algorithm logic on saoCuOrgE3
> ---
>  source/common/x86/loopfilter.asm |   40
> ++++++++++++-------------------------
>  1 files changed, 13 insertions(+), 27 deletions(-)
>
> diff -r dd456de98c23 -r 677ecdf2ba50 source/common/x86/loopfilter.asm
> --- a/source/common/x86/loopfilter.asm  Tue Apr 14 13:41:40 2015 +0800
> +++ b/source/common/x86/loopfilter.asm  Wed Apr 15 14:08:36 2015 +0800
> @@ -456,19 +456,20 @@
>  ;void saoCuOrgE3(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo,
> intptr_t stride, int startX, int endX)
>
>  ;=======================================================================================================
>  INIT_XMM sse4
> -cglobal saoCuOrgE3, 3, 7, 8
> +cglobal saoCuOrgE3, 3,6,8
>      mov             r3d, r3m
>      mov             r4d, r4m
>      mov             r5d, r5m
>
> -    mov             r6d, r5d
> -    sub             r6d, r4d
> +    ; save latest 2 pixels for case startX=1 or left_endX=15
> +    movh            m7, [r0 + r5]
> +    movhps          m7, [r1 + r5 - 1]
>
> +    ; move to startX+1
>      inc             r4d
>      add             r0, r4
>      add             r1, r4
> -    movh            m7, [r0 + r6 - 1]
> -    mov             r6, [r1 + r6 - 2]
> +    sub             r5d, r4d
>      pxor            m0, m0                      ; m0 = 0
>      movu            m6, [pb_2]                  ; m6 = [2, 2, 2, 2, 2, 2,
> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
>
> @@ -508,30 +509,15 @@
>      packuswb        m2, m1
>      movu            [r0], m2
>
> -    sub             r5d, 16
> -    jle             .end
> +    add             r0, 16
> +    add             r1, 16
>
> -    lea             r0, [r0 + 16]
> -    lea             r1, [r1 + 16]
> +    sub             r5, 16
> +    jg             .loop
>
> -    jnz             .loop
> -
> -.end:
> -    js              .skip
> -    sub             r0, r4
> -    sub             r1, r4
> -    movh            [r0 + 16], m7
> -    mov             [r1 + 15], r6
> -    jmp             .quit
> -
> -.skip:
> -    sub             r0, r4
> -    sub             r1, r4
> -    movh            [r0 + 15], m7
> -    mov             [r1 + 14], r6
> -
> -.quit:
> -
> +    ; restore last pixels (up to 2)
> +    movh            [r0 + r5], m7
> +    movhps          [r1 + r5 - 1], m7
>      RET
>
>
>  ;=====================================================================================
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150415/6e189fd0/attachment.html>


More information about the x265-devel mailing list