[x265] [PATCH] asm: Optimized sad_48x64: +5x and sad_24x32: +2x asm routines

Steve Borho steve at borho.org
Thu Oct 31 16:07:53 CET 2013


On Thu, Oct 31, 2013 at 6:18 AM, <dnyaneshwar at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
> # Date 1383218218 -19800
> #      Thu Oct 31 16:46:58 2013 +0530
> # Node ID 515b0af5eb805407d40ead87fd29a8c32118d3a2
> # Parent  86ff1a3ec89720a73325148e8ac01ec1dbdab3c2
> asm: Optimized sad_48x64: +5x and sad_24x32: +2x asm routines
>

this doesn't apply either


> diff -r 86ff1a3ec897 -r 515b0af5eb80 source/common/x86/sad-a.asm
> --- a/source/common/x86/sad-a.asm       Thu Oct 31 16:21:35 2013 +0530
> +++ b/source/common/x86/sad-a.asm       Thu Oct 31 16:46:58 2013 +0530
> @@ -175,39 +175,37 @@
>  %macro PROCESS_SAD_24x4 0
>      movu        m1,  [r2]
>      movq        m2,  [r2 + 16]
> -    lea         r2,  [r2 + r3]
> -    movu        m3,  [r2]
> -    movq        m4,  [r2 + 16]
> +    movu        m3,  [r2 + r3]
> +    movq        m4,  [r2 + r3 + 16]
>      psadbw      m1,  [r0]
>      psadbw      m3,  [r0 + r1]
>      paddd       m0,  m1
>      paddd       m0,  m3
>      movq        m1,  [r0 + 16]
> -    lea         r0,  [r0 + r1]
> -    movq        m3,  [r0 + 16]
> +    movq        m3,  [r0 + r1 + 16]
>      punpcklqdq  m2,  m4
>      punpcklqdq  m1,  m3
>      psadbw      m2, m1
>      paddd       m0, m2
> -    lea         r2,  [r2 + r3]
> -    lea         r0,  [r0 + r1]
> +    lea         r2,  [r2 + 2 * r3]
> +    lea         r0,  [r0 + 2 * r1]
>
>      movu        m1,  [r2]
>      movq        m2,  [r2 + 16]
> -    lea         r2,  [r2 + r3]
> -    movu        m3,  [r2]
> -    movq        m4,  [r2 + 16]
> +    movu        m3,  [r2 + r3]
> +    movq        m4,  [r2 + r3 + 16]
>      psadbw      m1,  [r0]
>      psadbw      m3,  [r0 + r1]
>      paddd       m0,  m1
>      paddd       m0,  m3
>      movq        m1,  [r0 + 16]
> -    lea         r0,  [r0 + r1]
> -    movq        m3,  [r0 + 16]
> +    movq        m3,  [r0 + r1 + 16]
>      punpcklqdq  m2,  m4
>      punpcklqdq  m1,  m3
>      psadbw      m2, m1
>      paddd       m0, m2
> +    lea         r2,  [r2 + 2 * r3]
> +    lea         r0,  [r0 + 2 * r1]
>  %endmacro
>
>  %macro PROCESS_SAD_32x4 0
> @@ -255,8 +253,18 @@
>      paddd   m1,  m2
>      paddd   m0,  m1
>      paddd   m0,  m3
> -    lea     r2,  [r2 + r3]
> -    lea     r0,  [r0 + r1]
> +
> +    movu    m1,  [r2 + r3]
> +    movu    m2,  [r2 + r3 + 16]
> +    movu    m3,  [r2 + r3 + 32]
> +    psadbw  m1,  [r0 + r1]
> +    psadbw  m2,  [r0 + r1 + 16]
> +    psadbw  m3,  [r0 + r1 + 32]
> +    paddd   m1,  m2
> +    paddd   m0,  m1
> +    paddd   m0,  m3
> +    lea     r2,  [r2 + 2 * r3]
> +    lea     r0,  [r0 + 2 * r1]
>
>      movu    m1,  [r2]
>      movu    m2,  [r2 + 16]
> @@ -267,30 +275,18 @@
>      paddd   m1,  m2
>      paddd   m0,  m1
>      paddd   m0,  m3
> -    lea     r2,  [r2 + r3]
> -    lea     r0,  [r0 + r1]
>
> -    movu    m1,  [r2]
> -    movu    m2,  [r2 + 16]
> -    movu    m3,  [r2 + 32]
> -    psadbw  m1,  [r0]
> -    psadbw  m2,  [r0 + 16]
> -    psadbw  m3,  [r0 + 32]
> +    movu    m1,  [r2 + r3]
> +    movu    m2,  [r2 + r3 + 16]
> +    movu    m3,  [r2 + r3 + 32]
> +    psadbw  m1,  [r0 + r1]
> +    psadbw  m2,  [r0 + r1 + 16]
> +    psadbw  m3,  [r0 + r1 + 32]
>      paddd   m1,  m2
>      paddd   m0,  m1
>      paddd   m0,  m3
> -    lea     r2,  [r2 + r3]
> -    lea     r0,  [r0 + r1]
> -
> -    movu    m1,  [r2]
> -    movu    m2,  [r2 + 16]
> -    movu    m3,  [r2 + 32]
> -    psadbw  m1,  [r0]
> -    psadbw  m2,  [r0 + 16]
> -    psadbw  m3,  [r0 + 32]
> -    paddd   m1,  m2
> -    paddd   m0,  m1
> -    paddd   m0,  m3
> +    lea     r2,  [r2 + 2 * r3]
> +    lea     r0,  [r0 + 2 * r1]
>  %endmacro
>
>  %macro PROCESS_SAD_8x4 0
> @@ -725,27 +721,17 @@
>
>  ;-----------------------------------------------------------------------------
>  ; int pixel_sad_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
>
>  ;-----------------------------------------------------------------------------
> -cglobal pixel_sad_48x64, 4,4,5
> +cglobal pixel_sad_48x64, 4,5,5
>      pxor  m0,  m0
> -    mov   r4,  64
> +    mov   r4d, 4
>
>  .loop
>      PROCESS_SAD_48x4
> -    lea     r2,  [r2 + r3]
> -    lea     r0,  [r0 + r1]
> -
>      PROCESS_SAD_48x4
> -    lea     r2,  [r2 + r3]
> -    lea     r0,  [r0 + r1]
> -
> -    sub   r4,  8
> -    cmp   r4,  8
> -
> -jnz .loop
>      PROCESS_SAD_48x4
> -    lea   r2,  [r2 + r3]
> -    lea   r0,  [r0 + r1]
>      PROCESS_SAD_48x4
> +    dec  r4d
> +    jnz .loop
>
>      movhlps m1,  m0
>      paddd   m0,  m1
> @@ -755,24 +741,17 @@
>
>  ;-----------------------------------------------------------------------------
>  ; int pixel_sad_24x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
>
>  ;-----------------------------------------------------------------------------
> -cglobal pixel_sad_24x32, 4,4,4
> +cglobal pixel_sad_24x32, 4,5,4
>      pxor  m0,  m0
> -    mov   r4,  32
> +    mov   r4d, 2
>
>  .loop
>      PROCESS_SAD_24x4
> -    lea         r2,  [r2 + r3]
> -    lea         r0,  [r0 + r1]
>      PROCESS_SAD_24x4
> -    lea         r2,  [r2 + r3]
> -    lea         r0,  [r0 + r1]
> -    sub   r4,  8
> -    cmp   r4,  8
> +    PROCESS_SAD_24x4
> +    PROCESS_SAD_24x4
> +    dec  r4d
>  jnz .loop
> -    PROCESS_SAD_24x4
> -    lea         r2,  [r2 + r3]
> -    lea         r0,  [r0 + r1]
> -    PROCESS_SAD_24x4
>
>      movhlps m1,  m0
>      paddd   m0,  m1
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>



-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131031/4e444517/attachment-0001.html>


More information about the x265-devel mailing list