[x265] [PATCH] asm: Optimized sad_48x64: +5x and sad_24x32: +2x asm routines
Steve Borho
steve at borho.org
Thu Oct 31 16:07:53 CET 2013
On Thu, Oct 31, 2013 at 6:18 AM, <dnyaneshwar at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
> # Date 1383218218 -19800
> # Thu Oct 31 16:46:58 2013 +0530
> # Node ID 515b0af5eb805407d40ead87fd29a8c32118d3a2
> # Parent 86ff1a3ec89720a73325148e8ac01ec1dbdab3c2
> asm: Optimized sad_48x64: +5x and sad_24x32: +2x asm routines
>
this doesn't apply either
> diff -r 86ff1a3ec897 -r 515b0af5eb80 source/common/x86/sad-a.asm
> --- a/source/common/x86/sad-a.asm Thu Oct 31 16:21:35 2013 +0530
> +++ b/source/common/x86/sad-a.asm Thu Oct 31 16:46:58 2013 +0530
> @@ -175,39 +175,37 @@
> %macro PROCESS_SAD_24x4 0
> movu m1, [r2]
> movq m2, [r2 + 16]
> - lea r2, [r2 + r3]
> - movu m3, [r2]
> - movq m4, [r2 + 16]
> + movu m3, [r2 + r3]
> + movq m4, [r2 + r3 + 16]
> psadbw m1, [r0]
> psadbw m3, [r0 + r1]
> paddd m0, m1
> paddd m0, m3
> movq m1, [r0 + 16]
> - lea r0, [r0 + r1]
> - movq m3, [r0 + 16]
> + movq m3, [r0 + r1 + 16]
> punpcklqdq m2, m4
> punpcklqdq m1, m3
> psadbw m2, m1
> paddd m0, m2
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
> + lea r2, [r2 + 2 * r3]
> + lea r0, [r0 + 2 * r1]
>
> movu m1, [r2]
> movq m2, [r2 + 16]
> - lea r2, [r2 + r3]
> - movu m3, [r2]
> - movq m4, [r2 + 16]
> + movu m3, [r2 + r3]
> + movq m4, [r2 + r3 + 16]
> psadbw m1, [r0]
> psadbw m3, [r0 + r1]
> paddd m0, m1
> paddd m0, m3
> movq m1, [r0 + 16]
> - lea r0, [r0 + r1]
> - movq m3, [r0 + 16]
> + movq m3, [r0 + r1 + 16]
> punpcklqdq m2, m4
> punpcklqdq m1, m3
> psadbw m2, m1
> paddd m0, m2
> + lea r2, [r2 + 2 * r3]
> + lea r0, [r0 + 2 * r1]
> %endmacro
>
> %macro PROCESS_SAD_32x4 0
> @@ -255,8 +253,18 @@
> paddd m1, m2
> paddd m0, m1
> paddd m0, m3
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
> +
> + movu m1, [r2 + r3]
> + movu m2, [r2 + r3 + 16]
> + movu m3, [r2 + r3 + 32]
> + psadbw m1, [r0 + r1]
> + psadbw m2, [r0 + r1 + 16]
> + psadbw m3, [r0 + r1 + 32]
> + paddd m1, m2
> + paddd m0, m1
> + paddd m0, m3
> + lea r2, [r2 + 2 * r3]
> + lea r0, [r0 + 2 * r1]
>
> movu m1, [r2]
> movu m2, [r2 + 16]
> @@ -267,30 +275,18 @@
> paddd m1, m2
> paddd m0, m1
> paddd m0, m3
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
>
> - movu m1, [r2]
> - movu m2, [r2 + 16]
> - movu m3, [r2 + 32]
> - psadbw m1, [r0]
> - psadbw m2, [r0 + 16]
> - psadbw m3, [r0 + 32]
> + movu m1, [r2 + r3]
> + movu m2, [r2 + r3 + 16]
> + movu m3, [r2 + r3 + 32]
> + psadbw m1, [r0 + r1]
> + psadbw m2, [r0 + r1 + 16]
> + psadbw m3, [r0 + r1 + 32]
> paddd m1, m2
> paddd m0, m1
> paddd m0, m3
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
> -
> - movu m1, [r2]
> - movu m2, [r2 + 16]
> - movu m3, [r2 + 32]
> - psadbw m1, [r0]
> - psadbw m2, [r0 + 16]
> - psadbw m3, [r0 + 32]
> - paddd m1, m2
> - paddd m0, m1
> - paddd m0, m3
> + lea r2, [r2 + 2 * r3]
> + lea r0, [r0 + 2 * r1]
> %endmacro
>
> %macro PROCESS_SAD_8x4 0
> @@ -725,27 +721,17 @@
>
> ;-----------------------------------------------------------------------------
> ; int pixel_sad_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
>
> ;-----------------------------------------------------------------------------
> -cglobal pixel_sad_48x64, 4,4,5
> +cglobal pixel_sad_48x64, 4,5,5
> pxor m0, m0
> - mov r4, 64
> + mov r4d, 4
>
> .loop
> PROCESS_SAD_48x4
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
> -
> PROCESS_SAD_48x4
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
> -
> - sub r4, 8
> - cmp r4, 8
> -
> -jnz .loop
> PROCESS_SAD_48x4
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
> PROCESS_SAD_48x4
> + dec r4d
> + jnz .loop
>
> movhlps m1, m0
> paddd m0, m1
> @@ -755,24 +741,17 @@
>
> ;-----------------------------------------------------------------------------
> ; int pixel_sad_24x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
>
> ;-----------------------------------------------------------------------------
> -cglobal pixel_sad_24x32, 4,4,4
> +cglobal pixel_sad_24x32, 4,5,4
> pxor m0, m0
> - mov r4, 32
> + mov r4d, 2
>
> .loop
> PROCESS_SAD_24x4
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
> PROCESS_SAD_24x4
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
> - sub r4, 8
> - cmp r4, 8
> + PROCESS_SAD_24x4
> + PROCESS_SAD_24x4
> + dec r4d
> jnz .loop
> - PROCESS_SAD_24x4
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
> - PROCESS_SAD_24x4
>
> movhlps m1, m0
> paddd m0, m1
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
--
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131031/4e444517/attachment-0001.html>
More information about the x265-devel
mailing list