[x265] [PATCH] asm: Optimized sad_64xN for better cache performance. Reduced lea instruction by half. Performance gain is average +5x w.r.t. previous asm code
Steve Borho
steve at borho.org
Thu Oct 31 16:06:50 CET 2013
On Thu, Oct 31, 2013 at 5:53 AM, <dnyaneshwar at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
> # Date 1383216695 -19800
> # Thu Oct 31 16:21:35 2013 +0530
> # Node ID 86ff1a3ec89720a73325148e8ac01ec1dbdab3c2
> # Parent 5d6ed411995acd674b838f989385c61039760780
> asm: Optimized sad_64xN for better cache performance. Reduced lea
> instruction by half. Performance gain is average +5x w.r.t. previous asm
> code.
>
this one does not apply, but the function already appears to be refactored.
> diff -r 5d6ed411995a -r 86ff1a3ec897 source/common/x86/sad-a.asm
> --- a/source/common/x86/sad-a.asm Thu Oct 31 15:10:34 2013 +0530
> +++ b/source/common/x86/sad-a.asm Thu Oct 31 16:21:35 2013 +0530
> @@ -329,38 +329,21 @@
> paddd m3, m4
> paddd m0, m1
> paddd m0, m3
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
>
> - movu m1, [r2]
> - movu m2, [r2 + 16]
> - movu m3, [r2 + 32]
> - movu m4, [r2 + 48]
> - psadbw m1, [r0]
> - psadbw m2, [r0 + 16]
> - psadbw m3, [r0 + 32]
> - psadbw m4, [r0 + 48]
> + movu m1, [r2 + r3]
> + movu m2, [r2 + r3 + 16]
> + movu m3, [r2 + r3 + 32]
> + movu m4, [r2 + r3 + 48]
> + psadbw m1, [r0 + r1]
> + psadbw m2, [r0 + r1 + 16]
> + psadbw m3, [r0 + r1 + 32]
> + psadbw m4, [r0 + r1 + 48]
> paddd m1, m2
> paddd m3, m4
> paddd m0, m1
> paddd m0, m3
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
> -
> - movu m1, [r2]
> - movu m2, [r2 + 16]
> - movu m3, [r2 + 32]
> - movu m4, [r2 + 48]
> - psadbw m1, [r0]
> - psadbw m2, [r0 + 16]
> - psadbw m3, [r0 + 32]
> - psadbw m4, [r0 + 48]
> - paddd m1, m2
> - paddd m3, m4
> - paddd m0, m1
> - paddd m0, m3
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
> + lea r2, [r2 + 2 * r3]
> + lea r0, [r0 + 2 * r1]
>
> movu m1, [r2]
> movu m2, [r2 + 16]
> @@ -375,6 +358,20 @@
> paddd m0, m1
> paddd m0, m3
>
> + movu m1, [r2 + r3]
> + movu m2, [r2 + r3 + 16]
> + movu m3, [r2 + r3 + 32]
> + movu m4, [r2 + r3 + 48]
> + psadbw m1, [r0 + r1]
> + psadbw m2, [r0 + r1 + 16]
> + psadbw m3, [r0 + r1 + 32]
> + psadbw m4, [r0 + r1 + 48]
> + paddd m1, m2
> + paddd m3, m4
> + paddd m0, m1
> + paddd m0, m3
> + lea r2, [r2 + 2 * r3]
> + lea r0, [r0 + 2 * r1]
> %endmacro
>
> %macro SAD_W16 0
> @@ -660,20 +657,8 @@
> pxor m0, m0
>
> PROCESS_SAD_64x4
> -
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
> -
> PROCESS_SAD_64x4
> -
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
> -
> PROCESS_SAD_64x4
> -
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
> -
> PROCESS_SAD_64x4
>
> movhlps m1, m0
> @@ -684,27 +669,16 @@
>
> ;-----------------------------------------------------------------------------
> ; int pixel_sad_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
>
> ;-----------------------------------------------------------------------------
> -cglobal pixel_sad_64x32, 4,4,5
> +cglobal pixel_sad_64x32, 4,5,5
> pxor m0, m0
> - mov r4, 32
> -
> + mov r4, 2
> .loop
> PROCESS_SAD_64x4
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
> -
> PROCESS_SAD_64x4
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
> -
> - sub r4, 8
> - cmp r4, 8
> -
> -jnz .loop
> PROCESS_SAD_64x4
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
> PROCESS_SAD_64x4
> + dec r4d
> + jnz .loop
>
> movhlps m1, m0
> paddd m0, m1
> @@ -714,27 +688,15 @@
>
> ;-----------------------------------------------------------------------------
> ; int pixel_sad_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t )
>
> ;-----------------------------------------------------------------------------
> -cglobal pixel_sad_64x48, 4,4,5
> +cglobal pixel_sad_64x48, 4,5,5
> pxor m0, m0
> - mov r4, 48
> -
> + mov r4, 4
> .loop
> PROCESS_SAD_64x4
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
> -
> PROCESS_SAD_64x4
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
> -
> - sub r4, 8
> - cmp r4, 8
> -
> -jnz .loop
> PROCESS_SAD_64x4
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
> - PROCESS_SAD_64x4
> + dec r4d
> + jnz .loop
>
> movhlps m1, m0
> paddd m0, m1
> @@ -744,27 +706,16 @@
>
> ;-----------------------------------------------------------------------------
> ; int pixel_sad_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
>
> ;-----------------------------------------------------------------------------
> -cglobal pixel_sad_64x64, 4,4,5
> +cglobal pixel_sad_64x64, 4,5,5
> pxor m0, m0
> - mov r4, 64
> -
> + mov r4, 4
> .loop
> PROCESS_SAD_64x4
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
> -
> PROCESS_SAD_64x4
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
> -
> - sub r4, 8
> - cmp r4, 8
> -
> -jnz .loop
> PROCESS_SAD_64x4
> - lea r2, [r2 + r3]
> - lea r0, [r0 + r1]
> PROCESS_SAD_64x4
> + dec r4d
> + jnz .loop
>
> movhlps m1, m0
> paddd m0, m1
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
--
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131031/7da0f36d/attachment.html>
More information about the x265-devel
mailing list