[x265] [PATCH] asm: improve sad[32x32] 10% by unroll loop
Steve Borho
steve at borho.org
Mon Mar 16 23:37:51 CET 2015
On 03/16, Min Chen wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1426539636 25200
> # Node ID 117fb09221983c5f50988741168a216d35e3581a
> # Parent d33fc159951225e42889071ef3d877d23f693197
> asm: improve sad[32x32] 10% by unroll loop
queued for testing
> ---
> source/common/x86/sad-a.asm | 25 ++++++++++++++++++-------
> 1 files changed, 18 insertions(+), 7 deletions(-)
>
> diff -r d33fc1599512 -r 117fb0922198 source/common/x86/sad-a.asm
> --- a/source/common/x86/sad-a.asm Mon Mar 16 12:00:42 2015 -0700
> +++ b/source/common/x86/sad-a.asm Mon Mar 16 14:00:36 2015 -0700
> @@ -3898,9 +3898,11 @@
> RET
>
> INIT_YMM avx2
> -cglobal pixel_sad_32x32, 4,5,5
> +cglobal pixel_sad_32x32, 4,7,5
> xorps m0, m0
> - mov r4d, 16
> + mov r4d, 32/4
> + lea r5, [r1 * 3]
> + lea r6, [r3 * 3]
>
> .loop
> movu m1, [r0] ; row 0 of pix0
> @@ -3913,11 +3915,21 @@
> paddd m0, m1
> paddd m0, m3
>
> - lea r2, [r2 + 2 * r3]
> - lea r0, [r0 + 2 * r1]
> -
> - dec r4d
> - jnz .loop
> + movu m1, [r0 + 2 * r1] ; row 2 of pix0
> + movu m2, [r2 + 2 * r3] ; row 2 of pix1
> + movu m3, [r0 + r5] ; row 3 of pix0
> + movu m4, [r2 + r6] ; row 3 of pix1
> +
> + psadbw m1, m2
> + psadbw m3, m4
> + paddd m0, m1
> + paddd m0, m3
> +
> + lea r2, [r2 + 4 * r3]
> + lea r0, [r0 + 4 * r1]
> +
> + dec r4d
> + jnz .loop
>
> vextracti128 xm1, m0, 1
> paddd xm0, xm1
> @@ -3926,5 +3938,4 @@
> movd eax, xm0
> RET
>
> -
> %endif
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list