[x265] [PATCH] asm: improve avx2 code sub_ps[32x32] 1402 -> 1360
Steve Borho
steve at borho.org
Mon Apr 13 22:50:40 CEST 2015
On 04/08, sumalatha at multicorewareinc.com wrote:
> # HG changeset patch
> # User Sumalatha Polureddy
> # Date 1428486008 -19800
> # Wed Apr 08 15:10:08 2015 +0530
> # Node ID 4819d554dbbc63e6881bd8eee9d61a93320197f2
> # Parent 3e416dec8024b8339b18568cf65e48eb3448bed1
> asm: improve avx2 code sub_ps[32x32] 1402 -> 1360
queued
> diff -r 3e416dec8024 -r 4819d554dbbc source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm Tue Apr 07 16:00:39 2015 -0500
> +++ b/source/common/x86/pixel-util8.asm Wed Apr 08 15:10:08 2015 +0530
> @@ -4686,10 +4686,14 @@
> ;-----------------------------------------------------------------------------
> ; void pixel_sub_ps_32x32(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
> ;-----------------------------------------------------------------------------
> +%if ARCH_X86_64
> INIT_YMM avx2
> -cglobal pixel_sub_ps_32x32, 6, 7, 4, dest, deststride, src0, src1, srcstride0, srcstride1
> - mov r6d, 4
> - add r1, r1
> +cglobal pixel_sub_ps_32x32, 6, 10, 4, dest, deststride, src0, src1, srcstride0, srcstride1
> + mov r6d, 4
> + add r1, r1
> + lea r7, [r4 * 3]
> + lea r8, [r5 * 3]
> + lea r9, [r1 * 3]
>
> .loop:
> pmovzxbw m0, [r2]
> @@ -4714,8 +4718,43 @@
> movu [r0 + r1], m0
> movu [r0 + r1 + 32], m1
>
> - add r2, r4
> - add r3, r5
> + pmovzxbw m0, [r2 + 2 * r4]
> + pmovzxbw m1, [r2 + 2 * r4 + 16]
> + pmovzxbw m2, [r3 + 2 * r5]
> + pmovzxbw m3, [r3 + 2 * r5 + 16]
> +
> + psubw m0, m2
> + psubw m1, m3
> +
> + movu [r0 + r1 * 2 ], m0
> + movu [r0 + r1 * 2 + 32], m1
> +
> + pmovzxbw m0, [r2 + r7]
> + pmovzxbw m1, [r2 + r7 + 16]
> + pmovzxbw m2, [r3 + r8]
> + pmovzxbw m3, [r3 + r8 + 16]
> +
> +
> + psubw m0, m2
> + psubw m1, m3
> +
> + movu [r0 + r9], m0
> + movu [r0 + r9 +32], m1
> +
> + lea r2, [r2 + r4 * 4]
> + lea r3, [r3 + r5 * 4]
> + lea r0, [r0 + r1 * 4]
> +
> + pmovzxbw m0, [r2]
> + pmovzxbw m1, [r2 + 16]
> + pmovzxbw m2, [r3]
> + pmovzxbw m3, [r3 + 16]
> +
> + psubw m0, m2
> + psubw m1, m3
> +
> + movu [r0 ], m0
> + movu [r0 + 32], m1
>
> pmovzxbw m0, [r2 + r4]
> pmovzxbw m1, [r2 + r4 + 16]
> @@ -4724,94 +4763,40 @@
>
> psubw m0, m2
> psubw m1, m3
> - lea r0, [r0 + r1 * 2]
> -
> - movu [r0 ], m0
> - movu [r0 + 32], m1
> -
> - add r2, r4
> - add r3, r5
> -
> - pmovzxbw m0, [r2 + r4]
> - pmovzxbw m1, [r2 + r4 + 16]
> - pmovzxbw m2, [r3 + r5]
> - pmovzxbw m3, [r3 + r5 + 16]
> -
> +
> + movu [r0 + r1], m0
> + movu [r0 + r1 + 32], m1
> +
> + pmovzxbw m0, [r2 + 2 * r4]
> + pmovzxbw m1, [r2 + 2 * r4 + 16]
> + pmovzxbw m2, [r3 + 2 * r5]
> + pmovzxbw m3, [r3 + 2 * r5 + 16]
>
> psubw m0, m2
> psubw m1, m3
> - add r0, r1
> -
> - movu [r0 ], m0
> - movu [r0 + 32], m1
> -
> - add r2, r4
> - add r3, r5
> -
> - pmovzxbw m0, [r2 + r4]
> - pmovzxbw m1, [r2 + r4 + 16]
> - pmovzxbw m2, [r3 + r5]
> - pmovzxbw m3, [r3 + r5 + 16]
> +
> + movu [r0 + r1 * 2], m0
> + movu [r0 + r1 * 2 + 32], m1
> +
> + pmovzxbw m0, [r2 + r7]
> + pmovzxbw m1, [r2 + r7 + 16]
> + pmovzxbw m2, [r3 + r8]
> + pmovzxbw m3, [r3 + r8 + 16]
>
> psubw m0, m2
> psubw m1, m3
> - add r0, r1
> -
> - movu [r0 ], m0
> - movu [r0 + 32], m1
> -
> - add r2, r4
> - add r3, r5
> -
> - pmovzxbw m0, [r2 + r4]
> - pmovzxbw m1, [r2 + r4 + 16]
> - pmovzxbw m2, [r3 + r5]
> - pmovzxbw m3, [r3 + r5 + 16]
> -
> - psubw m0, m2
> - psubw m1, m3
> - add r0, r1
> -
> - movu [r0 ], m0
> - movu [r0 + 32], m1
> -
> - add r2, r4
> - add r3, r5
> -
> - pmovzxbw m0, [r2 + r4]
> - pmovzxbw m1, [r2 + r4 + 16]
> - pmovzxbw m2, [r3 + r5]
> - pmovzxbw m3, [r3 + r5 + 16]
> -
> - psubw m0, m2
> - psubw m1, m3
> - add r0, r1
> -
> - movu [r0 ], m0
> - movu [r0 + 32], m1
> -
> - add r2, r4
> - add r3, r5
> -
> - pmovzxbw m0, [r2 + r4]
> - pmovzxbw m1, [r2 + r4 + 16]
> - pmovzxbw m2, [r3 + r5]
> - pmovzxbw m3, [r3 + r5 + 16]
> -
> - psubw m0, m2
> - psubw m1, m3
> - add r0, r1
> -
> - movu [r0 ], m0
> - movu [r0 + 32], m1
> -
> - lea r0, [r0 + r1]
> - lea r2, [r2 + r4 * 2]
> - lea r3, [r3 + r5 * 2]
> +
> + movu [r0 + r9], m0
> + movu [r0 + r9 + 32], m1
> +
> + lea r0, [r0 + r1 * 4]
> + lea r2, [r2 + r4 * 4]
> + lea r3, [r3 + r5 * 4]
>
> dec r6d
> jnz .loop
> RET
> +%endif
>
> ;-----------------------------------------------------------------------------
> ; void pixel_sub_ps_64x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list