[x265] [PATCH] asm: improve ~5% on AVX2 interp_8tap_horiz_ps_4xN
Deepthi Nandakumar
deepthi at multicorewareinc.com
Fri Mar 13 05:32:07 CET 2015
Thanks, queued
On Fri, Mar 13, 2015 at 9:16 AM, Min Chen <chenm003 at 163.com> wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1426218406 25200
> # Node ID 91da3d8069fdc0d937097ff3d9d6ae91e25b852c
> # Parent 1f125d14f656cfd253bd36c29a111764f007a349
> asm: improve ~5% on AVX2 interp_8tap_horiz_ps_4xN
> ---
> source/common/x86/ipfilter8.asm | 49
> +++++++++++++++++---------------------
> 2 files changed, 38 insertions(+), 27 deletions(-)
>
> diff -r 1f125d14f656 -r 91da3d8069fd source/common/x86/ipfilter8.asm
> --- a/source/common/x86/ipfilter8.asm Thu Mar 12 13:06:38 2015 -0500
> +++ b/source/common/x86/ipfilter8.asm Thu Mar 12 20:46:46 2015 -0700
> @@ -1749,10 +1749,10 @@
> ;void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t*
> dst, intptr_t dstStride, int coeffIdx, int isRowExt)
>
> ;-----------------------------------------------------------------------------------------------------------------------------
>
> -%macro IPFILTER_LUMA_PS_4x_AVX2 2
> +%macro IPFILTER_LUMA_PS_4xN_AVX2 1
> INIT_YMM avx2
> %if ARCH_X86_64 == 1
> -cglobal interp_8tap_horiz_ps_%1x%2, 6, 11, 6
> +cglobal interp_8tap_horiz_ps_4x%1, 6,7,6
> mov r5d, r5m
> mov r4d, r4m
> %ifdef PIC
> @@ -1762,7 +1762,6 @@
> vpbroadcastq m0, [tab_LumaCoeff + r4 *
> 8]
> %endif
> mova m1, [tab_Lm]
> - mov r9d, %2
> ;height
> add r3d, r3d
> vbroadcasti128 m2, [pw_2000]
>
> @@ -1771,17 +1770,17 @@
> ; m1 - shuffle order table
> ; m2 - pw_2000
>
> - xor r10, r10
> ; loop count variable
> sub r0, 3
> test r5d, r5d
> - jz .label
> - lea r8, [r1 * 3]
> ; r8 = (N / 2 - 1) * srcStride
> - sub r0, r8
> ; r0(src)-r8
> - add r9, 4
> ; blkheight += N - 1 (7 - 3 = 4 ; since the last three rows not in
> loop)
> -
> -.label
> - add r10, 4
> -
> + mov r5d, %1
> ; loop count variable - height
> + jz .preloop
> + lea r6, [r1 * 3]
> ; r8 = (N / 2 - 1) * srcStride
> + sub r0, r6
> ; r0(src) - 3 * srcStride
> + add r5d, 7
> ; need extra 7 rows, just set a specially flag here, blkheight += N -
> 1 (7 - 3 = 4 ; since the last three rows not in loop)
> +
> +.preloop:
> + lea r6, [r3 * 3]
> +.loop
> ; Row 0-1
> vbroadcasti128 m3, [r0]
> ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
> pshufb m3, m1
> ; shuffled based on the col order tab_Lm
> @@ -1807,18 +1806,17 @@
> psubw m3, m2
>
> vextracti128 xm4, m3, 1
> - lea r7, [r3 * 3]
> movq [r2], xm3
> ;row 0
> movhps [r2 + r3], xm3
> ;row 1
> movq [r2 + r3 * 2], xm4
> ;row 2
> - movhps [r2 + r7], xm4
> ;row 3
> + movhps [r2 + r6], xm4
> ;row 3
>
> lea r0, [r0 + r1 * 2]
> ; first loop src ->5th row(i.e 4)
> lea r2, [r2 + r3 * 4]
> ; first loop dst ->5th row(i.e 4)
> - cmp r10, r9
> - jnz .label
> - test r5d, r5d
> - jz .end
> + sub r5d, 4
> + jz .end
> + cmp r5d, 4
> + jge .loop
>
> ; Row 8-9
> vbroadcasti128 m3, [r0]
> ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
> @@ -1830,15 +1828,13 @@
> phaddw m3, m4
> ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A]
>
> ; Row 10
> - lea r0, [r0 + r1 * 2]
> - vbroadcasti128 m4, [r0]
> ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
> + vbroadcasti128 m4, [r0 + r1 * 2]
> ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
> pshufb m4, m1
> pmaddubsw m4, m0
> phaddw m4, m4
> ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A]
> phaddw m3, m4
>
> - mova m4, [interp8_hps_shuf]
> - vpermd m3, m4, m3
> + vpermd m3, m5, m3
> ; m5 don't broken in above
> psubw m3, m2
>
> vextracti128 xm4, m3, 1
> @@ -1846,14 +1842,13 @@
> movhps [r2 + r3], xm3
> movq [r2 + r3 * 2], xm4
> .end
> -RET
> -%endif
> -%endmacro
> -
> -
> - IPFILTER_LUMA_PS_4x_AVX2 4 , 4
> - IPFILTER_LUMA_PS_4x_AVX2 4 , 8
> - IPFILTER_LUMA_PS_4x_AVX2 4 , 16
> + RET
> +%endif
> +%endmacro
> +
> + IPFILTER_LUMA_PS_4xN_AVX2 4
> + IPFILTER_LUMA_PS_4xN_AVX2 8
> + IPFILTER_LUMA_PS_4xN_AVX2 16
>
> %macro IPFILTER_LUMA_PS_8xN_AVX2 1
> ; TODO: verify and enable on X86 mode
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150313/055574ea/attachment-0001.html>
More information about the x265-devel
mailing list