[x265] [PATCH] asm: improve ~5% on AVX2 interp_8tap_horiz_ps_4xN

Deepthi Nandakumar deepthi at multicorewareinc.com
Fri Mar 13 05:32:07 CET 2015


Thanks, queued

On Fri, Mar 13, 2015 at 9:16 AM, Min Chen <chenm003 at 163.com> wrote:

> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1426218406 25200
> # Node ID 91da3d8069fdc0d937097ff3d9d6ae91e25b852c
> # Parent  1f125d14f656cfd253bd36c29a111764f007a349
> asm: improve ~5% on AVX2 interp_8tap_horiz_ps_4xN
> ---
>  source/common/x86/ipfilter8.asm |   49
> +++++++++++++++++---------------------
>  2 files changed, 38 insertions(+), 27 deletions(-)
>
> diff -r 1f125d14f656 -r 91da3d8069fd source/common/x86/ipfilter8.asm
> --- a/source/common/x86/ipfilter8.asm   Thu Mar 12 13:06:38 2015 -0500
> +++ b/source/common/x86/ipfilter8.asm   Thu Mar 12 20:46:46 2015 -0700
> @@ -1749,10 +1749,10 @@
>  ;void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t*
> dst, intptr_t dstStride, int coeffIdx, int isRowExt)
>
>  ;-----------------------------------------------------------------------------------------------------------------------------
>
> -%macro IPFILTER_LUMA_PS_4x_AVX2 2
> +%macro IPFILTER_LUMA_PS_4xN_AVX2 1
>  INIT_YMM avx2
>  %if ARCH_X86_64 == 1
> -cglobal interp_8tap_horiz_ps_%1x%2, 6, 11, 6
> +cglobal interp_8tap_horiz_ps_4x%1, 6,7,6
>      mov                         r5d,               r5m
>      mov                         r4d,               r4m
>  %ifdef PIC
> @@ -1762,7 +1762,6 @@
>      vpbroadcastq                m0,                [tab_LumaCoeff + r4 *
> 8]
>  %endif
>      mova                        m1,                [tab_Lm]
> -    mov                         r9d,               %2
>        ;height
>      add                         r3d,               r3d
>      vbroadcasti128              m2,                [pw_2000]
>
> @@ -1771,17 +1770,17 @@
>      ; m1 - shuffle order table
>      ; m2 - pw_2000
>
> -    xor                         r10,               r10
>       ; loop count variable
>      sub                         r0,                3
>      test                        r5d,               r5d
> -    jz                          .label
> -    lea                         r8,                [r1 * 3]
>        ; r8 = (N / 2 - 1) * srcStride
> -    sub                         r0,                r8
>        ; r0(src)-r8
> -    add                         r9,                4
>       ; blkheight += N - 1  (7 - 3 = 4 ; since the last three rows not in
> loop)
> -
> -.label
> -      add                       r10,               4
> -
> +    mov                         r5d,               %1
>        ; loop count variable - height
> +    jz                         .preloop
> +    lea                         r6,                [r1 * 3]
>        ; r8 = (N / 2 - 1) * srcStride
> +    sub                         r0,                r6
>        ; r0(src) - 3 * srcStride
> +    add                         r5d,               7
>       ; need extra 7 rows, just set a specially flag here, blkheight += N -
> 1  (7 - 3 = 4 ; since the last three rows not in loop)
> +
> +.preloop:
> +    lea                         r6,                [r3 * 3]
> +.loop
>      ; Row 0-1
>      vbroadcasti128              m3,                [r0]
>        ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
>      pshufb                      m3,                m1
>        ; shuffled based on the col order tab_Lm
> @@ -1807,18 +1806,17 @@
>      psubw                       m3,                m2
>
>      vextracti128                xm4,               m3,               1
> -    lea                         r7,                [r3 * 3]
>      movq                        [r2],              xm3
>       ;row 0
>      movhps                      [r2 + r3],         xm3
>       ;row 1
>      movq                        [r2 + r3 * 2],     xm4
>       ;row 2
> -    movhps                      [r2 + r7],         xm4
>       ;row 3
> +    movhps                      [r2 + r6],         xm4
>       ;row 3
>
>      lea                         r0,                [r0 + r1 * 2]
>       ; first loop src ->5th row(i.e 4)
>      lea                         r2,                [r2 + r3 * 4]
>       ; first loop dst ->5th row(i.e 4)
> -    cmp                         r10,               r9
> -    jnz                         .label
> -    test                        r5d,               r5d
> -    jz                          .end
> +    sub                         r5d,               4
> +    jz                         .end
> +    cmp                         r5d,               4
> +    jge                        .loop
>
>      ; Row 8-9
>      vbroadcasti128              m3,                [r0]
>        ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
> @@ -1830,15 +1828,13 @@
>      phaddw                      m3,                m4
>        ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A]
>
>      ; Row 10
> -    lea                         r0,                [r0 + r1 * 2]
> -    vbroadcasti128              m4,                [r0]
>        ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
> +    vbroadcasti128              m4,                [r0 + r1 * 2]
>       ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
>      pshufb                      m4,                m1
>      pmaddubsw                   m4,                m0
>      phaddw                      m4,                m4
>        ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A]
>      phaddw                      m3,                m4
>
> -    mova                        m4,                [interp8_hps_shuf]
> -    vpermd                      m3,                m4,            m3
> +    vpermd                      m3,                m5,            m3
>       ; m5 don't broken in above
>      psubw                       m3,                m2
>
>      vextracti128                xm4,               m3,            1
> @@ -1846,14 +1842,13 @@
>      movhps                      [r2 + r3],         xm3
>      movq                        [r2 + r3 * 2],     xm4
>  .end
> -RET
> -%endif
> -%endmacro
> -
> -
> -    IPFILTER_LUMA_PS_4x_AVX2 4 , 4
> -    IPFILTER_LUMA_PS_4x_AVX2 4 , 8
> -    IPFILTER_LUMA_PS_4x_AVX2 4 , 16
> +    RET
> +%endif
> +%endmacro
> +
> +    IPFILTER_LUMA_PS_4xN_AVX2 4
> +    IPFILTER_LUMA_PS_4xN_AVX2 8
> +    IPFILTER_LUMA_PS_4xN_AVX2 16
>
>  %macro IPFILTER_LUMA_PS_8xN_AVX2 1
>  ; TODO: verify and enable on X86 mode
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150313/055574ea/attachment-0001.html>


More information about the x265-devel mailing list