[x265] [PATCH 13 of 13] x86 : AVX2 Refactor luma_hps 8xN and 24x32

Ashok Kumar Mishra ashok at multicorewareinc.com
Fri Jul 6 14:08:15 CEST 2018


On Fri, Jul 6, 2018 at 2:48 PM, <vignesh at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Jayashree <jayashree.c at multicorewareinc.com>
> # Date 1524473214 -19800
> #      Mon Apr 23 14:16:54 2018 +0530
> # Node ID 0106f9f2f867ee20893a317e98c60e9ca626e7d2
> # Parent  52ec2b63b870a1ec0c4ce45abc89ac522ade3c1f
> x86 : AVX2 Refactor luma_hps 8xN and 24x32
>
> diff -r 52ec2b63b870 -r 0106f9f2f867 source/common/x86/h-ipfilter16.asm
> --- a/source/common/x86/h-ipfilter16.asm        Fri Apr 20 14:16:17 2018
> +0530
> +++ b/source/common/x86/h-ipfilter16.asm        Mon Apr 23 14:16:54 2018
> +0530
> @@ -2133,12 +2133,43 @@
>      IPFILTER_LUMA_PS_4xN_AVX2 8
>      IPFILTER_LUMA_PS_4xN_AVX2 16
>
> +   %macro PROCESS_IPFILTER_LUMA_PS_8x1_AVX2 1
> +
> +     %assign x 0
> +    %rep %1/8
> +    vbroadcasti128      m4, [r0 + x]
> +    vbroadcasti128      m5, [r0 + 8+ x]
> +    pshufb              m4, m3
> +    pshufb              m7, m5, m3
> +    pmaddwd             m4, m0
> +    pmaddwd             m7, m1
> +    paddd               m4, m7
> +
> +    vbroadcasti128      m6, [r0 + 16 + x]
> +    pshufb              m5, m3
> +    pshufb              m6, m3
> +    pmaddwd             m5, m0
> +    pmaddwd             m6, m1
> +    paddd               m5, m6
> +
> +    phaddd              m4, m5
> +    vpermq              m4, m4, q3120
> +    paddd               m4, m2
> +    vextracti128        xm5,m4, 1
> +    psrad               xm4, INTERP_SHIFT_PS
> +    psrad               xm5, INTERP_SHIFT_PS
> +    packssdw            xm4, xm5
> +    movu                [r2 + x], xm4
> +    %assign x x+16
> +     %endrep
> +    %endmacro
> +
>  %macro IPFILTER_LUMA_PS_8xN_AVX2 1
>  INIT_YMM avx2
>  %if ARCH_X86_64 == 1
>  cglobal interp_8tap_horiz_ps_8x%1, 4, 6, 8
> -    add                 r1d, r1d
> -    add                 r3d, r3d
> +    shl                 r1d, 1
> +    shl                 r3d, 1
>      mov                 r4d, r4m
>      mov                 r5d, r5m
>      shl                 r4d, 4
> @@ -2165,30 +2196,7 @@
>      add                 r4d, 7
>
>  .loop0:
> -    vbroadcasti128      m4, [r0]
> -    vbroadcasti128      m5, [r0 + 8]
> -    pshufb              m4, m3
> -    pshufb              m7, m5, m3
> -    pmaddwd             m4, m0
> -    pmaddwd             m7, m1
> -    paddd               m4, m7
> -
> -    vbroadcasti128      m6, [r0 + 16]
> -    pshufb              m5, m3
> -    pshufb              m6, m3
> -    pmaddwd             m5, m0
> -    pmaddwd             m6, m1
> -    paddd               m5, m6
> -
> -    phaddd              m4, m5
> -    vpermq              m4, m4, q3120
> -    paddd               m4, m2
> -    vextracti128        xm5,m4, 1
> -    psrad               xm4, INTERP_SHIFT_PS
> -    psrad               xm5, INTERP_SHIFT_PS
> -    packssdw            xm4, xm5
> -
> -    movu                [r2], xm4
> +    PROCESS_IPFILTER_LUMA_PS_8x1_AVX2 8
>      add                 r2, r3
>      add                 r0, r1
>      dec                 r4d
> @@ -2232,36 +2240,9 @@
>      sub                 r0, r6
>      add                 r4d, 7
>
> +
>  .loop0:
> -%assign x 0
> -%rep 24/8
> -    vbroadcasti128      m4, [r0 + x]
> -    vbroadcasti128      m5, [r0 + 8 + x]
> -    pshufb              m4, m3
> -    pshufb              m7, m5, m3
> -    pmaddwd             m4, m0
> -    pmaddwd             m7, m1
> -    paddd               m4, m7
> -
> -    vbroadcasti128      m6, [r0 + 16 + x]
> -    pshufb              m5, m3
> -    pshufb              m6, m3
> -    pmaddwd             m5, m0
> -    pmaddwd             m6, m1
> -    paddd               m5, m6
> -
> -    phaddd              m4, m5
> -    vpermq              m4, m4, q3120
> -    paddd               m4, m2
> -    vextracti128        xm5,m4, 1
> -    psrad               xm4, INTERP_SHIFT_PS
> -    psrad               xm5, INTERP_SHIFT_PS
> -    packssdw            xm4, xm5
> -
> -    movu                [r2 + x], xm4
> -    %assign x x+16
> -    %endrep
> -
> +    PROCESS_IPFILTER_LUMA_PS_8x1_AVX2 24
>      add                 r2, r3
>      add                 r0, r1
>      dec                 r4d
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
Pushed this patch series.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20180706/60b12ce2/attachment.html>


More information about the x265-devel mailing list