[x265] [PATCH] asm code for luma filter functions

Fri Oct 18 21:51:32 CEST 2013

On Fri, Oct 18, 2013 at 5:48 AM, <praveen at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Praveen Tiwari
> # Date 1382093292 -19800
> # Node ID 0efb3f85325f03edb436b260ba28189b8eae6b3f
> # Parent  975b3d17baa8339f77a3b99245136e8c06cf3fdb
> asm code for luma filter functions
>

I've folded a few patches together, but these changes are all queued.

Thanks for persevering; the new 4tap and 8tap routines have good
performance and I think we have a solid foundation on which to add the rest
of the interpolation primitives, and perhaps do some XOP/AVX/AVX2
optimizations of the existing macros in the future.

The next step is to wire up these new primitives into the motion
compensation routines.  Then on to the vertical interpolation and the ps/sp
varieties.

>
> diff -r 975b3d17baa8 -r 0efb3f85325f source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp      Fri Oct 18 15:58:29 2013
> +0530
> +++ b/source/common/x86/asm-primitives.cpp      Fri Oct 18 16:18:12 2013
> +0530
> @@ -279,6 +279,7 @@
>          SA8D_INTER_FROM_BLOCK(sse4);
>
>          CHROMA_FILTERS(_sse4);
> +        LUMA_FILTERS(_sse4);
>      }
>      if (cpuMask & X265_CPU_AVX)
>      {
> diff -r 975b3d17baa8 -r 0efb3f85325f source/common/x86/ipfilter8.asm
> --- a/source/common/x86/ipfilter8.asm   Fri Oct 18 15:58:29 2013 +0530
> +++ b/source/common/x86/ipfilter8.asm   Fri Oct 18 16:18:12 2013 +0530
> @@ -30,6 +30,11 @@
>  tab_Tm:     db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
>              db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
>
> +tab_Lm:    db 0, 1, 2, 3, 4,  5,  6,  7,  1, 2, 3, 4,  5,  6,  7,  8
> +           db 2, 3, 4, 5, 6,  7,  8,  9,  3, 4, 5, 6,  7,  8,  9,  10
> +           db 4, 5, 6, 7, 8,  9,  10, 11, 5, 6, 7, 8,  9,  10, 11, 12
> +           db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14
> +
>  tab_c_512:  times 8 dw 512
>
>  tab_coeff:    db  0, 64,  0,  0
> @@ -41,6 +46,12 @@
>                db -2, 16, 54, -4
>                db -2, 10, 58, -2
>
> +tab_LumaCoeff:   db   0, 0,  0,  64,  0,   0,  0,  0
> +                 db  -1, 4, -10, 58,  17, -5,  1,  0
> +                 db  -1, 4, -11, 40,  40, -11, 4, -1
> +                 db   0, 1, -5,  17,  58, -10, 4, -1
> +
> +
>  SECTION .text
>
>  %macro FILTER_H4_w2_2 3
> @@ -469,3 +480,116 @@
>  IPFILTER_CHROMA_W 32, 16
>  IPFILTER_CHROMA_W 32, 24
>  IPFILTER_CHROMA_W 32, 32
> +
> +
> +%macro FILTER_H8_W8 3
> +    movu        %1, [r0 - 3 + r5]
> +    pshufb      %2, %1, [tab_Lm]
> +    pmaddubsw   %2, m3
> +    pshufb      m7, %1, [tab_Lm + 16]
> +    pmaddubsw   m7, m3
> +    phaddw      %2, m7
> +    pshufb      m7, %1, [tab_Lm + 32]
> +    pmaddubsw   m7, m3
> +    pshufb      %1, %1, [tab_Lm + 48]
> +    pmaddubsw   %1, m3
> +    phaddw      m7, %1
> +    phaddw      %2, m7
> +    pmulhrsw    %2, %3
> +    packuswb    %2, %2
> +    movh        [r2 + r5], %2
> +%endmacro
> +
> +%macro FILTER_H8_W4 3
> +    movu        %1, [r0 - 3 + r5]
> +    pshufb      %2, %1, [tab_Lm]
> +    pmaddubsw   %2, m3
> +    pshufb      m7, %1, [tab_Lm + 16]
> +    pmaddubsw   m7, m3
> +    phaddw      %2, m7
> +    phaddw      %2, %2
> +    pmulhrsw    %2, %3
> +    packuswb    %2, %2
> +    movd       [r2 + r5], %2
> +%endmacro
> +
> +%macro FILTER_H8_W1 3
> +    movu        %1, [r0 - 3 + r5]
> +    pshufb      %2, %1, [tab_Lm]
> +    pmaddubsw   %2, m3
> +    phaddw      %2, %2
> +    phaddw      %2, %2
> +    pmulhrsw    %2, %3
> +    packuswb    %2, %2
> +    pextrb      [r2 + r5], %2, 0
> +%endmacro
> +
>
> +;-----------------------------------------------------------------------------
> +; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel
> *dst, intptr_t dstStride, int coeffIdx)
>
> +;-----------------------------------------------------------------------------
> +%macro IPFILTER_LUMA 2
> +cglobal interp_8tap_horiz_pp_%1x%2, 4, 6, 5
> +
> +mov         r4d,        r4m
> +
> +%ifdef PIC
> +lea         r5,       [tab_LumaCoeff]
> +movh        m3,       [r5 + r4 * 8]
> +%else
> +movh        m3,       [tab_LumaCoeff + r4 * 8]
> +%endif
> +
> +punpcklqdq  m3,       m3
> +mova        m2,       [tab_c_512]
> +mov         r4,       %2
> +
> +.loop
> +    xor r5, r5
> +%rep %1 / 8
> +    FILTER_H8_W8  m0, m1, m2
> +    add           r5, 8
> +%endrep
> +
> +%rep (%1 % 8) / 4
> +    FILTER_H8_W4  m0, m1, m2
> +    add           r5, 4
> +%endrep
> +
> + %rep(%1 % 4)
> +    FILTER_H8_W1  m0, m1, m2
> +    add           r5, 1
> + %endrep
> +
> +    add          r0, r1
> +    add          r2, r3
> +
> +    dec          r4d
> +    jnz         .loop
> +    RET
> +%endmacro
> +
> +    IPFILTER_LUMA 4,   4
> +    IPFILTER_LUMA 8,   8
> +    IPFILTER_LUMA 8,   4
> +    IPFILTER_LUMA 4,   8
> +    IPFILTER_LUMA 16, 16
> +    IPFILTER_LUMA 16,  8
> +    IPFILTER_LUMA 8,  16
> +    IPFILTER_LUMA 16, 12
> +    IPFILTER_LUMA 12, 16
> +    IPFILTER_LUMA 16,  4
> +    IPFILTER_LUMA 4,  16
> +    IPFILTER_LUMA 32, 32
> +    IPFILTER_LUMA 32, 16
> +    IPFILTER_LUMA 16, 32
> +    IPFILTER_LUMA 32, 24
> +    IPFILTER_LUMA 24, 32
> +    IPFILTER_LUMA 32,  8
> +    IPFILTER_LUMA 8,  32
> +    IPFILTER_LUMA 64, 64
> +    IPFILTER_LUMA 64, 32
> +    IPFILTER_LUMA 32, 64
> +    IPFILTER_LUMA 64, 48
> +    IPFILTER_LUMA 48, 64
> +    IPFILTER_LUMA 64, 16
> +    IPFILTER_LUMA 16, 64
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>

-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131018/0dcb9bff/attachment-0001.html>