[x265] [PATCH Review Only] filterHorizontal_p_p_4, 48x48 asm code

Tue Oct 8 21:39:43 CEST 2013

On Tue, Oct 8, 2013 at 2:33 AM, <praveen at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Praveen Tiwari
> # Date 1381217602 -19800
> # Node ID 4f728eeab74a089c86068663baf522c40a136981
> # Parent  2b2fc4a46c7dcf8720b1b9872c0f3b86c048ffcd
> filterHorizontal_p_p_4, 48x48 asm code
>

For luma, the only width-48 block used in the encoder is 48x64.

And at width 64 there is only 64x16, 64x32, 64x48, 64x64 (1/4, 1/2, 3/4,
4/4).

The same applies to width 32 (8, 16, 24, 32) and 16 (4, 8, 12, 16).  (width
24 only has height 32, width 12 only has height 16)

width 8 only has 8x4 and 8x8

So to minimize your work effort you should be writing 8-tap luma macros
that interpolate:

* 64x16
* 32x8
* 16x4
* 8x4

The 48x64, 24x32, and 12x16 blocks are rarely used (AMP) and could be built
from 16x4 or 4x4.

These 4-tap filters are only used for 4:2:0 chroma and they will have
different block-size requirements, but you need to figure out exactly which
chroma blocks are needed before writing 4-tap block intrinsics.

diff -r 2b2fc4a46c7d -r 4f728eeab74a source/common/x86/ipfilter8.asm
> --- a/source/common/x86/ipfilter8.asm   Tue Oct 08 12:53:44 2013 +0530
> +++ b/source/common/x86/ipfilter8.asm   Tue Oct 08 13:03:22 2013 +0530
> @@ -530,3 +530,101 @@
>      FILTER_H4_w32   x0, x1, x2, x3
>      movu        [dstq + 16],    x1
>      RET
> +
> +    SECTION_RODATA 32
> +tab_Tm:     db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
> +            db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
> +
> +tab_c_512:  times 8 dw 512
> +
> +SECTION .text
> +
> +%macro FILTER_H4_w48 4
> +    movu        %1, [srcq - 1]
> +    pshufb      %2, %1, Tm0
> +    pmaddubsw   %2, coef2
> +    pshufb      %1, %1, Tm1
> +    pmaddubsw   %1, coef2
> +    phaddw      %2, %1
> +    movu        %1, [srcq - 1 + 8]
> +    pshufb      %4, %1, Tm0
> +    pmaddubsw   %4, coef2
> +    pshufb      %1, %1, Tm1
> +    pmaddubsw   %1, coef2
> +    phaddw      %4, %1
> +    pmulhrsw    %2, %3
> +    pmulhrsw    %4, %3
> +    packuswb    %2, %4
> +    movu        [dstq],      %2
> +    movu        %1, [srcq - 1 + 16]
> +    pshufb      %2, %1, Tm0
> +    pmaddubsw   %2, coef2
> +    pshufb      %1, %1, Tm1
> +    pmaddubsw   %1, coef2
> +    phaddw      %2, %1
> +    movu        %1, [srcq - 1 + 24]
> +    pshufb      %4, %1, Tm0
> +    pmaddubsw   %4, coef2
> +    pshufb      %1, %1, Tm1
> +    pmaddubsw   %1, coef2
> +    phaddw      %4, %1
> +    pmulhrsw    %2, %3
> +    pmulhrsw    %4, %3
> +    packuswb    %2, %4
> +    movu        [dstq + 16],      x1
> +    movu        %1, [srcq - 1 + 32]
> +    pshufb      %2, %1, Tm0
> +    pmaddubsw   %2, coef2
> +    pshufb      %1, %1, Tm1
> +    pmaddubsw   %1, coef2
> +    phaddw      %2, %1
> +    movu        %1, [srcq - 1 + 40]
> +    pshufb      %4, %1, Tm0
> +    pmaddubsw   %4, coef2
> +    pshufb      %1, %1, Tm1
> +    pmaddubsw   %1, coef2
> +    phaddw      %4, %1
> +    pmulhrsw    %2, %3
> +    pmulhrsw    %4, %3
> +    packuswb    %2, %4
> +%endmacro
> +
> +%macro FILTER_H4_w48_CALL 0
> +    FILTER_H4_w48   x0, x1, x2, x3
> +
> +    movu        [dstq + 32],      x1
> +
> +    add         srcq,        srcstrideq
> +    add         dstq,        dststrideq
> +%endmacro
> +
>
> +;-----------------------------------------------------------------------------
> +; void filterHorizontal_p_p_4(pixel *src, intptr_t srcStride, pixel *dst,
> intptr_t dstStride, int width, int height, short const *coeff)
>
> +;-----------------------------------------------------------------------------
> +INIT_XMM sse4
> +cglobal filterHorizontal_p_p_4, 4, 5, 6, src, srcstride, dst, dststride
> +%define coef2       m6
> +%define Tm0         m5
> +%define Tm1         m4
> +%define x3          m3
> +%define x2          m2
> +%define x1          m1
> +%define x0          m0
> +
> +    mov         r4,         r6m
> +    movu        coef2,      [r4]
> +    packsswb    coef2,      coef2
> +    pshufd      coef2,      coef2,      0
> +
> +    mova        x2,         [tab_c_512]
> +
> +    mova        Tm0,        [tab_Tm]
> +    mova        Tm1,        [tab_Tm + 16]
> +
> + %rep 47
> + FILTER_H4_w48_CALL
> + %endrep
> +
> +    FILTER_H4_w48   x0, x1, x2, x3
> +    movu        [dstq + 32],    x1
> +    RET
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>

-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131008/a3970d98/attachment.html>