[x265] [PATCH 1 of 3] asm: routines for chroma hps filter functions for 2xN, 4xN, 6x8 and 12x16 block sizes

Wed Nov 13 06:31:15 CET 2013

Most of the code here is reused from chroma hpp. Once all the patches are
integrated and running, I will revisit the code and make the necessary
changes.

On Wed, Nov 13, 2013 at 9:12 AM, chen <chenm003 at 163.com> wrote:

> the code is right
> if possible, I suggest do "dec srcq" before loop, so we can modify [srcq
> -1] to [srcq], it is faster and less a byte
>
> At 2013-11-12 23:06:55,nabajit at multicorewareinc.com wrote:
>
> ># HG changeset patch
> ># User Nabajit Deka
> ># Date 1384268074 -19800
> >#      Tue Nov 12 20:24:34 2013 +0530
> ># Node ID c9851effbce88c9a70f712fbfaf7e83616c5615f
> ># Parent  968f6df6d50f70d2a4cf569a8c0426f65d927b00
>
> >asm: routines for chroma hps filter functions for 2xN, 4xN, 6x8 and 12x16 block sizes.
> >
> >diff -r 968f6df6d50f -r c9851effbce8 source/common/x86/ipfilter8.asm
> >--- a/source/common/x86/ipfilter8.asm Tue Nov 12 17:34:19 2013 +0530
> >+++ b/source/common/x86/ipfilter8.asm Tue Nov 12 20:24:34 2013 +0530
> >@@ -3417,3 +3417,186 @@
> > FILTER_VER_CHROMA_SP_W8_H2 8, 8
> > FILTER_VER_CHROMA_SP_W8_H2 8, 16
> > FILTER_VER_CHROMA_SP_W8_H2 8, 32
> >+
> >+%macro PROCESS_CHROMA_W2 3
> >+    movh        %2, [srcq - 1]
> >+    pshufb      %2, %2, Tm0
> >+    movh        %1, [srcq + srcstrideq - 1]
> >+    pshufb      %1, %1, Tm0
> >+    punpcklqdq  %2, %1
> >+    pmaddubsw   %2, coef2
> >+    phaddw      %2, %2
> >+    psubw       %2, %3
> >+    movd        [dstq], %2
> >+    pshufd      %2, %2, 1
> >+    movd        [dstq + dststrideq], %2
> >+%endmacro
> >+
>
> >+;-------------------------------------------------------------------------------------------------------------
>
> >+; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
>
> >+;-------------------------------------------------------------------------------------------------------------
> >+%macro FILTER_HORIZ_CHROMA_2xN 2
> >+INIT_XMM sse4
>
> >+cglobal interp_4tap_horiz_ps_%1x%2, 4, 6, 6, src, srcstride, dst, dststride
> >+%define coef2       m5
> >+%define Tm0         m4
> >+%define Tm1         m3
> >+%define t2          m2
> >+%define t1          m1
> >+%define t0          m0
> >+
> >+mov         r4d, r4m
> >+add         dststrided, dststrided
> >+
> >+%ifdef PIC
> >+lea         r5, [tab_ChromaCoeff]
> >+movd        coef2, [r5 + r4 * 4]
> >+%else
> >+movd        coef2, [tab_ChromaCoeff + r4 * 4]
> >+%endif
> >+
> >+pshufd      coef2, coef2, 0
> >+mova        t2, [tab_c_8192]
> >+mova        Tm0, [tab_Tm]
> >+
> >+%rep %2/2
> >+PROCESS_CHROMA_W2   t0, t1, t2
> >+lea         srcq, [srcq + srcstrideq * 2]
> >+lea         dstq, [dstq + dststrideq * 2]
> >+%endrep
> >+
> >+RET
> >+%endmacro
> >+
> >+FILTER_HORIZ_CHROMA_2xN 2, 4
> >+FILTER_HORIZ_CHROMA_2xN 2, 8
> >+
> >+
> >+%macro PROCESS_CHROMA_W4 3
> >+    movh        %2, [srcq - 1]
> >+    pshufb      %2, %2, Tm0
> >+    pmaddubsw   %2, coef2
> >+    movh        %1, [srcq + srcstrideq - 1]
> >+    pshufb      %1, %1, Tm0
> >+    pmaddubsw   %1, coef2
> >+    phaddw      %2, %1
> >+    psubw       %2, %3
> >+    movlps        [dstq], %2
> >+    movhps        [dstq + dststrideq], %2
> >+%endmacro
> >+
>
> >+;-------------------------------------------------------------------------------------------------------------
>
> >+; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
>
> >+;-------------------------------------------------------------------------------------------------------------
> >+%macro FILTER_HORIZ_CHROMA_4xN 2
> >+INIT_XMM sse4
>
> >+cglobal interp_4tap_horiz_ps_%1x%2, 4, 6, 6, src, srcstride, dst, dststride
> >+%define coef2       m5
> >+%define Tm0         m4
> >+%define Tm1         m3
> >+%define t2          m2
> >+%define t1          m1
> >+%define t0          m0
> >+
> >+mov         r4d, r4m
> >+add         dststrided, dststrided
> >+
> >+%ifdef PIC
> >+lea         r5, [tab_ChromaCoeff]
> >+movd        coef2, [r5 + r4 * 4]
> >+%else
> >+movd        coef2, [tab_ChromaCoeff + r4 * 4]
> >+%endif
> >+
> >+pshufd      coef2, coef2, 0
> >+mova        t2, [tab_c_8192]
> >+mova        Tm0, [tab_Tm]
> >+
> >+%rep %2/2
> >+PROCESS_CHROMA_W4   t0, t1, t2
> >+lea         srcq, [srcq + srcstrideq * 2]
> >+lea         dstq, [dstq + dststrideq * 2]
> >+%endrep
> >+
> >+RET
> >+%endmacro
> >+
> >+FILTER_HORIZ_CHROMA_4xN 4, 2
> >+FILTER_HORIZ_CHROMA_4xN 4, 4
> >+FILTER_HORIZ_CHROMA_4xN 4, 8
> >+FILTER_HORIZ_CHROMA_4xN 4, 16
> >+
> >+
> >+%macro PROCESS_CHROMA_W6 3
> >+    movu        %1, [srcq - 1]
> >+    pshufb      %2, %1, Tm0
> >+    pmaddubsw   %2, coef2
> >+    pshufb      %1, %1, Tm1
> >+    pmaddubsw   %1, coef2
> >+    phaddw      %2, %1
> >+    psubw       %2, %3
> >+    movh        [dstq], %2
> >+    pshufd      %2, %2, 2
> >+    movd        [dstq + 8], %2
> >+%endmacro
> >+
> >+%macro PROCESS_CHROMA_W12 3
> >+    movu        %1, [srcq - 1]
> >+    pshufb      %2, %1, Tm0
> >+    pmaddubsw   %2, coef2
> >+    pshufb      %1, %1, Tm1
> >+    pmaddubsw   %1, coef2
> >+    phaddw      %2, %1
> >+    psubw       %2, %3
> >+    movu        [dstq], %2
> >+    movu        %1, [srcq - 1 + 8]
> >+    pshufb      %1, %1, Tm0
> >+    pmaddubsw   %1, coef2
> >+    phaddw      %1, %1
> >+    psubw       %1, %3
> >+    movh       [dstq + 16], %1
> >+%endmacro
> >+
>
> >+;-----------------------------------------------------------------------------
>
> >+; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
>
> >+;-----------------------------------------------------------------------------
> >+%macro FILTER_HORIZ_CHROMA 2
> >+INIT_XMM sse4
>
> >+cglobal interp_4tap_horiz_ps_%1x%2, 4, 6, 6, src, srcstride, dst, dststride
> >+%define coef2       m5
> >+%define Tm0         m4
> >+%define Tm1         m3
> >+%define t2          m2
> >+%define t1          m1
> >+%define t0          m0
> >+
> >+mov         r4d, r4m
> >+add         dststrided, dststrided
> >+
> >+%ifdef PIC
> >+lea         r5, [tab_ChromaCoeff]
> >+movd        coef2, [r5 + r4 * 4]
> >+%else
> >+movd        coef2, [tab_ChromaCoeff + r4 * 4]
> >+%endif
> >+
> >+mov           r5d, %2
> >+
> >+pshufd      coef2, coef2, 0
> >+mova        t2, [tab_c_8192]
> >+mova        Tm0, [tab_Tm]
> >+mova        Tm1, [tab_Tm + 16]
> >+
> >+.loop
> >+PROCESS_CHROMA_W%1   t0, t1, t2
> >+add         srcq, srcstrideq
> >+add         dstq, dststrideq
> >+
> >+dec         r5d
> >+jnz        .loop
> >+
> >+RET
> >+%endmacro
> >+
> >+FILTER_HORIZ_CHROMA 6, 8
> >+FILTER_HORIZ_CHROMA 12, 16
> >_______________________________________________
> >x265-devel mailing list
> >x265-devel at videolan.org
> >https://mailman.videolan.org/listinfo/x265-devel
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131113/79d1c8ec/attachment.html>