[x265] [PATCH 1 of 3] asm: routines for chroma hps filter functions for 2xN, 4xN, 6x8 and 12x16 block sizes
Nabajit Deka
nabajit at multicorewareinc.com
Wed Nov 13 06:31:15 CET 2013
Most of the code here is reused from chroma hpp. Once all the patches are
integrated and running, I will revisit the code and make the necessary
changes.
On Wed, Nov 13, 2013 at 9:12 AM, chen <chenm003 at 163.com> wrote:
> the code is right
> if possible, I suggest do "dec srcq" before loop, so we can modify [srcq
> -1] to [srcq], it is faster and less a byte
>
> At 2013-11-12 23:06:55,nabajit at multicorewareinc.com wrote:
>
> ># HG changeset patch
> ># User Nabajit Deka
> ># Date 1384268074 -19800
> ># Tue Nov 12 20:24:34 2013 +0530
> ># Node ID c9851effbce88c9a70f712fbfaf7e83616c5615f
> ># Parent 968f6df6d50f70d2a4cf569a8c0426f65d927b00
>
> >asm: routines for chroma hps filter functions for 2xN, 4xN, 6x8 and 12x16 block sizes.
> >
> >diff -r 968f6df6d50f -r c9851effbce8 source/common/x86/ipfilter8.asm
> >--- a/source/common/x86/ipfilter8.asm Tue Nov 12 17:34:19 2013 +0530
> >+++ b/source/common/x86/ipfilter8.asm Tue Nov 12 20:24:34 2013 +0530
> >@@ -3417,3 +3417,186 @@
> > FILTER_VER_CHROMA_SP_W8_H2 8, 8
> > FILTER_VER_CHROMA_SP_W8_H2 8, 16
> > FILTER_VER_CHROMA_SP_W8_H2 8, 32
> >+
> >+%macro PROCESS_CHROMA_W2 3
> >+ movh %2, [srcq - 1]
> >+ pshufb %2, %2, Tm0
> >+ movh %1, [srcq + srcstrideq - 1]
> >+ pshufb %1, %1, Tm0
> >+ punpcklqdq %2, %1
> >+ pmaddubsw %2, coef2
> >+ phaddw %2, %2
> >+ psubw %2, %3
> >+ movd [dstq], %2
> >+ pshufd %2, %2, 1
> >+ movd [dstq + dststrideq], %2
> >+%endmacro
> >+
>
> >+;-------------------------------------------------------------------------------------------------------------
>
> >+; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
>
> >+;-------------------------------------------------------------------------------------------------------------
> >+%macro FILTER_HORIZ_CHROMA_2xN 2
> >+INIT_XMM sse4
>
> >+cglobal interp_4tap_horiz_ps_%1x%2, 4, 6, 6, src, srcstride, dst, dststride
> >+%define coef2 m5
> >+%define Tm0 m4
> >+%define Tm1 m3
> >+%define t2 m2
> >+%define t1 m1
> >+%define t0 m0
> >+
> >+mov r4d, r4m
> >+add dststrided, dststrided
> >+
> >+%ifdef PIC
> >+lea r5, [tab_ChromaCoeff]
> >+movd coef2, [r5 + r4 * 4]
> >+%else
> >+movd coef2, [tab_ChromaCoeff + r4 * 4]
> >+%endif
> >+
> >+pshufd coef2, coef2, 0
> >+mova t2, [tab_c_8192]
> >+mova Tm0, [tab_Tm]
> >+
> >+%rep %2/2
> >+PROCESS_CHROMA_W2 t0, t1, t2
> >+lea srcq, [srcq + srcstrideq * 2]
> >+lea dstq, [dstq + dststrideq * 2]
> >+%endrep
> >+
> >+RET
> >+%endmacro
> >+
> >+FILTER_HORIZ_CHROMA_2xN 2, 4
> >+FILTER_HORIZ_CHROMA_2xN 2, 8
> >+
> >+
> >+%macro PROCESS_CHROMA_W4 3
> >+ movh %2, [srcq - 1]
> >+ pshufb %2, %2, Tm0
> >+ pmaddubsw %2, coef2
> >+ movh %1, [srcq + srcstrideq - 1]
> >+ pshufb %1, %1, Tm0
> >+ pmaddubsw %1, coef2
> >+ phaddw %2, %1
> >+ psubw %2, %3
> >+ movlps [dstq], %2
> >+ movhps [dstq + dststrideq], %2
> >+%endmacro
> >+
>
> >+;-------------------------------------------------------------------------------------------------------------
>
> >+; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
>
> >+;-------------------------------------------------------------------------------------------------------------
> >+%macro FILTER_HORIZ_CHROMA_4xN 2
> >+INIT_XMM sse4
>
> >+cglobal interp_4tap_horiz_ps_%1x%2, 4, 6, 6, src, srcstride, dst, dststride
> >+%define coef2 m5
> >+%define Tm0 m4
> >+%define Tm1 m3
> >+%define t2 m2
> >+%define t1 m1
> >+%define t0 m0
> >+
> >+mov r4d, r4m
> >+add dststrided, dststrided
> >+
> >+%ifdef PIC
> >+lea r5, [tab_ChromaCoeff]
> >+movd coef2, [r5 + r4 * 4]
> >+%else
> >+movd coef2, [tab_ChromaCoeff + r4 * 4]
> >+%endif
> >+
> >+pshufd coef2, coef2, 0
> >+mova t2, [tab_c_8192]
> >+mova Tm0, [tab_Tm]
> >+
> >+%rep %2/2
> >+PROCESS_CHROMA_W4 t0, t1, t2
> >+lea srcq, [srcq + srcstrideq * 2]
> >+lea dstq, [dstq + dststrideq * 2]
> >+%endrep
> >+
> >+RET
> >+%endmacro
> >+
> >+FILTER_HORIZ_CHROMA_4xN 4, 2
> >+FILTER_HORIZ_CHROMA_4xN 4, 4
> >+FILTER_HORIZ_CHROMA_4xN 4, 8
> >+FILTER_HORIZ_CHROMA_4xN 4, 16
> >+
> >+
> >+%macro PROCESS_CHROMA_W6 3
> >+ movu %1, [srcq - 1]
> >+ pshufb %2, %1, Tm0
> >+ pmaddubsw %2, coef2
> >+ pshufb %1, %1, Tm1
> >+ pmaddubsw %1, coef2
> >+ phaddw %2, %1
> >+ psubw %2, %3
> >+ movh [dstq], %2
> >+ pshufd %2, %2, 2
> >+ movd [dstq + 8], %2
> >+%endmacro
> >+
> >+%macro PROCESS_CHROMA_W12 3
> >+ movu %1, [srcq - 1]
> >+ pshufb %2, %1, Tm0
> >+ pmaddubsw %2, coef2
> >+ pshufb %1, %1, Tm1
> >+ pmaddubsw %1, coef2
> >+ phaddw %2, %1
> >+ psubw %2, %3
> >+ movu [dstq], %2
> >+ movu %1, [srcq - 1 + 8]
> >+ pshufb %1, %1, Tm0
> >+ pmaddubsw %1, coef2
> >+ phaddw %1, %1
> >+ psubw %1, %3
> >+ movh [dstq + 16], %1
> >+%endmacro
> >+
>
> >+;-----------------------------------------------------------------------------
>
> >+; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
>
> >+;-----------------------------------------------------------------------------
> >+%macro FILTER_HORIZ_CHROMA 2
> >+INIT_XMM sse4
>
> >+cglobal interp_4tap_horiz_ps_%1x%2, 4, 6, 6, src, srcstride, dst, dststride
> >+%define coef2 m5
> >+%define Tm0 m4
> >+%define Tm1 m3
> >+%define t2 m2
> >+%define t1 m1
> >+%define t0 m0
> >+
> >+mov r4d, r4m
> >+add dststrided, dststrided
> >+
> >+%ifdef PIC
> >+lea r5, [tab_ChromaCoeff]
> >+movd coef2, [r5 + r4 * 4]
> >+%else
> >+movd coef2, [tab_ChromaCoeff + r4 * 4]
> >+%endif
> >+
> >+mov r5d, %2
> >+
> >+pshufd coef2, coef2, 0
> >+mova t2, [tab_c_8192]
> >+mova Tm0, [tab_Tm]
> >+mova Tm1, [tab_Tm + 16]
> >+
> >+.loop
> >+PROCESS_CHROMA_W%1 t0, t1, t2
> >+add srcq, srcstrideq
> >+add dstq, dststrideq
> >+
> >+dec r5d
> >+jnz .loop
> >+
> >+RET
> >+%endmacro
> >+
> >+FILTER_HORIZ_CHROMA 6, 8
> >+FILTER_HORIZ_CHROMA 12, 16
> >_______________________________________________
> >x265-devel mailing list
> >x265-devel at videolan.org
> >https://mailman.videolan.org/listinfo/x265-devel
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131113/79d1c8ec/attachment.html>
More information about the x265-devel
mailing list