[x265] [PATCH 1 of 3] asm: routines for chroma hps filter functions for 2xN, 4xN, 6x8 and 12x16 block sizes
chen
chenm003 at 163.com
Wed Nov 13 04:42:48 CET 2013
the code is right
if possible, I suggest do "dec srcq" before loop, so we can modify [srcq -1] to [srcq], it is faster and less a byte
At 2013-11-12 23:06:55,nabajit at multicorewareinc.com wrote:
># HG changeset patch
># User Nabajit Deka
># Date 1384268074 -19800
># Tue Nov 12 20:24:34 2013 +0530
># Node ID c9851effbce88c9a70f712fbfaf7e83616c5615f
># Parent 968f6df6d50f70d2a4cf569a8c0426f65d927b00
>asm: routines for chroma hps filter functions for 2xN, 4xN, 6x8 and 12x16 block sizes.
>
>diff -r 968f6df6d50f -r c9851effbce8 source/common/x86/ipfilter8.asm
>--- a/source/common/x86/ipfilter8.asm Tue Nov 12 17:34:19 2013 +0530
>+++ b/source/common/x86/ipfilter8.asm Tue Nov 12 20:24:34 2013 +0530
>@@ -3417,3 +3417,186 @@
> FILTER_VER_CHROMA_SP_W8_H2 8, 8
> FILTER_VER_CHROMA_SP_W8_H2 8, 16
> FILTER_VER_CHROMA_SP_W8_H2 8, 32
>+
>+%macro PROCESS_CHROMA_W2 3
>+ movh %2, [srcq - 1]
>+ pshufb %2, %2, Tm0
>+ movh %1, [srcq + srcstrideq - 1]
>+ pshufb %1, %1, Tm0
>+ punpcklqdq %2, %1
>+ pmaddubsw %2, coef2
>+ phaddw %2, %2
>+ psubw %2, %3
>+ movd [dstq], %2
>+ pshufd %2, %2, 1
>+ movd [dstq + dststrideq], %2
>+%endmacro
>+
>+;-------------------------------------------------------------------------------------------------------------
>+; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
>+;-------------------------------------------------------------------------------------------------------------
>+%macro FILTER_HORIZ_CHROMA_2xN 2
>+INIT_XMM sse4
>+cglobal interp_4tap_horiz_ps_%1x%2, 4, 6, 6, src, srcstride, dst, dststride
>+%define coef2 m5
>+%define Tm0 m4
>+%define Tm1 m3
>+%define t2 m2
>+%define t1 m1
>+%define t0 m0
>+
>+mov r4d, r4m
>+add dststrided, dststrided
>+
>+%ifdef PIC
>+lea r5, [tab_ChromaCoeff]
>+movd coef2, [r5 + r4 * 4]
>+%else
>+movd coef2, [tab_ChromaCoeff + r4 * 4]
>+%endif
>+
>+pshufd coef2, coef2, 0
>+mova t2, [tab_c_8192]
>+mova Tm0, [tab_Tm]
>+
>+%rep %2/2
>+PROCESS_CHROMA_W2 t0, t1, t2
>+lea srcq, [srcq + srcstrideq * 2]
>+lea dstq, [dstq + dststrideq * 2]
>+%endrep
>+
>+RET
>+%endmacro
>+
>+FILTER_HORIZ_CHROMA_2xN 2, 4
>+FILTER_HORIZ_CHROMA_2xN 2, 8
>+
>+
>+%macro PROCESS_CHROMA_W4 3
>+ movh %2, [srcq - 1]
>+ pshufb %2, %2, Tm0
>+ pmaddubsw %2, coef2
>+ movh %1, [srcq + srcstrideq - 1]
>+ pshufb %1, %1, Tm0
>+ pmaddubsw %1, coef2
>+ phaddw %2, %1
>+ psubw %2, %3
>+ movlps [dstq], %2
>+ movhps [dstq + dststrideq], %2
>+%endmacro
>+
>+;-------------------------------------------------------------------------------------------------------------
>+; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
>+;-------------------------------------------------------------------------------------------------------------
>+%macro FILTER_HORIZ_CHROMA_4xN 2
>+INIT_XMM sse4
>+cglobal interp_4tap_horiz_ps_%1x%2, 4, 6, 6, src, srcstride, dst, dststride
>+%define coef2 m5
>+%define Tm0 m4
>+%define Tm1 m3
>+%define t2 m2
>+%define t1 m1
>+%define t0 m0
>+
>+mov r4d, r4m
>+add dststrided, dststrided
>+
>+%ifdef PIC
>+lea r5, [tab_ChromaCoeff]
>+movd coef2, [r5 + r4 * 4]
>+%else
>+movd coef2, [tab_ChromaCoeff + r4 * 4]
>+%endif
>+
>+pshufd coef2, coef2, 0
>+mova t2, [tab_c_8192]
>+mova Tm0, [tab_Tm]
>+
>+%rep %2/2
>+PROCESS_CHROMA_W4 t0, t1, t2
>+lea srcq, [srcq + srcstrideq * 2]
>+lea dstq, [dstq + dststrideq * 2]
>+%endrep
>+
>+RET
>+%endmacro
>+
>+FILTER_HORIZ_CHROMA_4xN 4, 2
>+FILTER_HORIZ_CHROMA_4xN 4, 4
>+FILTER_HORIZ_CHROMA_4xN 4, 8
>+FILTER_HORIZ_CHROMA_4xN 4, 16
>+
>+
>+%macro PROCESS_CHROMA_W6 3
>+ movu %1, [srcq - 1]
>+ pshufb %2, %1, Tm0
>+ pmaddubsw %2, coef2
>+ pshufb %1, %1, Tm1
>+ pmaddubsw %1, coef2
>+ phaddw %2, %1
>+ psubw %2, %3
>+ movh [dstq], %2
>+ pshufd %2, %2, 2
>+ movd [dstq + 8], %2
>+%endmacro
>+
>+%macro PROCESS_CHROMA_W12 3
>+ movu %1, [srcq - 1]
>+ pshufb %2, %1, Tm0
>+ pmaddubsw %2, coef2
>+ pshufb %1, %1, Tm1
>+ pmaddubsw %1, coef2
>+ phaddw %2, %1
>+ psubw %2, %3
>+ movu [dstq], %2
>+ movu %1, [srcq - 1 + 8]
>+ pshufb %1, %1, Tm0
>+ pmaddubsw %1, coef2
>+ phaddw %1, %1
>+ psubw %1, %3
>+ movh [dstq + 16], %1
>+%endmacro
>+
>+;-----------------------------------------------------------------------------
>+; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
>+;-----------------------------------------------------------------------------
>+%macro FILTER_HORIZ_CHROMA 2
>+INIT_XMM sse4
>+cglobal interp_4tap_horiz_ps_%1x%2, 4, 6, 6, src, srcstride, dst, dststride
>+%define coef2 m5
>+%define Tm0 m4
>+%define Tm1 m3
>+%define t2 m2
>+%define t1 m1
>+%define t0 m0
>+
>+mov r4d, r4m
>+add dststrided, dststrided
>+
>+%ifdef PIC
>+lea r5, [tab_ChromaCoeff]
>+movd coef2, [r5 + r4 * 4]
>+%else
>+movd coef2, [tab_ChromaCoeff + r4 * 4]
>+%endif
>+
>+mov r5d, %2
>+
>+pshufd coef2, coef2, 0
>+mova t2, [tab_c_8192]
>+mova Tm0, [tab_Tm]
>+mova Tm1, [tab_Tm + 16]
>+
>+.loop
>+PROCESS_CHROMA_W%1 t0, t1, t2
>+add srcq, srcstrideq
>+add dstq, dststrideq
>+
>+dec r5d
>+jnz .loop
>+
>+RET
>+%endmacro
>+
>+FILTER_HORIZ_CHROMA 6, 8
>+FILTER_HORIZ_CHROMA 12, 16
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131113/8a0b98b2/attachment-0001.html>
More information about the x265-devel
mailing list