[x265] [PATCH] asm: interp_8tap_horiz pp and ps sse2
chen
chenm003 at 163.com
Tue Apr 28 05:42:29 CEST 2015
At 2015-04-28 09:05:06,dtyx265 at gmail.com wrote:
># HG changeset patch
># User David T Yuen dtyx265 at gmail.com>
># Date 1430182995 25200
># Node ID 31b76bd430a47411f7b2ebaa7cfbb44e25c5ff60
># Parent 68a13226d586b335c02cade9311e093f0149c42a
>asm: interp_8tap_horiz pp and ps sse2
>
>This replaces c code and covers
>
>diff -r 68a13226d586 -r 31b76bd430a4 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Sat Apr 25 01:39:55 2015 -0500
>+++ b/source/common/x86/asm-primitives.cpp Mon Apr 27 18:03:15 2015 -0700
>@@ -1340,6 +1340,10 @@
> CHROMA_420_VSP_FILTERS(_sse2);
> CHROMA_422_VSP_FILTERS(_sse2);
> CHROMA_444_VSP_FILTERS(_sse2);
>+ ALL_LUMA_PU(luma_hpp, interp_8tap_horiz_pp, sse2);
>+ p.pu[LUMA_4x4].luma_hpp = x265_interp_8tap_horiz_pp_4x4_sse2;
>+ ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
>+ p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_sse2;
>
> //p.frameInitLowres = x265_frame_init_lowres_core_mmx2;
> p.frameInitLowres = x265_frame_init_lowres_core_sse2;
>diff -r 68a13226d586 -r 31b76bd430a4 source/common/x86/ipfilter8.asm
>--- a/source/common/x86/ipfilter8.asm Sat Apr 25 01:39:55 2015 -0500
>+++ b/source/common/x86/ipfilter8.asm Mon Apr 27 18:03:15 2015 -0700
>@@ -160,6 +160,11 @@
> db -1, 4, -11, 40, 40, -11, 4, -1
> db 0, 1, -5, 17, 58, -10, 4, -1
>
>+tabw_LumaCoeff: dw 0, 0, 0, 64, 0, 0, 0, 0
>+ dw -1, 4, -10, 58, 17, -5, 1, 0
>+ dw -1, 4, -11, 40, 40, -11, 4, -1
>+ dw 0, 1, -5, 17, 58, -10, 4, -1
>+
> tab_LumaCoeffV: times 4 dw 0, 0
> times 4 dw 0, 64
> times 4 dw 0, 0
>@@ -825,6 +830,230 @@
> IPFILTER_CHROMA_W_sse3 48, 64
> IPFILTER_CHROMA_W_sse3 64, 16
>
>+%macro FILTER_H8_W8_sse2 0
>+ movh m1, [r0 + r5 - 3]
offset field (-3) can avoid by preare on r0
>+ movh m4, [r0 + r5 - 3 + 1]
>+ punpcklbw m1, m6
>+ punpcklbw m4, m6
>+ movh m5, [r0 + r5 - 3 + 2]
>+ movh m0, [r0 + r5 - 3 + 3]
>+ punpcklbw m5, m6
>+ punpcklbw m0, m6
>+ pmaddwd m1, m3
>+ pmaddwd m4, m3
>+ pmaddwd m5, m3
>+ pmaddwd m0, m3
>+ packssdw m1, m4
>+ packssdw m5, m0
>+ pshuflw m4, m1, q2301
>+ pshufhw m4, m4, q2301
>+ pshuflw m0, m5, q2301
>+ pshufhw m0, m0, q2301
>+ paddw m1, m4
>+ paddw m5, m0
>+ psrldq m1, 2
>+ psrldq m5, 2
>+ pshufd m1, m1, q3120
>+ pshufd m5, m5, q3120
>+ punpcklqdq m1, m5
m1 valid in here
>+ movh m7, [r0 + r5 - 3 + 4]
>+ movh m4, [r0 + r5 - 3 + 5]
>+ punpcklbw m7, m6
>+ punpcklbw m4, m6
>+ movh m5, [r0 + r5 - 3 + 6]
>+ movh m0, [r0 + r5 - 3 + 7]
>+ punpcklbw m5, m6
>+ punpcklbw m0, m6
>+ pmaddwd m7, m3
>+ pmaddwd m4, m3
>+ pmaddwd m5, m3
>+ pmaddwd m0, m3
>+ packssdw m7, m4
>+ packssdw m5, m0
>+ pshuflw m4, m7, q2301
>+ pshufhw m4, m4, q2301
>+ pshuflw m0, m5, q2301
>+ pshufhw m0, m0, q2301
>+ paddw m7, m4
>+ paddw m5, m0
>+ psrldq m7, 2
>+ psrldq m5, 2
>+ pshufd m7, m7, q3120
>+ pshufd m5, m5, q3120
>+ punpcklqdq m7, m5
>+ pshuflw m4, m1, q2301
m1 used in here, long distance
>+ pshufhw m4, m4, q2301
>+ pshuflw m0, m7, q2301
>+ pshufhw m0, m0, q2301
>+ paddw m1, m4
>+ paddw m7, m0
>+ psrldq m1, 2
>+ psrldq m7, 2
>+ pshufd m1, m1, q3120
>+ pshufd m7, m7, q3120
>+ punpcklqdq m1, m7
>+%endmacro
>+
>+;----------------------------------------------------------------------------------------------------------------------------
>+; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
>+;----------------------------------------------------------------------------------------------------------------------------
>+%macro IPFILTER_LUMA_sse2 3
>+INIT_XMM sse2
>+cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8
>+
>+ mov r4d, r4m
>+ add r4d, r4d
>+ pxor m6, m6
>+%ifdef PIC
>+ lea r6, [tabw_LumaCoeff]
>+ mova m3, [r6 + r4 * 8]
>+%else
>+ mova m3, [tabw_LumaCoeff + r4 * 8]
>+%endif
>+
>+ mov r4d, %2
>+%ifidn %3, pp
>+ mova m2, [pw_32]
>+%else
>+ mova m2, [pw_2000]
>+ add r3d, r3d
>+ cmp r5m, byte 0
>+ je .loopH
>+ lea r6, [r1 + 2 * r1]
>+ sub r0d, r6d
>+ add r4d, 7
>+%endif
>+
>+.loopH:
>+ xor r5d, r5d
>+%rep %1 / 8
use %rep here, why you need physical register r5?
>+ FILTER_H8_W8_sse2
>+ %ifidn %3, pp
>+ paddw m1, m2
>+ psraw m1, 6
>+ packuswb m1, m1
>+ movh [r2 + r5], m1
>+ %else
>+ psubw m1, m2
>+ movu [r2 + 2 * r5], m1
>+ %endif
>+ add r5d, 8
>+%endrep
>+
>+%rep (%1 % 8) / 4
>+ FILTER_H8_W4_sse2
>+ %ifidn %3, pp
>+ paddw m1, m2
>+ psraw m1, 6
>+ packuswb m1, m1
>+ movd [r2 + r5], m1
>+ %else
>+ psubw m1, m2
>+ movh [r2 + 2 * r5], m1
>+ %endif
>+%endrep
>+
>+ add r0d, r1d
>+ add r2d, r3d
>+
>+ dec r4d
>+ jnz .loopH
>+
no extra blank line here
>+ RET
>+
>+%endmacro
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150428/c8d6feda/attachment.html>
More information about the x265-devel
mailing list