[x265] [PATCH] asm: interp_8tap_horiz pp and ps sse2
dave
dtyx265 at gmail.com
Tue Apr 28 06:13:54 CEST 2015
On 04/27/2015 08:42 PM, chen wrote:
>
> At 2015-04-28 09:05:06,dtyx265 at gmail.com wrote:
> ># HG changeset patch
> ># User David T Yuendtyx265 at gmail.com>
> ># Date 1430182995 25200
> ># Node ID 31b76bd430a47411f7b2ebaa7cfbb44e25c5ff60
> ># Parent 68a13226d586b335c02cade9311e093f0149c42a
> >asm: interp_8tap_horiz pp and ps sse2
> >
> >This replaces c code and covers
> >
> <mailto:dtyx265 at gmail.com%3E%3E#%A0Date%A01430182995%A025200%3E#%A0Node%A0ID%A031b76bd430a47411f7b2ebaa7cfbb44e25c5ff60%3E#%A0Parent%A0%A068a13226d586b335c02cade9311e093f0149c42a%3Easm:%A0interp_8tap_horiz%A0pp%A0and%A0ps%A0sse2%3E%3EThis%A0replaces%A0c%A0code%A0and%A0covers%3E>
> >diff -r 68a13226d586 -r 31b76bd430a4 source/common/x86/asm-primitives.cpp
> >--- a/source/common/x86/asm-primitives.cpp Sat Apr 25 01:39:55 2015 -0500
> >+++ b/source/common/x86/asm-primitives.cpp Mon Apr 27 18:03:15 2015 -0700
> >@@ -1340,6 +1340,10 @@
> > CHROMA_420_VSP_FILTERS(_sse2);
> > CHROMA_422_VSP_FILTERS(_sse2);
> > CHROMA_444_VSP_FILTERS(_sse2);
> >+ ALL_LUMA_PU(luma_hpp, interp_8tap_horiz_pp, sse2);
> >+ p.pu[LUMA_4x4].luma_hpp = x265_interp_8tap_horiz_pp_4x4_sse2;
> >+ ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
> >+ p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_sse2;
> >
> > //p.frameInitLowres = x265_frame_init_lowres_core_mmx2;
> > p.frameInitLowres = x265_frame_init_lowres_core_sse2;
> >diff -r 68a13226d586 -r 31b76bd430a4 source/common/x86/ipfilter8.asm
> >--- a/source/common/x86/ipfilter8.asm Sat Apr 25 01:39:55 2015 -0500
> >+++ b/source/common/x86/ipfilter8.asm Mon Apr 27 18:03:15 2015 -0700
> >@@ -160,6 +160,11 @@
> > db -1, 4, -11, 40, 40, -11, 4, -1
> > db 0, 1, -5, 17, 58, -10, 4, -1
> >
> >+tabw_LumaCoeff: dw 0, 0, 0, 64, 0, 0, 0, 0
> >+ dw -1, 4, -10, 58, 17, -5, 1, 0
> >+ dw -1, 4, -11, 40, 40, -11, 4, -1
> >+ dw 0, 1, -5, 17, 58, -10, 4, -1
> >+
> > tab_LumaCoeffV: times 4 dw 0, 0
> > times 4 dw 0, 64
> > times 4 dw 0, 0
> >@@ -825,6 +830,230 @@
> > IPFILTER_CHROMA_W_sse3 48, 64
> > IPFILTER_CHROMA_W_sse3 64, 16
> >
> >+%macro FILTER_H8_W8_sse2 0
> >+ movh m1, [r0 + r5 - 3]
> offset field (-3) can avoid by preare on r0
I could also use r5 - 3, r5 - 2, r5 - 1... instead of r5 - 3 + n.
> >+ movh m4, [r0 + r5 - 3 + 1]
> >+ punpcklbw m1, m6
> >+ punpcklbw m4, m6
> >+ movh m5, [r0 + r5 - 3 + 2]
> >+ movh m0, [r0 + r5 - 3 + 3]
> >+ punpcklbw m5, m6
> >+ punpcklbw m0, m6
> >+ pmaddwd m1, m3
> >+ pmaddwd m4, m3
> >+ pmaddwd m5, m3
> >+ pmaddwd m0, m3
> >+ packssdw m1, m4
> >+ packssdw m5, m0
> >+ pshuflw m4, m1, q2301
> >+ pshufhw m4, m4, q2301
> >+ pshuflw m0, m5, q2301
> >+ pshufhw m0, m0, q2301
> >+ paddw m1, m4
> >+ paddw m5, m0
> >+ psrldq m1, 2
> >+ psrldq m5, 2
> >+ pshufd m1, m1, q3120
> >+ pshufd m5, m5, q3120
> >+ punpcklqdq m1, m5
> m1 valid in here
> >+ movh m7, [r0 + r5 - 3 + 4]
> >+ movh m4, [r0 + r5 - 3 + 5]
> >+ punpcklbw m7, m6
> >+ punpcklbw m4, m6
> >+ movh m5, [r0 + r5 - 3 + 6]
> >+ movh m0, [r0 + r5 - 3 + 7]
> >+ punpcklbw m5, m6
> >+ punpcklbw m0, m6
> >+ pmaddwd m7, m3
> >+ pmaddwd m4, m3
> >+ pmaddwd m5, m3
> >+ pmaddwd m0, m3
> >+ packssdw m7, m4
> >+ packssdw m5, m0
> >+ pshuflw m4, m7, q2301
> >+ pshufhw m4, m4, q2301
> >+ pshuflw m0, m5, q2301
> >+ pshufhw m0, m0, q2301
> >+ paddw m7, m4
> >+ paddw m5, m0
> >+ psrldq m7, 2
> >+ psrldq m5, 2
> >+ pshufd m7, m7, q3120
> >+ pshufd m5, m5, q3120
> >+ punpcklqdq m7, m5
> >+ pshuflw m4, m1, q2301
> m1 used in here, long distance
I am not sure what is wrong with this.
> >+ pshufhw m4, m4, q2301
> >+ pshuflw m0, m7, q2301
> >+ pshufhw m0, m0, q2301
> >+ paddw m1, m4
> >+ paddw m7, m0
> >+ psrldq m1, 2
> >+ psrldq m7, 2
> >+ pshufd m1, m1, q3120
> >+ pshufd m7, m7, q3120
> >+ punpcklqdq m1, m7
> >+%endmacro
> >+
>
> >+;----------------------------------------------------------------------------------------------------------------------------
> >+; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
> >+;----------------------------------------------------------------------------------------------------------------------------
> >+%macro IPFILTER_LUMA_sse2 3
> >+INIT_XMM sse2
> >+cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8
> >+
> >+ mov r4d, r4m
> >+ add r4d, r4d
> >+ pxor m6, m6
> >+%ifdef PIC
> >+ lea r6, [tabw_LumaCoeff]
> >+ mova m3, [r6 + r4 * 8]
> >+%else
> >+ mova m3, [tabw_LumaCoeff + r4 * 8]
> >+%endif
> >+
> >+ mov r4d, %2
> >+%ifidn %3, pp
> >+ mova m2, [pw_32]
> >+%else
> >+ mova m2, [pw_2000]
> >+ add r3d, r3d
> >+ cmp r5m, byte 0
> >+ je .loopH
> >+ lea r6, [r1 + 2 * r1]
> >+ sub r0d, r6d
> >+ add r4d, 7
> >+%endif
> >+
> >+.loopH:
> >+ xor r5d, r5d
> >+%rep %1 / 8
> use %rep here, why you need physical register r5?
Each iteration of the loop completes a row. r5 is the index across the
row and must be reset at the beginning of each loop iteration.
> >+ FILTER_H8_W8_sse2
> >+ %ifidn %3, pp
> >+ paddw m1, m2
> >+ psraw m1, 6
> >+ packuswb m1, m1
> >+ movh [r2 + r5], m1
> >+ %else
> >+ psubw m1, m2
> >+ movu [r2 + 2 * r5], m1
> >+ %endif
> >+ add r5d, 8
> >+%endrep
> >+
> >+%rep (%1 % 8) / 4
> >+ FILTER_H8_W4_sse2
> >+ %ifidn %3, pp
> >+ paddw m1, m2
> >+ psraw m1, 6
> >+ packuswb m1, m1
> >+ movd [r2 + r5], m1
> >+ %else
> >+ psubw m1, m2
> >+ movh [r2 + 2 * r5], m1
> >+ %endif
> >+%endrep
> >+
> >+ add r0d, r1d
> >+ add r2d, r3d
> >+
> >+ dec r4d
> >+ jnz .loopH
> >+
> no extra blank line here
> will remove...
> >+ RET
> >+
> >+%endmacro
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150427/910c5de9/attachment-0001.html>
More information about the x265-devel
mailing list