[x265] [PATCH] asm: interp_4tap_vert_pX_4xN sse2
Steve Borho
steve at borho.org
Wed May 20 17:54:29 CEST 2015
On 05/19, dtyx265 at gmail.com wrote:
> # HG changeset patch
> # User David T Yuen <dtyx265 at gmail.com>
> # Date 1432085346 25200
> # Node ID e096c40ce8ff9c170bdb8caa094f53b30ebd7db7
> # Parent 3e07cba4b2034db2b819b2e11e98ee4b851d52b5
> asm: interp_4tap_vert_pX_4xN sse2
>
> Improved register usage for addressing of output. This improvement helps 64-bit .7% to 2.5%.
> Also added interp_4tap_vert_ps_4x32 in primitives setup.
queued
> diff -r 3e07cba4b203 -r e096c40ce8ff source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Tue May 19 14:27:04 2015 -0700
> +++ b/source/common/x86/asm-primitives.cpp Tue May 19 18:29:06 2015 -0700
> @@ -1482,6 +1482,7 @@
> p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vps = x265_interp_4tap_vert_ps_4x4_sse2;
> p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_vps = x265_interp_4tap_vert_ps_4x8_sse2;
> p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_vps = x265_interp_4tap_vert_ps_4x16_sse2;
> + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_vps = x265_interp_4tap_vert_ps_4x32_sse2;
> p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vps = x265_interp_4tap_vert_ps_4x4_sse2;
> p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_vps = x265_interp_4tap_vert_ps_4x8_sse2;
> p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_vps = x265_interp_4tap_vert_ps_4x16_sse2;
> diff -r 3e07cba4b203 -r e096c40ce8ff source/common/x86/ipfilter8.asm
> --- a/source/common/x86/ipfilter8.asm Tue May 19 14:27:04 2015 -0700
> +++ b/source/common/x86/ipfilter8.asm Tue May 19 18:29:06 2015 -0700
> @@ -1139,6 +1139,7 @@
> %endif
>
> lea r5, [3 * r1]
> + lea r4, [3 * r3]
> punpcklqdq m0, m0
>
> %assign x 1
> @@ -1243,11 +1244,10 @@
> movd [r2], m2
> psrldq m2, 4
> movd [r2 + r3], m2
> - lea r2, [r2 + 2 * r3]
> psrldq m2, 4
> - movd [r2], m2
> + movd [r2 + 2 * r3], m2
> psrldq m2, 4
> - movd [r2 + r3], m2
> + movd [r2 + r4], m2
> %elifidn %1,ps
> psrldq m4, 2
> psrldq m5, 2
> @@ -1255,13 +1255,12 @@
> pshufd m5, m5, q3120
> punpcklqdq m4, m5
> psubw m4, m1
> - lea r2, [r2 + 2 * r3]
> - movh [r2], m4
> - movhps [r2 + r3], m4
> + movh [r2 + 2 * r3], m4
> + movhps [r2 + r4], m4
> %endif
>
> %if x < %2/4
> - lea r2, [r2 + 2 * r3]
> + lea r2, [r2 + 4 * r3]
> %endif
>
> %assign x x+1
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list