[x265] [PATCH] asm: interp_4tap_vert_pX_4xN sse2

Steve Borho steve at borho.org
Wed May 20 17:54:29 CEST 2015


On 05/19, dtyx265 at gmail.com wrote:
> # HG changeset patch
> # User David T Yuen <dtyx265 at gmail.com>
> # Date 1432085346 25200
> # Node ID e096c40ce8ff9c170bdb8caa094f53b30ebd7db7
> # Parent  3e07cba4b2034db2b819b2e11e98ee4b851d52b5
> asm: interp_4tap_vert_pX_4xN sse2
> 
> Improved register usage for addressing of output.  This improvement helps 64-bit .7% to 2.5%.
> Also added interp_4tap_vert_ps_4x32 in primitives setup.

queued

> diff -r 3e07cba4b203 -r e096c40ce8ff source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp	Tue May 19 14:27:04 2015 -0700
> +++ b/source/common/x86/asm-primitives.cpp	Tue May 19 18:29:06 2015 -0700
> @@ -1482,6 +1482,7 @@
>          p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vps = x265_interp_4tap_vert_ps_4x4_sse2;
>          p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_vps = x265_interp_4tap_vert_ps_4x8_sse2;
>          p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_vps = x265_interp_4tap_vert_ps_4x16_sse2;
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_vps = x265_interp_4tap_vert_ps_4x32_sse2;
>          p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vps = x265_interp_4tap_vert_ps_4x4_sse2;
>          p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_vps = x265_interp_4tap_vert_ps_4x8_sse2;
>          p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_vps = x265_interp_4tap_vert_ps_4x16_sse2;
> diff -r 3e07cba4b203 -r e096c40ce8ff source/common/x86/ipfilter8.asm
> --- a/source/common/x86/ipfilter8.asm	Tue May 19 14:27:04 2015 -0700
> +++ b/source/common/x86/ipfilter8.asm	Tue May 19 18:29:06 2015 -0700
> @@ -1139,6 +1139,7 @@
>  %endif
>  
>      lea         r5,        [3 * r1]
> +    lea         r4,        [3 * r3]
>      punpcklqdq  m0,        m0
>  
>  %assign x 1
> @@ -1243,11 +1244,10 @@
>      movd        [r2],      m2
>      psrldq      m2,        4
>      movd        [r2 + r3], m2
> -    lea         r2,        [r2 + 2 * r3]
>      psrldq      m2,        4
> -    movd        [r2],      m2
> +    movd        [r2 + 2 * r3],      m2
>      psrldq      m2,        4
> -    movd        [r2 + r3], m2
> +    movd        [r2 + r4], m2
>  %elifidn %1,ps
>      psrldq      m4,        2
>      psrldq      m5,        2
> @@ -1255,13 +1255,12 @@
>      pshufd      m5,        m5, q3120
>      punpcklqdq  m4,        m5
>      psubw       m4,        m1
> -    lea         r2,        [r2 + 2 * r3]
> -    movh        [r2],      m4
> -    movhps      [r2 + r3], m4
> +    movh        [r2 + 2 * r3],      m4
> +    movhps      [r2 + r4], m4
>  %endif
>  
>  %if x < %2/4
> -    lea         r2,        [r2 + 2 * r3]
> +    lea         r2,        [r2 + 4 * r3]
>  %endif
>  
>  %assign x x+1
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho


More information about the x265-devel mailing list