[x265] [PATCH] asm: interp_8tap_hv_pp_8x8 sse3
Steve Borho
steve at borho.org
Fri May 1 19:03:09 CEST 2015
On 04/29, dtyx265 at gmail.com wrote:
> # HG changeset patch
> # User David T Yuen <dtyx265 at gmail.com>
> # Date 1430361608 25200
> # Node ID f95cc094467c844c6607c67d330748d171d26483
> # Parent 9a1b8b71bc997547044f42992e1eb7f3572f03f1
> asm: interp_8tap_hv_pp_8x8 sse3
these two patches are queued for testing, the larger one was still
crashing for me
> This replaces c code
>
> 64-bit
>
> ./test/TestBench --testbench interp | grep hv
> luma_hv [ 8x8] 2.53x 14225.03 35970.65
>
> 32-bit
>
> ./test/TestBench --testbench interp | grep hv
> luma_hv [ 8x8] 2.50x 14367.40 35917.48
>
> diff -r 9a1b8b71bc99 -r f95cc094467c source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Wed Apr 29 08:23:45 2015 -0700
> +++ b/source/common/x86/asm-primitives.cpp Wed Apr 29 19:40:08 2015 -0700
> @@ -1347,6 +1347,7 @@
> p.pu[LUMA_4x4].luma_hpp = x265_interp_8tap_horiz_pp_4x4_sse2;
> ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
> p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_sse2;
> + p.pu[LUMA_8x8].luma_hvpp = x265_interp_8tap_hv_pp_8x8_sse3;
>
> //p.frameInitLowres = x265_frame_init_lowres_core_mmx2;
> p.frameInitLowres = x265_frame_init_lowres_core_sse2;
> diff -r 9a1b8b71bc99 -r f95cc094467c source/common/x86/ipfilter8.asm
> --- a/source/common/x86/ipfilter8.asm Wed Apr 29 08:23:45 2015 -0700
> +++ b/source/common/x86/ipfilter8.asm Wed Apr 29 19:40:08 2015 -0700
> @@ -3464,6 +3464,78 @@
> RET
>
> ;-----------------------------------------------------------------------------
> +; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
> +;-----------------------------------------------------------------------------
> +INIT_XMM sse3
> +cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16
> + mov r4d, r4m
> + mov r5d, r5m
> + add r4d, r4d
> + pxor m6, m6
> +
> +%ifdef PIC
> + lea r6, [tabw_LumaCoeff]
> + mova m3, [r6 + r4 * 8]
> +%else
> + mova m3, [tabw_LumaCoeff + r4 * 8]
> +%endif
> +
> + ; move to row -3
> + lea r6, [r1 + r1 * 2]
> + sub r0, r6
> +
> + mov r4, rsp
> +
> +%assign x 0 ;needed for FILTER_H8_W8_sse2 macro
> +%assign y 1
> +%rep 15
> + FILTER_H8_W8_sse2
> + psubw m1, [pw_2000]
> + mova [r4], m1
> +
> +%if y < 15
> + add r0, r1
> + add r4, 16
> +%endif
> +%assign y y+1
> +%endrep
> +
> + ; ready to phase V
> + ; Here all of mN is free
> +
> + ; load coeff table
> + shl r5, 6
> + lea r6, [tab_LumaCoeffV]
> + lea r5, [r5 + r6]
> +
> + ; load intermedia buffer
> + mov r0, rsp
> +
> + ; register mapping
> + ; r0 - src
> + ; r5 - coeff
> +
> + ; let's go
> +%assign y 1
> +%rep 4
> + FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0
> + FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1
> + FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2
> + FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3
> + FILTER_HV8_END m3, m0, m4, m1
> +
> + movh [r2], m3
> + movhps [r2 + r3], m3
> +
> +%if y < 4
> + lea r0, [r0 + 16 * 2]
> + lea r2, [r2 + r3 * 2]
> +%endif
> +%assign y y+1
> +%endrep
> + RET
> +
> +;-----------------------------------------------------------------------------
> ;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> diff -r 9a1b8b71bc99 -r f95cc094467c source/common/x86/ipfilter8.h
> --- a/source/common/x86/ipfilter8.h Wed Apr 29 08:23:45 2015 -0700
> +++ b/source/common/x86/ipfilter8.h Wed Apr 29 19:40:08 2015 -0700
> @@ -900,6 +900,7 @@
> void x265_interp_8tap_horiz_ps_64x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> void x265_interp_8tap_horiz_ps_64x48_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> void x265_interp_8tap_horiz_ps_64x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_hv_pp_8x8_sse3(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY);
> #undef LUMA_FILTERS
> #undef LUMA_SP_FILTERS
> #undef LUMA_SS_FILTERS
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list