[x265] [PATCH] asm: interp_8tap_hv_pp_8x8 sse3

Steve Borho steve at borho.org
Fri May 1 19:03:09 CEST 2015


On 04/29, dtyx265 at gmail.com wrote:
> # HG changeset patch
> # User David T Yuen <dtyx265 at gmail.com>
> # Date 1430361608 25200
> # Node ID f95cc094467c844c6607c67d330748d171d26483
> # Parent  9a1b8b71bc997547044f42992e1eb7f3572f03f1
> asm: interp_8tap_hv_pp_8x8 sse3

these two patches are queued for testing, the larger one was still
crashing for me

> This replaces c code
> 
> 64-bit
> 
> ./test/TestBench --testbench interp | grep hv
> luma_hv [  8x8]		2.53x 	 14225.03 	 35970.65
> 
> 32-bit
> 
> ./test/TestBench --testbench interp | grep hv
> luma_hv [  8x8]		2.50x 	 14367.40 	 35917.48
> 
> diff -r 9a1b8b71bc99 -r f95cc094467c source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp	Wed Apr 29 08:23:45 2015 -0700
> +++ b/source/common/x86/asm-primitives.cpp	Wed Apr 29 19:40:08 2015 -0700
> @@ -1347,6 +1347,7 @@
>          p.pu[LUMA_4x4].luma_hpp = x265_interp_8tap_horiz_pp_4x4_sse2;
>          ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
>          p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_sse2;
> +        p.pu[LUMA_8x8].luma_hvpp = x265_interp_8tap_hv_pp_8x8_sse3;
>  
>          //p.frameInitLowres = x265_frame_init_lowres_core_mmx2;
>          p.frameInitLowres = x265_frame_init_lowres_core_sse2;
> diff -r 9a1b8b71bc99 -r f95cc094467c source/common/x86/ipfilter8.asm
> --- a/source/common/x86/ipfilter8.asm	Wed Apr 29 08:23:45 2015 -0700
> +++ b/source/common/x86/ipfilter8.asm	Wed Apr 29 19:40:08 2015 -0700
> @@ -3464,6 +3464,78 @@
>      RET
>  
>  ;-----------------------------------------------------------------------------
> +; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
> +;-----------------------------------------------------------------------------
> +INIT_XMM sse3
> +cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16
> +    mov         r4d,        r4m
> +    mov         r5d,        r5m
> +    add         r4d,        r4d
> +    pxor        m6,         m6
> +
> +%ifdef PIC
> +    lea         r6,         [tabw_LumaCoeff]
> +    mova        m3,         [r6 + r4 * 8]
> +%else
> +    mova        m3,         [tabw_LumaCoeff + r4 * 8]
> +%endif
> +
> +    ; move to row -3
> +    lea         r6,         [r1 + r1 * 2]
> +    sub         r0,         r6
> +
> +    mov         r4,         rsp
> +
> +%assign x 0     ;needed for FILTER_H8_W8_sse2 macro
> +%assign y 1
> +%rep 15
> +    FILTER_H8_W8_sse2
> +    psubw       m1,         [pw_2000]
> +    mova        [r4],       m1
> +
> +%if y < 15
> +    add         r0,         r1
> +    add         r4,         16
> +%endif
> +%assign y y+1
> +%endrep
> +
> +    ; ready to phase V
> +    ; Here all of mN is free
> +
> +    ; load coeff table
> +    shl         r5,         6
> +    lea         r6,         [tab_LumaCoeffV]
> +    lea         r5,         [r5 + r6]
> +
> +    ; load intermedia buffer
> +    mov         r0,         rsp
> +
> +    ; register mapping
> +    ; r0 - src
> +    ; r5 - coeff
> +
> +    ; let's go
> +%assign y 1
> +%rep 4
> +    FILTER_HV8_START    m1, m2, m3, m4, m0,             0, 0
> +    FILTER_HV8_MID      m6, m2, m3, m4, m0, m1, m7, m5, 3, 1
> +    FILTER_HV8_MID      m5, m6, m3, m4, m0, m1, m7, m2, 5, 2
> +    FILTER_HV8_MID      m6, m5, m3, m4, m0, m1, m7, m2, 7, 3
> +    FILTER_HV8_END      m3, m0, m4, m1
> +
> +    movh        [r2],       m3
> +    movhps      [r2 + r3],  m3
> +
> +%if y < 4
> +    lea         r0,         [r0 + 16 * 2]
> +    lea         r2,         [r2 + r3 * 2]
> +%endif
> +%assign y y+1
> +%endrep
> +    RET
> +
> +;-----------------------------------------------------------------------------
>  ;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
> diff -r 9a1b8b71bc99 -r f95cc094467c source/common/x86/ipfilter8.h
> --- a/source/common/x86/ipfilter8.h	Wed Apr 29 08:23:45 2015 -0700
> +++ b/source/common/x86/ipfilter8.h	Wed Apr 29 19:40:08 2015 -0700
> @@ -900,6 +900,7 @@
>  void x265_interp_8tap_horiz_ps_64x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
>  void x265_interp_8tap_horiz_ps_64x48_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
>  void x265_interp_8tap_horiz_ps_64x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_hv_pp_8x8_sse3(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY);
>  #undef LUMA_FILTERS
>  #undef LUMA_SP_FILTERS
>  #undef LUMA_SS_FILTERS
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho


More information about the x265-devel mailing list