[x264-devel] [PATCH 17/24] arm: Implement x264_deblock_h_chroma_mbaff_neon

Janne Grunau janne-x264 at jannau.net
Thu Aug 20 17:46:53 CEST 2015


On 2015-08-13 23:59:38 +0300, Martin Storsjö wrote:
> checkasm timing        Cortex-A7      A8     A9
> deblock_chroma_420_mbaff_c    1944    1706   1526
> deblock_chroma_420_mbaff_neon 1210    873    865
> ---
>  common/arm/deblock-a.S |   57 ++++++++++++++++++++++++++++++++++++++++++++++++
>  common/deblock.c       |    4 ++--
>  2 files changed, 59 insertions(+), 2 deletions(-)
> 
> diff --git a/common/arm/deblock-a.S b/common/arm/deblock-a.S
> index 26e95ed..d5210e0 100644
> --- a/common/arm/deblock-a.S
> +++ b/common/arm/deblock-a.S
> @@ -309,6 +309,63 @@ function x264_deblock_h_chroma_422_neon
>      pop             {pc}
>  endfunc
>  
> +.macro h264_loop_filter_chroma8
> +    vdup.8          d22, r2         @ alpha
> +    vmovl.u8        q12, d24
> +    vabd.u8         d26, d16, d0    @ abs(p0 - q0)
> +    vabd.u8         d28, d18, d16   @ abs(p1 - p0)
> +    vsubl.u8        q2,  d0,  d16
> +    vsli.16         d24, d24, #8
> +    vshl.i16        q2,  q2,  #2
> +    vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
> +    vaddw.u8        q2,  q2,  d18
> +    vclt.u8         d26, d26, d22   @ < alpha
> +    vsubw.u8        q2,  q2,  d2
> +    vdup.8          d22, r3         @ beta
> +    vclt.s8         d20, d24, #0
> +    vrshrn.i16      d4,  q2,  #3
> +    vclt.u8         d28, d28, d22   @ < beta
> +    vbic            d26, d26, d20
> +    vclt.u8         d30, d30, d22   @ < beta
> +    vand            d26, d26, d28
> +    vneg.s8         d20, d24
> +    vand            d26, d26, d30
> +    vmin.s8         d4,  d4,  d24
> +    vmovl.u8        q14, d16
> +    vand            d4,  d4,  d26
> +    vmax.s8         d4,  d4,  d20
> +    vmovl.u8        q11, d0
> +    vaddw.s8        q14, q14, d4
> +    vsubw.s8        q11, q11, d4
> +    vqmovun.s16     d16, q14
> +    vqmovun.s16     d0,  q11
> +.endm
> +
> +function x264_deblock_h_chroma_mbaff_neon
> +    h264_loop_filter_start
> +
> +    sub             r0,  r0,  #4
> +    vld1.8          {d18}, [r0], r1
> +    vld1.8          {d16}, [r0], r1
> +    vld1.8          {d0},  [r0], r1
> +    vld1.8          {d2},  [r0], r1
> +
> +    TRANSPOSE4x4_16 d18, d16, d0, d2
> +
> +    h264_loop_filter_chroma8
> +
> +    vtrn.16         d16, d0
> +
> +    sub             r0,  r0,  r1, lsl #2
> +    add             r0,  r0,  #2
> +    vst1.32         {d16[0]}, [r0], r1
> +    vst1.32         {d0[0]},  [r0], r1
> +    vst1.32         {d16[1]}, [r0], r1
> +    vst1.32         {d0[1]},  [r0]
> +
> +    bx              lr
> +endfunc
> +
>  function x264_deblock_strength_neon
>      ldr             ip,  [sp]
>      vmov.i8         q8,  #0
> diff --git a/common/deblock.c b/common/deblock.c
> index 83bda62..1d398ad 100644
> --- a/common/deblock.c
> +++ b/common/deblock.c
> @@ -740,8 +740,8 @@ void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X26
>                                   int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
>                                   int mvy_limit, int bframe );
>  void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
> -#if ARCH_AARCH64
>  void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
> +#if ARCH_AARCH64
>  void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
>  void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
>  void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
> @@ -874,8 +874,8 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
>          pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
>          pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
>          pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_neon;
> -#if ARCH_AARCH64
>          pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_neon;
> +#if ARCH_AARCH64
>          pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon;
>          pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon;
>          pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon;

ok

Janne


More information about the x264-devel mailing list