[x264-devel] [PATCH 13/24] arm: Implement x264_denoise_dct_neon

Tue Aug 18 11:55:02 CEST 2015

On 2015-08-13 23:59:34 +0300, Martin Storsjö wrote:
> checkasm timing       Cortex-A7      A8     A9
> denoise_dct_c                6605    5515   5950
> denoise_dct_neon             1885    1178   1887
> ---
>  common/arm/quant-a.S |   31 +++++++++++++++++++++++++++++++
>  common/arm/quant.h   |    2 ++
>  common/quant.c       |    2 +-
>  3 files changed, 34 insertions(+), 1 deletion(-)
> 
> diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S
> index ad8d8f8..e3d5cd2 100644
> --- a/common/arm/quant-a.S
> +++ b/common/arm/quant-a.S
> @@ -4,6 +4,7 @@
>   * Copyright (C) 2009-2015 x264 project
>   *
>   * Authors: David Conrad <lessen42 at gmail.com>
> + *          Janne Grunau <janne-x264 at jannau.net>
>   *
>   * This program is free software; you can redistribute it and/or modify
>   * it under the terms of the GNU General Public License as published by
> @@ -404,3 +405,33 @@ function x264_coeff_last64_neon
>      movlt       r0,  #0
>      bx          lr
>  endfunc
> +
> +function x264_denoise_dct_neon
> +    vpush       {q4-q7}

after a cursory look it should no problem to do the same computation in 
12 128-bit registers

> +1:  subs        r3,  r3,  #16
> +    vld1.16     {q0, q1}, [r0]
> +    vld1.32     {q4, q5}, [r1]!
> +    vld1.32     {q6, q7}, [r1]
> +    sub         r1,  #32
> +    vabs.s16    q8,  q0
> +    vabs.s16    q9,  q1
> +    vld1.16     {q2, q3}, [r2]!
> +    vclt.s16    q10, q0,  #0
> +    vclt.s16    q11, q1,  #0

q0 and q1 are unused after this

> +    vaddw.u16   q4,  q4,  d16
> +    vaddw.u16   q5,  q5,  d17
> +    vqsub.u16   q12, q8,  q2
> +    vqsub.u16   q13, q9,  q3

q2, q3, q8 and q9 are unused after this, that should make enough 
registers free to keep q4-q7 unmodified

> +    vaddw.u16   q6,  q6,  d18
> +    vaddw.u16   q7,  q7,  d19
> +    vneg.s16    q14, q12
> +    vneg.s16    q15, q13
> +    vbsl        q10, q14, q12
> +    vbsl        q11, q15, q13
> +    vst1.32     {q4, q5}, [r1]!
> +    vst1.32     {q6, q7}, [r1]!
> +    vst1.16     {q10, q11}, [r0]!
> +    bgt         1b
> +    vpop        {q4-q7}
> +    bx          lr
> +endfunc
> diff --git a/common/arm/quant.h b/common/arm/quant.h
> index 8ea179a..78178e8 100644
> --- a/common/arm/quant.h
> +++ b/common/arm/quant.h
> @@ -44,4 +44,6 @@ int x264_coeff_last15_neon( int16_t * );
>  int x264_coeff_last16_neon( int16_t * );
>  int x264_coeff_last64_neon( int16_t * );
>  
> +void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
> +
>  #endif
> diff --git a/common/quant.c b/common/quant.c
> index bc9e8d7..f8279a7 100644
> --- a/common/quant.c
> +++ b/common/quant.c
> @@ -750,6 +750,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
>          pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_neon;
>          pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon;
>          pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
> +        pf->denoise_dct = x264_denoise_dct_neon;
>      }
>  #endif
>  #if ARCH_AARCH64
> @@ -767,7 +768,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
>          pf->decimate_score15 = x264_decimate_score15_neon;
>          pf->decimate_score16 = x264_decimate_score16_neon;
>          pf->decimate_score64 = x264_decimate_score64_neon;
> -        pf->denoise_dct = x264_denoise_dct_neon;
>      }
>  #endif

ok otherwise

Janne