[x264-devel] [PATCH 13/24] arm: Implement x264_denoise_dct_neon
Janne Grunau
janne-x264 at jannau.net
Tue Aug 18 11:55:02 CEST 2015
On 2015-08-13 23:59:34 +0300, Martin Storsjö wrote:
> checkasm timing Cortex-A7 A8 A9
> denoise_dct_c 6605 5515 5950
> denoise_dct_neon 1885 1178 1887
> ---
> common/arm/quant-a.S | 31 +++++++++++++++++++++++++++++++
> common/arm/quant.h | 2 ++
> common/quant.c | 2 +-
> 3 files changed, 34 insertions(+), 1 deletion(-)
>
> diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S
> index ad8d8f8..e3d5cd2 100644
> --- a/common/arm/quant-a.S
> +++ b/common/arm/quant-a.S
> @@ -4,6 +4,7 @@
> * Copyright (C) 2009-2015 x264 project
> *
> * Authors: David Conrad <lessen42 at gmail.com>
> + * Janne Grunau <janne-x264 at jannau.net>
> *
> * This program is free software; you can redistribute it and/or modify
> * it under the terms of the GNU General Public License as published by
> @@ -404,3 +405,33 @@ function x264_coeff_last64_neon
> movlt r0, #0
> bx lr
> endfunc
> +
> +function x264_denoise_dct_neon
> + vpush {q4-q7}
after a cursory look it should no problem to do the same computation in
12 128-bit registers
> +1: subs r3, r3, #16
> + vld1.16 {q0, q1}, [r0]
> + vld1.32 {q4, q5}, [r1]!
> + vld1.32 {q6, q7}, [r1]
> + sub r1, #32
> + vabs.s16 q8, q0
> + vabs.s16 q9, q1
> + vld1.16 {q2, q3}, [r2]!
> + vclt.s16 q10, q0, #0
> + vclt.s16 q11, q1, #0
q0 and q1 are unused after this
> + vaddw.u16 q4, q4, d16
> + vaddw.u16 q5, q5, d17
> + vqsub.u16 q12, q8, q2
> + vqsub.u16 q13, q9, q3
q2, q3, q8 and q9 are unused after this, that should make enough
registers free to keep q4-q7 unmodified
> + vaddw.u16 q6, q6, d18
> + vaddw.u16 q7, q7, d19
> + vneg.s16 q14, q12
> + vneg.s16 q15, q13
> + vbsl q10, q14, q12
> + vbsl q11, q15, q13
> + vst1.32 {q4, q5}, [r1]!
> + vst1.32 {q6, q7}, [r1]!
> + vst1.16 {q10, q11}, [r0]!
> + bgt 1b
> + vpop {q4-q7}
> + bx lr
> +endfunc
> diff --git a/common/arm/quant.h b/common/arm/quant.h
> index 8ea179a..78178e8 100644
> --- a/common/arm/quant.h
> +++ b/common/arm/quant.h
> @@ -44,4 +44,6 @@ int x264_coeff_last15_neon( int16_t * );
> int x264_coeff_last16_neon( int16_t * );
> int x264_coeff_last64_neon( int16_t * );
>
> +void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
> +
> #endif
> diff --git a/common/quant.c b/common/quant.c
> index bc9e8d7..f8279a7 100644
> --- a/common/quant.c
> +++ b/common/quant.c
> @@ -750,6 +750,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
> pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_neon;
> pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon;
> pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
> + pf->denoise_dct = x264_denoise_dct_neon;
> }
> #endif
> #if ARCH_AARCH64
> @@ -767,7 +768,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
> pf->decimate_score15 = x264_decimate_score15_neon;
> pf->decimate_score16 = x264_decimate_score16_neon;
> pf->decimate_score64 = x264_decimate_score64_neon;
> - pf->denoise_dct = x264_denoise_dct_neon;
> }
> #endif
ok otherwise
Janne
More information about the x264-devel
mailing list