[x264-devel] [PATCH 04/11] arm: Implement x264_denoise_dct_neon
Martin Storsjö
martin at martin.st
Tue Aug 25 13:38:13 CEST 2015
checkasm timing Cortex-A7 A8 A9
denoise_dct_c 6604 5510 5858
denoise_dct_neon 1774 1139 1614
---
Use other registers to avoid having to push/pop registers.
---
common/arm/quant-a.S | 29 +++++++++++++++++++++++++++++
common/arm/quant.h | 2 ++
common/quant.c | 2 +-
3 files changed, 32 insertions(+), 1 deletion(-)
diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S
index ad8d8f8..e63170e 100644
--- a/common/arm/quant-a.S
+++ b/common/arm/quant-a.S
@@ -4,6 +4,7 @@
* Copyright (C) 2009-2015 x264 project
*
* Authors: David Conrad <lessen42 at gmail.com>
+ * Janne Grunau <janne-x264 at jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -404,3 +405,31 @@ function x264_coeff_last64_neon
movlt r0, #0
bx lr
endfunc
+
+function x264_denoise_dct_neon
+1: subs r3, r3, #16
+ vld1.16 {q0, q1}, [r0]
+ vld1.32 {q12, q13}, [r1]!
+ vld1.32 {q14, q15}, [r1]
+ sub r1, #32
+ vabs.s16 q8, q0
+ vabs.s16 q9, q1
+ vld1.16 {q2, q3}, [r2]!
+ vclt.s16 q10, q0, #0
+ vclt.s16 q11, q1, #0
+ vaddw.u16 q12, q12, d16
+ vaddw.u16 q13, q13, d17
+ vqsub.u16 q0, q8, q2
+ vqsub.u16 q1, q9, q3
+ vaddw.u16 q14, q14, d18
+ vaddw.u16 q15, q15, d19
+ vneg.s16 q8, q0
+ vneg.s16 q9, q1
+ vbsl q10, q8, q0
+ vbsl q11, q9, q1
+ vst1.32 {q12, q13}, [r1]!
+ vst1.32 {q14, q15}, [r1]!
+ vst1.16 {q10, q11}, [r0]!
+ bgt 1b
+ bx lr
+endfunc
diff --git a/common/arm/quant.h b/common/arm/quant.h
index 8ea179a..78178e8 100644
--- a/common/arm/quant.h
+++ b/common/arm/quant.h
@@ -44,4 +44,6 @@ int x264_coeff_last15_neon( int16_t * );
int x264_coeff_last16_neon( int16_t * );
int x264_coeff_last64_neon( int16_t * );
+void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
+
#endif
diff --git a/common/quant.c b/common/quant.c
index bc9e8d7..f8279a7 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -750,6 +750,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_neon;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
+ pf->denoise_dct = x264_denoise_dct_neon;
}
#endif
#if ARCH_AARCH64
@@ -767,7 +768,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->decimate_score15 = x264_decimate_score15_neon;
pf->decimate_score16 = x264_decimate_score16_neon;
pf->decimate_score64 = x264_decimate_score64_neon;
- pf->denoise_dct = x264_denoise_dct_neon;
}
#endif
--
1.7.10.4
More information about the x264-devel
mailing list