[x264-devel] [PATCH 13/24] arm: Implement x264_denoise_dct_neon

Thu Aug 13 22:59:34 CEST 2015

checkasm timing       Cortex-A7      A8     A9
denoise_dct_c                6605    5515   5950
denoise_dct_neon             1885    1178   1887
---
 common/arm/quant-a.S |   31 +++++++++++++++++++++++++++++++
 common/arm/quant.h   |    2 ++
 common/quant.c       |    2 +-
 3 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S
index ad8d8f8..e3d5cd2 100644
--- a/common/arm/quant-a.S
+++ b/common/arm/quant-a.S
@@ -4,6 +4,7 @@
  * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42 at gmail.com>
+ *          Janne Grunau <janne-x264 at jannau.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -404,3 +405,33 @@ function x264_coeff_last64_neon
     movlt       r0,  #0
     bx          lr
 endfunc
+
+function x264_denoise_dct_neon
+    vpush       {q4-q7}
+1:  subs        r3,  r3,  #16
+    vld1.16     {q0, q1}, [r0]
+    vld1.32     {q4, q5}, [r1]!
+    vld1.32     {q6, q7}, [r1]
+    sub         r1,  #32
+    vabs.s16    q8,  q0
+    vabs.s16    q9,  q1
+    vld1.16     {q2, q3}, [r2]!
+    vclt.s16    q10, q0,  #0
+    vclt.s16    q11, q1,  #0
+    vaddw.u16   q4,  q4,  d16
+    vaddw.u16   q5,  q5,  d17
+    vqsub.u16   q12, q8,  q2
+    vqsub.u16   q13, q9,  q3
+    vaddw.u16   q6,  q6,  d18
+    vaddw.u16   q7,  q7,  d19
+    vneg.s16    q14, q12
+    vneg.s16    q15, q13
+    vbsl        q10, q14, q12
+    vbsl        q11, q15, q13
+    vst1.32     {q4, q5}, [r1]!
+    vst1.32     {q6, q7}, [r1]!
+    vst1.16     {q10, q11}, [r0]!
+    bgt         1b
+    vpop        {q4-q7}
+    bx          lr
+endfunc
diff --git a/common/arm/quant.h b/common/arm/quant.h
index 8ea179a..78178e8 100644
--- a/common/arm/quant.h
+++ b/common/arm/quant.h
@@ -44,4 +44,6 @@ int x264_coeff_last15_neon( int16_t * );
 int x264_coeff_last16_neon( int16_t * );
 int x264_coeff_last64_neon( int16_t * );
 
+void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
+
 #endif
diff --git a/common/quant.c b/common/quant.c
index bc9e8d7..f8279a7 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -750,6 +750,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_neon;
         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon;
         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
+        pf->denoise_dct = x264_denoise_dct_neon;
     }
 #endif
 #if ARCH_AARCH64
@@ -767,7 +768,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->decimate_score15 = x264_decimate_score15_neon;
         pf->decimate_score16 = x264_decimate_score16_neon;
         pf->decimate_score64 = x264_decimate_score64_neon;
-        pf->denoise_dct = x264_denoise_dct_neon;
     }
 #endif
 
-- 
1.7.10.4