[vlc-devel] [PATCH] add ARM/NEON conversions for audio_filter/channel_mixer/simple

David Geldreich david.geldreich at free.fr
Wed Apr 4 14:16:29 CEST 2012


write a ARM/NEON inline assembly version of most of the conversion cases of 
audio_filter/channel_mixer/simple

inline assembly is in separate functions for clarity and will be inlined by the compiler

For example, 5.x->2 conversion gets a 8x speedup on iPad1 and 3x on iPad2

I could provide a test program that shows that these routines :
- give the same result (modulo epsilon) as the original one
- work for any alignement of src/dst
- work for any buffer size

---
 modules/audio_filter/channel_mixer/simple.c |  269 ++++++++++++++++++++++++++-
 1 files changed, 268 insertions(+), 1 deletions(-)

diff --git a/modules/audio_filter/channel_mixer/simple.c b/modules/audio_filter/channel_mixer/simple.c
index b48eede..71d7f26 100644
--- a/modules/audio_filter/channel_mixer/simple.c
+++ b/modules/audio_filter/channel_mixer/simple.c
@@ -1,10 +1,11 @@
 /*****************************************************************************
  * simple.c : simple channel mixer plug-in
  *****************************************************************************
- * Copyright (C) 2002, 2004, 2006-2009 the VideoLAN team
+ * Copyright (C) 2002, 2004, 2006-2009, 2012 the VideoLAN team
  * $Id$
  *
  * Authors: Gildas Bazin <gbazin at videolan.org>
+ *          David Geldreich <david.geldreich at free.fr>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -55,6 +56,237 @@ static bool IsSupported( const audio_format_t *p_input, const audio_format_t *p_
 
 static block_t *Filter( filter_t *, block_t * );
 
+#ifdef __ARM_NEON__
+/*****************************************************************************
+ * ARM/NEON buffer conversions
+ *****************************************************************************/
+static void convert_7to2_neon_asm( float *dst, const float *src, int num, bool lfeChannel )
+{
+    const float coeff[4] = {0.5f,0.5f,0.25f, 0.25f};
+
+    __asm__ volatile (
+                      "vld1.32 {q0},[%[coeff]]\n"// load constants
+                      "0:\n"                     // use local label
+                      "vld1.32 {q2},[%[src]]!\n" // load 0,1,2,3
+                      "vmul.f32 q2,q2,q0\n"      // 0.5*src[0] 0.5*src[1] 0.25*src[2] 0.25*src[3]
+                      "vld1.32 {d6},[%[src]]!\n" // load 4,5
+                      "vmul.f32 d6,d6,d1\n"      // 0.25*src[4] 0.25*src[5]
+                      "vadd.f32 d4,d4,d5\n"      // 0.5*src[0] + 0.25*src[2]
+                                                 // 0.5*src[1] + 0.25*src[3]
+                      "vadd.f32 d4,d4,d6\n"      // 0.5*src[0] + 0.25*src[2] + 0.25*src[4]
+                                                 // 0.5*src[1] + 0.25*src[3] + 0.25*src[5]
+
+                      "flds s14,[%[src]]\n"      // load 6
+                      "vdup.32 d7,d7[0]\n"
+                      "teq %[lfeChannel],0\n"
+                      "ite eq\n"
+                      "addeq %[src],%[src],#4\n"
+                      "addne %[src],%[src],#8\n" // skip the lfe channel
+
+                      "vadd.f32 d4,d4,d7\n"      // 0.5*src[0] + 0.25*src[2] + 0.25*src[4] + src[6]
+                                                 // 0.5*src[1] + 0.25*src[3] + 0.25*src[5] + src[6]
+                      "vst1.32 d4, [%[dst]]!\n"
+                      "subs %[num],%[num],#1\n"
+                      "bne 0b\n"
+                      :[src] "+r" (src), [num] "+r" (num), [dst] "+r" (dst)
+                      :[coeff] "r" (coeff), [lfeChannel] "r" (lfeChannel)
+                      :"q0","q2","q3","cc","memory");
+}
+
+static void convert_5to2_neon_asm( float *dst, const float *src, int num, bool lfeChannel )
+{
+    const float coeff[4] = {0.5f,0.5f,0.33f, 0.33f};
+
+    __asm__ volatile (
+                      "vld1.32 {q0},[%[coeff]]\n" // load constants
+                      "0:\n"                      // use local label
+                      "vld1.32 {q1},[%[src]]!\n"  // load 0,1,2,3
+                      "flds s8,[%[src]]\n"        // load 4
+                      "vdup.32 d4,d4[0]\n"
+                      "teq %[lfeChannel],0\n"
+                      "ite eq\n"
+                      "addeq %[src],%[src],#4\n"
+                      "addne %[src],%[src],#8\n" // skip the lfe channel
+                      "vmul.f32 q1,q1,q0\n"      // 0.5*src[0] 0.5*src[1] 0.33*src[2] 0.33*src[3]/3
+                      "vadd.f32 d2,d2,d3\n"      // 0.5*src[0] + 0.33*src[2]
+                                                 // 0.5*src[1] + 0.33*src[3]
+                      "vadd.f32 d2,d2,d4\n"      // 0.5*src[0] + 0.33*src[2] + src[4]
+                                                 // 0.5*src[1] + 0.33*src[3] + src[4]
+                      "vst1.32 d2,[%[dst]]!\n"
+                      "subs %[num],%[num],#1\n"
+                      "bne 0b\n"
+                      :[src] "+r" (src), [num] "+r" (num), [dst] "+r" (dst)
+                      :[coeff] "r" (coeff), [lfeChannel] "r" (lfeChannel)
+                      :"q0","q1","d4","cc","memory");
+}
+
+static void convert_4to2_neon_asm( float *dst, const float *src, int num )
+{
+    const float coeff[2] = {0.5f,0.5f};
+
+    __asm__ volatile (
+                      "vld1.32 {d0},[%[coeff]]\n" // load constants
+                      "0:\n"                      // use local label
+                      "vld1.32 {q1},[%[src]]!\n"
+                      "vmul.f32 d2,d2,d0\n"      // 0.5*src[0] 0.5*src[1]
+                      "vdup.32 d4,d3[0]\n"       // dup src[2]
+                      "vdup.32 d3,d3[1]\n"       // dup src[3]
+                      "vadd.f32 d2,d2,d3\n"      // +src[3]
+                      "vadd.f32 d2,d2,d4\n"      // +src[2]
+                      "vst1.32 d2,[%[dst]]!\n"
+                      "subs %[num],%[num],#1\n"
+                      "bne 0b\n"
+                      :[src] "+r" (src), [num] "+r" (num), [dst] "+r" (dst)
+                      :[coeff] "r" (coeff)
+                      :"q0","q1","q2","cc","memory");
+}
+
+static void convert_3to2_neon_asm( float *dst, const float *src, int num, bool lfeChannel )
+{
+    const float coeff[2] = {0.5f,0.5f};
+
+    __asm__ volatile (
+                      "vld1.32 {d0},[%[coeff]]\n" // load constants
+                      "0:\n"                      // use local label
+                      "vld1.32 {d1},[%[src]]!\n"  // load 0,1
+                      "flds s4,[%[src]]\n"        // load 2
+                      "vdup.32 d2,d2[0]\n"
+                      "teq %[lfeChannel],0\n"
+                      "ite eq\n"
+                      "addeq %[src],%[src],#4\n"
+                      "addne %[src],%[src],#8\n" // skip the lfe channel
+                      "vmul.f32 d1,d1,d0\n"      // 0.5*src[0] 0.5*src[1]
+                      "vadd.f32 d1,d1,d2\n"      // 0.5*src[0] + src[2]
+                                                 // 0.5*src[1] + src[2]
+                      "vst1.32 d1,[%[dst]]!\n"
+                      "subs %[num],%[num],#1\n"
+                      "bne 0b\n"
+                      :[src] "+r" (src), [num] "+r" (num), [dst] "+r" (dst)
+                      :[coeff] "r" (coeff), [lfeChannel] "r" (lfeChannel)
+                      :"d0","d1","d2","cc","memory");
+}
+
+static void convert_7to1_neon_asm( float *dst, const float *src, int num, bool lfeChannel )
+{
+    const float coeff[4] = {0.25f,0.25f,0.125f, 0.125f};
+
+    __asm__ volatile (
+                      "vld1.32 {q0},[%[coeff]]\n"// load constants
+                      "0:\n"                     // use local label
+                      "vld1.32 {q1},[%[src]]!\n" // load 0,1,2,3
+                      "vmul.f32 q1,q1,q0\n"      // 0.25*src[0] 0.25*src[1] 0.125*src[2] 0.125*src[3]
+                      "vld1.32 {d4},[%[src]]!\n" // load 4,5
+                      "vmul.f32 d4,d4,d1\n"      // 0.125*src[4] 0.125*src[5]
+                      "vadd.f32 d2,d2,d3\n"
+                      "vadd.f32 d2,d2,d4\n"
+
+                      "flds s10,[%[src]]\n"      // load 6
+                      "teq %[lfeChannel],0\n"
+                      "ite eq\n"
+                      "addeq %[src],%[src],#4\n"
+                      "addne %[src],%[src],#8\n" // skip the lfe channel
+
+                      "vadd.f32 s4,s4,s5\n"
+                      "vadd.f32 s4,s4,s10\n"
+                      "fsts s4,[%[dst]]\n"
+                      "add %[dst],%[dst],#4\n"
+                      "subs %[num],%[num],#1\n"
+                      "bne 0b\n"
+                      :[src] "+r" (src), [num] "+r" (num), [dst] "+r" (dst)
+                      :[coeff] "r" (coeff), [lfeChannel] "r" (lfeChannel)
+                      :"q0","q1","q2","cc","memory");
+}
+
+static void convert_5to1_neon_asm( float *dst, const float *src, int num, bool lfeChannel )
+{
+    const float coeff[4] = {0.25f, 0.25f, 1.f/6.f, 1.f/6.f};
+
+    __asm__ volatile (
+                      "vld1.32 {q0},[%[coeff]]\n"// load constants
+                      "0:\n"                     // use local label
+                      "vld1.32 {q1},[%[src]]!\n" // load 0,1,2,3
+                      "vmul.f32 q1,q1,q0\n"      // 0.25*src[0] 0.25*src[1] src[2]/6 src[3]/6
+                      "vadd.f32 d2,d2,d3\n"
+
+                      "flds s10,[%[src]]\n"      // load 4
+                      "teq %[lfeChannel],0\n"
+                      "ite eq\n"
+                      "addeq %[src],%[src],#4\n"
+                      "addne %[src],%[src],#8\n" // skip the lfe channel
+
+                      "vadd.f32 s4,s4,s5\n"
+                      "vadd.f32 s4,s4,s10\n"
+                      "fsts s4,[%[dst]]\n"
+                      "add %[dst],%[dst],#4\n"
+                      "subs %[num],%[num],#1\n"
+                      "bne 0b\n"
+                      :[src] "+r" (src), [num] "+r" (num), [dst] "+r" (dst)
+                      :[coeff] "r" (coeff), [lfeChannel] "r" (lfeChannel)
+                      :"q0","q1","q2","cc","memory");
+}
+
+static void convert_7to4_neon_asm( float *dst, const float *src, int num, bool lfeChannel )
+{
+    const float coeff[4] = {0.5f, 0.5f, 1.f/6.f, 1.f/6.f};
+
+    __asm__ volatile (
+                      "vld1.32 {q0},[%[coeff]]\n"// load constants
+                      "0:\n"                     // use local label
+                      "vld1.32 {q1},[%[src]]!\n" // load 0,1,2,3
+                      "vmul.f32 q1,q1,q0\n"      // 0.5*src[0] 0.5*src[1] src[2]/6 src[3]/6
+                      "vld1.32 {d5},[%[src]]!\n" // load 4,5
+                      "flds s14,[%[src]]\n"      // load 6
+                      "vadd.f32 d2,d2,d3\n"      // 0.5*src[0] + src[2]/6
+                                                 // 0.5*src[1] + src[3]/6
+                      "vdup.32 d4,d7[0]\n"       // so q2 : src[6] src[6] src[4] src[5]
+                      "vadd.f32 q2,q2,q1\n"      // src[6] + 0.5*src[0] + src[2]/6
+                                                 // src[6] + 0.5*src[1] + src[3]/6
+                                                 // src[4] + src[2]/6
+                                                 // src[5] + src[3]/6
+
+                      "teq %[lfeChannel],0\n"
+                      "ite eq\n"
+                      "addeq %[src],%[src],#4\n"
+                      "addne %[src],%[src],#8\n" // skip the lfe channel
+
+                      "vst1.32 {q2}, [%[dst]]!\n"
+                      "subs %[num],%[num],#1\n"
+                      "bne 0b\n"
+                      :[src] "+r" (src), [num] "+r" (num), [dst] "+r" (dst)
+                      :[coeff] "r" (coeff), [lfeChannel] "r" (lfeChannel)
+                      :"q0","q1","q2","q3","cc","memory");
+}
+
+static void convert_5to4_neon_asm( float *dst, const float *src, int num, bool lfeChannel )
+{
+    const float coeff[2] = {0.5f, 0.5f};
+
+    __asm__ volatile (
+                      "vld1.32 {d0},[%[coeff]]\n"// load constants
+                      "0:\n"                     // use local label
+                      "vld1.32 {q1},[%[src]]!\n" // load 0,1,2,3
+                      "vmul.f32 d2,d2,d0\n"      // 0.5*src[0] 0.5*src[1]
+                      "flds s8,[%[src]]\n"       // load 4
+                      "vdup.32 d4,d4[0]\n"
+                      "vadd.f32 d2,d2,d4\n"      // 0.5*src[0] + src[4]
+                                                 // 0.5*src[1] + src[4]
+                                                 // src[2]
+                                                 // src[3]
+
+                      "teq %[lfeChannel],0\n"
+                      "ite eq\n"
+                      "addeq %[src],%[src],#4\n"
+                      "addne %[src],%[src],#8\n" // skip the lfe channel
+
+                      "vst1.32 {q1}, [%[dst]]!\n"
+                      "subs %[num],%[num],#1\n"
+                      "bne 0b\n"
+                      :[src] "+r" (src), [num] "+r" (num), [dst] "+r" (dst)
+                      :[coeff] "r" (coeff), [lfeChannel] "r" (lfeChannel)
+                      :"q0","q1","q2","cc","memory");
+}
+#endif // __ARM_NEON__
+
 /*****************************************************************************
  * DoWork: convert a buffer
  *****************************************************************************/
@@ -81,8 +313,24 @@ static void DoWork( filter_t * p_filter,
     p_out_buf->i_nb_samples = p_in_buf->i_nb_samples;
     p_out_buf->i_buffer = p_in_buf->i_buffer * i_output_nb / i_input_nb;
 
+    if ( p_in_buf->i_nb_samples <= 0 )
+        return;
+
     if( p_filter->fmt_out.audio.i_physical_channels == AOUT_CHANS_2_0 )
     {
+#ifdef __ARM_NEON__
+        if( b_input_7_0 )
+            convert_7to2_neon_asm( p_dest, p_src, p_in_buf->i_nb_samples,
+                                   p_filter->fmt_in.audio.i_physical_channels & AOUT_CHAN_LFE );
+        else if( b_input_5_0 )
+            convert_5to2_neon_asm( p_dest, p_src, p_in_buf->i_nb_samples,
+                                   p_filter->fmt_in.audio.i_physical_channels & AOUT_CHAN_LFE );
+        else if( b_input_3_0 )
+            convert_3to2_neon_asm( p_dest, p_src, p_in_buf->i_nb_samples,
+                                   p_filter->fmt_in.audio.i_physical_channels & AOUT_CHAN_LFE );
+        else if( b_input_4_center_rear )
+            convert_4to2_neon_asm( p_dest, p_src, p_in_buf->i_nb_samples );
+#else
         if( b_input_7_0 )
         for( i = p_in_buf->i_nb_samples; i--; )
         {
@@ -128,9 +376,18 @@ static void DoWork( filter_t * p_filter,
           p_dest++;
           p_src += 4;
         }
+#endif // __ARM_NEON__
     }
     else if( p_filter->fmt_out.audio.i_physical_channels == AOUT_CHAN_CENTER )
     {
+#ifdef __ARM_NEON__
+        if( b_input_7_0 )
+            convert_7to1_neon_asm( p_dest, p_src, p_in_buf->i_nb_samples,
+                                   p_filter->fmt_in.audio.i_physical_channels & AOUT_CHAN_LFE );
+        else if( b_input_5_0 )
+            convert_5to1_neon_asm( p_dest, p_src, p_in_buf->i_nb_samples,
+                                   p_filter->fmt_in.audio.i_physical_channels & AOUT_CHAN_LFE );
+#else
         if( b_input_7_0 )
         for( i = p_in_buf->i_nb_samples; i--; )
         {
@@ -151,6 +408,7 @@ static void DoWork( filter_t * p_filter,
 
             if( p_filter->fmt_in.audio.i_physical_channels & AOUT_CHAN_LFE ) p_src++;
         }
+#endif // __ARM_NEON__
         else if( b_input_3_0 )
         for( i = p_in_buf->i_nb_samples; i--; )
         {
@@ -175,6 +433,14 @@ static void DoWork( filter_t * p_filter,
         assert( p_filter->fmt_out.audio.i_physical_channels == AOUT_CHANS_4_0 );
         assert( b_input_7_0 || b_input_5_0 );
 
+#ifdef __ARM_NEON__
+        if( b_input_7_0 )
+            convert_7to4_neon_asm( p_dest, p_src, p_in_buf->i_nb_samples,
+                                   p_filter->fmt_in.audio.i_physical_channels & AOUT_CHAN_LFE );
+        else
+            convert_5to4_neon_asm( p_dest, p_src, p_in_buf->i_nb_samples,
+                                   p_filter->fmt_in.audio.i_physical_channels & AOUT_CHAN_LFE );
+#else
         if( b_input_7_0 )
         for( i = p_in_buf->i_nb_samples; i--; )
         {
@@ -207,6 +473,7 @@ static void DoWork( filter_t * p_filter,
 
             if( p_filter->fmt_in.audio.i_physical_channels & AOUT_CHAN_LFE ) p_src++;
         }
+#endif // __ARM_NEON__
     }
 }
 
-- 
1.7.7.5 (Apple Git-26)




More information about the vlc-devel mailing list