[vlc-devel] [PATCH] add ARM/NEON conversions for audio_filter/channel_mixer/simple
David Geldreich
david.geldreich at free.fr
Wed Apr 4 14:16:29 CEST 2012
write a ARM/NEON inline assembly version of most of the conversion cases of
audio_filter/channel_mixer/simple
inline assembly is in separate functions for clarity and will be inlined by the compiler
For example, 5.x->2 conversion gets a 8x speedup on iPad1 and 3x on iPad2
I could provide a test program that shows that these routines :
- give the same result (modulo epsilon) as the original one
- work for any alignement of src/dst
- work for any buffer size
---
modules/audio_filter/channel_mixer/simple.c | 269 ++++++++++++++++++++++++++-
1 files changed, 268 insertions(+), 1 deletions(-)
diff --git a/modules/audio_filter/channel_mixer/simple.c b/modules/audio_filter/channel_mixer/simple.c
index b48eede..71d7f26 100644
--- a/modules/audio_filter/channel_mixer/simple.c
+++ b/modules/audio_filter/channel_mixer/simple.c
@@ -1,10 +1,11 @@
/*****************************************************************************
* simple.c : simple channel mixer plug-in
*****************************************************************************
- * Copyright (C) 2002, 2004, 2006-2009 the VideoLAN team
+ * Copyright (C) 2002, 2004, 2006-2009, 2012 the VideoLAN team
* $Id$
*
* Authors: Gildas Bazin <gbazin at videolan.org>
+ * David Geldreich <david.geldreich at free.fr>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -55,6 +56,237 @@ static bool IsSupported( const audio_format_t *p_input, const audio_format_t *p_
static block_t *Filter( filter_t *, block_t * );
+#ifdef __ARM_NEON__
+/*****************************************************************************
+ * ARM/NEON buffer conversions
+ *****************************************************************************/
+static void convert_7to2_neon_asm( float *dst, const float *src, int num, bool lfeChannel )
+{
+ const float coeff[4] = {0.5f,0.5f,0.25f, 0.25f};
+
+ __asm__ volatile (
+ "vld1.32 {q0},[%[coeff]]\n"// load constants
+ "0:\n" // use local label
+ "vld1.32 {q2},[%[src]]!\n" // load 0,1,2,3
+ "vmul.f32 q2,q2,q0\n" // 0.5*src[0] 0.5*src[1] 0.25*src[2] 0.25*src[3]
+ "vld1.32 {d6},[%[src]]!\n" // load 4,5
+ "vmul.f32 d6,d6,d1\n" // 0.25*src[4] 0.25*src[5]
+ "vadd.f32 d4,d4,d5\n" // 0.5*src[0] + 0.25*src[2]
+ // 0.5*src[1] + 0.25*src[3]
+ "vadd.f32 d4,d4,d6\n" // 0.5*src[0] + 0.25*src[2] + 0.25*src[4]
+ // 0.5*src[1] + 0.25*src[3] + 0.25*src[5]
+
+ "flds s14,[%[src]]\n" // load 6
+ "vdup.32 d7,d7[0]\n"
+ "teq %[lfeChannel],0\n"
+ "ite eq\n"
+ "addeq %[src],%[src],#4\n"
+ "addne %[src],%[src],#8\n" // skip the lfe channel
+
+ "vadd.f32 d4,d4,d7\n" // 0.5*src[0] + 0.25*src[2] + 0.25*src[4] + src[6]
+ // 0.5*src[1] + 0.25*src[3] + 0.25*src[5] + src[6]
+ "vst1.32 d4, [%[dst]]!\n"
+ "subs %[num],%[num],#1\n"
+ "bne 0b\n"
+ :[src] "+r" (src), [num] "+r" (num), [dst] "+r" (dst)
+ :[coeff] "r" (coeff), [lfeChannel] "r" (lfeChannel)
+ :"q0","q2","q3","cc","memory");
+}
+
+static void convert_5to2_neon_asm( float *dst, const float *src, int num, bool lfeChannel )
+{
+ const float coeff[4] = {0.5f,0.5f,0.33f, 0.33f};
+
+ __asm__ volatile (
+ "vld1.32 {q0},[%[coeff]]\n" // load constants
+ "0:\n" // use local label
+ "vld1.32 {q1},[%[src]]!\n" // load 0,1,2,3
+ "flds s8,[%[src]]\n" // load 4
+ "vdup.32 d4,d4[0]\n"
+ "teq %[lfeChannel],0\n"
+ "ite eq\n"
+ "addeq %[src],%[src],#4\n"
+ "addne %[src],%[src],#8\n" // skip the lfe channel
+ "vmul.f32 q1,q1,q0\n" // 0.5*src[0] 0.5*src[1] 0.33*src[2] 0.33*src[3]/3
+ "vadd.f32 d2,d2,d3\n" // 0.5*src[0] + 0.33*src[2]
+ // 0.5*src[1] + 0.33*src[3]
+ "vadd.f32 d2,d2,d4\n" // 0.5*src[0] + 0.33*src[2] + src[4]
+ // 0.5*src[1] + 0.33*src[3] + src[4]
+ "vst1.32 d2,[%[dst]]!\n"
+ "subs %[num],%[num],#1\n"
+ "bne 0b\n"
+ :[src] "+r" (src), [num] "+r" (num), [dst] "+r" (dst)
+ :[coeff] "r" (coeff), [lfeChannel] "r" (lfeChannel)
+ :"q0","q1","d4","cc","memory");
+}
+
+static void convert_4to2_neon_asm( float *dst, const float *src, int num )
+{
+ const float coeff[2] = {0.5f,0.5f};
+
+ __asm__ volatile (
+ "vld1.32 {d0},[%[coeff]]\n" // load constants
+ "0:\n" // use local label
+ "vld1.32 {q1},[%[src]]!\n"
+ "vmul.f32 d2,d2,d0\n" // 0.5*src[0] 0.5*src[1]
+ "vdup.32 d4,d3[0]\n" // dup src[2]
+ "vdup.32 d3,d3[1]\n" // dup src[3]
+ "vadd.f32 d2,d2,d3\n" // +src[3]
+ "vadd.f32 d2,d2,d4\n" // +src[2]
+ "vst1.32 d2,[%[dst]]!\n"
+ "subs %[num],%[num],#1\n"
+ "bne 0b\n"
+ :[src] "+r" (src), [num] "+r" (num), [dst] "+r" (dst)
+ :[coeff] "r" (coeff)
+ :"q0","q1","q2","cc","memory");
+}
+
+static void convert_3to2_neon_asm( float *dst, const float *src, int num, bool lfeChannel )
+{
+ const float coeff[2] = {0.5f,0.5f};
+
+ __asm__ volatile (
+ "vld1.32 {d0},[%[coeff]]\n" // load constants
+ "0:\n" // use local label
+ "vld1.32 {d1},[%[src]]!\n" // load 0,1
+ "flds s4,[%[src]]\n" // load 2
+ "vdup.32 d2,d2[0]\n"
+ "teq %[lfeChannel],0\n"
+ "ite eq\n"
+ "addeq %[src],%[src],#4\n"
+ "addne %[src],%[src],#8\n" // skip the lfe channel
+ "vmul.f32 d1,d1,d0\n" // 0.5*src[0] 0.5*src[1]
+ "vadd.f32 d1,d1,d2\n" // 0.5*src[0] + src[2]
+ // 0.5*src[1] + src[2]
+ "vst1.32 d1,[%[dst]]!\n"
+ "subs %[num],%[num],#1\n"
+ "bne 0b\n"
+ :[src] "+r" (src), [num] "+r" (num), [dst] "+r" (dst)
+ :[coeff] "r" (coeff), [lfeChannel] "r" (lfeChannel)
+ :"d0","d1","d2","cc","memory");
+}
+
+static void convert_7to1_neon_asm( float *dst, const float *src, int num, bool lfeChannel )
+{
+ const float coeff[4] = {0.25f,0.25f,0.125f, 0.125f};
+
+ __asm__ volatile (
+ "vld1.32 {q0},[%[coeff]]\n"// load constants
+ "0:\n" // use local label
+ "vld1.32 {q1},[%[src]]!\n" // load 0,1,2,3
+ "vmul.f32 q1,q1,q0\n" // 0.25*src[0] 0.25*src[1] 0.125*src[2] 0.125*src[3]
+ "vld1.32 {d4},[%[src]]!\n" // load 4,5
+ "vmul.f32 d4,d4,d1\n" // 0.125*src[4] 0.125*src[5]
+ "vadd.f32 d2,d2,d3\n"
+ "vadd.f32 d2,d2,d4\n"
+
+ "flds s10,[%[src]]\n" // load 6
+ "teq %[lfeChannel],0\n"
+ "ite eq\n"
+ "addeq %[src],%[src],#4\n"
+ "addne %[src],%[src],#8\n" // skip the lfe channel
+
+ "vadd.f32 s4,s4,s5\n"
+ "vadd.f32 s4,s4,s10\n"
+ "fsts s4,[%[dst]]\n"
+ "add %[dst],%[dst],#4\n"
+ "subs %[num],%[num],#1\n"
+ "bne 0b\n"
+ :[src] "+r" (src), [num] "+r" (num), [dst] "+r" (dst)
+ :[coeff] "r" (coeff), [lfeChannel] "r" (lfeChannel)
+ :"q0","q1","q2","cc","memory");
+}
+
+static void convert_5to1_neon_asm( float *dst, const float *src, int num, bool lfeChannel )
+{
+ const float coeff[4] = {0.25f, 0.25f, 1.f/6.f, 1.f/6.f};
+
+ __asm__ volatile (
+ "vld1.32 {q0},[%[coeff]]\n"// load constants
+ "0:\n" // use local label
+ "vld1.32 {q1},[%[src]]!\n" // load 0,1,2,3
+ "vmul.f32 q1,q1,q0\n" // 0.25*src[0] 0.25*src[1] src[2]/6 src[3]/6
+ "vadd.f32 d2,d2,d3\n"
+
+ "flds s10,[%[src]]\n" // load 4
+ "teq %[lfeChannel],0\n"
+ "ite eq\n"
+ "addeq %[src],%[src],#4\n"
+ "addne %[src],%[src],#8\n" // skip the lfe channel
+
+ "vadd.f32 s4,s4,s5\n"
+ "vadd.f32 s4,s4,s10\n"
+ "fsts s4,[%[dst]]\n"
+ "add %[dst],%[dst],#4\n"
+ "subs %[num],%[num],#1\n"
+ "bne 0b\n"
+ :[src] "+r" (src), [num] "+r" (num), [dst] "+r" (dst)
+ :[coeff] "r" (coeff), [lfeChannel] "r" (lfeChannel)
+ :"q0","q1","q2","cc","memory");
+}
+
+static void convert_7to4_neon_asm( float *dst, const float *src, int num, bool lfeChannel )
+{
+ const float coeff[4] = {0.5f, 0.5f, 1.f/6.f, 1.f/6.f};
+
+ __asm__ volatile (
+ "vld1.32 {q0},[%[coeff]]\n"// load constants
+ "0:\n" // use local label
+ "vld1.32 {q1},[%[src]]!\n" // load 0,1,2,3
+ "vmul.f32 q1,q1,q0\n" // 0.5*src[0] 0.5*src[1] src[2]/6 src[3]/6
+ "vld1.32 {d5},[%[src]]!\n" // load 4,5
+ "flds s14,[%[src]]\n" // load 6
+ "vadd.f32 d2,d2,d3\n" // 0.5*src[0] + src[2]/6
+ // 0.5*src[1] + src[3]/6
+ "vdup.32 d4,d7[0]\n" // so q2 : src[6] src[6] src[4] src[5]
+ "vadd.f32 q2,q2,q1\n" // src[6] + 0.5*src[0] + src[2]/6
+ // src[6] + 0.5*src[1] + src[3]/6
+ // src[4] + src[2]/6
+ // src[5] + src[3]/6
+
+ "teq %[lfeChannel],0\n"
+ "ite eq\n"
+ "addeq %[src],%[src],#4\n"
+ "addne %[src],%[src],#8\n" // skip the lfe channel
+
+ "vst1.32 {q2}, [%[dst]]!\n"
+ "subs %[num],%[num],#1\n"
+ "bne 0b\n"
+ :[src] "+r" (src), [num] "+r" (num), [dst] "+r" (dst)
+ :[coeff] "r" (coeff), [lfeChannel] "r" (lfeChannel)
+ :"q0","q1","q2","q3","cc","memory");
+}
+
+static void convert_5to4_neon_asm( float *dst, const float *src, int num, bool lfeChannel )
+{
+ const float coeff[2] = {0.5f, 0.5f};
+
+ __asm__ volatile (
+ "vld1.32 {d0},[%[coeff]]\n"// load constants
+ "0:\n" // use local label
+ "vld1.32 {q1},[%[src]]!\n" // load 0,1,2,3
+ "vmul.f32 d2,d2,d0\n" // 0.5*src[0] 0.5*src[1]
+ "flds s8,[%[src]]\n" // load 4
+ "vdup.32 d4,d4[0]\n"
+ "vadd.f32 d2,d2,d4\n" // 0.5*src[0] + src[4]
+ // 0.5*src[1] + src[4]
+ // src[2]
+ // src[3]
+
+ "teq %[lfeChannel],0\n"
+ "ite eq\n"
+ "addeq %[src],%[src],#4\n"
+ "addne %[src],%[src],#8\n" // skip the lfe channel
+
+ "vst1.32 {q1}, [%[dst]]!\n"
+ "subs %[num],%[num],#1\n"
+ "bne 0b\n"
+ :[src] "+r" (src), [num] "+r" (num), [dst] "+r" (dst)
+ :[coeff] "r" (coeff), [lfeChannel] "r" (lfeChannel)
+ :"q0","q1","q2","cc","memory");
+}
+#endif // __ARM_NEON__
+
/*****************************************************************************
* DoWork: convert a buffer
*****************************************************************************/
@@ -81,8 +313,24 @@ static void DoWork( filter_t * p_filter,
p_out_buf->i_nb_samples = p_in_buf->i_nb_samples;
p_out_buf->i_buffer = p_in_buf->i_buffer * i_output_nb / i_input_nb;
+ if ( p_in_buf->i_nb_samples <= 0 )
+ return;
+
if( p_filter->fmt_out.audio.i_physical_channels == AOUT_CHANS_2_0 )
{
+#ifdef __ARM_NEON__
+ if( b_input_7_0 )
+ convert_7to2_neon_asm( p_dest, p_src, p_in_buf->i_nb_samples,
+ p_filter->fmt_in.audio.i_physical_channels & AOUT_CHAN_LFE );
+ else if( b_input_5_0 )
+ convert_5to2_neon_asm( p_dest, p_src, p_in_buf->i_nb_samples,
+ p_filter->fmt_in.audio.i_physical_channels & AOUT_CHAN_LFE );
+ else if( b_input_3_0 )
+ convert_3to2_neon_asm( p_dest, p_src, p_in_buf->i_nb_samples,
+ p_filter->fmt_in.audio.i_physical_channels & AOUT_CHAN_LFE );
+ else if( b_input_4_center_rear )
+ convert_4to2_neon_asm( p_dest, p_src, p_in_buf->i_nb_samples );
+#else
if( b_input_7_0 )
for( i = p_in_buf->i_nb_samples; i--; )
{
@@ -128,9 +376,18 @@ static void DoWork( filter_t * p_filter,
p_dest++;
p_src += 4;
}
+#endif // __ARM_NEON__
}
else if( p_filter->fmt_out.audio.i_physical_channels == AOUT_CHAN_CENTER )
{
+#ifdef __ARM_NEON__
+ if( b_input_7_0 )
+ convert_7to1_neon_asm( p_dest, p_src, p_in_buf->i_nb_samples,
+ p_filter->fmt_in.audio.i_physical_channels & AOUT_CHAN_LFE );
+ else if( b_input_5_0 )
+ convert_5to1_neon_asm( p_dest, p_src, p_in_buf->i_nb_samples,
+ p_filter->fmt_in.audio.i_physical_channels & AOUT_CHAN_LFE );
+#else
if( b_input_7_0 )
for( i = p_in_buf->i_nb_samples; i--; )
{
@@ -151,6 +408,7 @@ static void DoWork( filter_t * p_filter,
if( p_filter->fmt_in.audio.i_physical_channels & AOUT_CHAN_LFE ) p_src++;
}
+#endif // __ARM_NEON__
else if( b_input_3_0 )
for( i = p_in_buf->i_nb_samples; i--; )
{
@@ -175,6 +433,14 @@ static void DoWork( filter_t * p_filter,
assert( p_filter->fmt_out.audio.i_physical_channels == AOUT_CHANS_4_0 );
assert( b_input_7_0 || b_input_5_0 );
+#ifdef __ARM_NEON__
+ if( b_input_7_0 )
+ convert_7to4_neon_asm( p_dest, p_src, p_in_buf->i_nb_samples,
+ p_filter->fmt_in.audio.i_physical_channels & AOUT_CHAN_LFE );
+ else
+ convert_5to4_neon_asm( p_dest, p_src, p_in_buf->i_nb_samples,
+ p_filter->fmt_in.audio.i_physical_channels & AOUT_CHAN_LFE );
+#else
if( b_input_7_0 )
for( i = p_in_buf->i_nb_samples; i--; )
{
@@ -207,6 +473,7 @@ static void DoWork( filter_t * p_filter,
if( p_filter->fmt_in.audio.i_physical_channels & AOUT_CHAN_LFE ) p_src++;
}
+#endif // __ARM_NEON__
}
}
--
1.7.7.5 (Apple Git-26)
More information about the vlc-devel
mailing list