[vlc-devel] [PATCH] add ARM/NEON version of simple channel mixer

Thu Oct 4 20:17:03 CEST 2012

From: David Geldreich <david.geldreich at free.fr>

Signed-off-by: Jean-Baptiste Kempf <jb at videolan.org>
Signed-off-by: Rafaël Carré <funman at videolan.org>
---
 modules/LIST                            |    1 +
 modules/arm_neon/Modules.am             |    8 +
 modules/arm_neon/simple_channel_mixer.S |  279 +++++++++++++++++++++++++++++++
 modules/arm_neon/simple_channel_mixer.c |  175 +++++++++++++++++++
 4 files changed, 463 insertions(+)
 create mode 100644 modules/arm_neon/simple_channel_mixer.S
 create mode 100644 modules/arm_neon/simple_channel_mixer.c

diff --git a/modules/LIST b/modules/LIST
index 100375f..33050b5 100644
--- a/modules/LIST
+++ b/modules/LIST
@@ -289,6 +289,7 @@ $Id$
  * shm: Shared memory framebuffer access module
  * sid: Sidplay demuxer
  * simple_channel_mixer: channel mixer
+ * simple_channel_mixer_neon: channel mixer using NEON assembly
  * skins2: Skinnable interface, new generation
  * smf: Standard MIDI file demuxer
  * smooth: Microsoft Smooth Streaming input
diff --git a/modules/arm_neon/Modules.am b/modules/arm_neon/Modules.am
index 498368a..4f2af9e 100644
--- a/modules/arm_neon/Modules.am
+++ b/modules/arm_neon/Modules.am
@@ -8,6 +8,13 @@ libaudio_format_neon_plugin_la_SOURCES = \
 libaudio_format_neon_plugin_la_CFLAGS = $(AM_CFLAGS)
 libaudio_format_neon_plugin_la_LIBADD = $(AM_LIBADD)
 
+libsimple_channel_mixer_neon_plugin_la_SOURCES = \
+	simple_channel_mixer.S \
+	simple_channel_mixer.c
+libsimple_channel_mixer_neon_plugin_la_CFLAGS = $(AM_CFLAGS)
+libsimple_channel_mixer_neon_plugin_la_LIBADD = $(AM_LIBADD)
+libsimple_channel_mixer_neon_plugin_la_DEPENDENCIES =
+
 libchroma_yuv_neon_plugin_la_SOURCES = \
 	i420_yuyv.S \
 	i422_yuyv.S \
@@ -30,6 +37,7 @@ libyuv_rgb_neon_plugin_la_LIBADD = $(AM_LIBADD)
 
 libvlc_LTLIBRARIES += \
 	libaudio_format_neon_plugin.la \
+	libsimple_channel_mixer_neon_plugin.la \
 	libchroma_yuv_neon_plugin.la \
 	libvolume_neon_plugin.la \
 	libyuv_rgb_neon_plugin.la \
diff --git a/modules/arm_neon/simple_channel_mixer.S b/modules/arm_neon/simple_channel_mixer.S
new file mode 100644
index 0000000..16d6702
--- /dev/null
+++ b/modules/arm_neon/simple_channel_mixer.S
@@ -0,0 +1,279 @@
+ @*****************************************************************************
+ @ simple_channel_mixer.S : ARM NEON channel mixer
+ @*****************************************************************************
+ @ Copyright (C) 2012 David Geldreich <david.geldreich at free.fr>
+ @                    Sébastien Toque
+ @
+ @ This program is free software; you can redistribute it and/or modify
+ @ it under the terms of the GNU General Public License as published by
+ @ the Free Software Foundation; either version 2 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ @ GNU General Public License for more details.
+ @
+ @ You should have received a copy of the GNU General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+
+	.fpu neon
+	.text
+	.align
+
+#define DST		r0
+#define SRC		r1
+#define NUM		r2
+#define LFE		r3
+#define COEFF	r4
+
+coeff_7to2:
+	.float 0.5
+	.float 0.5
+	.float 0.25
+	.float 0.25
+	.global convert_7to2_neon_asm
+	.type	convert_7to2_neon_asm, %function
+convert_7to2_neon_asm:
+	push {r4,lr}
+
+	adr COEFF, coeff_7to2
+	vld1.32 {q0},[COEFF]
+0:                                                @ use local label
+	vld1.32 {q2},[SRC]!                           @ load 0,1,2,3
+	vmul.f32 q2,q2,q0                             @ 0.5*src[0] 0.5*src[1] 0.25*src[2] 0.25*src[3]
+	vld1.32 {d6},[SRC]!                           @ load 4,5
+	vmul.f32 d6,d6,d1                             @ 0.25*src[4] 0.25*src[5]
+	vadd.f32 d4,d4,d5                             @ 0.5*src[0] + 0.25*src[2]
+                                                  @ 0.5*src[1] + 0.25*src[3]
+	vadd.f32 d4,d4,d6                             @ 0.5*src[0] + 0.25*src[2] + 0.25*src[4]
+                                                  @ 0.5*src[1] + 0.25*src[3] + 0.25*src[5]
+	flds s14,[SRC]                                @ load 6
+	vdup.32 d7,d7[0]
+	teq LFE,#0
+	ite eq
+	addeq SRC,SRC,#4
+	addne SRC,SRC,#8                              @ skip the lfe channel
+	vadd.f32 d4,d4,d7                             @ 0.5*src[0] + 0.25*src[2] + 0.25*src[4] + src[6]
+                                                  @ 0.5*src[1] + 0.25*src[3] + 0.25*src[5] + src[6]
+	vst1.32 d4, [DST]!
+	subs NUM,NUM,#1
+	bne 0b
+
+	pop {r4,pc}
+
+
+coeff_5to2:
+	.float 0.5
+	.float 0.5
+	.float 0.33
+	.float 0.33
+	.global convert_5to2_neon_asm
+	.type	convert_5to2_neon_asm, %function
+convert_5to2_neon_asm:
+	push {r4,lr}
+
+	adr COEFF, coeff_5to2
+	vld1.32 {q0},[COEFF]                          @ load constants
+0:                                                @ use local label
+	vld1.32 {q1},[SRC]!                           @ load 0,1,2,3
+	flds s8,[SRC]                                 @ load 4
+	vdup.32 d4,d4[0]
+	teq LFE,#0
+	ite eq
+	addeq SRC,SRC,#4
+	addne SRC,SRC,#8                              @ skip the lfe channel
+	vmul.f32 q1,q1,q0                             @ 0.5*src[0] 0.5*src[1] 0.33*src[2] 0.33*src[3]/3
+	vadd.f32 d2,d2,d3                             @ 0.5*src[0] + 0.33*src[2]
+                                                  @ 0.5*src[1] + 0.33*src[3]
+	vadd.f32 d2,d2,d4                             @ 0.5*src[0] + 0.33*src[2] + src[4]
+                                                  @ 0.5*src[1] + 0.33*src[3] + src[4]
+	vst1.32 d2,[DST]!
+	subs NUM,NUM,#1
+	bne 0b
+
+	pop {r4,pc}
+
+
+coeff_4to2:
+	.float 0.5
+	.float 0.5
+	.global convert_4to2_neon_asm
+	.type	convert_4to2_neon_asm, %function
+convert_4to2_neon_asm:
+	push {r4,lr}
+
+	adr COEFF, coeff_4to2
+	vld1.32 {d0},[COEFF]                          @ load constants
+0:                                                @ use local label
+	vld1.32 {q1},[SRC]!
+	vmul.f32 d2,d2,d0                             @ 0.5*src[0] 0.5*src[1]
+	vdup.32 d4,d3[0]                              @ dup src[2]
+	vdup.32 d3,d3[1]                              @ dup src[3]
+	vadd.f32 d2,d2,d3                             @ +src[3]
+	vadd.f32 d2,d2,d4                             @ +src[2]
+	vst1.32 d2,[DST]!
+	subs NUM,NUM,#1
+	bne 0b
+
+	pop {r4,pc}
+
+
+coeff_3to2:
+	.float 0.5
+	.float 0.5
+	.global convert_3to2_neon_asm
+	.type	convert_3to2_neon_asm, %function
+convert_3to2_neon_asm:
+	push {r4,lr}
+
+	adr COEFF, coeff_3to2
+	vld1.32 {d0},[COEFF]                          @ load constants
+0:                                                @ use local label
+	vld1.32 {d1},[SRC]!                           @ load 0,1
+	flds s4,[SRC]                                 @ load 2
+	vdup.32 d2,d2[0]
+	teq LFE,#0
+	ite eq
+	addeq SRC,SRC,#4
+	addne SRC,SRC,#8                              @ skip the lfe channel
+	vmul.f32 d1,d1,d0                             @ 0.5*src[0] 0.5*src[1]
+	vadd.f32 d1,d1,d2                             @ 0.5*src[0] + src[2]
+                                                  @ 0.5*src[1] + src[2]
+	vst1.32 d1,[DST]!
+	subs NUM,NUM,#1
+	bne 0b
+
+	pop {r4,pc}
+
+
+coeff_7to1:
+	.float 0.25
+	.float 0.25
+	.float 0.125
+	.float 0.125
+	.global convert_7to1_neon_asm
+	.type	convert_7to1_neon_asm, %function
+convert_7to1_neon_asm:
+	push {r4,lr}
+
+	adr COEFF, coeff_7to1
+	vld1.32 {q0},[COEFF]
+0:                                                @ use local label
+	vld1.32 {q1},[SRC]!                           @ load 0,1,2,3
+	vmul.f32 q1,q1,q0                             @ 0.25*src[0] 0.25*src[1] 0.125*src[2] 0.125*src[3]
+	vld1.32 {d4},[SRC]!                           @ load 4,5
+	vmul.f32 d4,d4,d1                             @ 0.125*src[4] 0.125*src[5]
+	vadd.f32 d2,d2,d3
+	vadd.f32 d2,d2,d4
+	flds s10,[SRC]                                @ load 6
+	teq LFE,#0
+	ite eq
+	addeq SRC,SRC,#4
+	addne SRC,SRC,#8                              @ skip the lfe channel
+	vadd.f32 s4,s4,s5
+	vadd.f32 s4,s4,s10
+	fsts s4,[DST]
+	add DST,DST,#4
+	subs NUM,NUM,#1
+	bne 0b
+
+	pop {r4,pc}
+
+
+coeff_5to1:
+	.float 0.25
+	.float 0.25
+	.float 0.16666667
+	.float 0.16666667
+	.global convert_5to1_neon_asm
+	.type	convert_5to1_neon_asm, %function
+convert_5to1_neon_asm:
+	push {r4,lr}
+
+	adr COEFF, coeff_5to1
+	vld1.32 {q0},[COEFF]
+0:                                                @ use local label
+	vld1.32 {q1},[SRC]!                           @ load 0,1,2,3
+	vmul.f32 q1,q1,q0                             @ 0.25*src[0] 0.25*src[1] src[2]/6 src[3]/6
+	vadd.f32 d2,d2,d3
+	flds s10,[SRC]                                @ load 4
+	teq LFE,#0
+	ite eq
+	addeq SRC,SRC,#4
+	addne SRC,SRC,#8                              @ skip the lfe channel
+	vadd.f32 s4,s4,s5
+	vadd.f32 s4,s4,s10
+	fsts s4,[DST]
+	add DST,DST,#4
+	subs NUM,NUM,#1
+	bne 0b
+
+	pop {r4,pc}
+
+
+coeff_7to4:
+	.float 0.5
+	.float 0.5
+	.float 0.16666667
+	.float 0.16666667
+	.global convert_7to4_neon_asm
+	.type	convert_7to4_neon_asm, %function
+convert_7to4_neon_asm:
+	push {r4,lr}
+
+	adr COEFF, coeff_7to4
+	vld1.32 {q0},[COEFF]
+0:                                                @ use local label
+	vld1.32 {q1},[SRC]!                           @ load 0,1,2,3
+	vmul.f32 q1,q1,q0                             @ 0.5*src[0] 0.5*src[1] src[2]/6 src[3]/6
+	vld1.32 {d5},[SRC]!                           @ load 4,5
+	flds s14,[SRC]                                @ load 6
+	vadd.f32 d2,d2,d3                             @ 0.5*src[0] + src[2]/6
+                                                  @ 0.5*src[1] + src[3]/6
+	vdup.32 d4,d7[0]                              @ so q2 : src[6] src[6] src[4] src[5]
+	vadd.f32 q2,q2,q1                             @ src[6] + 0.5*src[0] + src[2]/6
+                                                  @ src[6] + 0.5*src[1] + src[3]/6
+                                                  @ src[4] + src[2]/6
+                                                  @ src[5] + src[3]/6
+	teq LFE,#0
+	ite eq
+	addeq SRC,SRC,#4
+	addne SRC,SRC,#8                              @ skip the lfe channel
+	vst1.32 {q2}, [DST]!
+	subs NUM,NUM,#1
+	bne 0b
+
+	pop {r4,pc}
+
+
+coeff_5to4:
+	.float 0.5
+	.float 0.5
+	.global convert_5to4_neon_asm
+	.type	convert_5to4_neon_asm, %function
+convert_5to4_neon_asm:
+	push {r4,lr}
+
+	adr COEFF, coeff_5to4
+	vld1.32 {d0},[COEFF]
+0:                                                @ use local label
+	vld1.32 {q1},[SRC]!                           @ load 0,1,2,3
+	vmul.f32 d2,d2,d0                             @ 0.5*src[0] 0.5*src[1]
+	flds s8,[SRC]                                 @ load 4
+	vdup.32 d4,d4[0]
+	vadd.f32 d2,d2,d4                             @ 0.5*src[0] + src[4]
+                                                  @ 0.5*src[1] + src[4]
+                                                  @ src[2]
+                                                  @ src[3]
+	teq LFE,#0
+	ite eq
+	addeq SRC,SRC,#4
+	addne SRC,SRC,#8                              @ skip the lfe channel
+	vst1.32 {q1}, [DST]!
+	subs NUM,NUM,#1
+	bne 0b
+
+	pop {r4,pc}
diff --git a/modules/arm_neon/simple_channel_mixer.c b/modules/arm_neon/simple_channel_mixer.c
new file mode 100644
index 0000000..687ae2b
--- /dev/null
+++ b/modules/arm_neon/simple_channel_mixer.c
@@ -0,0 +1,175 @@
+/*****************************************************************************
+ * simple_channel_mixer.c : simple channel mixer plug-in using NEON assembly
+ *****************************************************************************
+ * Copyright (C) 2002, 2004, 2006-2009, 2012 the VideoLAN team
+ * $Id$
+ *
+ * Authors: Gildas Bazin <gbazin at videolan.org>
+ *          David Geldreich <david.geldreich at free.fr>
+ *          Sébastien Toque
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ *****************************************************************************/
+
+/*****************************************************************************
+ * Preamble
+ *****************************************************************************/
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <vlc_common.h>
+#include <vlc_plugin.h>
+#include <vlc_aout.h>
+#include <vlc_filter.h>
+#include <vlc_block.h>
+#include <vlc_cpu.h>
+#include <assert.h>
+
+/*****************************************************************************
+ * Module descriptor
+ *****************************************************************************/
+static int  OpenFilter( vlc_object_t * );
+
+vlc_module_begin ()
+    set_description( N_("Audio filter for simple channel mixing using NEON assembly") )
+    set_category( CAT_AUDIO )
+    set_subcategory( SUBCAT_AUDIO_MISC )
+    set_capability( "audio filter", 20 )
+    set_callbacks( OpenFilter, NULL )
+vlc_module_end ()
+
+#define FILTER_WRAPPER(in, out)                                                  \
+    void convert_##in##to##out##_neon_asm(float *dst, const float *src, int num, bool lfeChannel); \
+    static block_t *Filter_##in##to##out (filter_t *p_filter, block_t *p_block)  \
+    {                                                                            \
+        block_t *p_out;                                                          \
+        if (!FilterInit( p_filter, p_block, &p_out ))                            \
+            return NULL;                                                         \
+        const float *p_src = (const float *)p_block->p_buffer;                   \
+        float *p_dest = (float *)p_out->p_buffer;                                \
+        convert_##in##to##out##_neon_asm( p_dest, p_src, p_block->i_nb_samples,  \
+                  p_filter->fmt_in.audio.i_physical_channels & AOUT_CHAN_LFE );  \
+        block_Release( p_block );                                                \
+        return p_out;                                                            \
+    }
+
+#define TRY_FILTER(in, out)                                \
+    if ( b_input_##in && b_output_##out )                  \
+    {                                                      \
+        p_filter->pf_audio_filter = Filter_##in##to##out ; \
+        return VLC_SUCCESS;                                \
+    }
+
+/*****************************************************************************
+ * Filter:
+ *****************************************************************************/
+static bool FilterInit( filter_t *p_filter, block_t *p_block, block_t **pp_out )
+{
+    if( !p_block || !p_block->i_nb_samples )
+    {
+        if( p_block )
+            block_Release( p_block );
+        return false;
+    }
+
+    size_t i_out_size = p_block->i_nb_samples *
+        p_filter->fmt_out.audio.i_bitspersample *
+        p_filter->fmt_out.audio.i_channels / 8;
+
+    block_t *p_out = filter_NewAudioBuffer( p_filter, i_out_size );
+    if( !p_out )
+    {
+        msg_Warn( p_filter, "can't get output buffer" );
+        block_Release( p_block );
+        return false;
+    }
+
+    p_out->i_nb_samples = p_block->i_nb_samples;
+    p_out->i_dts = p_block->i_dts;
+    p_out->i_pts = p_block->i_pts;
+    p_out->i_length = p_block->i_length;
+
+    int i_input_nb = aout_FormatNbChannels( &p_filter->fmt_in.audio );
+    int i_output_nb = aout_FormatNbChannels( &p_filter->fmt_out.audio );
+    p_out->i_buffer = p_block->i_buffer * i_output_nb / i_input_nb;
+
+    *pp_out = p_out;
+    return true;
+}
+
+FILTER_WRAPPER(7,2)
+FILTER_WRAPPER(5,2)
+FILTER_WRAPPER(4,2)
+FILTER_WRAPPER(3,2)
+FILTER_WRAPPER(7,1)
+FILTER_WRAPPER(5,1)
+FILTER_WRAPPER(7,4)
+FILTER_WRAPPER(5,4)
+
+/*****************************************************************************
+ * OpenFilter:
+ *****************************************************************************/
+static int OpenFilter( vlc_object_t *p_this )
+{
+    filter_t *p_filter = (filter_t *)p_this;
+
+    if (!vlc_CPU_ARM_NEON())
+        return VLC_EGENERIC;
+
+    audio_format_t fmt_in  = p_filter->fmt_in.audio;
+    audio_format_t fmt_out = p_filter->fmt_out.audio;
+
+    fmt_in.i_format = p_filter->fmt_in.i_codec;
+    fmt_out.i_format = p_filter->fmt_out.i_codec;
+
+    if( fmt_in.i_format != VLC_CODEC_FL32 ||
+        fmt_in.i_format != fmt_out.i_format ||
+        fmt_in.i_rate != fmt_out.i_rate )
+    {
+        return VLC_EGENERIC;
+    }
+
+    if( fmt_in.i_physical_channels == fmt_out.i_physical_channels &&
+        fmt_in.i_original_channels == fmt_out.i_original_channels )
+    {
+        return VLC_EGENERIC;
+    }
+
+    const bool b_input_7 = (fmt_in.i_physical_channels & ~AOUT_CHAN_LFE) == AOUT_CHANS_7_0;
+    const bool b_input_5 = ( (fmt_in.i_physical_channels & AOUT_CHANS_5_0) == AOUT_CHANS_5_0 ||
+                             (fmt_in.i_physical_channels & AOUT_CHANS_5_0_MIDDLE) == AOUT_CHANS_5_0_MIDDLE );
+    const bool b_input_4 =  (fmt_in.i_physical_channels & ~AOUT_CHAN_LFE) == AOUT_CHANS_4_CENTER_REAR;
+    const bool b_input_3 = (fmt_in.i_physical_channels & ~AOUT_CHAN_LFE) == AOUT_CHANS_3_0;
+
+    const bool b_output_1 = fmt_out.i_physical_channels == AOUT_CHAN_CENTER;
+    const bool b_output_2 = fmt_out.i_physical_channels == AOUT_CHANS_2_0;
+    const bool b_output_4 = fmt_out.i_physical_channels == AOUT_CHANS_4_0;
+
+    /* Only conversion to Mono, Stereo and 4.0 right now */
+    /* Only from 7/7.1/5/5.1/3/3.1/2.0
+     * XXX 5.X rear and middle are handled the same way */
+
+    TRY_FILTER(7,2)
+    TRY_FILTER(5,2)
+    TRY_FILTER(4,2)
+    TRY_FILTER(3,2)
+    TRY_FILTER(7,1)
+    TRY_FILTER(5,1)
+    TRY_FILTER(7,4)
+    TRY_FILTER(5,4)
+
+    return VLC_EGENERIC;
+}
-- 
1.7.10.4