[vlc-devel] commit: NEON converter: unroll fi32->s16n conversion ( Rémi Denis-Courmont )
git version control
git at videolan.org
Tue Sep 29 21:07:53 CEST 2009
vlc | branch: master | Rémi Denis-Courmont <remi at remlab.net> | Mon Sep 28 22:59:01 2009 +0300| [56bbc3365ba0ec871b68564ab9b6b15db46194b9] | committer: Rémi Denis-Courmont
NEON converter: unroll fi32->s16n conversion
Main loop now burns 3 cycles per 8 values, if I compute right.
This is not quite main bottleneck, but it is pretty much always used
(since we output 16-bits PCM).
> http://git.videolan.org/gitweb.cgi/vlc.git/?a=commit;h=56bbc3365ba0ec871b68564ab9b6b15db46194b9
---
modules/audio_filter/converter/Modules.am | 4 +-
modules/audio_filter/converter/neon.c | 67 +++++------------
modules/audio_filter/converter/neon_s32_s16.S | 100 +++++++++++++++++++++++++
3 files changed, 122 insertions(+), 49 deletions(-)
diff --git a/modules/audio_filter/converter/Modules.am b/modules/audio_filter/converter/Modules.am
index 6306500..1f0f611 100644
--- a/modules/audio_filter/converter/Modules.am
+++ b/modules/audio_filter/converter/Modules.am
@@ -1,6 +1,8 @@
SOURCES_converter_fixed = fixed.c
SOURCES_converter_float = float.c
-SOURCES_converter_neon = neon.c
+SOURCES_converter_neon = \
+ neon_s32_s16.S \
+ neon.c
SOURCES_a52tospdif = a52tospdif.c
SOURCES_a52tofloat32 = a52tofloat32.c
SOURCES_dtstospdif = dtstospdif.c
diff --git a/modules/audio_filter/converter/neon.c b/modules/audio_filter/converter/neon.c
index 21fd13d..e1c682a 100644
--- a/modules/audio_filter/converter/neon.c
+++ b/modules/audio_filter/converter/neon.c
@@ -28,6 +28,8 @@
#include <vlc_filter.h>
#include <vlc_cpu.h>
+#include <assert.h>
+
static int Open (vlc_object_t *);
vlc_module_begin ()
@@ -129,63 +131,32 @@ static block_t *Do_F32_S32 (filter_t *filter, block_t *inbuf)
return inbuf;
}
+void s32_s16_neon_unaligned (int16_t *out, const int32_t *in, unsigned nb);
+void s32_s16_neon (int16_t *out, const int32_t *in, unsigned nb);
+
/**
* Signed 32-bits fixed point to signed 16-bits integer
*/
static block_t *Do_S32_S16 (filter_t *filter, block_t *inbuf)
{
- unsigned nb_samples = inbuf->i_nb_samples
- * aout_FormatNbChannels (&filter->fmt_in.audio);
- int32_t *inp = (int32_t *)inbuf->p_buffer;
- const int32_t *endp = inp + nb_samples;
- int16_t *outp = (int16_t *)inp;
-
- while (nb_samples & 3)
- {
- const int16_t roundup = 1 << 12;
- asm volatile (
- "qadd r0, %[inv], %[roundup]\n"
- "ssat %[outv], #16, r0, asr #13\n"
- : [outv] "=r" (*outp)
- : [inv] "r" (*inp), [roundup] "r" (roundup)
- : "r0");
- inp++;
- outp++;
- nb_samples--;
- }
+ const int32_t *in = (int32_t *)inbuf->p_buffer;
+ int16_t *out = (int16_t *)in;
+ unsigned nb;
- if (nb_samples & 4)
- asm volatile (
- "vld1.s32 {q0}, [%[inp]]!\n"
- "vrshrn.i32 d0, q0, #13\n"
- "vst1.s16 {d0}, [%[outp]]!\n"
- : [outp] "+r" (outp), [inp] "+r" (inp)
- :
- : "q0", "memory");
+ nb = ((-(uintptr_t)in) & 12) >> 2;
+ out += nb; /* fix up misalignment */
+ inbuf->p_buffer += 2 * nb;
- if (nb_samples & 8)
- asm volatile (
- "vld1.s32 {q0-q1}, [%[inp]]!\n"
- "vrshrn.i32 d0, q0, #13\n"
- "vrshrn.i32 d1, q1, #13\n"
- "vst1.s16 {q0}, [%[outp]]!\n"
- : [outp] "+r" (outp), [inp] "+r" (inp)
- :
- : "q0", "q1", "memory");
+ s32_s16_neon_unaligned (out, in, nb);
+ in += nb;
+ out += nb;
- while (inp != endp)
- asm volatile (
- "vld1.s32 {q0-q1}, [%[inp]]!\n"
- "vld1.s32 {q2-q3}, [%[inp]]!\n"
- "vrshrn.s32 d0, q0, #13\n"
- "vrshrn.s32 d1, q1, #13\n"
- "vrshrn.s32 d2, q2, #13\n"
- "vrshrn.s32 d3, q3, #13\n"
- "vst1.s16 {q0-q1}, [%[outp]]!\n"
- : [outp] "+r" (outp), [inp] "+r" (inp)
- :
- : "q0", "q1", "q2", "q3", "memory");
+ nb = inbuf->i_nb_samples
+ * aout_FormatNbChannels (&filter->fmt_in.audio) - nb;
+ assert (!(((uintptr_t)in) & 15));
+ assert (!(((uintptr_t)out) & 15));
+ s32_s16_neon (out, in, nb);
inbuf->i_buffer /= 2;
return inbuf;
}
diff --git a/modules/audio_filter/converter/neon_s32_s16.S b/modules/audio_filter/converter/neon_s32_s16.S
new file mode 100644
index 0000000..88effca
--- /dev/null
+++ b/modules/audio_filter/converter/neon_s32_s16.S
@@ -0,0 +1,100 @@
+ @*****************************************************************************
+ @ neon_s32_s16.S : ARM NEONv1 fi32 to s16n audio sample conversion
+ @*****************************************************************************
+ @ Copyright (C) 2009 Rémi Denis-Courmont
+ @
+ @ This program is free software; you can redistribute it and/or modify
+ @ it under the terms of the GNU General Public License as published by
+ @ the Free Software Foundation; either version 2 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ @ GNU General Public License for more details.
+ @
+ @ You should have received a copy of the GNU General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+
+ .fpu neon
+ .text
+
+#define OUT r0
+#define IN r1
+#define N r2
+#define BUF r3
+#define HALF ip
+
+ .align
+ .global s32_s16_neon
+ .type s32_s16_neon, %function
+ @ Converts fixed-point 32-bits to signed 16-bits
+ @ Input and output must be on 128-bits boundary
+s32_s16_neon:
+ pld [IN]
+2:
+ cmp N, #8
+ blt s32_s16_neon_unaligned
+ vld1.s32 {q8-q9}, [IN,:128]!
+
+3: @ Main loop
+ pld [IN, #64]
+ sub N, #8
+ vqrshrn.s32 d16, q8, #13
+ vqrshrn.s32 d17, q9, #13
+ cmp N, #8
+ blt 4f
+ vld1.s32 {q10-q11}, [IN,:128]!
+ sub N, #8
+ vqrshrn.s32 d18, q10, #13
+ vqrshrn.s32 d19, q11, #13
+ cmp N, #8
+ blt 5f
+ vld1.s32 {q12-q13}, [IN,:128]!
+ sub N, #8
+ vqrshrn.s32 d20, q12, #13
+ vqrshrn.s32 d21, q13, #13
+ vst1.s16 {d16-d19}, [OUT,:128]!
+ cmp N, #8
+ blt 6f
+ vld1.s32 {q8-q9}, [IN,:128]!
+ vst1.s16 {d20-d21}, [OUT,:128]!
+ b 3b
+4:
+ vst1.s16 {d16-d17}, [OUT,:128]!
+ b 7f
+5:
+ vst1.s16 {d16-d19}, [OUT,:128]!
+ b 7f
+6:
+ vst1.s16 {d20-d21}, [OUT,:128]!
+7:
+ cmp N, #4
+ blt s32_s16_neon_unaligned
+ vld1.s32 {q8}, [IN,:128]!
+ sub N, #4
+ vqrshrn.s32 d16, q8, #13
+ vst1.s16 {d16}, [OUT,:64]!
+
+ @ Fall through for last 0-3 samples
+
+ .global s32_s16_neon_unaligned
+ .type s32_s16_neon_unaligned, %function
+ @ Converts fixed-point 32-bits to signed 16-bits
+ @ Input must be on 32-bits boundary, output on 16-bits
+s32_s16_neon_unaligned:
+ mov HALF, #4096
+1:
+ cmp N, #0
+ bxeq lr
+
+ ldr BUF, [IN]
+ add IN, #4
+ add OUT, #2
+ qadd BUF, HALF, BUF
+ sub N, #1
+ ssat BUF, #16, BUF, asr #13
+ strh BUF, [OUT, #-2]
+ b 1b
More information about the vlc-devel
mailing list