[vlc-devel] [PATCH 1/1] deinterlace: arm64 NEON merge asm
Janne Grunau
janne-vlc at jannau.net
Tue Aug 9 23:37:29 CEST 2016
Approximately factor 2 faster.
Also adds build system support / cpu "detection" for arm64 neon.
Advanced SIMD (neon) is mandatory for general purpose ARMv8-a CPU so the
CPU feature detection is a constant 1.
---
configure.ac | 20 +++++
include/vlc_cpu.h | 2 +
modules/video_filter/Makefile.am | 4 +
modules/video_filter/deinterlace/deinterlace.c | 5 ++
modules/video_filter/deinterlace/merge.h | 9 +++
modules/video_filter/deinterlace/merge_arm64.S | 102 +++++++++++++++++++++++++
6 files changed, 142 insertions(+)
create mode 100644 modules/video_filter/deinterlace/merge_arm64.S
diff --git a/configure.ac b/configure.ac
index e83b0a5..e063f0b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1432,6 +1432,26 @@ asm volatile("vqmovun.s64 d0, q1":::"d0");
])
AM_CONDITIONAL(HAVE_NEON, [test "${ac_cv_arm_neon}" = "yes"])
+AC_ARG_ENABLE(arm64,
+ [AS_HELP_STRING([--disable-arm64],
+ [disable arm 64-bit optimizations (default auto)])],, [
+ AS_IF([test "${host_cpu}" = "aarch64"], [enable_arm64="yes"] ,[enable_arm64="no"])
+])
+AS_IF([test "${enable_arm64}" != "no"], [
+ AC_CACHE_CHECK([if $CCAS groks ARM 64 SIMD assembly], [ac_cv_arm64], [
+ AC_COMPILE_IFELSE([
+ AC_LANG_PROGRAM(,[[
+asm volatile("uhadd v0.8b, v0.8b, v1.8b":::"v0");
+]])
+ ], [
+ ac_cv_arm64="yes"
+ ], [
+ ac_cv_arm64="no"
+ ])
+ ])
+])
+AM_CONDITIONAL(HAVE_ARM64, [test "${ac_cv_arm64}" = "yes"])
+
AC_ARG_ENABLE(altivec,
[AS_HELP_STRING([--disable-altivec],
diff --git a/include/vlc_cpu.h b/include/vlc_cpu.h
index 910900a..8c520a0 100644
--- a/include/vlc_cpu.h
+++ b/include/vlc_cpu.h
@@ -178,6 +178,8 @@ VLC_API unsigned vlc_CPU(void);
# elif defined (__aarch64__)
# define HAVE_FPU 1
+// NEON is mandatory for general purpose ARMv8-a CPUs
+# define vlc_CPU_ARM64_NEON() (1)
# elif defined (__sparc__)
# define HAVE_FPU 1
diff --git a/modules/video_filter/Makefile.am b/modules/video_filter/Makefile.am
index d853717..5d5fdaf 100644
--- a/modules/video_filter/Makefile.am
+++ b/modules/video_filter/Makefile.am
@@ -124,6 +124,10 @@ if HAVE_NEON
libdeinterlace_plugin_la_SOURCES += video_filter/deinterlace/merge_arm.S
libdeinterlace_plugin_la_CFLAGS += -DCAN_COMPILE_ARM
endif
+if HAVE_ARM64
+libdeinterlace_plugin_la_SOURCES += video_filter/deinterlace/merge_arm64.S
+libdeinterlace_plugin_la_CFLAGS += -DCAN_COMPILE_ARM64
+endif
video_filter_LTLIBRARIES += libdeinterlace_plugin.la
libdynamicoverlay_plugin_la_SOURCES = \
diff --git a/modules/video_filter/deinterlace/deinterlace.c b/modules/video_filter/deinterlace/deinterlace.c
index b1676a4..bcc44b6 100644
--- a/modules/video_filter/deinterlace/deinterlace.c
+++ b/modules/video_filter/deinterlace/deinterlace.c
@@ -700,6 +700,11 @@ notsupp:
p_sys->pf_merge = pixel_size == 1 ? merge8_armv6 : merge16_armv6;
else
#endif
+#if defined(CAN_COMPILE_ARM64)
+ if( vlc_CPU_ARM64_NEON() )
+ p_sys->pf_merge = pixel_size == 1 ? merge8_arm64_neon : merge16_arm64_neon;
+ else
+#endif
{
p_sys->pf_merge = pixel_size == 1 ? Merge8BitGeneric : Merge16BitGeneric;
#if defined(__i386__) || defined(__x86_64__)
diff --git a/modules/video_filter/deinterlace/merge.h b/modules/video_filter/deinterlace/merge.h
index d342522..74b5ab5 100644
--- a/modules/video_filter/deinterlace/merge.h
+++ b/modules/video_filter/deinterlace/merge.h
@@ -172,6 +172,15 @@ void merge8_armv6 (void *, const void *, const void *, size_t);
void merge16_armv6 (void *, const void *, const void *, size_t);
#endif
+#if defined(CAN_COMPILE_ARM64)
+/**
+ * ARM64 NEON routine to blend pixels from two picture lines.
+ */
+void merge8_arm64_neon (void *, const void *, const void *, size_t);
+void merge16_arm64_neon (void *, const void *, const void *, size_t);
+
+#endif
+
/*****************************************************************************
* EndMerge routines
*****************************************************************************/
diff --git a/modules/video_filter/deinterlace/merge_arm64.S b/modules/video_filter/deinterlace/merge_arm64.S
new file mode 100644
index 0000000..ad898a3
--- /dev/null
+++ b/modules/video_filter/deinterlace/merge_arm64.S
@@ -0,0 +1,102 @@
+ //*****************************************************************************
+ // merge_arm64.S : ARM64 NEON mean
+ //*****************************************************************************
+ // Copyright (C) 2009-2012 RĂ©mi Denis-Courmont
+ // Copyright (C) 2016- Janne Grunau
+ //
+ // This program is free software; you can redistribute it and/or modify
+ // it under the terms of the GNU Lesser General Public License as published by
+ // the Free Software Foundation; either version 2.1 of the License, or
+ // (at your option) any later version.
+ //
+ // This program is distributed in the hope that it will be useful,
+ // but WITHOUT ANY WARRANTY; without even the implied warranty of
+ // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ // GNU Lesser General Public License for more details.
+ //
+ // You should have received a copy of the GNU Lesser General Public License
+ // along with this program; if not, write to the Free Software Foundation,
+ // Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ //****************************************************************************/
+
+ .text
+
+#define DEST x0
+#define SRC1 x1
+#define SRC2 x2
+#define SIZE x3
+
+ .align 2
+ .global merge8_arm64_neon
+ .type merge8_arm64_neon, %function
+ // NOTE: Offset and pitch must be multiple of 16-bytes in VLC.
+merge8_arm64_neon:
+ ands x5, SIZE, #~63
+ b.eq 2f
+ mov x10, #64
+ add x11, SRC1, #32
+ add x12, SRC2, #32
+1:
+ ld1 {v0.16b,v1.16b}, [SRC1], x10
+ ld1 {v4.16b,v5.16b}, [SRC2], x10
+ ld1 {v2.16b,v3.16b}, [x11], x10
+ uhadd v0.16b, v0.16b, v4.16b
+ ld1 {v6.16b,v7.16b}, [x12], x10
+ subs x5, x5, #64
+ uhadd v0.16b, v0.16b, v4.16b
+ uhadd v1.16b, v1.16b, v5.16b
+ uhadd v2.16b, v2.16b, v6.16b
+ uhadd v3.16b, v3.16b, v7.16b
+ st1 {v0.16b,v1.16b}, [DEST], #32
+ st1 {v2.16b,v3.16b}, [DEST], #32
+ b.gt 1b
+2:
+ tbz SIZE, #32, 3f
+ ld1 {v0.16b,v1.16b}, [SRC1], #32
+ ld1 {v4.16b,v5.16b}, [SRC2], #32
+ uhadd v0.16b, v0.16b, v4.16b
+ uhadd v1.16b, v1.16b, v5.16b
+ st1 {v0.16b,v1.16b}, [DEST], #32
+3:
+ tbz SIZE, #16, 4f
+ ld1 {v0.16b}, [SRC1]
+ ld1 {v4.16b}, [SRC2]
+ uhadd v0.16b, v0.16b, v4.16b
+ st1 {v0.16b}, [DEST]
+4:
+ ret
+
+ .align 2
+ .global merge16_arm64_neon
+ .type merge16_arm64_neon, %function
+merge16_arm64_neon:
+ ands x5, SIZE, #~63
+ b.eq 2f
+1:
+ ld1 {v0.8h,v1.8h}, [SRC1], #32
+ ld1 {v4.8h,v5.8h}, [SRC2], #32
+ ld1 {v2.8h,v3.8h}, [SRC1], #32
+ uhadd v0.8h, v0.8h, v4.8h
+ ld1 {v6.8h,v7.8h}, [SRC2], #32
+ uhadd v1.8h, v1.8h, v5.8h
+ uhadd v2.8h, v2.8h, v6.8h
+ uhadd v3.8h, v3.8h, v7.8h
+ st1 {v0.8h,v1.8h}, [DEST], #32
+ st1 {v2.8h,v3.8h}, [DEST], #32
+ subs x5, x5, #64
+ b.gt 1b
+2:
+ tbz SIZE, #32, 3f
+ ld1 {v0.8h,v1.8h}, [SRC1], #32
+ ld1 {v4.8h,v5.8h}, [SRC2], #32
+ uhadd v0.8h, v0.8h, v4.8h
+ uhadd v1.8h, v1.8h, v5.8h
+ st1 {v0.8h,v1.8h}, [DEST], #32
+3:
+ tbz SIZE, #16, 4f
+ ld1 {v0.8h}, [SRC1]
+ ld1 {v4.8h}, [SRC2]
+ uhadd v0.8h, v0.8h, v4.8h
+ st1 {v0.8h}, [DEST]
+4:
+ ret
--
2.9.2
More information about the vlc-devel
mailing list