[vlc-devel] [PATCH 1/1] deinterlace: arm64 NEON merge asm

Felix Paul Kühne fkuehne at videolan.org
Sun Aug 14 00:12:27 CEST 2016


Hey Janne,

Merged, thanks a lot!

Felix

> On 09 Aug 2016, at 23:37, Janne Grunau <janne-vlc at jannau.net> wrote:
> 
> Approximately factor 2 faster.
> 
> Also adds build system support / cpu "detection" for arm64 neon.
> Advanced SIMD (neon) is mandatory for general purpose ARMv8-a CPU so the
> CPU feature detection is a constant 1.
> ---
> configure.ac                                   |  20 +++++
> include/vlc_cpu.h                              |   2 +
> modules/video_filter/Makefile.am               |   4 +
> modules/video_filter/deinterlace/deinterlace.c |   5 ++
> modules/video_filter/deinterlace/merge.h       |   9 +++
> modules/video_filter/deinterlace/merge_arm64.S | 102 +++++++++++++++++++++++++
> 6 files changed, 142 insertions(+)
> create mode 100644 modules/video_filter/deinterlace/merge_arm64.S
> 
> diff --git a/configure.ac b/configure.ac
> index e83b0a5..e063f0b 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -1432,6 +1432,26 @@ asm volatile("vqmovun.s64 d0, q1":::"d0");
> ])
> AM_CONDITIONAL(HAVE_NEON, [test "${ac_cv_arm_neon}" = "yes"])
> 
> +AC_ARG_ENABLE(arm64,
> +  [AS_HELP_STRING([--disable-arm64],
> +    [disable arm 64-bit optimizations (default auto)])],, [
> +  AS_IF([test "${host_cpu}" = "aarch64"], [enable_arm64="yes"] ,[enable_arm64="no"])
> +])
> +AS_IF([test "${enable_arm64}" != "no"], [
> +  AC_CACHE_CHECK([if $CCAS groks ARM 64 SIMD assembly], [ac_cv_arm64], [
> +    AC_COMPILE_IFELSE([
> +      AC_LANG_PROGRAM(,[[
> +asm volatile("uhadd v0.8b, v0.8b, v1.8b":::"v0");
> +]])
> +    ], [
> +      ac_cv_arm64="yes"
> +    ], [
> +      ac_cv_arm64="no"
> +    ])
> +  ])
> +])
> +AM_CONDITIONAL(HAVE_ARM64, [test "${ac_cv_arm64}" = "yes"])
> +
> 
> AC_ARG_ENABLE(altivec,
>   [AS_HELP_STRING([--disable-altivec],
> diff --git a/include/vlc_cpu.h b/include/vlc_cpu.h
> index 910900a..8c520a0 100644
> --- a/include/vlc_cpu.h
> +++ b/include/vlc_cpu.h
> @@ -178,6 +178,8 @@ VLC_API unsigned vlc_CPU(void);
> 
> # elif defined (__aarch64__)
> #  define HAVE_FPU 1
> +// NEON is mandatory for general purpose ARMv8-a CPUs
> +#  define vlc_CPU_ARM64_NEON() (1)
> 
> # elif defined (__sparc__)
> #  define HAVE_FPU 1
> diff --git a/modules/video_filter/Makefile.am b/modules/video_filter/Makefile.am
> index d853717..5d5fdaf 100644
> --- a/modules/video_filter/Makefile.am
> +++ b/modules/video_filter/Makefile.am
> @@ -124,6 +124,10 @@ if HAVE_NEON
> libdeinterlace_plugin_la_SOURCES += video_filter/deinterlace/merge_arm.S
> libdeinterlace_plugin_la_CFLAGS += -DCAN_COMPILE_ARM
> endif
> +if HAVE_ARM64
> +libdeinterlace_plugin_la_SOURCES += video_filter/deinterlace/merge_arm64.S
> +libdeinterlace_plugin_la_CFLAGS += -DCAN_COMPILE_ARM64
> +endif
> video_filter_LTLIBRARIES += libdeinterlace_plugin.la
> 
> libdynamicoverlay_plugin_la_SOURCES = \
> diff --git a/modules/video_filter/deinterlace/deinterlace.c b/modules/video_filter/deinterlace/deinterlace.c
> index b1676a4..bcc44b6 100644
> --- a/modules/video_filter/deinterlace/deinterlace.c
> +++ b/modules/video_filter/deinterlace/deinterlace.c
> @@ -700,6 +700,11 @@ notsupp:
>         p_sys->pf_merge = pixel_size == 1 ? merge8_armv6 : merge16_armv6;
>     else
> #endif
> +#if defined(CAN_COMPILE_ARM64)
> +    if( vlc_CPU_ARM64_NEON() )
> +        p_sys->pf_merge = pixel_size == 1 ? merge8_arm64_neon : merge16_arm64_neon;
> +    else
> +#endif
>     {
>         p_sys->pf_merge = pixel_size == 1 ? Merge8BitGeneric : Merge16BitGeneric;
> #if defined(__i386__) || defined(__x86_64__)
> diff --git a/modules/video_filter/deinterlace/merge.h b/modules/video_filter/deinterlace/merge.h
> index d342522..74b5ab5 100644
> --- a/modules/video_filter/deinterlace/merge.h
> +++ b/modules/video_filter/deinterlace/merge.h
> @@ -172,6 +172,15 @@ void merge8_armv6 (void *, const void *, const void *, size_t);
> void merge16_armv6 (void *, const void *, const void *, size_t);
> #endif
> 
> +#if defined(CAN_COMPILE_ARM64)
> +/**
> + * ARM64 NEON routine to blend pixels from two picture lines.
> + */
> +void merge8_arm64_neon (void *, const void *, const void *, size_t);
> +void merge16_arm64_neon (void *, const void *, const void *, size_t);
> +
> +#endif
> +
> /*****************************************************************************
>  * EndMerge routines
>  *****************************************************************************/
> diff --git a/modules/video_filter/deinterlace/merge_arm64.S b/modules/video_filter/deinterlace/merge_arm64.S
> new file mode 100644
> index 0000000..ad898a3
> --- /dev/null
> +++ b/modules/video_filter/deinterlace/merge_arm64.S
> @@ -0,0 +1,102 @@
> + //*****************************************************************************
> + // merge_arm64.S : ARM64 NEON mean
> + //*****************************************************************************
> + // Copyright (C) 2009-2012 Rémi Denis-Courmont
> + // Copyright (C) 2016-	   Janne Grunau
> + //
> + // This program is free software; you can redistribute it and/or modify
> + // it under the terms of the GNU Lesser General Public License as published by
> + // the Free Software Foundation; either version 2.1 of the License, or
> + // (at your option) any later version.
> + //
> + // This program is distributed in the hope that it will be useful,
> + // but WITHOUT ANY WARRANTY; without even the implied warranty of
> + // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + // GNU Lesser General Public License for more details.
> + //
> + // You should have received a copy of the GNU Lesser General Public License
> + // along with this program; if not, write to the Free Software Foundation,
> + // Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
> + //****************************************************************************/
> +
> +	.text
> +
> +#define	DEST	x0
> +#define	SRC1	x1
> +#define	SRC2	x2
> +#define	SIZE	x3
> +
> +	.align 2
> +	.global merge8_arm64_neon
> +	.type	merge8_arm64_neon, %function
> +	// NOTE: Offset and pitch must be multiple of 16-bytes in VLC.
> +merge8_arm64_neon:
> +	ands		x5, SIZE, #~63
> +	b.eq		2f
> +	mov		x10, #64
> +	add		x11, SRC1, #32
> +        add             x12, SRC2, #32
> +1:
> +	ld1		{v0.16b,v1.16b}, [SRC1], x10
> +	ld1		{v4.16b,v5.16b}, [SRC2], x10
> +	ld1		{v2.16b,v3.16b}, [x11], x10
> +	uhadd		v0.16b, v0.16b, v4.16b
> +	ld1		{v6.16b,v7.16b}, [x12], x10
> +	subs		x5,  x5,  #64
> +	uhadd		v0.16b, v0.16b, v4.16b
> +	uhadd		v1.16b, v1.16b, v5.16b
> +	uhadd		v2.16b, v2.16b, v6.16b
> +	uhadd		v3.16b, v3.16b, v7.16b
> +	st1		{v0.16b,v1.16b}, [DEST], #32
> +	st1		{v2.16b,v3.16b}, [DEST], #32
> +	b.gt		1b
> +2:
> +	tbz		SIZE, #32, 3f
> +	ld1		{v0.16b,v1.16b}, [SRC1], #32
> +	ld1		{v4.16b,v5.16b}, [SRC2], #32
> +	uhadd		v0.16b, v0.16b, v4.16b
> +	uhadd		v1.16b, v1.16b, v5.16b
> +	st1		{v0.16b,v1.16b}, [DEST], #32
> +3:
> +	tbz		SIZE, #16, 4f
> +	ld1		{v0.16b},  [SRC1]
> +	ld1		{v4.16b},  [SRC2]
> +	uhadd		v0.16b, v0.16b, v4.16b
> +	st1		{v0.16b},  [DEST]
> +4:
> +	ret
> +
> +	.align 2
> +	.global merge16_arm64_neon
> +	.type	merge16_arm64_neon, %function
> +merge16_arm64_neon:
> +	ands		x5, SIZE, #~63
> +	b.eq		2f
> +1:
> +	ld1		{v0.8h,v1.8h}, [SRC1], #32
> +	ld1		{v4.8h,v5.8h}, [SRC2], #32
> +	ld1		{v2.8h,v3.8h}, [SRC1], #32
> +	uhadd		v0.8h,  v0.8h,  v4.8h
> +	ld1		{v6.8h,v7.8h}, [SRC2], #32
> +	uhadd		v1.8h,  v1.8h,  v5.8h
> +	uhadd		v2.8h,  v2.8h,  v6.8h
> +	uhadd		v3.8h,  v3.8h,  v7.8h
> +	st1		{v0.8h,v1.8h}, [DEST], #32
> +	st1		{v2.8h,v3.8h}, [DEST], #32
> +	subs		x5,  x5,  #64
> +	b.gt		1b
> +2:
> +	tbz		SIZE, #32, 3f
> +	ld1		{v0.8h,v1.8h}, [SRC1], #32
> +	ld1		{v4.8h,v5.8h}, [SRC2], #32
> +	uhadd		v0.8h,  v0.8h,  v4.8h
> +	uhadd		v1.8h,  v1.8h,  v5.8h
> +	st1		{v0.8h,v1.8h}, [DEST], #32
> +3:
> +	tbz		SIZE, #16, 4f
> +	ld1		{v0.8h},  [SRC1]
> +	ld1		{v4.8h},  [SRC2]
> +	uhadd		v0.8h,  v0.8h,  v4.8h
> +	st1		{v0.8h},  [DEST]
> +4:
> +	ret
> -- 
> 2.9.2
> 
> _______________________________________________
> vlc-devel mailing list
> To unsubscribe or modify your subscription options:
> https://mailman.videolan.org/listinfo/vlc-devel



More information about the vlc-devel mailing list