[vlc-devel] [RFC] [PATCH] deinterlace: rewrite ARM optimizations for 8-bits merge

Fri Aug 3 09:34:08 CEST 2012

- Assembler code out of line
 - ARM NEON run-time detection
 - Better choice of registers
 - Prefetching
 - ARMv6 SIMD optimizations where Advanced SIMD not available
   (not yet in use)

Scheduling is not completely optimal.
16-bits merge could easily be added later.
---
 modules/video_filter/Modules.am              |    3 +
 modules/video_filter/deinterlace/merge.c     |   63 ++---------------
 modules/video_filter/deinterlace/merge_arm.S |   94 ++++++++++++++++++++++++++
 3 files changed, 102 insertions(+), 58 deletions(-)
 create mode 100644 modules/video_filter/deinterlace/merge_arm.S

diff --git a/modules/video_filter/Modules.am b/modules/video_filter/Modules.am
index 2d2e2a2..1301123 100644
--- a/modules/video_filter/Modules.am
+++ b/modules/video_filter/Modules.am
@@ -28,6 +28,9 @@ libdeinterlace_plugin_la_SOURCES = \
 	deinterlace/yadif.h deinterlace/yadif_template.h \
 	deinterlace/algo_phosphor.c deinterlace/algo_phosphor.h \
 	deinterlace/algo_ivtc.c deinterlace/algo_ivtc.h
+if HAVE_NEON
+libdeinterlace_plugin_la_SOURCES += deinterlace/merge_arm.S
+endif
 libdeinterlace_plugin_la_CFLAGS = $(AM_CFLAGS)
 libdeinterlace_plugin_la_LIBADD = $(AM_LIBADD)
 libdeinterlace_plugin_la_DEPENDENCIES =
diff --git a/modules/video_filter/deinterlace/merge.c b/modules/video_filter/deinterlace/merge.c
index 06c0334..be86450 100644
--- a/modules/video_filter/deinterlace/merge.c
+++ b/modules/video_filter/deinterlace/merge.c
@@ -243,64 +243,11 @@ void MergeAltivec( void *_p_dest, const void *_p_s1,
 }
 #endif
 
-#ifdef __ARM_NEON__
-void MergeNEON (void *restrict out, const void *in1,
-                const void *in2, size_t n)
-{
-    uint8_t *outp = out;
-    const uint8_t *in1p = in1;
-    const uint8_t *in2p = in2;
-    size_t mis = __MIN((16 - ((uintptr_t)outp & 15)) & 15, n);
-
-    if (mis)
-    {
-        Merge8BitGeneric (outp, in1p, in2p, mis);
-        outp += mis;
-        in1p += mis;
-        in2p += mis;
-        n -= mis;
-    }
-
-    uint8_t *end = outp + (n & ~15);
-
-    if ((((uintptr_t)in1p)|((uintptr_t)in2p)) & 15)
-        while (outp < end)
-            asm volatile (
-                "vld1.u8  {q0-q1}, [%[in1]]!\n"
-                "vld1.u8  {q2-q3}, [%[in2]]!\n"
-                "vhadd.u8 q4, q0, q2\n"
-                "vld1.u8  {q6-q7}, [%[in1]]!\n"
-                "vhadd.u8 q5, q1, q3\n"
-                "vld1.u8  {q8-q9}, [%[in2]]!\n"
-                "vhadd.u8 q10, q6, q8\n"
-                "vhadd.u8 q11, q7, q9\n"
-                "vst1.u8  {q4-q5}, [%[out],:128]!\n"
-                "vst1.u8  {q10-q11}, [%[out],:128]!\n"
-                : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
-                :
-                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-                  "q8", "q9", "q10", "q11", "memory");
-    else
-         while (outp < end)
-            asm volatile (
-                "vld1.u8  {q0-q1}, [%[in1],:128]!\n"
-                "vld1.u8  {q2-q3}, [%[in2],:128]!\n"
-                "vhadd.u8 q4, q0, q2\n"
-                "vld1.u8  {q6-q7}, [%[in1],:128]!\n"
-                "vhadd.u8 q5, q1, q3\n"
-                "vld1.u8  {q8-q9}, [%[in2],:128]!\n"
-                "vhadd.u8 q10, q6, q8\n"
-                "vhadd.u8 q11, q7, q9\n"
-                "vst1.u8  {q4-q5}, [%[out],:128]!\n"
-                "vst1.u8  {q10-q11}, [%[out],:128]!\n"
-                : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
-                :
-                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-                  "q8", "q9", "q10", "q11", "memory");
-    n &= 15;
-    if (n)
-        Merge8BitGeneric (outp, in1p, in2p, n);
-}
+#ifdef __arm__
+void MergeNEON(void *restrict out, const void *in1, const void *in2, size_t n)
+    asm("merge_arm_neon");
+void MergeARMv6(void *restrict out, const void *in1, const void *in2, size_t n)
+    asm("merge_armv6");
 #endif
 
 /*****************************************************************************
diff --git a/modules/video_filter/deinterlace/merge_arm.S b/modules/video_filter/deinterlace/merge_arm.S
new file mode 100644
index 0000000..80c652b
--- /dev/null
+++ b/modules/video_filter/deinterlace/merge_arm.S
@@ -0,0 +1,94 @@
+ @*****************************************************************************
+ @ i420_yuyv.S : ARM NEONv1 I420 to YUYV chroma conversion
+ @*****************************************************************************
+ @ Copyright (C) 2009-2012 Rémi Denis-Courmont
+ @
+ @ This program is free software; you can redistribute it and/or modify
+ @ it under the terms of the GNU Lesser General Public License as published by
+ @ the Free Software Foundation; either version 2.1 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ @ GNU Lesser General Public License for more details.
+ @
+ @ You should have received a copy of the GNU Lesser General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+
+	.syntax	unified
+	.arm
+	.arch	armv6
+	.fpu	neon
+	.text
+
+#define	DEST	r0
+#define	SRC1	r1
+#define	SRC2	r2
+#define	SIZE	r3
+
+	.align 2
+	.global merge8_arm_neon
+	.type	merge8_arm_neon, %function
+	@ NOTE: Offset and pitch must be multiple of 16-bytes in VLC.
+merge8_arm_neon:
+	cmp		SIZE,	#64
+	blo		2f
+1:
+	pld		[SRC1, #64]
+	vld1.u8		{q0-q1},	[SRC1,:128]!
+	pld		[SRC2, #64]
+	vld1.u8		{q8-q9},	[SRC2,:128]!
+	vhadd.u8	q0,	q0,	q8
+	sub		SIZE,	SIZE,	#64
+	vld1.u8		{q2-q3},	[SRC1,:128]!
+	vhadd.u8	q1,	q1,	q9
+	vld1.u8		{q10-q11},	[SRC2,:128]!
+	vhadd.u8	q2,	q2,	q10
+	cmp		SIZE,	#64
+	vhadd.u8	q3,	q3,	q11
+	vst1.u8		{q0-q1},	[DEST,:128]!
+	vst1.u8		{q2-q3},	[DEST,:128]!
+	bhs		1b
+2:
+	cmp		SIZE,	#32
+	blo		3f
+	vld1.u8		{q0-q1},	[SRC1,:128]!
+	sub		SIZE,	SIZE,	#32
+	vld1.u8		{q8-q9},	[SRC2,:128]!
+	vhadd.u8	q0,	q0,	q8
+	vhadd.u8	q1,	q1,	q9
+	vst1.u8		{q0-q1},	[DEST,:128]!
+3:
+	cmp		SIZE,	#16
+	bxlo		lr
+	vld1.u8		{q0},		[SRC1,:128]!
+	sub		SIZE,	SIZE,	#16
+	vld1.u8		{q8},		[SRC2,:128]!
+	vhadd.u8	q0,	q0,	q8
+	vst1.u8		{q0},		[DEST,:128]!
+	bx		lr
+
+	.align 2
+	.global merge8_armv6
+	.type	merge8_armv6, %function
+merge8_armv6:
+	push		{r4-r9,lr}
+1:
+	pld		[SRC1, #64]
+	ldm		SRC1!,	{r4-r5}
+	pld		[SRC2, #64]
+	ldm		SRC2!,	{r8-r9}
+	subs		SIZE,	SIZE,	#16
+	uhadd8		r4,	r4,	r8
+	ldm		SRC1!,	{r6-r7}
+	uhadd8		r5,	r5,	r9
+	ldm		SRC2!,	{ip,lr}
+	uhadd8		r6,	r6,	ip
+	stm		DEST!,	{r4-r5}
+	uhadd8		r7,	r7,	lr
+	stm		DEST!,	{r6-r7}
+	popeq		{r4-r9,pc}
+	b		1b
-- 
1.7.10.4