[vlc-devel] [PATCH] arm_neon: Add an optimized routine for NV12/21 to I420

Martin Storsjö martin at martin.st
Mon Sep 30 10:50:16 CEST 2013


This avoids hitting swscale for this conversion, for hw decoders
that return NV12/21 in combination with the android vout in YUV
mode.
---
 modules/arm_neon/Makefile.am   |    1 +
 modules/arm_neon/chroma_neon.h |    5 +++
 modules/arm_neon/chroma_yuv.c  |   43 ++++++++++++++++++++++
 modules/arm_neon/nv12_i420.S   |   79 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 128 insertions(+)
 create mode 100644 modules/arm_neon/nv12_i420.S

diff --git a/modules/arm_neon/Makefile.am b/modules/arm_neon/Makefile.am
index 212605f..8978b75 100644
--- a/modules/arm_neon/Makefile.am
+++ b/modules/arm_neon/Makefile.am
@@ -10,6 +10,7 @@ libchroma_yuv_neon_plugin_la_SOURCES = \
 	arm_neon/i420_yuyv.S \
 	arm_neon/i422_yuyv.S \
 	arm_neon/yuyv_i422.S \
+	arm_neon/nv12_i420.S \
 	arm_neon/chroma_yuv.c arm_neon/chroma_neon.h
 libchroma_yuv_neon_plugin_la_CFLAGS = $(AM_CFLAGS)
 libchroma_yuv_neon_plugin_LIBTOOLFLAGS = --tag=CC
diff --git a/modules/arm_neon/chroma_neon.h b/modules/arm_neon/chroma_neon.h
index 865315a..877d011 100644
--- a/modules/arm_neon/chroma_neon.h
+++ b/modules/arm_neon/chroma_neon.h
@@ -67,6 +67,11 @@ void uyvy_i422_neon (struct yuv_planes *const out,
                      const struct yuv_pack *const in,
                      int width, int height) asm("uyvy_i422_neon");
 
+/* NV12 to I420 conversion. */
+void nv12_i420_neon (struct yuv_planes *const out,
+                     const struct yuv_planes *const in,
+                     int width, int height) asm("nv12_i420_neon");
+
 /* I420 to RGBA conversion. */
 void i420_rgb_neon (struct yuv_pack *const out,
                     const struct yuv_planes *const in,
diff --git a/modules/arm_neon/chroma_yuv.c b/modules/arm_neon/chroma_yuv.c
index b54732e..ff933a8 100644
--- a/modules/arm_neon/chroma_yuv.c
+++ b/modules/arm_neon/chroma_yuv.c
@@ -83,6 +83,26 @@ static void I420_VYUY (filter_t *filter, picture_t *src, picture_t *dst)
 VIDEO_FILTER_WRAPPER (I420_VYUY)
 
 
+/* Semiplanar NV12/21 to planar I420 */
+static void NV12_I420 (filter_t *filter, picture_t *src, picture_t *dst)
+{
+    DEFINE_PLANES(out, dst);
+    DEFINE_PLANES(in, src);
+    nv12_i420_neon (&out, &in, filter->fmt_in.video.i_width,
+                    filter->fmt_in.video.i_height);
+}
+VIDEO_FILTER_WRAPPER (NV12_I420)
+
+static void NV21_I420 (filter_t *filter, picture_t *src, picture_t *dst)
+{
+    DEFINE_PLANES_SWAP(out, dst);
+    DEFINE_PLANES(in, src);
+    nv12_i420_neon (&out, &in, filter->fmt_in.video.i_width,
+                    filter->fmt_in.video.i_height);
+}
+VIDEO_FILTER_WRAPPER (NV21_I420)
+
+
 /* Planar YUV422 to packed YUV422 */
 static void I422_YUYV (filter_t *filter, picture_t *src, picture_t *dst)
 {
@@ -231,6 +251,29 @@ static int Open (vlc_object_t *obj)
             }
             break;
 
+        /* Semiplanar to planar */
+        case VLC_CODEC_NV12:
+            switch (filter->fmt_out.video.i_chroma)
+            {
+                case VLC_CODEC_I420:
+                    filter->pf_video_filter = NV12_I420_Filter;
+                    break;
+                default:
+                    return VLC_EGENERIC;
+            }
+            break;
+
+        case VLC_CODEC_NV21:
+            switch (filter->fmt_out.video.i_chroma)
+            {
+                case VLC_CODEC_I420:
+                    filter->pf_video_filter = NV21_I420_Filter;
+                    break;
+                default:
+                    return VLC_EGENERIC;
+            }
+            break;
+
         /* Packed to planar */
         case VLC_CODEC_YUYV:
             switch (filter->fmt_out.video.i_chroma)
diff --git a/modules/arm_neon/nv12_i420.S b/modules/arm_neon/nv12_i420.S
new file mode 100644
index 0000000..00bba8a
--- /dev/null
+++ b/modules/arm_neon/nv12_i420.S
@@ -0,0 +1,79 @@
+ @*****************************************************************************
+ @ nv12_i420.S : ARM NEONv1 NV12 to I420 chroma conversion
+ @*****************************************************************************
+ @ Copyright (C) 2009-2011 Rémi Denis-Courmont
+ @ Copyright (C) 2013 Martin Storsjö
+ @
+ @ This program is free software; you can redistribute it and/or modify
+ @ it under the terms of the GNU Lesser General Public License as published by
+ @ the Free Software Foundation; either version 2.1 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ @ GNU Lesser General Public License for more details.
+ @
+ @ You should have received a copy of the GNU Lesser General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+
+	.syntax unified
+	.fpu neon
+	.text
+
+#define IY	r1
+#define WIDTH	r2
+#define HEIGHT	r3
+#define UV	r4
+#define OY	r5
+#define U	r6
+#define V	r7
+#define OPITCH	r8
+#define IPAD	r10
+#define OPAD	r11
+#define COUNT	ip
+#define IPITCH	lr
+
+	.align 2
+	.global nv12_i420_neon
+	.type	nv12_i420_neon, %function
+nv12_i420_neon:
+	push		{r4-r8,r10-r11,lr}
+	ldmia		r0,	{OY, U, V, OPITCH}
+	ldmia		r1,	{IY, UV, IPAD, IPITCH} @ third plane is unused
+	cmp		HEIGHT,	#0
+	sub		IPAD,	IPITCH,	WIDTH
+	sub		OPAD,	OPITCH,	WIDTH
+1:
+	movsgt		COUNT,	WIDTH
+	pople		{r4-r8,r10-r11,pc}
+2:
+	pld		[IY, #16]
+	vld1.u8		{q0},		[IY,:128]!
+	pld		[UV, #16]
+	vld2.u8		{d2, d3},	[UV,:128]!
+	subs		COUNT,	COUNT,	#16
+	vst1.u8		{q0},		[OY,:128]!
+	vst1.u8		{d2},		[U,:64]!
+	vst1.u8		{d3},		[V,:64]!
+	bgt		2b
+
+	mov		COUNT,	WIDTH
+	add		IY,	IY,	IPAD
+	add		OY,	OY,	OPAD
+3:
+	pld		[IY, #16]
+	vld1.u8		{q0},		[IY,:128]!
+	subs		COUNT,	COUNT,	#16
+	vst1.u8		{q0},		[OY,:128]!
+	bgt		3b
+
+	subs		HEIGHT,	#2
+	add		IY,	IY,	IPAD
+	add		UV,	UV,	IPAD
+	add		OY,	OY,	OPAD
+	add		U,	U,	OPAD,	lsr #1
+	add		V,	V,	OPAD,	lsr #1
+	b		1b
-- 
1.7.9.4




More information about the vlc-devel mailing list