[vlc-devel] [PATCH] arm_neon: Add an optimized routine for NV12/21 to I420/YV12
Martin Storsjö
martin at martin.st
Tue Oct 1 12:31:29 CEST 2013
This avoids hitting swscale for this conversion, for hw decoders
that return NV12/21 in combination with the android vout in YUV
mode.
---
Copying the luma plane using memcpy, added support for YV12 as well,
as suggested by Rémi.
---
modules/arm_neon/Makefile.am | 1 +
modules/arm_neon/chroma_neon.h | 5 +++
modules/arm_neon/chroma_yuv.c | 85 ++++++++++++++++++++++++++++++++++++++++
modules/arm_neon/nv12_i420.S | 62 +++++++++++++++++++++++++++++
4 files changed, 153 insertions(+)
create mode 100644 modules/arm_neon/nv12_i420.S
diff --git a/modules/arm_neon/Makefile.am b/modules/arm_neon/Makefile.am
index 212605f..8978b75 100644
--- a/modules/arm_neon/Makefile.am
+++ b/modules/arm_neon/Makefile.am
@@ -10,6 +10,7 @@ libchroma_yuv_neon_plugin_la_SOURCES = \
arm_neon/i420_yuyv.S \
arm_neon/i422_yuyv.S \
arm_neon/yuyv_i422.S \
+ arm_neon/nv12_i420.S \
arm_neon/chroma_yuv.c arm_neon/chroma_neon.h
libchroma_yuv_neon_plugin_la_CFLAGS = $(AM_CFLAGS)
libchroma_yuv_neon_plugin_LIBTOOLFLAGS = --tag=CC
diff --git a/modules/arm_neon/chroma_neon.h b/modules/arm_neon/chroma_neon.h
index 865315a..877d011 100644
--- a/modules/arm_neon/chroma_neon.h
+++ b/modules/arm_neon/chroma_neon.h
@@ -67,6 +67,11 @@ void uyvy_i422_neon (struct yuv_planes *const out,
const struct yuv_pack *const in,
int width, int height) asm("uyvy_i422_neon");
+/* NV12 to I420 conversion. */
+void nv12_i420_neon (struct yuv_planes *const out,
+ const struct yuv_planes *const in,
+ int width, int height) asm("nv12_i420_neon");
+
/* I420 to RGBA conversion. */
void i420_rgb_neon (struct yuv_pack *const out,
const struct yuv_planes *const in,
diff --git a/modules/arm_neon/chroma_yuv.c b/modules/arm_neon/chroma_yuv.c
index b54732e..aa72a0e 100644
--- a/modules/arm_neon/chroma_yuv.c
+++ b/modules/arm_neon/chroma_yuv.c
@@ -83,6 +83,62 @@ static void I420_VYUY (filter_t *filter, picture_t *src, picture_t *dst)
VIDEO_FILTER_WRAPPER (I420_VYUY)
+/* Semiplanar NV12/21 to planar I420/YV12 */
+static void copy_y_plane(filter_t *filter, picture_t *src, picture_t *dst)
+{
+ uint8_t *src_y = src->Y_PIXELS;
+ uint8_t *dst_y = dst->Y_PIXELS;
+ if (src->Y_PITCH == dst->Y_PITCH) {
+ memcpy(dst_y, src_y, dst->Y_PITCH * filter->fmt_in.video.i_height);
+ } else {
+ for (unsigned y = 0; y < filter->fmt_in.video.i_height; y++) {
+ memcpy(dst_y + dst->Y_PITCH * y, src_y + src->Y_PITCH * y,
+ filter->fmt_in.video.i_width);
+ }
+ }
+}
+
+static void NV12_I420 (filter_t *filter, picture_t *src, picture_t *dst)
+{
+ DEFINE_PLANES(out, dst);
+ DEFINE_PLANES(in, src);
+ copy_y_plane (filter, src, dst);
+ nv12_i420_neon (&out, &in, filter->fmt_in.video.i_width,
+ filter->fmt_in.video.i_height);
+}
+VIDEO_FILTER_WRAPPER (NV12_I420)
+
+static void NV12_YV12 (filter_t *filter, picture_t *src, picture_t *dst)
+{
+ DEFINE_PLANES_SWAP(out, dst);
+ DEFINE_PLANES(in, src);
+ copy_y_plane (filter, src, dst);
+ nv12_i420_neon (&out, &in, filter->fmt_in.video.i_width,
+ filter->fmt_in.video.i_height);
+}
+VIDEO_FILTER_WRAPPER (NV12_YV12)
+
+static void NV21_I420 (filter_t *filter, picture_t *src, picture_t *dst)
+{
+ DEFINE_PLANES_SWAP(out, dst);
+ DEFINE_PLANES(in, src);
+ copy_y_plane (filter, src, dst);
+ nv12_i420_neon (&out, &in, filter->fmt_in.video.i_width,
+ filter->fmt_in.video.i_height);
+}
+VIDEO_FILTER_WRAPPER (NV21_I420)
+
+static void NV21_YV12 (filter_t *filter, picture_t *src, picture_t *dst)
+{
+ DEFINE_PLANES(out, dst);
+ DEFINE_PLANES(in, src);
+ copy_y_plane (filter, src, dst);
+ nv12_i420_neon (&out, &in, filter->fmt_in.video.i_width,
+ filter->fmt_in.video.i_height);
+}
+VIDEO_FILTER_WRAPPER (NV21_YV12)
+
+
/* Planar YUV422 to packed YUV422 */
static void I422_YUYV (filter_t *filter, picture_t *src, picture_t *dst)
{
@@ -231,6 +287,35 @@ static int Open (vlc_object_t *obj)
}
break;
+ /* Semiplanar to planar */
+ case VLC_CODEC_NV12:
+ switch (filter->fmt_out.video.i_chroma)
+ {
+ case VLC_CODEC_I420:
+ filter->pf_video_filter = NV12_I420_Filter;
+ break;
+ case VLC_CODEC_YV12:
+ filter->pf_video_filter = NV12_YV12_Filter;
+ break;
+ default:
+ return VLC_EGENERIC;
+ }
+ break;
+
+ case VLC_CODEC_NV21:
+ switch (filter->fmt_out.video.i_chroma)
+ {
+ case VLC_CODEC_I420:
+ filter->pf_video_filter = NV21_I420_Filter;
+ break;
+ case VLC_CODEC_YV12:
+ filter->pf_video_filter = NV21_YV12_Filter;
+ break;
+ default:
+ return VLC_EGENERIC;
+ }
+ break;
+
/* Packed to planar */
case VLC_CODEC_YUYV:
switch (filter->fmt_out.video.i_chroma)
diff --git a/modules/arm_neon/nv12_i420.S b/modules/arm_neon/nv12_i420.S
new file mode 100644
index 0000000..070594f
--- /dev/null
+++ b/modules/arm_neon/nv12_i420.S
@@ -0,0 +1,62 @@
+ @*****************************************************************************
+ @ nv12_i420.S : ARM NEONv1 NV12 to I420 chroma conversion
+ @*****************************************************************************
+ @ Copyright (C) 2009-2011 Rémi Denis-Courmont
+ @ Copyright (C) 2013 Martin Storsjö
+ @
+ @ This program is free software; you can redistribute it and/or modify
+ @ it under the terms of the GNU Lesser General Public License as published by
+ @ the Free Software Foundation; either version 2.1 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ @ GNU Lesser General Public License for more details.
+ @
+ @ You should have received a copy of the GNU Lesser General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+
+ .syntax unified
+ .fpu neon
+ .text
+
+#define WIDTH r2
+#define HEIGHT r3
+#define UV r4
+#define U r5
+#define V r6
+#define OPITCH r7
+#define IPAD r8
+#define OPAD r11
+#define COUNT ip
+#define IPITCH lr
+
+ .align 2
+ .global nv12_i420_neon
+ .type nv12_i420_neon, %function
+nv12_i420_neon:
+ push {r4-r8,r11,lr}
+ ldmia r0, {r0, U, V, OPITCH} @ first plane is unused
+ ldmia r1, {r1, UV, IPAD, IPITCH} @ first and third planes are unused
+ cmp HEIGHT, #0
+ sub IPAD, IPITCH, WIDTH
+ sub OPAD, OPITCH, WIDTH
+1:
+ movsgt COUNT, WIDTH
+ pople {r4-r8,r11,pc}
+2:
+ pld [UV, #64]
+ vld2.u8 {d0, d1}, [UV,:128]!
+ subs COUNT, COUNT, #16
+ vst1.u8 {d0}, [U,:64]!
+ vst1.u8 {d1}, [V,:64]!
+ bgt 2b
+
+ subs HEIGHT, #2
+ add UV, UV, IPAD
+ add U, U, OPAD, lsr #1
+ add V, V, OPAD, lsr #1
+ b 1b
--
1.7.9.4
More information about the vlc-devel
mailing list