[vlc-devel] [PATCH] arm_neon: Add an optimized routine for NV12/21 to I420
Rémi Denis-Courmont
remi at remlab.net
Mon Sep 30 16:41:07 CEST 2013
Le lundi 30 septembre 2013 11:50:16 Martin Storsjö a écrit :
> This avoids hitting swscale for this conversion, for hw decoders
> that return NV12/21 in combination with the android vout in YUV
> mode.
> ---
> modules/arm_neon/Makefile.am | 1 +
> modules/arm_neon/chroma_neon.h | 5 +++
> modules/arm_neon/chroma_yuv.c | 43 ++++++++++++++++++++++
> modules/arm_neon/nv12_i420.S | 79
> ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 128 insertions(+)
> create mode 100644 modules/arm_neon/nv12_i420.S
>
> diff --git a/modules/arm_neon/Makefile.am b/modules/arm_neon/Makefile.am
> index 212605f..8978b75 100644
> --- a/modules/arm_neon/Makefile.am
> +++ b/modules/arm_neon/Makefile.am
> @@ -10,6 +10,7 @@ libchroma_yuv_neon_plugin_la_SOURCES = \
> arm_neon/i420_yuyv.S \
> arm_neon/i422_yuyv.S \
> arm_neon/yuyv_i422.S \
> + arm_neon/nv12_i420.S \
> arm_neon/chroma_yuv.c arm_neon/chroma_neon.h
> libchroma_yuv_neon_plugin_la_CFLAGS = $(AM_CFLAGS)
> libchroma_yuv_neon_plugin_LIBTOOLFLAGS = --tag=CC
> diff --git a/modules/arm_neon/chroma_neon.h b/modules/arm_neon/chroma_neon.h
> index 865315a..877d011 100644
> --- a/modules/arm_neon/chroma_neon.h
> +++ b/modules/arm_neon/chroma_neon.h
> @@ -67,6 +67,11 @@ void uyvy_i422_neon (struct yuv_planes *const out,
> const struct yuv_pack *const in,
> int width, int height) asm("uyvy_i422_neon");
>
> +/* NV12 to I420 conversion. */
> +void nv12_i420_neon (struct yuv_planes *const out,
> + const struct yuv_planes *const in,
> + int width, int height) asm("nv12_i420_neon");
> +
> /* I420 to RGBA conversion. */
> void i420_rgb_neon (struct yuv_pack *const out,
> const struct yuv_planes *const in,
> diff --git a/modules/arm_neon/chroma_yuv.c b/modules/arm_neon/chroma_yuv.c
> index b54732e..ff933a8 100644
> --- a/modules/arm_neon/chroma_yuv.c
> +++ b/modules/arm_neon/chroma_yuv.c
> @@ -83,6 +83,26 @@ static void I420_VYUY (filter_t *filter, picture_t *src,
> picture_t *dst) VIDEO_FILTER_WRAPPER (I420_VYUY)
>
>
> +/* Semiplanar NV12/21 to planar I420 */
> +static void NV12_I420 (filter_t *filter, picture_t *src, picture_t *dst)
> +{
> + DEFINE_PLANES(out, dst);
> + DEFINE_PLANES(in, src);
> + nv12_i420_neon (&out, &in, filter->fmt_in.video.i_width,
> + filter->fmt_in.video.i_height);
> +}
> +VIDEO_FILTER_WRAPPER (NV12_I420)
Regarding the luminance plane, in my exprience, memcpy() optimizations beat
the crap out of a simplistic NEON load/store loop. memcpy() would basically
halve the complexity of the assembler code and improve data locality.
> +
> +static void NV21_I420 (filter_t *filter, picture_t *src, picture_t *dst)
> +{
> + DEFINE_PLANES_SWAP(out, dst);
> + DEFINE_PLANES(in, src);
> + nv12_i420_neon (&out, &in, filter->fmt_in.video.i_width,
> + filter->fmt_in.video.i_height);
> +}
> +VIDEO_FILTER_WRAPPER (NV21_I420)
> +
> +
> /* Planar YUV422 to packed YUV422 */
> static void I422_YUYV (filter_t *filter, picture_t *src, picture_t *dst)
> {
> @@ -231,6 +251,29 @@ static int Open (vlc_object_t *obj)
> }
> break;
>
> + /* Semiplanar to planar */
> + case VLC_CODEC_NV12:
> + switch (filter->fmt_out.video.i_chroma)
> + {
> + case VLC_CODEC_I420:
> + filter->pf_video_filter = NV12_I420_Filter;
> + break;
YV12 should be handled too.
> + default:
> + return VLC_EGENERIC;
> + }
> + break;
> +
> + case VLC_CODEC_NV21:
> + switch (filter->fmt_out.video.i_chroma)
> + {
> + case VLC_CODEC_I420:
> + filter->pf_video_filter = NV21_I420_Filter;
> + break;
Ditto.
> + default:
> + return VLC_EGENERIC;
> + }
> + break;
> +
> /* Packed to planar */
> case VLC_CODEC_YUYV:
> switch (filter->fmt_out.video.i_chroma)
> diff --git a/modules/arm_neon/nv12_i420.S b/modules/arm_neon/nv12_i420.S
> new file mode 100644
> index 0000000..00bba8a
> --- /dev/null
> +++ b/modules/arm_neon/nv12_i420.S
> @@ -0,0 +1,79 @@
> +
> @**************************************************************************
> *** + @ nv12_i420.S : ARM NEONv1 NV12 to I420 chroma conversion
> +
> @**************************************************************************
> *** + @ Copyright (C) 2009-2011 Rémi Denis-Courmont
> + @ Copyright (C) 2013 Martin Storsjö
> + @
> + @ This program is free software; you can redistribute it and/or modify
> + @ it under the terms of the GNU Lesser General Public License as published
> by + @ the Free Software Foundation; either version 2.1 of the License, or
> + @ (at your option) any later version.
> + @
> + @ This program is distributed in the hope that it will be useful,
> + @ but WITHOUT ANY WARRANTY; without even the implied warranty of
> + @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + @ GNU Lesser General Public License for more details.
> + @
> + @ You should have received a copy of the GNU Lesser General Public License
> + @ along with this program; if not, write to the Free Software Foundation,
> + @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA. +
> @**************************************************************************
> **/ +
> + .syntax unified
> + .fpu neon
> + .text
> +
> +#define IY r1
> +#define WIDTH r2
> +#define HEIGHT r3
> +#define UV r4
> +#define OY r5
> +#define U r6
> +#define V r7
> +#define OPITCH r8
> +#define IPAD r10
> +#define OPAD r11
> +#define COUNT ip
> +#define IPITCH lr
> +
> + .align 2
> + .global nv12_i420_neon
> + .type nv12_i420_neon, %function
> +nv12_i420_neon:
> + push {r4-r8,r10-r11,lr}
> + ldmia r0, {OY, U, V, OPITCH}
> + ldmia r1, {IY, UV, IPAD, IPITCH} @ third plane is unused
> + cmp HEIGHT, #0
> + sub IPAD, IPITCH, WIDTH
> + sub OPAD, OPITCH, WIDTH
> +1:
> + movsgt COUNT, WIDTH
> + pople {r4-r8,r10-r11,pc}
> +2:
> + pld [IY, #16]
> + vld1.u8 {q0}, [IY,:128]!
> + pld [UV, #16]
> + vld2.u8 {d2, d3}, [UV,:128]!
> + subs COUNT, COUNT, #16
> + vst1.u8 {q0}, [OY,:128]!
> + vst1.u8 {d2}, [U,:64]!
> + vst1.u8 {d3}, [V,:64]!
> + bgt 2b
> +
> + mov COUNT, WIDTH
> + add IY, IY, IPAD
> + add OY, OY, OPAD
> +3:
> + pld [IY, #16]
> + vld1.u8 {q0}, [IY,:128]!
> + subs COUNT, COUNT, #16
> + vst1.u8 {q0}, [OY,:128]!
> + bgt 3b
> +
> + subs HEIGHT, #2
> + add IY, IY, IPAD
> + add UV, UV, IPAD
> + add OY, OY, OPAD
> + add U, U, OPAD, lsr #1
> + add V, V, OPAD, lsr #1
> + b 1b
--
Rémi Denis-Courmont
http://www.remlab.net/
More information about the vlc-devel
mailing list