[vlc-devel] [PATCH] arm_neon: Add an optimized routine for NV12/21 to I420/YV12

Tue Oct 1 16:49:26 CEST 2013

Le mardi 1 octobre 2013 13:31:29 Martin Storsjö a écrit :
> This avoids hitting swscale for this conversion, for hw decoders
> that return NV12/21 in combination with the android vout in YUV
> mode.
> ---
> Copying the luma plane using memcpy, added support for YV12 as well,
> as suggested by Rémi.
> ---
>  modules/arm_neon/Makefile.am   |    1 +
>  modules/arm_neon/chroma_neon.h |    5 +++
>  modules/arm_neon/chroma_yuv.c  |   85
> ++++++++++++++++++++++++++++++++++++++++ modules/arm_neon/nv12_i420.S   |  
> 62 +++++++++++++++++++++++++++++ 4 files changed, 153 insertions(+)
>  create mode 100644 modules/arm_neon/nv12_i420.S
> 
> diff --git a/modules/arm_neon/Makefile.am b/modules/arm_neon/Makefile.am
> index 212605f..8978b75 100644
> --- a/modules/arm_neon/Makefile.am
> +++ b/modules/arm_neon/Makefile.am
> @@ -10,6 +10,7 @@ libchroma_yuv_neon_plugin_la_SOURCES = \
>  	arm_neon/i420_yuyv.S \
>  	arm_neon/i422_yuyv.S \
>  	arm_neon/yuyv_i422.S \
> +	arm_neon/nv12_i420.S \
>  	arm_neon/chroma_yuv.c arm_neon/chroma_neon.h
>  libchroma_yuv_neon_plugin_la_CFLAGS = $(AM_CFLAGS)
>  libchroma_yuv_neon_plugin_LIBTOOLFLAGS = --tag=CC
> diff --git a/modules/arm_neon/chroma_neon.h b/modules/arm_neon/chroma_neon.h
> index 865315a..877d011 100644
> --- a/modules/arm_neon/chroma_neon.h
> +++ b/modules/arm_neon/chroma_neon.h
> @@ -67,6 +67,11 @@ void uyvy_i422_neon (struct yuv_planes *const out,
>                       const struct yuv_pack *const in,
>                       int width, int height) asm("uyvy_i422_neon");
> 
> +/* NV12 to I420 conversion. */
> +void nv12_i420_neon (struct yuv_planes *const out,
> +                     const struct yuv_planes *const in,
> +                     int width, int height) asm("nv12_i420_neon");
> +
>  /* I420 to RGBA conversion. */
>  void i420_rgb_neon (struct yuv_pack *const out,
>                      const struct yuv_planes *const in,
> diff --git a/modules/arm_neon/chroma_yuv.c b/modules/arm_neon/chroma_yuv.c
> index b54732e..aa72a0e 100644
> --- a/modules/arm_neon/chroma_yuv.c
> +++ b/modules/arm_neon/chroma_yuv.c
> @@ -83,6 +83,62 @@ static void I420_VYUY (filter_t *filter, picture_t *src,
> picture_t *dst) VIDEO_FILTER_WRAPPER (I420_VYUY)
> 
> 
> +/* Semiplanar NV12/21 to planar I420/YV12 */
> +static void copy_y_plane(filter_t *filter, picture_t *src, picture_t *dst)
> +{
> +    uint8_t *src_y = src->Y_PIXELS;
> +    uint8_t *dst_y = dst->Y_PIXELS;
> +    if (src->Y_PITCH == dst->Y_PITCH) {
> +        memcpy(dst_y, src_y, dst->Y_PITCH * filter->fmt_in.video.i_height);
> +    } else {
> +        for (unsigned y = 0; y < filter->fmt_in.video.i_height; y++) {
> +            memcpy(dst_y + dst->Y_PITCH * y, src_y + src->Y_PITCH * y,
> +                   filter->fmt_in.video.i_width);
> +        }
> +    }
> +}
> +
> +static void NV12_I420 (filter_t *filter, picture_t *src, picture_t *dst)
> +{
> +    DEFINE_PLANES(out, dst);
> +    DEFINE_PLANES(in, src);
> +    copy_y_plane (filter, src, dst);
> +    nv12_i420_neon (&out, &in, filter->fmt_in.video.i_width,
> +                    filter->fmt_in.video.i_height);
> +}
> +VIDEO_FILTER_WRAPPER (NV12_I420)
> +
> +static void NV12_YV12 (filter_t *filter, picture_t *src, picture_t *dst)
> +{
> +    DEFINE_PLANES_SWAP(out, dst);
> +    DEFINE_PLANES(in, src);
> +    copy_y_plane (filter, src, dst);
> +    nv12_i420_neon (&out, &in, filter->fmt_in.video.i_width,
> +                    filter->fmt_in.video.i_height);
> +}
> +VIDEO_FILTER_WRAPPER (NV12_YV12)
> +
> +static void NV21_I420 (filter_t *filter, picture_t *src, picture_t *dst)
> +{
> +    DEFINE_PLANES_SWAP(out, dst);
> +    DEFINE_PLANES(in, src);
> +    copy_y_plane (filter, src, dst);
> +    nv12_i420_neon (&out, &in, filter->fmt_in.video.i_width,
> +                    filter->fmt_in.video.i_height);
> +}
> +VIDEO_FILTER_WRAPPER (NV21_I420)
> +
> +static void NV21_YV12 (filter_t *filter, picture_t *src, picture_t *dst)
> +{
> +    DEFINE_PLANES(out, dst);
> +    DEFINE_PLANES(in, src);
> +    copy_y_plane (filter, src, dst);
> +    nv12_i420_neon (&out, &in, filter->fmt_in.video.i_width,
> +                    filter->fmt_in.video.i_height);
> +}
> +VIDEO_FILTER_WRAPPER (NV21_YV12)

Isn't this duplicate code?

> +
> +
>  /* Planar YUV422 to packed YUV422 */
>  static void I422_YUYV (filter_t *filter, picture_t *src, picture_t *dst)
>  {
> @@ -231,6 +287,35 @@ static int Open (vlc_object_t *obj)
>              }
>              break;
> 
> +        /* Semiplanar to planar */
> +        case VLC_CODEC_NV12:
> +            switch (filter->fmt_out.video.i_chroma)
> +            {
> +                case VLC_CODEC_I420:
> +                    filter->pf_video_filter = NV12_I420_Filter;
> +                    break;
> +                case VLC_CODEC_YV12:
> +                    filter->pf_video_filter = NV12_YV12_Filter;
> +                    break;
> +                default:
> +                    return VLC_EGENERIC;
> +            }
> +            break;
> +
> +        case VLC_CODEC_NV21:
> +            switch (filter->fmt_out.video.i_chroma)
> +            {
> +                case VLC_CODEC_I420:
> +                    filter->pf_video_filter = NV21_I420_Filter;
> +                    break;
> +                case VLC_CODEC_YV12:
> +                    filter->pf_video_filter = NV21_YV12_Filter;
> +                    break;
> +                default:
> +                    return VLC_EGENERIC;
> +            }
> +            break;
> +
>          /* Packed to planar */
>          case VLC_CODEC_YUYV:
>              switch (filter->fmt_out.video.i_chroma)
> diff --git a/modules/arm_neon/nv12_i420.S b/modules/arm_neon/nv12_i420.S
> new file mode 100644
> index 0000000..070594f
> --- /dev/null
> +++ b/modules/arm_neon/nv12_i420.S
> @@ -0,0 +1,62 @@
> +
> @**************************************************************************
> *** + @ nv12_i420.S : ARM NEONv1 NV12 to I420 chroma conversion
> +
> @**************************************************************************
> *** + @ Copyright (C) 2009-2011 Rémi Denis-Courmont
> + @ Copyright (C) 2013 Martin Storsjö
> + @
> + @ This program is free software; you can redistribute it and/or modify
> + @ it under the terms of the GNU Lesser General Public License as published
> by + @ the Free Software Foundation; either version 2.1 of the License, or
> + @ (at your option) any later version.
> + @
> + @ This program is distributed in the hope that it will be useful,
> + @ but WITHOUT ANY WARRANTY; without even the implied warranty of
> + @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + @ GNU Lesser General Public License for more details.
> + @
> + @ You should have received a copy of the GNU Lesser General Public License
> + @ along with this program; if not, write to the Free Software Foundation,
> + @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA. +
> @**************************************************************************
> **/ +
> +	.syntax unified
> +	.fpu neon
> +	.text
> +
> +#define WIDTH	r2
> +#define HEIGHT	r3
> +#define UV	r4
> +#define U	r5
> +#define V	r6
> +#define OPITCH	r7
> +#define IPAD	r8
> +#define OPAD	r11
> +#define COUNT	ip
> +#define IPITCH	lr
> +
> +	.align 2
> +	.global nv12_i420_neon
> +	.type	nv12_i420_neon, %function
> +nv12_i420_neon:

Poor choice of function name, IMHO.

> +	push		{r4-r8,r11,lr}
> +	ldmia		r0,	{r0, U, V, OPITCH} @ first plane is unused
> +	ldmia		r1,	{r1, UV, IPAD, IPITCH} @ first and third planes are 
unused

You could clobber two fewer registers.

> +	cmp		HEIGHT,	#0
> +	sub		IPAD,	IPITCH,	WIDTH
> +	sub		OPAD,	OPITCH,	WIDTH
> +1:
> +	movsgt		COUNT,	WIDTH
> +	pople		{r4-r8,r11,pc}
> +2:
> +	pld		[UV, #64]
> +	vld2.u8		{d0, d1},	[UV,:128]!
> +	subs		COUNT,	COUNT,	#16
> +	vst1.u8		{d0},		[U,:64]!
> +	vst1.u8		{d1},		[V,:64]!
> +	bgt		2b
> +
> +	subs		HEIGHT,	#2
> +	add		UV,	UV,	IPAD
> +	add		U,	U,	OPAD,	lsr #1
> +	add		V,	V,	OPAD,	lsr #1
> +	b		1b
-- 
Rémi Denis-Courmont
http://www.remlab.net/