[vlc-devel] [PATCH] arm_neon: Add an optimized routine for NV12/21 to I420/YV12

Tue Oct 1 18:13:53 CEST 2013

On Tue, 1 Oct 2013, Rémi Denis-Courmont wrote:

> Le mardi 1 octobre 2013 13:31:29 Martin Storsjö a écrit :
>> This avoids hitting swscale for this conversion, for hw decoders
>> that return NV12/21 in combination with the android vout in YUV
>> mode.
>> ---
>> Copying the luma plane using memcpy, added support for YV12 as well,
>> as suggested by Rémi.
>> ---
>>  modules/arm_neon/Makefile.am   |    1 +
>>  modules/arm_neon/chroma_neon.h |    5 +++
>>  modules/arm_neon/chroma_yuv.c  |   85
>> ++++++++++++++++++++++++++++++++++++++++ modules/arm_neon/nv12_i420.S   | 
>> 62 +++++++++++++++++++++++++++++ 4 files changed, 153 insertions(+)
>>  create mode 100644 modules/arm_neon/nv12_i420.S
>> 
>> diff --git a/modules/arm_neon/Makefile.am b/modules/arm_neon/Makefile.am
>> index 212605f..8978b75 100644
>> --- a/modules/arm_neon/Makefile.am
>> +++ b/modules/arm_neon/Makefile.am
>> @@ -10,6 +10,7 @@ libchroma_yuv_neon_plugin_la_SOURCES = \
>>  	arm_neon/i420_yuyv.S \
>>  	arm_neon/i422_yuyv.S \
>>  	arm_neon/yuyv_i422.S \
>> +	arm_neon/nv12_i420.S \
>>  	arm_neon/chroma_yuv.c arm_neon/chroma_neon.h
>>  libchroma_yuv_neon_plugin_la_CFLAGS = $(AM_CFLAGS)
>>  libchroma_yuv_neon_plugin_LIBTOOLFLAGS = --tag=CC
>> diff --git a/modules/arm_neon/chroma_neon.h b/modules/arm_neon/chroma_neon.h
>> index 865315a..877d011 100644
>> --- a/modules/arm_neon/chroma_neon.h
>> +++ b/modules/arm_neon/chroma_neon.h
>> @@ -67,6 +67,11 @@ void uyvy_i422_neon (struct yuv_planes *const out,
>>                       const struct yuv_pack *const in,
>>                       int width, int height) asm("uyvy_i422_neon");
>> 
>> +/* NV12 to I420 conversion. */
>> +void nv12_i420_neon (struct yuv_planes *const out,
>> +                     const struct yuv_planes *const in,
>> +                     int width, int height) asm("nv12_i420_neon");
>> +
>>  /* I420 to RGBA conversion. */
>>  void i420_rgb_neon (struct yuv_pack *const out,
>>                      const struct yuv_planes *const in,
>> diff --git a/modules/arm_neon/chroma_yuv.c b/modules/arm_neon/chroma_yuv.c
>> index b54732e..aa72a0e 100644
>> --- a/modules/arm_neon/chroma_yuv.c
>> +++ b/modules/arm_neon/chroma_yuv.c
>> @@ -83,6 +83,62 @@ static void I420_VYUY (filter_t *filter, picture_t *src,
>> picture_t *dst) VIDEO_FILTER_WRAPPER (I420_VYUY)
>> 
>> 
>> +/* Semiplanar NV12/21 to planar I420/YV12 */
>> +static void copy_y_plane(filter_t *filter, picture_t *src, picture_t *dst)
>> +{
>> +    uint8_t *src_y = src->Y_PIXELS;
>> +    uint8_t *dst_y = dst->Y_PIXELS;
>> +    if (src->Y_PITCH == dst->Y_PITCH) {
>> +        memcpy(dst_y, src_y, dst->Y_PITCH * filter->fmt_in.video.i_height);
>> +    } else {
>> +        for (unsigned y = 0; y < filter->fmt_in.video.i_height; y++) {
>> +            memcpy(dst_y + dst->Y_PITCH * y, src_y + src->Y_PITCH * y,
>> +                   filter->fmt_in.video.i_width);
>> +        }
>> +    }
>> +}
>> +
>> +static void NV12_I420 (filter_t *filter, picture_t *src, picture_t *dst)
>> +{
>> +    DEFINE_PLANES(out, dst);
>> +    DEFINE_PLANES(in, src);
>> +    copy_y_plane (filter, src, dst);
>> +    nv12_i420_neon (&out, &in, filter->fmt_in.video.i_width,
>> +                    filter->fmt_in.video.i_height);
>> +}
>> +VIDEO_FILTER_WRAPPER (NV12_I420)
>> +
>> +static void NV12_YV12 (filter_t *filter, picture_t *src, picture_t *dst)
>> +{
>> +    DEFINE_PLANES_SWAP(out, dst);
>> +    DEFINE_PLANES(in, src);
>> +    copy_y_plane (filter, src, dst);
>> +    nv12_i420_neon (&out, &in, filter->fmt_in.video.i_width,
>> +                    filter->fmt_in.video.i_height);
>> +}
>> +VIDEO_FILTER_WRAPPER (NV12_YV12)
>> +
>> +static void NV21_I420 (filter_t *filter, picture_t *src, picture_t *dst)
>> +{
>> +    DEFINE_PLANES_SWAP(out, dst);
>> +    DEFINE_PLANES(in, src);
>> +    copy_y_plane (filter, src, dst);
>> +    nv12_i420_neon (&out, &in, filter->fmt_in.video.i_width,
>> +                    filter->fmt_in.video.i_height);
>> +}
>> +VIDEO_FILTER_WRAPPER (NV21_I420)
>> +
>> +static void NV21_YV12 (filter_t *filter, picture_t *src, picture_t *dst)
>> +{
>> +    DEFINE_PLANES(out, dst);
>> +    DEFINE_PLANES(in, src);
>> +    copy_y_plane (filter, src, dst);
>> +    nv12_i420_neon (&out, &in, filter->fmt_in.video.i_width,
>> +                    filter->fmt_in.video.i_height);
>> +}
>> +VIDEO_FILTER_WRAPPER (NV21_YV12)
>
> Isn't this duplicate code?

Hmm, yeah. Would you prefer keeping NV12_I420 and NV12_YV12 as names and 
have the NV21 ones use those but flipped, or name them e.g. 
semiplanar_planar and semiplanar_planar_swapped or something similar?

>> +	.align 2
>> +	.global nv12_i420_neon
>> +	.type	nv12_i420_neon, %function
>> +nv12_i420_neon:
>
> Poor choice of function name, IMHO.

Would it be better with "semiplanar_to_planar" or so?

>> +	push		{r4-r8,r11,lr}
>> +	ldmia		r0,	{r0, U, V, OPITCH} @ first plane is unused
>> +	ldmia		r1,	{r1, UV, IPAD, IPITCH} @ first and third planes are 
> unused
>
> You could clobber two fewer registers.

Yeah, I'll resend with these fixed. I guess it's easiest to load only the 
struct members that I need, individually, instead of using ldmia with a 
bunch of unused registers.

// Martin