[vlc-devel] [PATCH] arm_neon: Add an optimized routine for NV12/21 to I420/YV12
Martin Storsjö
martin at martin.st
Tue Oct 1 18:13:53 CEST 2013
On Tue, 1 Oct 2013, Rémi Denis-Courmont wrote:
> Le mardi 1 octobre 2013 13:31:29 Martin Storsjö a écrit :
>> This avoids hitting swscale for this conversion, for hw decoders
>> that return NV12/21 in combination with the android vout in YUV
>> mode.
>> ---
>> Copying the luma plane using memcpy, added support for YV12 as well,
>> as suggested by Rémi.
>> ---
>> modules/arm_neon/Makefile.am | 1 +
>> modules/arm_neon/chroma_neon.h | 5 +++
>> modules/arm_neon/chroma_yuv.c | 85
>> ++++++++++++++++++++++++++++++++++++++++ modules/arm_neon/nv12_i420.S |
>> 62 +++++++++++++++++++++++++++++ 4 files changed, 153 insertions(+)
>> create mode 100644 modules/arm_neon/nv12_i420.S
>>
>> diff --git a/modules/arm_neon/Makefile.am b/modules/arm_neon/Makefile.am
>> index 212605f..8978b75 100644
>> --- a/modules/arm_neon/Makefile.am
>> +++ b/modules/arm_neon/Makefile.am
>> @@ -10,6 +10,7 @@ libchroma_yuv_neon_plugin_la_SOURCES = \
>> arm_neon/i420_yuyv.S \
>> arm_neon/i422_yuyv.S \
>> arm_neon/yuyv_i422.S \
>> + arm_neon/nv12_i420.S \
>> arm_neon/chroma_yuv.c arm_neon/chroma_neon.h
>> libchroma_yuv_neon_plugin_la_CFLAGS = $(AM_CFLAGS)
>> libchroma_yuv_neon_plugin_LIBTOOLFLAGS = --tag=CC
>> diff --git a/modules/arm_neon/chroma_neon.h b/modules/arm_neon/chroma_neon.h
>> index 865315a..877d011 100644
>> --- a/modules/arm_neon/chroma_neon.h
>> +++ b/modules/arm_neon/chroma_neon.h
>> @@ -67,6 +67,11 @@ void uyvy_i422_neon (struct yuv_planes *const out,
>> const struct yuv_pack *const in,
>> int width, int height) asm("uyvy_i422_neon");
>>
>> +/* NV12 to I420 conversion. */
>> +void nv12_i420_neon (struct yuv_planes *const out,
>> + const struct yuv_planes *const in,
>> + int width, int height) asm("nv12_i420_neon");
>> +
>> /* I420 to RGBA conversion. */
>> void i420_rgb_neon (struct yuv_pack *const out,
>> const struct yuv_planes *const in,
>> diff --git a/modules/arm_neon/chroma_yuv.c b/modules/arm_neon/chroma_yuv.c
>> index b54732e..aa72a0e 100644
>> --- a/modules/arm_neon/chroma_yuv.c
>> +++ b/modules/arm_neon/chroma_yuv.c
>> @@ -83,6 +83,62 @@ static void I420_VYUY (filter_t *filter, picture_t *src,
>> picture_t *dst) VIDEO_FILTER_WRAPPER (I420_VYUY)
>>
>>
>> +/* Semiplanar NV12/21 to planar I420/YV12 */
>> +static void copy_y_plane(filter_t *filter, picture_t *src, picture_t *dst)
>> +{
>> + uint8_t *src_y = src->Y_PIXELS;
>> + uint8_t *dst_y = dst->Y_PIXELS;
>> + if (src->Y_PITCH == dst->Y_PITCH) {
>> + memcpy(dst_y, src_y, dst->Y_PITCH * filter->fmt_in.video.i_height);
>> + } else {
>> + for (unsigned y = 0; y < filter->fmt_in.video.i_height; y++) {
>> + memcpy(dst_y + dst->Y_PITCH * y, src_y + src->Y_PITCH * y,
>> + filter->fmt_in.video.i_width);
>> + }
>> + }
>> +}
>> +
>> +static void NV12_I420 (filter_t *filter, picture_t *src, picture_t *dst)
>> +{
>> + DEFINE_PLANES(out, dst);
>> + DEFINE_PLANES(in, src);
>> + copy_y_plane (filter, src, dst);
>> + nv12_i420_neon (&out, &in, filter->fmt_in.video.i_width,
>> + filter->fmt_in.video.i_height);
>> +}
>> +VIDEO_FILTER_WRAPPER (NV12_I420)
>> +
>> +static void NV12_YV12 (filter_t *filter, picture_t *src, picture_t *dst)
>> +{
>> + DEFINE_PLANES_SWAP(out, dst);
>> + DEFINE_PLANES(in, src);
>> + copy_y_plane (filter, src, dst);
>> + nv12_i420_neon (&out, &in, filter->fmt_in.video.i_width,
>> + filter->fmt_in.video.i_height);
>> +}
>> +VIDEO_FILTER_WRAPPER (NV12_YV12)
>> +
>> +static void NV21_I420 (filter_t *filter, picture_t *src, picture_t *dst)
>> +{
>> + DEFINE_PLANES_SWAP(out, dst);
>> + DEFINE_PLANES(in, src);
>> + copy_y_plane (filter, src, dst);
>> + nv12_i420_neon (&out, &in, filter->fmt_in.video.i_width,
>> + filter->fmt_in.video.i_height);
>> +}
>> +VIDEO_FILTER_WRAPPER (NV21_I420)
>> +
>> +static void NV21_YV12 (filter_t *filter, picture_t *src, picture_t *dst)
>> +{
>> + DEFINE_PLANES(out, dst);
>> + DEFINE_PLANES(in, src);
>> + copy_y_plane (filter, src, dst);
>> + nv12_i420_neon (&out, &in, filter->fmt_in.video.i_width,
>> + filter->fmt_in.video.i_height);
>> +}
>> +VIDEO_FILTER_WRAPPER (NV21_YV12)
>
> Isn't this duplicate code?
Hmm, yeah. Would you prefer keeping NV12_I420 and NV12_YV12 as names and
have the NV21 ones use those but flipped, or name them e.g.
semiplanar_planar and semiplanar_planar_swapped or something similar?
>> + .align 2
>> + .global nv12_i420_neon
>> + .type nv12_i420_neon, %function
>> +nv12_i420_neon:
>
> Poor choice of function name, IMHO.
Would it be better with "semiplanar_to_planar" or so?
>> + push {r4-r8,r11,lr}
>> + ldmia r0, {r0, U, V, OPITCH} @ first plane is unused
>> + ldmia r1, {r1, UV, IPAD, IPITCH} @ first and third planes are
> unused
>
> You could clobber two fewer registers.
Yeah, I'll resend with these fixed. I guess it's easiest to load only the
struct members that I need, individually, instead of using ldmia with a
bunch of unused registers.
// Martin
More information about the vlc-devel
mailing list