[vlc-devel] [PATCH] YUV->RGB convertor in NEON

Sun Jan 22 18:27:56 CET 2012

   Hello,

This patch has already been reviewed and rejected months ago. Why is it even 
proposed again? And why the heck did it get merged?

I AM REALLY FED UP WITH YOUR RECKLESS ATTITUDE.

On Thursday 19 January 2012, Sébastien Toque wrote:
> Signed-off-by: Rafaël Carré <funman at videolan.org>
> ---
>  modules/arm_neon/Modules.am    |   10 ++
>  modules/arm_neon/chroma_neon.h |   12 +++
>  modules/arm_neon/i420_rgb.S    |  209
> ++++++++++++++++++++++++++++++++++++++++ modules/arm_neon/nv12_rgb.S    | 
> 206 +++++++++++++++++++++++++++++++++++++++ modules/arm_neon/nv21_rgb.S   
> |  206 +++++++++++++++++++++++++++++++++++++++ modules/arm_neon/yuv_rgb.c 
>    |  179 ++++++++++++++++++++++++++++++++++ 6 files changed, 822
> insertions(+), 0 deletions(-)
>  create mode 100644 modules/arm_neon/i420_rgb.S
>  create mode 100644 modules/arm_neon/nv12_rgb.S
>  create mode 100644 modules/arm_neon/nv21_rgb.S
>  create mode 100644 modules/arm_neon/yuv_rgb.c
> 
> diff --git a/modules/arm_neon/Modules.am b/modules/arm_neon/Modules.am
> index 83576eb..9dfd4ab 100644
> --- a/modules/arm_neon/Modules.am
> +++ b/modules/arm_neon/Modules.am
> @@ -18,7 +18,17 @@ libchroma_yuv_neon_plugin_la_CFLAGS = $(AM_CFLAGS)
>  libchroma_yuv_neon_plugin_la_LIBADD = $(AM_LIBADD)
>  libchroma_yuv_neon_plugin_la_DEPENDENCIES =
> 
> +libyuv_rgb_neon_plugin_la_SOURCES = \
> +	i420_rgb.S \
> +	nv21_rgb.S \
> +	nv12_rgb.S \
> +	yuv_rgb.c
> +libyuv_rgb_neon_plugin_la_CFLAGS = $(AM_CFLAGS)
> +libyuv_rgb_neon_plugin_la_LIBADD = $(AM_LIBADD)
> +libyuv_rgb_neon_plugin_la_DEPENDENCIES =
> +
>  libvlc_LTLIBRARIES += \
>  	libaudio_format_neon_plugin.la \
>  	libchroma_yuv_neon_plugin.la \
> +	libyuv_rgb_neon_plugin.la \
>  	$(NULL)
> diff --git a/modules/arm_neon/chroma_neon.h
> b/modules/arm_neon/chroma_neon.h index 204c5f1..3e867e3 100644
> --- a/modules/arm_neon/chroma_neon.h
> +++ b/modules/arm_neon/chroma_neon.h
> @@ -64,3 +64,15 @@ void yuyv_i422_neon (struct yuv_planes *const out,
>  /* UYVY to I422 conversion. */
>  void uyvy_i422_neon (struct yuv_planes *const out,
>                       const struct yuv_pack *const in, int width, int
> height); +
> +/* I420 to RGBA conversion. */
> +void i420_rgb_neon (struct yuv_pack *const out,
> +                    const struct yuv_planes *const in, int width, int
> height); +
> +/* NV21 to RGBA conversion. */
> +void nv21_rgb_neon (struct yuv_pack *const out,
> +                    const struct yuv_planes *const in, int width, int
> height); +
> +/* NV12 to RGBA conversion. */
> +void nv12_rgb_neon (struct yuv_pack *const out,
> +                    const struct yuv_planes *const in, int width, int
> height); diff --git a/modules/arm_neon/i420_rgb.S
> b/modules/arm_neon/i420_rgb.S new file mode 100644
> index 0000000..cc0caf3
> --- /dev/null
> +++ b/modules/arm_neon/i420_rgb.S
> @@ -0,0 +1,209 @@
> +
> @*************************************************************************
> **** + @ i420_rgb.S : ARM NEONv1 I420 to RGB chroma conversion
> +
> @*************************************************************************
> **** + @ Copyright (C) 2011 Sébastien Toque
> + @                    Rémi Denis-Courmont
> + @
> + @ This program is free software; you can redistribute it and/or modify
> + @ it under the terms of the GNU General Public License as published by
> + @ the Free Software Foundation; either version 2 of the License, or
> + @ (at your option) any later version.
> + @
> + @ This program is distributed in the hope that it will be useful,
> + @ but WITHOUT ANY WARRANTY; without even the implied warranty of
> + @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + @ GNU General Public License for more details.
> + @
> + @ You should have received a copy of the GNU General Public License
> + @ along with this program; if not, write to the Free Software Foundation,
> + @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
> +
> @*************************************************************************
> ***/ +
> +	.fpu neon
> +	.text
> +
> +/* ARM */
> +#define O1	r0
> +#define O2	r1
> +#define WIDTH	r2
> +#define HEIGHT	r3
> +#define Y1	r4
> +#define Y2	r5
> +#define U	r6
> +#define V	r7
> +#define YPITCH	r8
> +#define OPAD	r10
> +#define YPAD	r11
> +#define COUNT	ip
> +#define OPITCH	lr
> +
> +/* NEON */
> +#define coefY	D0
> +#define coefRV	D1
> +#define coefGU	D2
> +#define coefGV	D3
> +#define coefBU	D4
> +#define Rc	Q3
> +#define Gc	Q4
> +#define Bc	Q5
> +
> +#define u	D24
> +#define v	D25
> +#define y1	D28
> +#define y2	D29
> +
> +#define chro_r	Q6
> +#define chro_g	Q7
> +#define chro_b	Q8
> +#define red		Q9
> +#define green	Q10
> +#define blue	Q11
> +#define lumi	Q15
> +
> +#define red1	D24
> +#define green1	D25
> +#define blue1	D26
> +#define alpha1	D27
> +#define red2	D28
> +#define green2	D29
> +#define blue2	D30
> +#define alpha2	D31
> +
> +coefficients:
> +    .short  -15872
> +    .short    4992
> +    .short  -18432
> +
> +	.align
> +	.global i420_rgb_neon
> +	.type	i420_rgb_neon, %function
> +i420_rgb_neon:
> +	push		{r4-r8,r10-r11,lr}
> +	vpush		{q4-q7}
> +
> +	/* load arguments */
> +	ldmia		r0,	{O1, OPITCH}
> +	ldmia		r1,	{Y1, U, V, YPITCH}
> +
> +	/* round the width to be a multiple of 16 */
> +	ands		OPAD, WIDTH, #15
> +	sub			WIDTH, WIDTH, OPAD
> +	addne		WIDTH, WIDTH, #16
> +
> +	/* init constants (scale value by 64) */
> +	vmov.u8		coefY, #74
> +	vmov.u8		coefRV, #115
> +	vmov.u8		coefGU, #14
> +	vmov.u8		coefGV, #34
> +	vmov.u8		coefBU, #135
> +	adr			OPAD, coefficients
> +	vld1.s16	{d6[], d7[]}, [OPAD]!
> +	vld1.s16	{d8[], d9[]}, [OPAD]!
> +	vld1.s16	{d10[], d11[]}, [OPAD]!
> +	vmov.u8		alpha1, #255
> +
> +	/* init padding */
> +	cmp			HEIGHT,	#0
> +	sub			OPAD,	OPITCH,	WIDTH, lsl #2
> +	sub			YPAD,	YPITCH,	WIDTH
> +
> +loop_row:
> +	movgts	COUNT,	WIDTH
> +	add		O2,	O1,	OPITCH
> +	add		Y2,	Y1,	YPITCH
> +	/* exit if all rows have been processed */
> +	vpople	{q4-q7}
> +	pople	{r4-r8,r10-r11,pc}
> +
> +loop_col:
> +
> +	/* Common U & V */
> +
> +	vld1.u8	{u}, [U,:64]!
> +	vld1.u8	{v}, [V,:64]!
> +
> +	vmull.u8	chro_r, v, coefRV
> +	vmull.u8	chro_g, u, coefGU
> +	vmlal.u8	chro_g, v, coefGV
> +	vmull.u8	chro_b, u, coefBU
> +
> +	vadd.s16	chro_r, Rc, chro_r
> +	vsub.s16	chro_g, Gc, chro_g
> +	vadd.s16	chro_b, Bc, chro_b
> +
> +	PLD	[U]
> +	PLD	[V]
> +
> +	/* Y Top Row */
> +	vld2.u8	{y1,y2}, [Y1,:128]!
> +
> +	/* y1 : chrominance + luminance, then clamp (divide by 64) */
> +	vmull.u8	lumi, y1, coefY
> +	vqadd.s16	red, lumi, chro_r
> +	vqadd.s16	green, lumi, chro_g
> +	vqadd.s16	blue, lumi, chro_b
> +	vqrshrun.s16	red1, red, #6
> +	vqrshrun.s16	green1, green, #6
> +	vqrshrun.s16	blue1, blue, #6
> +
> +	/* y2 : chrominance + luminance, then clamp (divide by 64) */
> +	vmull.u8	lumi, y2, coefY
> +	vqadd.s16	red, lumi, chro_r
> +	vqadd.s16	green, lumi, chro_g
> +	vqadd.s16	blue, lumi, chro_b
> +	vqrshrun.s16	red2, red, #6
> +	vqrshrun.s16	green2, green, #6
> +	vqrshrun.s16	blue2, blue, #6
> +
> +	PLD	[Y1]
> +
> +	vmov.u8	alpha2, #255
> +	vzip.u8	red1, red2
> +	vzip.u8	green1, green2
> +	vzip.u8	blue1, blue2
> +
> +	vst4.u8		{red1,green1,blue1,alpha1}, [O1,:128]!
> +	vst4.u8		{red2,green2,blue2,alpha2}, [O1,:128]!
> +
> +	/* Y Bottom Row */
> +	vld2.u8	{y1,y2}, [Y2,:128]!
> +
> +	/* y1 : chrominance + luminance, then clamp (divide by 64) */
> +	vmull.u8	lumi, y1, coefY
> +	vqadd.s16	red, lumi, chro_r
> +	vqadd.s16	green, lumi, chro_g
> +	vqadd.s16	blue, lumi, chro_b
> +	vqrshrun.s16	red1, red, #6
> +	vqrshrun.s16	green1, green, #6
> +	vqrshrun.s16	blue1, blue, #6
> +
> +	/* y2 : chrominance + luminance, then clamp (divide by 64) */
> +	vmull.u8	lumi, y2, coefY
> +	vqadd.s16	red, lumi, chro_r
> +	vqadd.s16	green, lumi, chro_g
> +	vqadd.s16	blue, lumi, chro_b
> +	vqrshrun.s16	red2, red, #6
> +	vqrshrun.s16	green2, green, #6
> +	vqrshrun.s16	blue2, blue, #6
> +
> +	PLD	[Y2]
> +
> +	vmov.u8	alpha2, #255
> +	vzip.u8	red1, red2
> +	vzip.u8	green1, green2
> +	vzip.u8	blue1, blue2
> +
> +	vst4.u8		{red1,green1,blue1,alpha1}, [O2,:128]!
> +	vst4.u8		{red2,green2,blue2,alpha2}, [O2,:128]!
> +
> +	/* next columns (x16) */
> +	subs	COUNT,	COUNT,	#16
> +	bgt		loop_col
> +
> +	/* next rows (x2) */
> +	subs	HEIGHT,	#2
> +	add		O1,	O2,	OPAD
> +	add		Y1,	Y2,	YPAD
> +	add		U,	U,	YPAD,	lsr #1
> +	add		V,	V,	YPAD,	lsr #1
> +	b		loop_row
> diff --git a/modules/arm_neon/nv12_rgb.S b/modules/arm_neon/nv12_rgb.S
> new file mode 100644
> index 0000000..f4bb510
> --- /dev/null
> +++ b/modules/arm_neon/nv12_rgb.S
> @@ -0,0 +1,206 @@
> +
> @*************************************************************************
> **** + @ nv12_rgb.S : ARM NEONv1 NV12 to RGB chroma conversion
> +
> @*************************************************************************
> **** + @ Copyright (C) 2011 Sébastien Toque
> + @                    Rémi Denis-Courmont
> + @
> + @ This program is free software; you can redistribute it and/or modify
> + @ it under the terms of the GNU General Public License as published by
> + @ the Free Software Foundation; either version 2 of the License, or
> + @ (at your option) any later version.
> + @
> + @ This program is distributed in the hope that it will be useful,
> + @ but WITHOUT ANY WARRANTY; without even the implied warranty of
> + @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + @ GNU General Public License for more details.
> + @
> + @ You should have received a copy of the GNU General Public License
> + @ along with this program; if not, write to the Free Software Foundation,
> + @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
> +
> @*************************************************************************
> ***/ +
> +	.fpu neon
> +	.text
> +
> +/* ARM */
> +#define O1	r0
> +#define O2	r1
> +#define WIDTH	r2
> +#define HEIGHT	r3
> +#define Y1	r4
> +#define Y2	r5
> +#define U	r6
> +#define V	r7
> +#define YPITCH	r8
> +#define OPAD	r10
> +#define YPAD	r11
> +#define COUNT	ip
> +#define OPITCH	lr
> +
> +/* NEON */
> +#define coefY	D0
> +#define coefRV	D1
> +#define coefGU	D2
> +#define coefGV	D3
> +#define coefBU	D4
> +#define Rc	Q3
> +#define Gc	Q4
> +#define Bc	Q5
> +
> +#define u	D24
> +#define v	D25
> +#define y1	D28
> +#define y2	D29
> +
> +#define chro_r	Q6
> +#define chro_g	Q7
> +#define chro_b	Q8
> +#define red		Q9
> +#define green	Q10
> +#define blue	Q11
> +#define lumi	Q15
> +
> +#define red1	D24
> +#define green1	D25
> +#define blue1	D26
> +#define alpha1	D27
> +#define red2	D28
> +#define green2	D29
> +#define blue2	D30
> +#define alpha2	D31
> +
> +coefficients:
> +    .short  -15872
> +    .short    4992
> +    .short  -18432
> +
> +	.align
> +	.global nv12_rgb_neon
> +	.type	nv12_rgb_neon, %function
> +nv12_rgb_neon:
> +	push		{r4-r8,r10-r11,lr}
> +	vpush		{q4-q7}
> +
> +	/* load arguments */
> +	ldmia		r0,	{O1, OPITCH}
> +	ldmia		r1,	{Y1, U, V, YPITCH}
> +
> +	/* round the width to be a multiple of 16 */
> +	ands		OPAD, WIDTH, #15
> +	sub			WIDTH, WIDTH, OPAD
> +	addne		WIDTH, WIDTH, #16
> +
> +	/* init constants (scale value by 64) */
> +	vmov.u8		coefY, #74
> +	vmov.u8		coefRV, #115
> +	vmov.u8		coefGU, #14
> +	vmov.u8		coefGV, #34
> +	vmov.u8		coefBU, #135
> +	adr			OPAD, coefficients
> +	vld1.s16	{d6[], d7[]}, [OPAD]!
> +	vld1.s16	{d8[], d9[]}, [OPAD]!
> +	vld1.s16	{d10[], d11[]}, [OPAD]!
> +	vmov.u8		alpha1, #255
> +
> +	/* init padding */
> +	cmp			HEIGHT,	#0
> +	sub			OPAD,	OPITCH,	WIDTH, lsl #2
> +	sub			YPAD,	YPITCH,	WIDTH
> +
> +loop_row:
> +	movgts	COUNT,	WIDTH
> +	add		O2,	O1,	OPITCH
> +	add		Y2,	Y1,	YPITCH
> +	/* exit if all rows have been processed */
> +	vpople	{q4-q7}
> +	pople	{r4-r8,r10-r11,pc}
> +
> +loop_col:
> +
> +	/* Common U & V */
> +
> +	vld2.u8	{u,v}, [U,:128]!
> +
> +	vmull.u8	chro_r, v, coefRV
> +	vmull.u8	chro_g, u, coefGU
> +	vmlal.u8	chro_g, v, coefGV
> +	vmull.u8	chro_b, u, coefBU
> +
> +	vadd.s16	chro_r, Rc, chro_r
> +	vsub.s16	chro_g, Gc, chro_g
> +	vadd.s16	chro_b, Bc, chro_b
> +
> +	PLD	[U]
> +
> +	/* Y Top Row */
> +	vld2.u8	{y1,y2}, [Y1,:128]!
> +
> +	/* y1 : chrominance + luminance, then clamp (divide by 64) */
> +	vmull.u8	lumi, y1, coefY
> +	vqadd.s16	red, lumi, chro_r
> +	vqadd.s16	green, lumi, chro_g
> +	vqadd.s16	blue, lumi, chro_b
> +	vqrshrun.s16	red1, red, #6
> +	vqrshrun.s16	green1, green, #6
> +	vqrshrun.s16	blue1, blue, #6
> +
> +	/* y2 : chrominance + luminance, then clamp (divide by 64) */
> +	vmull.u8	lumi, y2, coefY
> +	vqadd.s16	red, lumi, chro_r
> +	vqadd.s16	green, lumi, chro_g
> +	vqadd.s16	blue, lumi, chro_b
> +	vqrshrun.s16	red2, red, #6
> +	vqrshrun.s16	green2, green, #6
> +	vqrshrun.s16	blue2, blue, #6
> +
> +	PLD	[Y1]
> +
> +	vmov.u8	alpha2, #255
> +	vzip.u8	red1, red2
> +	vzip.u8	green1, green2
> +	vzip.u8	blue1, blue2
> +
> +	vst4.u8		{red1,green1,blue1,alpha1}, [O1,:128]!
> +	vst4.u8		{red2,green2,blue2,alpha2}, [O1,:128]!
> +
> +	/* Y Bottom Row */
> +	vld2.u8	{y1,y2}, [Y2,:128]!
> +
> +	/* y1 : chrominance + luminance, then clamp (divide by 64) */
> +	vmull.u8	lumi, y1, coefY
> +	vqadd.s16	red, lumi, chro_r
> +	vqadd.s16	green, lumi, chro_g
> +	vqadd.s16	blue, lumi, chro_b
> +	vqrshrun.s16	red1, red, #6
> +	vqrshrun.s16	green1, green, #6
> +	vqrshrun.s16	blue1, blue, #6
> +
> +	/* y2 : chrominance + luminance, then clamp (divide by 64) */
> +	vmull.u8	lumi, y2, coefY
> +	vqadd.s16	red, lumi, chro_r
> +	vqadd.s16	green, lumi, chro_g
> +	vqadd.s16	blue, lumi, chro_b
> +	vqrshrun.s16	red2, red, #6
> +	vqrshrun.s16	green2, green, #6
> +	vqrshrun.s16	blue2, blue, #6
> +
> +	PLD	[Y2]
> +
> +	vmov.u8	alpha2, #255
> +	vzip.u8	red1, red2
> +	vzip.u8	green1, green2
> +	vzip.u8	blue1, blue2
> +
> +	vst4.u8		{red1,green1,blue1,alpha1}, [O2,:128]!
> +	vst4.u8		{red2,green2,blue2,alpha2}, [O2,:128]!
> +
> +	/* next columns (x16) */
> +	subs	COUNT,	COUNT,	#16
> +	bgt		loop_col
> +
> +	/* next rows (x2) */
> +	subs	HEIGHT,	#2
> +	add		O1,	O2,	OPAD
> +	add		Y1,	Y2,	YPAD
> +	add		U,	U,	YPAD
> +	b		loop_row
> diff --git a/modules/arm_neon/nv21_rgb.S b/modules/arm_neon/nv21_rgb.S
> new file mode 100644
> index 0000000..82a7099
> --- /dev/null
> +++ b/modules/arm_neon/nv21_rgb.S
> @@ -0,0 +1,206 @@
> +
> @*************************************************************************
> **** + @ nv21_rgb.S : ARM NEONv1 NV21 to RGB chroma conversion
> +
> @*************************************************************************
> **** + @ Copyright (C) 2011 Sébastien Toque
> + @                    Rémi Denis-Courmont
> + @
> + @ This program is free software; you can redistribute it and/or modify
> + @ it under the terms of the GNU General Public License as published by
> + @ the Free Software Foundation; either version 2 of the License, or
> + @ (at your option) any later version.
> + @
> + @ This program is distributed in the hope that it will be useful,
> + @ but WITHOUT ANY WARRANTY; without even the implied warranty of
> + @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + @ GNU General Public License for more details.
> + @
> + @ You should have received a copy of the GNU General Public License
> + @ along with this program; if not, write to the Free Software Foundation,
> + @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
> +
> @*************************************************************************
> ***/ +
> +	.fpu neon
> +	.text
> +
> +/* ARM */
> +#define O1	r0
> +#define O2	r1
> +#define WIDTH	r2
> +#define HEIGHT	r3
> +#define Y1	r4
> +#define Y2	r5
> +#define U	r6
> +#define V	r7
> +#define YPITCH	r8
> +#define OPAD	r10
> +#define YPAD	r11
> +#define COUNT	ip
> +#define OPITCH	lr
> +
> +/* NEON */
> +#define coefY	D0
> +#define coefRV	D1
> +#define coefGU	D2
> +#define coefGV	D3
> +#define coefBU	D4
> +#define Rc	Q3
> +#define Gc	Q4
> +#define Bc	Q5
> +
> +#define u	D24
> +#define v	D25
> +#define y1	D28
> +#define y2	D29
> +
> +#define chro_r	Q6
> +#define chro_g	Q7
> +#define chro_b	Q8
> +#define red		Q9
> +#define green	Q10
> +#define blue	Q11
> +#define lumi	Q15
> +
> +#define red1	D24
> +#define green1	D25
> +#define blue1	D26
> +#define alpha1	D27
> +#define red2	D28
> +#define green2	D29
> +#define blue2	D30
> +#define alpha2	D31
> +
> +coefficients:
> +    .short  -15872
> +    .short    4992
> +    .short  -18432
> +
> +	.align
> +	.global nv21_rgb_neon
> +	.type	nv21_rgb_neon, %function
> +nv21_rgb_neon:
> +	push		{r4-r8,r10-r11,lr}
> +	vpush		{q4-q7}
> +
> +	/* load arguments */
> +	ldmia		r0,	{O1, OPITCH}
> +	ldmia		r1,	{Y1, U, V, YPITCH}
> +
> +	/* round the width to be a multiple of 16 */
> +	ands		OPAD, WIDTH, #15
> +	sub			WIDTH, WIDTH, OPAD
> +	addne		WIDTH, WIDTH, #16
> +
> +	/* init constants (scale value by 64) */
> +	vmov.u8		coefY, #74
> +	vmov.u8		coefRV, #115
> +	vmov.u8		coefGU, #14
> +	vmov.u8		coefGV, #34
> +	vmov.u8		coefBU, #135
> +	adr			OPAD, coefficients
> +	vld1.s16	{d6[], d7[]}, [OPAD]!
> +	vld1.s16	{d8[], d9[]}, [OPAD]!
> +	vld1.s16	{d10[], d11[]}, [OPAD]!
> +	vmov.u8		alpha1, #255
> +
> +	/* init padding */
> +	cmp			HEIGHT,	#0
> +	sub			OPAD,	OPITCH,	WIDTH, lsl #2
> +	sub			YPAD,	YPITCH,	WIDTH
> +
> +loop_row:
> +	movgts	COUNT,	WIDTH
> +	add		O2,	O1,	OPITCH
> +	add		Y2,	Y1,	YPITCH
> +	/* exit if all rows have been processed */
> +	vpople	{q4-q7}
> +	pople	{r4-r8,r10-r11,pc}
> +
> +loop_col:
> +
> +	/* Common U & V */
> +
> +	vld2.u8	{u,v}, [U,:128]!
> +
> +	vmull.u8	chro_r, u, coefRV
> +	vmull.u8	chro_g, v, coefGU
> +	vmlal.u8	chro_g, u, coefGV
> +	vmull.u8	chro_b, v, coefBU
> +
> +	vadd.s16	chro_r, Rc, chro_r
> +	vsub.s16	chro_g, Gc, chro_g
> +	vadd.s16	chro_b, Bc, chro_b
> +
> +	PLD	[U]
> +
> +	/* Y Top Row */
> +	vld2.u8	{y1,y2}, [Y1,:128]!
> +
> +	/* y1 : chrominance + luminance, then clamp (divide by 64) */
> +	vmull.u8	lumi, y1, coefY
> +	vqadd.s16	red, lumi, chro_r
> +	vqadd.s16	green, lumi, chro_g
> +	vqadd.s16	blue, lumi, chro_b
> +	vqrshrun.s16	red1, red, #6
> +	vqrshrun.s16	green1, green, #6
> +	vqrshrun.s16	blue1, blue, #6
> +
> +	/* y2 : chrominance + luminance, then clamp (divide by 64) */
> +	vmull.u8	lumi, y2, coefY
> +	vqadd.s16	red, lumi, chro_r
> +	vqadd.s16	green, lumi, chro_g
> +	vqadd.s16	blue, lumi, chro_b
> +	vqrshrun.s16	red2, red, #6
> +	vqrshrun.s16	green2, green, #6
> +	vqrshrun.s16	blue2, blue, #6
> +
> +	PLD	[Y1]
> +
> +	vmov.u8	alpha2, #255
> +	vzip.u8	red1, red2
> +	vzip.u8	green1, green2
> +	vzip.u8	blue1, blue2
> +
> +	vst4.u8		{red1,green1,blue1,alpha1}, [O1,:128]!
> +	vst4.u8		{red2,green2,blue2,alpha2}, [O1,:128]!
> +
> +	/* Y Bottom Row */
> +	vld2.u8	{y1,y2}, [Y2,:128]!
> +
> +	/* y1 : chrominance + luminance, then clamp (divide by 64) */
> +	vmull.u8	lumi, y1, coefY
> +	vqadd.s16	red, lumi, chro_r
> +	vqadd.s16	green, lumi, chro_g
> +	vqadd.s16	blue, lumi, chro_b
> +	vqrshrun.s16	red1, red, #6
> +	vqrshrun.s16	green1, green, #6
> +	vqrshrun.s16	blue1, blue, #6
> +
> +	/* y2 : chrominance + luminance, then clamp (divide by 64) */
> +	vmull.u8	lumi, y2, coefY
> +	vqadd.s16	red, lumi, chro_r
> +	vqadd.s16	green, lumi, chro_g
> +	vqadd.s16	blue, lumi, chro_b
> +	vqrshrun.s16	red2, red, #6
> +	vqrshrun.s16	green2, green, #6
> +	vqrshrun.s16	blue2, blue, #6
> +
> +	PLD	[Y2]
> +
> +	vmov.u8	alpha2, #255
> +	vzip.u8	red1, red2
> +	vzip.u8	green1, green2
> +	vzip.u8	blue1, blue2
> +
> +	vst4.u8		{red1,green1,blue1,alpha1}, [O2,:128]!
> +	vst4.u8		{red2,green2,blue2,alpha2}, [O2,:128]!
> +
> +	/* next columns (x16) */
> +	subs	COUNT,	COUNT,	#16
> +	bgt		loop_col
> +
> +	/* next rows (x2) */
> +	subs	HEIGHT,	#2
> +	add		O1,	O2,	OPAD
> +	add		Y1,	Y2,	YPAD
> +	add		U,	U,	YPAD
> +	b		loop_row
> diff --git a/modules/arm_neon/yuv_rgb.c b/modules/arm_neon/yuv_rgb.c
> new file mode 100644
> index 0000000..ede49c9
> --- /dev/null
> +++ b/modules/arm_neon/yuv_rgb.c
> @@ -0,0 +1,179 @@
> +/*************************************************************************
> **** + * yuv_rgb.c : ARM NEONv1 YUV to RGB32 chroma conversion for VLC
> +
> **************************************************************************
> *** + * Copyright (C) 2011 Sébastien Toque
> + *                    Rémi Denis-Courmont
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
> 02110-1301, USA. +
> **************************************************************************
> ***/ +
> +#ifdef HAVE_CONFIG_H
> +# include "config.h"
> +#endif
> +
> +#include <vlc_common.h>
> +#include <vlc_plugin.h>
> +#include <vlc_filter.h>
> +#include <vlc_cpu.h>
> +#include "chroma_neon.h"
> +
> +static int Open (vlc_object_t *);
> +
> +vlc_module_begin ()
> +    set_description (N_("ARM NEON video chroma YUV->RGBA"))
> +    set_capability ("video filter2", 250)
> +    set_callbacks (Open, NULL)
> +vlc_module_end ()
> +
> +/*
> +static int CoefY[256];
> +static int CoefRV[256];
> +static int CoefGU[256];
> +static int CoefGV[256];
> +static int CoefBU[256];
> +
> +// C reference version of the converter
> +static void I420_RGBA_C (filter_t *filter, picture_t *src, picture_t *dst)
> +{
> +    const uint8_t *const out = dst->p->p_pixels;
> +    const size_t width = src->p[Y_PLANE].i_visible_pitch;
> +    const size_t height = src->p[Y_PLANE].i_visible_lines;
> +
> +    const int ypitch = src->p[Y_PLANE].i_pitch;
> +    const int uvpitch = src->p[U_PLANE].i_pitch;
> +    const int dpitch = dst->p->i_pitch / dst->p->i_pixel_pitch;
> +
> +    for (size_t j = 0; j <  height; ++j)
> +    {
> +        const int y = j * ypitch;
> +        const int u = (j>>1) * uvpitch;
> +        const int d = j * dpitch;
> +
> +        for (size_t i = 0; i < width; ++i)
> +        {
> +            uint8_t Y = src->Y_PIXELS[y + i];
> +            uint8_t U = src->U_PIXELS[u + (i>>1)];
> +            uint8_t V = src->V_PIXELS[u + (i>>1)];
> +
> +            //coef = float * Precision + .5 (Precision=32768)
> +            int R = CoefY[Y] + CoefRV[V];
> +            int G = CoefY[Y] + CoefGU[U] + CoefGV[V];
> +            int B = CoefY[Y] + CoefBU[U];
> +
> +            //rgb = (rgb+Precision/2) / Precision (Precision=32768)
> +            R = R >> 15;
> +            G = G >> 15;
> +            B = B >> 15;
> +
> +            if (unlikely(R < 0)) R = 0;
> +            if (unlikely(G < 0)) G = 0;
> +            if (unlikely(B < 0)) B = 0;
> +            if (unlikely(R > 255)) R = 255;
> +            if (unlikely(G > 255)) G = 255;
> +            if (unlikely(B > 255)) B = 255;
> +
> +            ((uint32_t*)out)[d + i] = R | (G<<8) | (B<<16) | (0xff<<24);
> +        }
> +    }
> +}*/
> +
> +static void I420_RGBA (filter_t *filter, picture_t *src, picture_t *dst)
> +{
> +    struct yuv_pack out = { dst->p->p_pixels, dst->p->i_pitch };
> +    struct yuv_planes in = { src->Y_PIXELS, src->U_PIXELS, src->V_PIXELS,
> src->Y_PITCH }; +    i420_rgb_neon (&out, &in,
> filter->fmt_in.video.i_width, filter->fmt_in.video.i_height); +}
> +static void YV12_RGBA (filter_t *filter, picture_t *src, picture_t *dst)
> +{
> +    struct yuv_pack out = { dst->p->p_pixels, dst->p->i_pitch };
> +    struct yuv_planes in = { src->Y_PIXELS, src->V_PIXELS, src->U_PIXELS,
> src->Y_PITCH }; +    i420_rgb_neon (&out, &in,
> filter->fmt_in.video.i_width, filter->fmt_in.video.i_height); +}
> +
> +static void NV21_RGBA (filter_t *filter, picture_t *src, picture_t *dst)
> +{
> +    struct yuv_pack out = { dst->p->p_pixels, dst->p->i_pitch };
> +    struct yuv_planes in = { src->Y_PIXELS, src->U_PIXELS, src->V_PIXELS,
> src->Y_PITCH }; +    nv21_rgb_neon (&out, &in,
> filter->fmt_in.video.i_width, filter->fmt_in.video.i_height); +}
> +
> +static void NV12_RGBA (filter_t *filter, picture_t *src, picture_t *dst)
> +{
> +    struct yuv_pack out = { dst->p->p_pixels, dst->p->i_pitch };
> +    struct yuv_planes in = { src->Y_PIXELS, src->U_PIXELS, src->V_PIXELS,
> src->Y_PITCH }; +    nv12_rgb_neon (&out, &in,
> filter->fmt_in.video.i_width, filter->fmt_in.video.i_height); +}
> +
> +VIDEO_FILTER_WRAPPER (I420_RGBA)
> +VIDEO_FILTER_WRAPPER (YV12_RGBA)
> +VIDEO_FILTER_WRAPPER (NV21_RGBA)
> +VIDEO_FILTER_WRAPPER (NV12_RGBA)
> +
> +static int Open (vlc_object_t *obj)
> +{
> +    filter_t *filter = (filter_t *)obj;
> +
> +    if (((filter->fmt_in.video.i_width | filter->fmt_in.video.i_height) &
> 1) +     || (filter->fmt_in.video.i_width !=
> filter->fmt_out.video.i_width) +     || (filter->fmt_in.video.i_height !=
> filter->fmt_out.video.i_height)) +        return VLC_EGENERIC;
> +
> +    switch (filter->fmt_out.video.i_chroma)
> +    {
> +        case VLC_CODEC_RGB32:
> +            switch (filter->fmt_in.video.i_chroma)
> +            {
> +                case VLC_CODEC_I420:
> +                    filter->pf_video_filter = I420_RGBA_Filter;
> +                    break;
> +                case VLC_CODEC_YV12:
> +                    filter->pf_video_filter = YV12_RGBA_Filter;
> +                    break;
> +                case VLC_CODEC_NV21:
> +                    filter->pf_video_filter = NV21_RGBA_Filter;
> +                    break;
> +                case VLC_CODEC_NV12:
> +                    filter->pf_video_filter = NV12_RGBA_Filter;
> +                    break;
> +                default:
> +                    return VLC_EGENERIC;
> +            }
> +            break;
> +
> +        default:
> +            return VLC_EGENERIC;
> +    }
> +
> +    //precompute some values for the C version
> +    /*const int coefY  = (int)(1.164 * 32768 + 0.5);
> +    const int coefRV = (int)(1.793 * 32768 + 0.5);
> +    const int coefGU = (int)(0.213 * 32768 + 0.5);
> +    const int coefGV = (int)(0.533 * 32768 + 0.5);
> +    const int coefBU = (int)(2.113 * 32768 + 0.5);
> +    for (int i=0; i<256; ++i)
> +    {
> +        CoefY[i] = coefY * (i-16) + 16384;
> +        CoefRV[i] = coefRV*(i-128);
> +        CoefGU[i] = -coefGU*(i-128);
> +        CoefGV[i] = -coefGV*(i-128);
> +        CoefBU[i] = coefBU*(i-128);
> +    }*/
> +
> +    msg_Dbg(filter, "%4.4s(%dx%d) to %4.4s(%dx%d)",
> +            (char*)&filter->fmt_in.video.i_chroma,
> filter->fmt_in.video.i_width, filter->fmt_in.video.i_height, +           
> (char*)&filter->fmt_out.video.i_chroma, filter->fmt_out.video.i_width,
> filter->fmt_out.video.i_height); +
> +    return VLC_SUCCESS;
> +}
-- 
Rémi Denis-Courmont
http://www.remlab.info