[vlc-commits] Add i420->rv16 neon converter

Sun Mar 10 17:26:48 CET 2013

vlc | branch: master | Sébastien Toque <xilasz at gmail.com> | Tue Mar  5 21:20:39 2013 +0100| [21a9fec8a4177d63eb297d32bd5e080593afb68b] | committer: Jean-Baptiste Kempf

Add i420->rv16 neon converter

Signed-off-by: Jean-Baptiste Kempf <jb at videolan.org>

> http://git.videolan.org/gitweb.cgi/vlc.git/?a=commit;h=21a9fec8a4177d63eb297d32bd5e080593afb68b
---

 modules/arm_neon/Modules.am    |    1 +
 modules/arm_neon/chroma_neon.h |    5 +
 modules/arm_neon/i420_rv16.S   |  227 ++++++++++++++++++++++++++++++++++++++++
 modules/arm_neon/yuv_rgb.c     |   20 ++++
 4 files changed, 253 insertions(+)

diff --git a/modules/arm_neon/Modules.am b/modules/arm_neon/Modules.am
index 3106485..decb3b8 100644
--- a/modules/arm_neon/Modules.am
+++ b/modules/arm_neon/Modules.am
@@ -21,6 +21,7 @@ libvolume_neon_plugin_la_LIBADD = $(AM_LIBADD)
 
 libyuv_rgb_neon_plugin_la_SOURCES = \
 	i420_rgb.S \
+	i420_rv16.S \
 	nv21_rgb.S \
 	nv12_rgb.S \
 	yuv_rgb.c
diff --git a/modules/arm_neon/chroma_neon.h b/modules/arm_neon/chroma_neon.h
index 708d121..865315a 100644
--- a/modules/arm_neon/chroma_neon.h
+++ b/modules/arm_neon/chroma_neon.h
@@ -72,6 +72,11 @@ void i420_rgb_neon (struct yuv_pack *const out,
                     const struct yuv_planes *const in,
                     int width, int height) asm("i420_rgb_neon");
 
+/* I420 to RV16 conversion. */
+void i420_rv16_neon (struct yuv_pack *const out,
+                     const struct yuv_planes *const in,
+                     int width, int height) asm("i420_rv16_neon");
+
 /* NV21 to RGBA conversion. */
 void nv21_rgb_neon (struct yuv_pack *const out,
                     const struct yuv_planes *const in,
diff --git a/modules/arm_neon/i420_rv16.S b/modules/arm_neon/i420_rv16.S
new file mode 100644
index 0000000..cd6d269
--- /dev/null
+++ b/modules/arm_neon/i420_rv16.S
@@ -0,0 +1,227 @@
+ @*****************************************************************************
+ @ i420_rv16.S : ARM NEONv1 I420 to RV16 chroma conversion
+ @*****************************************************************************
+ @ Copyright (C) 2011 Sébastien Toque
+ @                    Rémi Denis-Courmont
+ @
+ @ This program is free software; you can redistribute it and/or modify it
+ @ under the terms of the GNU Lesser General Public License as published by
+ @ the Free Software Foundation; either version 2.1 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ @ GNU Lesser General Public License for more details.
+ @
+ @ You should have received a copy of the GNU Lesser General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+
+	.syntax unified
+	.fpu neon
+	.text
+
+/* ARM */
+#define O1	r0
+#define O2	r1
+#define WIDTH	r2
+#define HEIGHT	r3
+#define Y1	r4
+#define Y2	r5
+#define U	r6
+#define V	r7
+#define YPITCH	r8
+#define OPAD	r10
+#define YPAD	r11
+#define COUNT	ip
+#define OPITCH	lr
+
+/* NEON */
+#define coefY	D0
+#define coefRV	D1
+#define coefGU	D2
+#define coefGV	D3
+#define coefBU	D4
+#define Rc	Q3
+#define Gc	Q4
+#define Bc	Q5
+
+#define u	D24
+#define v	D25
+#define y1	D18
+#define y2	D19
+
+#define chro_r	Q6
+#define chro_g	Q7
+#define chro_b	Q8
+#define lumi1	Q15
+#define lumi2	Q10
+#define red16_1		Q9
+#define green16_1	Q10
+#define blue16_1	Q11
+#define red16_2		Q12
+#define green16_2	Q13
+#define blue16_2	Q14
+
+#define red1	D25
+#define green1	D26
+#define blue1	D27
+#define red2	D29
+#define green2	D30
+#define blue2	D31
+
+#define out1l	D24
+#define out1h	D25
+#define out2l	D28
+#define out2h	D29
+
+coefficients:
+    .short  -15872
+    .short    4992
+    .short  -18432
+
+	.align 2
+	.global i420_rv16_neon
+	.type	i420_rv16_neon, %function
+i420_rv16_neon:
+	push		{r4-r8,r10-r11,lr}
+	vpush		{q4-q7}
+
+	/* load arguments */
+	ldmia		r0,	{O1, OPITCH}
+	ldmia		r1,	{Y1, U, V, YPITCH}
+
+	/* round the width to be a multiple of 16 */
+	ands		OPAD, WIDTH, #15
+	sub			WIDTH, WIDTH, OPAD
+	addne		WIDTH, WIDTH, #16
+
+	/* init constants (scale value by 64) */
+	vmov.u8		coefY, #74
+	vmov.u8		coefRV, #115
+	vmov.u8		coefGU, #14
+	vmov.u8		coefGV, #34
+	vmov.u8		coefBU, #135
+	adr			OPAD, coefficients
+	vld1.s16	{d6[], d7[]}, [OPAD]!
+	vld1.s16	{d8[], d9[]}, [OPAD]!
+	vld1.s16	{d10[], d11[]}, [OPAD]!
+
+	/* init padding */
+	cmp			HEIGHT,	#0
+	sub			OPAD,	OPITCH,	WIDTH, lsl #1
+	sub			YPAD,	YPITCH,	WIDTH
+
+loop_row:
+	movsgt	COUNT,	WIDTH
+	add		O2,	O1,	OPITCH
+	add		Y2,	Y1,	YPITCH
+	/* exit if all rows have been processed */
+	vpople	{q4-q7}
+	pople	{r4-r8,r10-r11,pc}
+
+loop_col:
+
+	/* Common U & V */
+
+	vld1.u8	{u}, [U,:64]!
+	vld1.u8	{v}, [V,:64]!
+
+	/* Y Top Row */
+	vld2.u8	{y1,y2}, [Y1,:128]!
+
+	vmull.u8	Q14, v, coefRV
+	vmull.u8	Q11, u, coefGU
+	vmull.u8	Q13, u, coefBU
+	vmlal.u8	Q11, v, coefGV
+
+	vmull.u8	lumi2, y2, coefY
+	vmull.u8	lumi1, y1, coefY
+	vadd.s16	chro_r, Rc, Q14
+	vadd.s16	chro_b, Bc, Q13
+	vsub.s16	chro_g, Gc, Q11
+
+	pld	[U]
+	pld	[V]
+
+	/* chrominance + luminance */
+	vqadd.s16	red16_2, lumi2, chro_r
+	vqadd.s16	green16_2, lumi2, chro_g
+	vqadd.s16	blue16_2, lumi2, chro_b
+	vqadd.s16	red16_1, lumi1, chro_r
+	vqadd.s16	green16_1, lumi1, chro_g
+	vqadd.s16	blue16_1, lumi1, chro_b
+
+	/* clamp (divide by 64) */
+	vqrshrun.s16	green2, green16_2, #6
+	vqrshrun.s16	blue2, blue16_2, #6
+	vqrshrun.s16	red2, red16_2, #6
+	vqrshrun.s16	green1, green16_1, #6
+	vqrshrun.s16	red1, red16_1, #6
+	vqrshrun.s16	blue1, blue16_1, #6
+
+	pld	[Y1]
+
+	/* pack into RGB565 */
+	vshl.u8	out2l, green2, #3 // low 2a
+	vsri.u8	out2h, green2, #5 // high 2
+	vshl.u8	out1l, green1, #3 // low 1a
+	vsri.u8	out1h, green1, #5 // high 1
+	vsri.u8	out2l, blue2, #3 // low 2b
+	vsri.u8	out1l, blue1, #3 // low 1b
+
+	/* Y Bottom Row */
+	vld2.u8	{y1,y2}, [Y2,:128]!
+
+	/* Top Row output */
+	vzip.u8	out1h, out2h
+	vmull.u8	lumi2, y2, coefY
+	vzip.u8	out1l, out2l
+	vmull.u8	lumi1, y1, coefY
+	vst2.u8	{out1l, out1h}, [O1,:128]!
+	vst2.u8	{out2l, out2h}, [O1,:128]!
+
+	/* chrominance + luminance */
+	vqadd.s16	green16_2, lumi2, chro_g
+	vqadd.s16	red16_2, lumi2, chro_r
+	vqadd.s16	blue16_2, lumi2, chro_b
+	vqadd.s16	red16_1, lumi1, chro_r
+	vqadd.s16	green16_1, lumi1, chro_g
+	vqadd.s16	blue16_1, lumi1, chro_b
+
+	/* clamp (divide by 64) */
+	vqrshrun.s16	green2, green16_2, #6
+	vqrshrun.s16	blue2, blue16_2, #6
+	vqrshrun.s16	red2, red16_2, #6
+	vqrshrun.s16	green1, green16_1, #6
+	vqrshrun.s16	red1, red16_1, #6
+	vqrshrun.s16	blue1, blue16_1, #6
+
+	pld	[Y1]
+
+	/* pack into RGB565 */
+	vshl.u8	out2l, green2, #3 // low 2a
+	vsri.u8	out2h, green2, #5 // high 2
+	vshl.u8	out1l, green1, #3 // low 1a
+	vsri.u8	out1h, green1, #5 // high 1
+	vsri.u8	out2l, blue2, #3 // low 2b
+	vsri.u8	out1l, blue1, #3 // low 1b
+
+	vzip.u8	out1h, out2h
+	vzip.u8	out1l, out2l
+	vst2.u8	{out1l, out1h}, [O2,:128]!
+	vst2.u8	{out2l, out2h}, [O2,:128]!
+
+	/* next columns (x16) */
+	subs	COUNT,	COUNT,	#16
+	bgt		loop_col
+
+	/* next rows (x2) */
+	subs	HEIGHT,	#2
+	add		O1,	O2,	OPAD
+	add		Y1,	Y2,	YPAD
+	add		U,	U,	YPAD,	lsr #1
+	add		V,	V,	YPAD,	lsr #1
+	b		loop_row
diff --git a/modules/arm_neon/yuv_rgb.c b/modules/arm_neon/yuv_rgb.c
index 0fb29a2..d28a27e 100644
--- a/modules/arm_neon/yuv_rgb.c
+++ b/modules/arm_neon/yuv_rgb.c
@@ -95,6 +95,14 @@ static void I420_RGBA (filter_t *filter, picture_t *src, picture_t *dst)
     struct yuv_planes in = { src->Y_PIXELS, src->U_PIXELS, src->V_PIXELS, src->Y_PITCH };
     i420_rgb_neon (&out, &in, filter->fmt_in.video.i_width, filter->fmt_in.video.i_height);
 }
+
+static void I420_RV16 (filter_t *filter, picture_t *src, picture_t *dst)
+{
+    struct yuv_pack out = { dst->p->p_pixels, dst->p->i_pitch };
+    struct yuv_planes in = { src->Y_PIXELS, src->U_PIXELS, src->V_PIXELS, src->Y_PITCH };
+    i420_rv16_neon (&out, &in, filter->fmt_in.video.i_width, filter->fmt_in.video.i_height);
+}
+
 static void YV12_RGBA (filter_t *filter, picture_t *src, picture_t *dst)
 {
     struct yuv_pack out = { dst->p->p_pixels, dst->p->i_pitch };
@@ -117,6 +125,7 @@ static void NV12_RGBA (filter_t *filter, picture_t *src, picture_t *dst)
 }
 
 VIDEO_FILTER_WRAPPER (I420_RGBA)
+VIDEO_FILTER_WRAPPER (I420_RV16)
 VIDEO_FILTER_WRAPPER (YV12_RGBA)
 VIDEO_FILTER_WRAPPER (NV21_RGBA)
 VIDEO_FILTER_WRAPPER (NV12_RGBA)
@@ -135,6 +144,17 @@ static int Open (vlc_object_t *obj)
 
     switch (filter->fmt_out.video.i_chroma)
     {
+        case VLC_CODEC_RGB16:
+            switch (filter->fmt_in.video.i_chroma)
+            {
+                case VLC_CODEC_I420:
+                    filter->pf_video_filter = I420_RV16_Filter;
+                    break;
+                default:
+                    return VLC_EGENERIC;
+            }
+            break;
+
         case VLC_CODEC_RGB32:
             if(        filter->fmt_out.video.i_rmask != 0x000000ff
                     || filter->fmt_out.video.i_gmask != 0x0000ff00