[vlc-devel] [PATCH] YUV->RGB convertor in NEON
Sébastien Toque
xilasz at gmail.com
Thu Jan 19 22:33:18 CET 2012
Signed-off-by: Rafaël Carré <funman at videolan.org>
---
modules/arm_neon/Modules.am | 10 ++
modules/arm_neon/chroma_neon.h | 12 +++
modules/arm_neon/i420_rgb.S | 209 ++++++++++++++++++++++++++++++++++++++++
modules/arm_neon/nv12_rgb.S | 206 +++++++++++++++++++++++++++++++++++++++
modules/arm_neon/nv21_rgb.S | 206 +++++++++++++++++++++++++++++++++++++++
modules/arm_neon/yuv_rgb.c | 179 ++++++++++++++++++++++++++++++++++
6 files changed, 822 insertions(+), 0 deletions(-)
create mode 100644 modules/arm_neon/i420_rgb.S
create mode 100644 modules/arm_neon/nv12_rgb.S
create mode 100644 modules/arm_neon/nv21_rgb.S
create mode 100644 modules/arm_neon/yuv_rgb.c
diff --git a/modules/arm_neon/Modules.am b/modules/arm_neon/Modules.am
index 83576eb..9dfd4ab 100644
--- a/modules/arm_neon/Modules.am
+++ b/modules/arm_neon/Modules.am
@@ -18,7 +18,17 @@ libchroma_yuv_neon_plugin_la_CFLAGS = $(AM_CFLAGS)
libchroma_yuv_neon_plugin_la_LIBADD = $(AM_LIBADD)
libchroma_yuv_neon_plugin_la_DEPENDENCIES =
+libyuv_rgb_neon_plugin_la_SOURCES = \
+ i420_rgb.S \
+ nv21_rgb.S \
+ nv12_rgb.S \
+ yuv_rgb.c
+libyuv_rgb_neon_plugin_la_CFLAGS = $(AM_CFLAGS)
+libyuv_rgb_neon_plugin_la_LIBADD = $(AM_LIBADD)
+libyuv_rgb_neon_plugin_la_DEPENDENCIES =
+
libvlc_LTLIBRARIES += \
libaudio_format_neon_plugin.la \
libchroma_yuv_neon_plugin.la \
+ libyuv_rgb_neon_plugin.la \
$(NULL)
diff --git a/modules/arm_neon/chroma_neon.h b/modules/arm_neon/chroma_neon.h
index 204c5f1..3e867e3 100644
--- a/modules/arm_neon/chroma_neon.h
+++ b/modules/arm_neon/chroma_neon.h
@@ -64,3 +64,15 @@ void yuyv_i422_neon (struct yuv_planes *const out,
/* UYVY to I422 conversion. */
void uyvy_i422_neon (struct yuv_planes *const out,
const struct yuv_pack *const in, int width, int height);
+
+/* I420 to RGBA conversion. */
+void i420_rgb_neon (struct yuv_pack *const out,
+ const struct yuv_planes *const in, int width, int height);
+
+/* NV21 to RGBA conversion. */
+void nv21_rgb_neon (struct yuv_pack *const out,
+ const struct yuv_planes *const in, int width, int height);
+
+/* NV12 to RGBA conversion. */
+void nv12_rgb_neon (struct yuv_pack *const out,
+ const struct yuv_planes *const in, int width, int height);
diff --git a/modules/arm_neon/i420_rgb.S b/modules/arm_neon/i420_rgb.S
new file mode 100644
index 0000000..cc0caf3
--- /dev/null
+++ b/modules/arm_neon/i420_rgb.S
@@ -0,0 +1,209 @@
+ @*****************************************************************************
+ @ i420_rgb.S : ARM NEONv1 I420 to RGB chroma conversion
+ @*****************************************************************************
+ @ Copyright (C) 2011 Sébastien Toque
+ @ Rémi Denis-Courmont
+ @
+ @ This program is free software; you can redistribute it and/or modify
+ @ it under the terms of the GNU General Public License as published by
+ @ the Free Software Foundation; either version 2 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ @ GNU General Public License for more details.
+ @
+ @ You should have received a copy of the GNU General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+
+ .fpu neon
+ .text
+
+/* ARM */
+#define O1 r0
+#define O2 r1
+#define WIDTH r2
+#define HEIGHT r3
+#define Y1 r4
+#define Y2 r5
+#define U r6
+#define V r7
+#define YPITCH r8
+#define OPAD r10
+#define YPAD r11
+#define COUNT ip
+#define OPITCH lr
+
+/* NEON */
+#define coefY D0
+#define coefRV D1
+#define coefGU D2
+#define coefGV D3
+#define coefBU D4
+#define Rc Q3
+#define Gc Q4
+#define Bc Q5
+
+#define u D24
+#define v D25
+#define y1 D28
+#define y2 D29
+
+#define chro_r Q6
+#define chro_g Q7
+#define chro_b Q8
+#define red Q9
+#define green Q10
+#define blue Q11
+#define lumi Q15
+
+#define red1 D24
+#define green1 D25
+#define blue1 D26
+#define alpha1 D27
+#define red2 D28
+#define green2 D29
+#define blue2 D30
+#define alpha2 D31
+
+coefficients:
+ .short -15872
+ .short 4992
+ .short -18432
+
+ .align
+ .global i420_rgb_neon
+ .type i420_rgb_neon, %function
+i420_rgb_neon:
+ push {r4-r8,r10-r11,lr}
+ vpush {q4-q7}
+
+ /* load arguments */
+ ldmia r0, {O1, OPITCH}
+ ldmia r1, {Y1, U, V, YPITCH}
+
+ /* round the width to be a multiple of 16 */
+ ands OPAD, WIDTH, #15
+ sub WIDTH, WIDTH, OPAD
+ addne WIDTH, WIDTH, #16
+
+ /* init constants (scale value by 64) */
+ vmov.u8 coefY, #74
+ vmov.u8 coefRV, #115
+ vmov.u8 coefGU, #14
+ vmov.u8 coefGV, #34
+ vmov.u8 coefBU, #135
+ adr OPAD, coefficients
+ vld1.s16 {d6[], d7[]}, [OPAD]!
+ vld1.s16 {d8[], d9[]}, [OPAD]!
+ vld1.s16 {d10[], d11[]}, [OPAD]!
+ vmov.u8 alpha1, #255
+
+ /* init padding */
+ cmp HEIGHT, #0
+ sub OPAD, OPITCH, WIDTH, lsl #2
+ sub YPAD, YPITCH, WIDTH
+
+loop_row:
+ movgts COUNT, WIDTH
+ add O2, O1, OPITCH
+ add Y2, Y1, YPITCH
+ /* exit if all rows have been processed */
+ vpople {q4-q7}
+ pople {r4-r8,r10-r11,pc}
+
+loop_col:
+
+ /* Common U & V */
+
+ vld1.u8 {u}, [U,:64]!
+ vld1.u8 {v}, [V,:64]!
+
+ vmull.u8 chro_r, v, coefRV
+ vmull.u8 chro_g, u, coefGU
+ vmlal.u8 chro_g, v, coefGV
+ vmull.u8 chro_b, u, coefBU
+
+ vadd.s16 chro_r, Rc, chro_r
+ vsub.s16 chro_g, Gc, chro_g
+ vadd.s16 chro_b, Bc, chro_b
+
+ PLD [U]
+ PLD [V]
+
+ /* Y Top Row */
+ vld2.u8 {y1,y2}, [Y1,:128]!
+
+ /* y1 : chrominance + luminance, then clamp (divide by 64) */
+ vmull.u8 lumi, y1, coefY
+ vqadd.s16 red, lumi, chro_r
+ vqadd.s16 green, lumi, chro_g
+ vqadd.s16 blue, lumi, chro_b
+ vqrshrun.s16 red1, red, #6
+ vqrshrun.s16 green1, green, #6
+ vqrshrun.s16 blue1, blue, #6
+
+ /* y2 : chrominance + luminance, then clamp (divide by 64) */
+ vmull.u8 lumi, y2, coefY
+ vqadd.s16 red, lumi, chro_r
+ vqadd.s16 green, lumi, chro_g
+ vqadd.s16 blue, lumi, chro_b
+ vqrshrun.s16 red2, red, #6
+ vqrshrun.s16 green2, green, #6
+ vqrshrun.s16 blue2, blue, #6
+
+ PLD [Y1]
+
+ vmov.u8 alpha2, #255
+ vzip.u8 red1, red2
+ vzip.u8 green1, green2
+ vzip.u8 blue1, blue2
+
+ vst4.u8 {red1,green1,blue1,alpha1}, [O1,:128]!
+ vst4.u8 {red2,green2,blue2,alpha2}, [O1,:128]!
+
+ /* Y Bottom Row */
+ vld2.u8 {y1,y2}, [Y2,:128]!
+
+ /* y1 : chrominance + luminance, then clamp (divide by 64) */
+ vmull.u8 lumi, y1, coefY
+ vqadd.s16 red, lumi, chro_r
+ vqadd.s16 green, lumi, chro_g
+ vqadd.s16 blue, lumi, chro_b
+ vqrshrun.s16 red1, red, #6
+ vqrshrun.s16 green1, green, #6
+ vqrshrun.s16 blue1, blue, #6
+
+ /* y2 : chrominance + luminance, then clamp (divide by 64) */
+ vmull.u8 lumi, y2, coefY
+ vqadd.s16 red, lumi, chro_r
+ vqadd.s16 green, lumi, chro_g
+ vqadd.s16 blue, lumi, chro_b
+ vqrshrun.s16 red2, red, #6
+ vqrshrun.s16 green2, green, #6
+ vqrshrun.s16 blue2, blue, #6
+
+ PLD [Y2]
+
+ vmov.u8 alpha2, #255
+ vzip.u8 red1, red2
+ vzip.u8 green1, green2
+ vzip.u8 blue1, blue2
+
+ vst4.u8 {red1,green1,blue1,alpha1}, [O2,:128]!
+ vst4.u8 {red2,green2,blue2,alpha2}, [O2,:128]!
+
+ /* next columns (x16) */
+ subs COUNT, COUNT, #16
+ bgt loop_col
+
+ /* next rows (x2) */
+ subs HEIGHT, #2
+ add O1, O2, OPAD
+ add Y1, Y2, YPAD
+ add U, U, YPAD, lsr #1
+ add V, V, YPAD, lsr #1
+ b loop_row
diff --git a/modules/arm_neon/nv12_rgb.S b/modules/arm_neon/nv12_rgb.S
new file mode 100644
index 0000000..f4bb510
--- /dev/null
+++ b/modules/arm_neon/nv12_rgb.S
@@ -0,0 +1,206 @@
+ @*****************************************************************************
+ @ nv12_rgb.S : ARM NEONv1 NV12 to RGB chroma conversion
+ @*****************************************************************************
+ @ Copyright (C) 2011 Sébastien Toque
+ @ Rémi Denis-Courmont
+ @
+ @ This program is free software; you can redistribute it and/or modify
+ @ it under the terms of the GNU General Public License as published by
+ @ the Free Software Foundation; either version 2 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ @ GNU General Public License for more details.
+ @
+ @ You should have received a copy of the GNU General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+
+ .fpu neon
+ .text
+
+/* ARM */
+#define O1 r0
+#define O2 r1
+#define WIDTH r2
+#define HEIGHT r3
+#define Y1 r4
+#define Y2 r5
+#define U r6
+#define V r7
+#define YPITCH r8
+#define OPAD r10
+#define YPAD r11
+#define COUNT ip
+#define OPITCH lr
+
+/* NEON */
+#define coefY D0
+#define coefRV D1
+#define coefGU D2
+#define coefGV D3
+#define coefBU D4
+#define Rc Q3
+#define Gc Q4
+#define Bc Q5
+
+#define u D24
+#define v D25
+#define y1 D28
+#define y2 D29
+
+#define chro_r Q6
+#define chro_g Q7
+#define chro_b Q8
+#define red Q9
+#define green Q10
+#define blue Q11
+#define lumi Q15
+
+#define red1 D24
+#define green1 D25
+#define blue1 D26
+#define alpha1 D27
+#define red2 D28
+#define green2 D29
+#define blue2 D30
+#define alpha2 D31
+
+coefficients:
+ .short -15872
+ .short 4992
+ .short -18432
+
+ .align
+ .global nv12_rgb_neon
+ .type nv12_rgb_neon, %function
+nv12_rgb_neon:
+ push {r4-r8,r10-r11,lr}
+ vpush {q4-q7}
+
+ /* load arguments */
+ ldmia r0, {O1, OPITCH}
+ ldmia r1, {Y1, U, V, YPITCH}
+
+ /* round the width to be a multiple of 16 */
+ ands OPAD, WIDTH, #15
+ sub WIDTH, WIDTH, OPAD
+ addne WIDTH, WIDTH, #16
+
+ /* init constants (scale value by 64) */
+ vmov.u8 coefY, #74
+ vmov.u8 coefRV, #115
+ vmov.u8 coefGU, #14
+ vmov.u8 coefGV, #34
+ vmov.u8 coefBU, #135
+ adr OPAD, coefficients
+ vld1.s16 {d6[], d7[]}, [OPAD]!
+ vld1.s16 {d8[], d9[]}, [OPAD]!
+ vld1.s16 {d10[], d11[]}, [OPAD]!
+ vmov.u8 alpha1, #255
+
+ /* init padding */
+ cmp HEIGHT, #0
+ sub OPAD, OPITCH, WIDTH, lsl #2
+ sub YPAD, YPITCH, WIDTH
+
+loop_row:
+ movgts COUNT, WIDTH
+ add O2, O1, OPITCH
+ add Y2, Y1, YPITCH
+ /* exit if all rows have been processed */
+ vpople {q4-q7}
+ pople {r4-r8,r10-r11,pc}
+
+loop_col:
+
+ /* Common U & V */
+
+ vld2.u8 {u,v}, [U,:128]!
+
+ vmull.u8 chro_r, v, coefRV
+ vmull.u8 chro_g, u, coefGU
+ vmlal.u8 chro_g, v, coefGV
+ vmull.u8 chro_b, u, coefBU
+
+ vadd.s16 chro_r, Rc, chro_r
+ vsub.s16 chro_g, Gc, chro_g
+ vadd.s16 chro_b, Bc, chro_b
+
+ PLD [U]
+
+ /* Y Top Row */
+ vld2.u8 {y1,y2}, [Y1,:128]!
+
+ /* y1 : chrominance + luminance, then clamp (divide by 64) */
+ vmull.u8 lumi, y1, coefY
+ vqadd.s16 red, lumi, chro_r
+ vqadd.s16 green, lumi, chro_g
+ vqadd.s16 blue, lumi, chro_b
+ vqrshrun.s16 red1, red, #6
+ vqrshrun.s16 green1, green, #6
+ vqrshrun.s16 blue1, blue, #6
+
+ /* y2 : chrominance + luminance, then clamp (divide by 64) */
+ vmull.u8 lumi, y2, coefY
+ vqadd.s16 red, lumi, chro_r
+ vqadd.s16 green, lumi, chro_g
+ vqadd.s16 blue, lumi, chro_b
+ vqrshrun.s16 red2, red, #6
+ vqrshrun.s16 green2, green, #6
+ vqrshrun.s16 blue2, blue, #6
+
+ PLD [Y1]
+
+ vmov.u8 alpha2, #255
+ vzip.u8 red1, red2
+ vzip.u8 green1, green2
+ vzip.u8 blue1, blue2
+
+ vst4.u8 {red1,green1,blue1,alpha1}, [O1,:128]!
+ vst4.u8 {red2,green2,blue2,alpha2}, [O1,:128]!
+
+ /* Y Bottom Row */
+ vld2.u8 {y1,y2}, [Y2,:128]!
+
+ /* y1 : chrominance + luminance, then clamp (divide by 64) */
+ vmull.u8 lumi, y1, coefY
+ vqadd.s16 red, lumi, chro_r
+ vqadd.s16 green, lumi, chro_g
+ vqadd.s16 blue, lumi, chro_b
+ vqrshrun.s16 red1, red, #6
+ vqrshrun.s16 green1, green, #6
+ vqrshrun.s16 blue1, blue, #6
+
+ /* y2 : chrominance + luminance, then clamp (divide by 64) */
+ vmull.u8 lumi, y2, coefY
+ vqadd.s16 red, lumi, chro_r
+ vqadd.s16 green, lumi, chro_g
+ vqadd.s16 blue, lumi, chro_b
+ vqrshrun.s16 red2, red, #6
+ vqrshrun.s16 green2, green, #6
+ vqrshrun.s16 blue2, blue, #6
+
+ PLD [Y2]
+
+ vmov.u8 alpha2, #255
+ vzip.u8 red1, red2
+ vzip.u8 green1, green2
+ vzip.u8 blue1, blue2
+
+ vst4.u8 {red1,green1,blue1,alpha1}, [O2,:128]!
+ vst4.u8 {red2,green2,blue2,alpha2}, [O2,:128]!
+
+ /* next columns (x16) */
+ subs COUNT, COUNT, #16
+ bgt loop_col
+
+ /* next rows (x2) */
+ subs HEIGHT, #2
+ add O1, O2, OPAD
+ add Y1, Y2, YPAD
+ add U, U, YPAD
+ b loop_row
diff --git a/modules/arm_neon/nv21_rgb.S b/modules/arm_neon/nv21_rgb.S
new file mode 100644
index 0000000..82a7099
--- /dev/null
+++ b/modules/arm_neon/nv21_rgb.S
@@ -0,0 +1,206 @@
+ @*****************************************************************************
+ @ nv21_rgb.S : ARM NEONv1 NV21 to RGB chroma conversion
+ @*****************************************************************************
+ @ Copyright (C) 2011 Sébastien Toque
+ @ Rémi Denis-Courmont
+ @
+ @ This program is free software; you can redistribute it and/or modify
+ @ it under the terms of the GNU General Public License as published by
+ @ the Free Software Foundation; either version 2 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ @ GNU General Public License for more details.
+ @
+ @ You should have received a copy of the GNU General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+
+ .fpu neon
+ .text
+
+/* ARM */
+#define O1 r0
+#define O2 r1
+#define WIDTH r2
+#define HEIGHT r3
+#define Y1 r4
+#define Y2 r5
+#define U r6
+#define V r7
+#define YPITCH r8
+#define OPAD r10
+#define YPAD r11
+#define COUNT ip
+#define OPITCH lr
+
+/* NEON */
+#define coefY D0
+#define coefRV D1
+#define coefGU D2
+#define coefGV D3
+#define coefBU D4
+#define Rc Q3
+#define Gc Q4
+#define Bc Q5
+
+#define u D24
+#define v D25
+#define y1 D28
+#define y2 D29
+
+#define chro_r Q6
+#define chro_g Q7
+#define chro_b Q8
+#define red Q9
+#define green Q10
+#define blue Q11
+#define lumi Q15
+
+#define red1 D24
+#define green1 D25
+#define blue1 D26
+#define alpha1 D27
+#define red2 D28
+#define green2 D29
+#define blue2 D30
+#define alpha2 D31
+
+coefficients:
+ .short -15872
+ .short 4992
+ .short -18432
+
+ .align
+ .global nv21_rgb_neon
+ .type nv21_rgb_neon, %function
+nv21_rgb_neon:
+ push {r4-r8,r10-r11,lr}
+ vpush {q4-q7}
+
+ /* load arguments */
+ ldmia r0, {O1, OPITCH}
+ ldmia r1, {Y1, U, V, YPITCH}
+
+ /* round the width to be a multiple of 16 */
+ ands OPAD, WIDTH, #15
+ sub WIDTH, WIDTH, OPAD
+ addne WIDTH, WIDTH, #16
+
+ /* init constants (scale value by 64) */
+ vmov.u8 coefY, #74
+ vmov.u8 coefRV, #115
+ vmov.u8 coefGU, #14
+ vmov.u8 coefGV, #34
+ vmov.u8 coefBU, #135
+ adr OPAD, coefficients
+ vld1.s16 {d6[], d7[]}, [OPAD]!
+ vld1.s16 {d8[], d9[]}, [OPAD]!
+ vld1.s16 {d10[], d11[]}, [OPAD]!
+ vmov.u8 alpha1, #255
+
+ /* init padding */
+ cmp HEIGHT, #0
+ sub OPAD, OPITCH, WIDTH, lsl #2
+ sub YPAD, YPITCH, WIDTH
+
+loop_row:
+ movgts COUNT, WIDTH
+ add O2, O1, OPITCH
+ add Y2, Y1, YPITCH
+ /* exit if all rows have been processed */
+ vpople {q4-q7}
+ pople {r4-r8,r10-r11,pc}
+
+loop_col:
+
+ /* Common U & V */
+
+ vld2.u8 {u,v}, [U,:128]!
+
+ vmull.u8 chro_r, u, coefRV
+ vmull.u8 chro_g, v, coefGU
+ vmlal.u8 chro_g, u, coefGV
+ vmull.u8 chro_b, v, coefBU
+
+ vadd.s16 chro_r, Rc, chro_r
+ vsub.s16 chro_g, Gc, chro_g
+ vadd.s16 chro_b, Bc, chro_b
+
+ PLD [U]
+
+ /* Y Top Row */
+ vld2.u8 {y1,y2}, [Y1,:128]!
+
+ /* y1 : chrominance + luminance, then clamp (divide by 64) */
+ vmull.u8 lumi, y1, coefY
+ vqadd.s16 red, lumi, chro_r
+ vqadd.s16 green, lumi, chro_g
+ vqadd.s16 blue, lumi, chro_b
+ vqrshrun.s16 red1, red, #6
+ vqrshrun.s16 green1, green, #6
+ vqrshrun.s16 blue1, blue, #6
+
+ /* y2 : chrominance + luminance, then clamp (divide by 64) */
+ vmull.u8 lumi, y2, coefY
+ vqadd.s16 red, lumi, chro_r
+ vqadd.s16 green, lumi, chro_g
+ vqadd.s16 blue, lumi, chro_b
+ vqrshrun.s16 red2, red, #6
+ vqrshrun.s16 green2, green, #6
+ vqrshrun.s16 blue2, blue, #6
+
+ PLD [Y1]
+
+ vmov.u8 alpha2, #255
+ vzip.u8 red1, red2
+ vzip.u8 green1, green2
+ vzip.u8 blue1, blue2
+
+ vst4.u8 {red1,green1,blue1,alpha1}, [O1,:128]!
+ vst4.u8 {red2,green2,blue2,alpha2}, [O1,:128]!
+
+ /* Y Bottom Row */
+ vld2.u8 {y1,y2}, [Y2,:128]!
+
+ /* y1 : chrominance + luminance, then clamp (divide by 64) */
+ vmull.u8 lumi, y1, coefY
+ vqadd.s16 red, lumi, chro_r
+ vqadd.s16 green, lumi, chro_g
+ vqadd.s16 blue, lumi, chro_b
+ vqrshrun.s16 red1, red, #6
+ vqrshrun.s16 green1, green, #6
+ vqrshrun.s16 blue1, blue, #6
+
+ /* y2 : chrominance + luminance, then clamp (divide by 64) */
+ vmull.u8 lumi, y2, coefY
+ vqadd.s16 red, lumi, chro_r
+ vqadd.s16 green, lumi, chro_g
+ vqadd.s16 blue, lumi, chro_b
+ vqrshrun.s16 red2, red, #6
+ vqrshrun.s16 green2, green, #6
+ vqrshrun.s16 blue2, blue, #6
+
+ PLD [Y2]
+
+ vmov.u8 alpha2, #255
+ vzip.u8 red1, red2
+ vzip.u8 green1, green2
+ vzip.u8 blue1, blue2
+
+ vst4.u8 {red1,green1,blue1,alpha1}, [O2,:128]!
+ vst4.u8 {red2,green2,blue2,alpha2}, [O2,:128]!
+
+ /* next columns (x16) */
+ subs COUNT, COUNT, #16
+ bgt loop_col
+
+ /* next rows (x2) */
+ subs HEIGHT, #2
+ add O1, O2, OPAD
+ add Y1, Y2, YPAD
+ add U, U, YPAD
+ b loop_row
diff --git a/modules/arm_neon/yuv_rgb.c b/modules/arm_neon/yuv_rgb.c
new file mode 100644
index 0000000..ede49c9
--- /dev/null
+++ b/modules/arm_neon/yuv_rgb.c
@@ -0,0 +1,179 @@
+/*****************************************************************************
+ * yuv_rgb.c : ARM NEONv1 YUV to RGB32 chroma conversion for VLC
+ *****************************************************************************
+ * Copyright (C) 2011 Sébastien Toque
+ * Rémi Denis-Courmont
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ *****************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <vlc_common.h>
+#include <vlc_plugin.h>
+#include <vlc_filter.h>
+#include <vlc_cpu.h>
+#include "chroma_neon.h"
+
+static int Open (vlc_object_t *);
+
+vlc_module_begin ()
+ set_description (N_("ARM NEON video chroma YUV->RGBA"))
+ set_capability ("video filter2", 250)
+ set_callbacks (Open, NULL)
+vlc_module_end ()
+
+/*
+static int CoefY[256];
+static int CoefRV[256];
+static int CoefGU[256];
+static int CoefGV[256];
+static int CoefBU[256];
+
+// C reference version of the converter
+static void I420_RGBA_C (filter_t *filter, picture_t *src, picture_t *dst)
+{
+ const uint8_t *const out = dst->p->p_pixels;
+ const size_t width = src->p[Y_PLANE].i_visible_pitch;
+ const size_t height = src->p[Y_PLANE].i_visible_lines;
+
+ const int ypitch = src->p[Y_PLANE].i_pitch;
+ const int uvpitch = src->p[U_PLANE].i_pitch;
+ const int dpitch = dst->p->i_pitch / dst->p->i_pixel_pitch;
+
+ for (size_t j = 0; j < height; ++j)
+ {
+ const int y = j * ypitch;
+ const int u = (j>>1) * uvpitch;
+ const int d = j * dpitch;
+
+ for (size_t i = 0; i < width; ++i)
+ {
+ uint8_t Y = src->Y_PIXELS[y + i];
+ uint8_t U = src->U_PIXELS[u + (i>>1)];
+ uint8_t V = src->V_PIXELS[u + (i>>1)];
+
+ //coef = float * Precision + .5 (Precision=32768)
+ int R = CoefY[Y] + CoefRV[V];
+ int G = CoefY[Y] + CoefGU[U] + CoefGV[V];
+ int B = CoefY[Y] + CoefBU[U];
+
+ //rgb = (rgb+Precision/2) / Precision (Precision=32768)
+ R = R >> 15;
+ G = G >> 15;
+ B = B >> 15;
+
+ if (unlikely(R < 0)) R = 0;
+ if (unlikely(G < 0)) G = 0;
+ if (unlikely(B < 0)) B = 0;
+ if (unlikely(R > 255)) R = 255;
+ if (unlikely(G > 255)) G = 255;
+ if (unlikely(B > 255)) B = 255;
+
+ ((uint32_t*)out)[d + i] = R | (G<<8) | (B<<16) | (0xff<<24);
+ }
+ }
+}*/
+
+static void I420_RGBA (filter_t *filter, picture_t *src, picture_t *dst)
+{
+ struct yuv_pack out = { dst->p->p_pixels, dst->p->i_pitch };
+ struct yuv_planes in = { src->Y_PIXELS, src->U_PIXELS, src->V_PIXELS, src->Y_PITCH };
+ i420_rgb_neon (&out, &in, filter->fmt_in.video.i_width, filter->fmt_in.video.i_height);
+}
+static void YV12_RGBA (filter_t *filter, picture_t *src, picture_t *dst)
+{
+ struct yuv_pack out = { dst->p->p_pixels, dst->p->i_pitch };
+ struct yuv_planes in = { src->Y_PIXELS, src->V_PIXELS, src->U_PIXELS, src->Y_PITCH };
+ i420_rgb_neon (&out, &in, filter->fmt_in.video.i_width, filter->fmt_in.video.i_height);
+}
+
+static void NV21_RGBA (filter_t *filter, picture_t *src, picture_t *dst)
+{
+ struct yuv_pack out = { dst->p->p_pixels, dst->p->i_pitch };
+ struct yuv_planes in = { src->Y_PIXELS, src->U_PIXELS, src->V_PIXELS, src->Y_PITCH };
+ nv21_rgb_neon (&out, &in, filter->fmt_in.video.i_width, filter->fmt_in.video.i_height);
+}
+
+static void NV12_RGBA (filter_t *filter, picture_t *src, picture_t *dst)
+{
+ struct yuv_pack out = { dst->p->p_pixels, dst->p->i_pitch };
+ struct yuv_planes in = { src->Y_PIXELS, src->U_PIXELS, src->V_PIXELS, src->Y_PITCH };
+ nv12_rgb_neon (&out, &in, filter->fmt_in.video.i_width, filter->fmt_in.video.i_height);
+}
+
+VIDEO_FILTER_WRAPPER (I420_RGBA)
+VIDEO_FILTER_WRAPPER (YV12_RGBA)
+VIDEO_FILTER_WRAPPER (NV21_RGBA)
+VIDEO_FILTER_WRAPPER (NV12_RGBA)
+
+static int Open (vlc_object_t *obj)
+{
+ filter_t *filter = (filter_t *)obj;
+
+ if (((filter->fmt_in.video.i_width | filter->fmt_in.video.i_height) & 1)
+ || (filter->fmt_in.video.i_width != filter->fmt_out.video.i_width)
+ || (filter->fmt_in.video.i_height != filter->fmt_out.video.i_height))
+ return VLC_EGENERIC;
+
+ switch (filter->fmt_out.video.i_chroma)
+ {
+ case VLC_CODEC_RGB32:
+ switch (filter->fmt_in.video.i_chroma)
+ {
+ case VLC_CODEC_I420:
+ filter->pf_video_filter = I420_RGBA_Filter;
+ break;
+ case VLC_CODEC_YV12:
+ filter->pf_video_filter = YV12_RGBA_Filter;
+ break;
+ case VLC_CODEC_NV21:
+ filter->pf_video_filter = NV21_RGBA_Filter;
+ break;
+ case VLC_CODEC_NV12:
+ filter->pf_video_filter = NV12_RGBA_Filter;
+ break;
+ default:
+ return VLC_EGENERIC;
+ }
+ break;
+
+ default:
+ return VLC_EGENERIC;
+ }
+
+ //precompute some values for the C version
+ /*const int coefY = (int)(1.164 * 32768 + 0.5);
+ const int coefRV = (int)(1.793 * 32768 + 0.5);
+ const int coefGU = (int)(0.213 * 32768 + 0.5);
+ const int coefGV = (int)(0.533 * 32768 + 0.5);
+ const int coefBU = (int)(2.113 * 32768 + 0.5);
+ for (int i=0; i<256; ++i)
+ {
+ CoefY[i] = coefY * (i-16) + 16384;
+ CoefRV[i] = coefRV*(i-128);
+ CoefGU[i] = -coefGU*(i-128);
+ CoefGV[i] = -coefGV*(i-128);
+ CoefBU[i] = coefBU*(i-128);
+ }*/
+
+ msg_Dbg(filter, "%4.4s(%dx%d) to %4.4s(%dx%d)",
+ (char*)&filter->fmt_in.video.i_chroma, filter->fmt_in.video.i_width, filter->fmt_in.video.i_height,
+ (char*)&filter->fmt_out.video.i_chroma, filter->fmt_out.video.i_width, filter->fmt_out.video.i_height);
+
+ return VLC_SUCCESS;
+}
--
1.7.8.3
More information about the vlc-devel
mailing list