[vlc-devel] [PATCH] i420->rv32 neon: improve scheduling & registers usage
Sébastien Toque
xilasz at gmail.com
Tue Mar 12 18:20:00 CET 2013
---
modules/arm_neon/i420_rgb.S | 112 ++++++++++++++++++++++---------------------
1 file changed, 58 insertions(+), 54 deletions(-)
diff --git a/modules/arm_neon/i420_rgb.S b/modules/arm_neon/i420_rgb.S
index db955e9..a512b5f 100644
--- a/modules/arm_neon/i420_rgb.S
+++ b/modules/arm_neon/i420_rgb.S
@@ -50,16 +50,20 @@
#define u D24
#define v D25
-#define y1 D28
-#define y2 D29
+#define y1 D18
+#define y2 D19
#define chro_r Q6
#define chro_g Q7
#define chro_b Q8
-#define red Q9
-#define green Q10
-#define blue Q11
-#define lumi Q15
+#define lumi1 Q15
+#define lumi2 Q10
+#define red16_1 Q9
+#define green16_1 Q10
+#define blue16_1 Q11
+#define red16_2 Q12
+#define green16_2 Q13
+#define blue16_2 Q14
#define red1 D24
#define green1 D25
@@ -123,69 +127,69 @@ loop_col:
vld1.u8 {u}, [U,:64]!
vld1.u8 {v}, [V,:64]!
- vmull.u8 chro_r, v, coefRV
- vmull.u8 chro_g, u, coefGU
- vmlal.u8 chro_g, v, coefGV
- vmull.u8 chro_b, u, coefBU
+ /* Y Top Row */
+ vld2.u8 {y1,y2}, [Y1,:128]!
- vadd.s16 chro_r, Rc, chro_r
- vsub.s16 chro_g, Gc, chro_g
- vadd.s16 chro_b, Bc, chro_b
+ vmull.u8 Q14, v, coefRV
+ vmull.u8 Q11, u, coefGU
+ vmull.u8 Q13, u, coefBU
+ vmlal.u8 Q11, v, coefGV
+
+ vmull.u8 lumi2, y2, coefY
+ vmull.u8 lumi1, y1, coefY
+ vadd.s16 chro_r, Rc, Q14
+ vadd.s16 chro_b, Bc, Q13
+ vsub.s16 chro_g, Gc, Q11
pld [U]
pld [V]
- /* Y Top Row */
- vld2.u8 {y1,y2}, [Y1,:128]!
-
- /* y1 : chrominance + luminance, then clamp (divide by 64) */
- vmull.u8 lumi, y1, coefY
- vqadd.s16 red, lumi, chro_r
- vqadd.s16 green, lumi, chro_g
- vqadd.s16 blue, lumi, chro_b
- vqrshrun.s16 red1, red, #6
- vqrshrun.s16 green1, green, #6
- vqrshrun.s16 blue1, blue, #6
-
- /* y2 : chrominance + luminance, then clamp (divide by 64) */
- vmull.u8 lumi, y2, coefY
- vqadd.s16 red, lumi, chro_r
- vqadd.s16 green, lumi, chro_g
- vqadd.s16 blue, lumi, chro_b
- vqrshrun.s16 red2, red, #6
- vqrshrun.s16 green2, green, #6
- vqrshrun.s16 blue2, blue, #6
+ /* chrominance + luminance */
+ vqadd.s16 red16_2, lumi2, chro_r
+ vqadd.s16 blue16_2, lumi2, chro_b
+ vqadd.s16 green16_2, lumi2, chro_g
+ vqadd.s16 red16_1, lumi1, chro_r
+ vqadd.s16 green16_1, lumi1, chro_g
+ vqadd.s16 blue16_1, lumi1, chro_b
+
+ /* clamp (divide by 64) */
+ vqrshrun.s16 blue2, blue16_2, #6
+ vqrshrun.s16 red2, red16_2, #6
+ vqrshrun.s16 green2, green16_2, #6
+ vqrshrun.s16 red1, red16_1, #6
+ vqrshrun.s16 green1, green16_1, #6
+ vqrshrun.s16 blue1, blue16_1, #6
pld [Y1]
- vmov.u8 alpha2, #255
+ /* Y Bottom Row */
+ vld2.u8 {y1,y2}, [Y2,:128]!
+
+ vmov.u8 alpha1, #255
vzip.u8 red1, red2
vzip.u8 green1, green2
vzip.u8 blue1, blue2
+ vmull.u8 lumi2, y2, coefY
vst4.u8 {red1,green1,blue1,alpha1}, [O1,:128]!
vst4.u8 {red2,green2,blue2,alpha2}, [O1,:128]!
- /* Y Bottom Row */
- vld2.u8 {y1,y2}, [Y2,:128]!
-
- /* y1 : chrominance + luminance, then clamp (divide by 64) */
- vmull.u8 lumi, y1, coefY
- vqadd.s16 red, lumi, chro_r
- vqadd.s16 green, lumi, chro_g
- vqadd.s16 blue, lumi, chro_b
- vqrshrun.s16 red1, red, #6
- vqrshrun.s16 green1, green, #6
- vqrshrun.s16 blue1, blue, #6
-
- /* y2 : chrominance + luminance, then clamp (divide by 64) */
- vmull.u8 lumi, y2, coefY
- vqadd.s16 red, lumi, chro_r
- vqadd.s16 green, lumi, chro_g
- vqadd.s16 blue, lumi, chro_b
- vqrshrun.s16 red2, red, #6
- vqrshrun.s16 green2, green, #6
- vqrshrun.s16 blue2, blue, #6
+ /* chrominance + luminance */
+ vmull.u8 lumi1, y1, coefY
+ vqadd.s16 red16_2, lumi2, chro_r
+ vqadd.s16 green16_2, lumi2, chro_g
+ vqadd.s16 blue16_2, lumi2, chro_b
+ vqadd.s16 red16_1, lumi1, chro_r
+ vqadd.s16 green16_1, lumi1, chro_g
+ vqadd.s16 blue16_1, lumi1, chro_b
+
+ /* clamp (divide by 64) */
+ vqrshrun.s16 blue2, blue16_2, #6
+ vqrshrun.s16 red2, red16_2, #6
+ vqrshrun.s16 green2, green16_2, #6
+ vqrshrun.s16 red1, red16_1, #6
+ vqrshrun.s16 green1, green16_1, #6
+ vqrshrun.s16 blue1, blue16_1, #6
pld [Y2]
--
1.7.9.5
More information about the vlc-devel
mailing list