[vlc-devel] [PATCH] arm neon i420_yuyv/i420_uyvy/s32_s16 : a bit less cycles (untested)
Rafaël Carré
rafael.carre at gmail.com
Fri Jun 24 03:10:34 CEST 2011
i420_*: Do not push lr on the stack and use bx lr
should be the same number of cycles but with less memory usage
(can't check as i can't find the pdf with the number of cycles per instruction)
sub+cmp -> subs
s32_s16: sub+cmp -> subs
ldr+add -> ldr with post-increment
---
modules/arm_neon/i420_yuyv.S | 20 ++++++++++----------
modules/arm_neon/s32_s16.S | 10 ++++------
2 files changed, 14 insertions(+), 16 deletions(-)
diff --git a/modules/arm_neon/i420_yuyv.S b/modules/arm_neon/i420_yuyv.S
index 556680f..6a6bd4d 100644
--- a/modules/arm_neon/i420_yuyv.S
+++ b/modules/arm_neon/i420_yuyv.S
@@ -36,8 +36,8 @@
.global i420_yuyv_neon
.type i420_yuyv_neon, %function
i420_yuyv_neon:
- push {r4-r8, lr}
- ldr HEIGHT, [sp, #(4*6)]
+ push {r4-r8}
+ ldr HEIGHT, [sp, #(4*5)]
ldmia r1, {Y1, U, V}
add O2, O1, PITCH, lsl #1
add Y2, Y1, PITCH
@@ -64,7 +64,6 @@ i420_yuyv_neon:
cmp O1, END_O1
bne 2b
- sub HEIGHT, #2
mov O1, O2
add O2, PITCH, lsl #1
add Y2, S_OFF
@@ -74,16 +73,17 @@ i420_yuyv_neon:
add U, S_OFF, lsr #1
add V, S_OFF, lsr #1
- cmp HEIGHT, #0
+ subs HEIGHT, #2
bne 1b
- pop {r4-r8, pc}
+ pop {r4-r8}
+ bx lr
.global i420_uyvy_neon
.type i420_uyvy_neon, %function
i420_uyvy_neon:
- push {r4-r8, lr}
- ldr HEIGHT, [sp, #(4*6)]
+ push {r4-r8}
+ ldr HEIGHT, [sp, #(4*5)]
ldmia r1, {Y1, U, V}
add O2, O1, PITCH, lsl #1
add Y2, Y1, PITCH
@@ -109,7 +109,6 @@ i420_uyvy_neon:
cmp O1, END_O1
bne 2b
- sub HEIGHT, #2
mov O1, O2
add O2, PITCH, lsl #1
add Y2, S_OFF
@@ -119,7 +118,8 @@ i420_uyvy_neon:
add U, S_OFF, lsr #1
add V, S_OFF, lsr #1
- cmp HEIGHT, #0
+ subs HEIGHT, #2
bne 1b
- pop {r4-r8, pc}
+ pop {r4-r8}
+ bx lr
diff --git a/modules/arm_neon/s32_s16.S b/modules/arm_neon/s32_s16.S
index 88effca..4901c1a 100644
--- a/modules/arm_neon/s32_s16.S
+++ b/modules/arm_neon/s32_s16.S
@@ -86,15 +86,13 @@ s32_s16_neon:
@ Input must be on 32-bits boundary, output on 16-bits
s32_s16_neon_unaligned:
mov HALF, #4096
-1:
cmp N, #0
+1:
bxeq lr
- ldr BUF, [IN]
- add IN, #4
- add OUT, #2
+ ldr BUF, [IN], #4
qadd BUF, HALF, BUF
- sub N, #1
ssat BUF, #16, BUF, asr #13
- strh BUF, [OUT, #-2]
+ strh BUF, [OUT], #2
+ subs N, #1
b 1b
--
1.7.4.1
More information about the vlc-devel
mailing list