[vlc-devel] [PATCH] arm neon i420_yuyv/i420_uyvy/s32_s16 : a bit less cycles (untested)

Rafaël Carré rafael.carre at gmail.com
Fri Jun 24 03:10:34 CEST 2011


i420_*: Do not push lr on the stack and use bx lr
should be the same number of cycles but with less memory usage
(can't check as i can't find the pdf with the number of cycles per instruction)

sub+cmp -> subs

s32_s16: sub+cmp -> subs
ldr+add -> ldr with post-increment
---
 modules/arm_neon/i420_yuyv.S |   20 ++++++++++----------
 modules/arm_neon/s32_s16.S   |   10 ++++------
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/modules/arm_neon/i420_yuyv.S b/modules/arm_neon/i420_yuyv.S
index 556680f..6a6bd4d 100644
--- a/modules/arm_neon/i420_yuyv.S
+++ b/modules/arm_neon/i420_yuyv.S
@@ -36,8 +36,8 @@
 	.global i420_yuyv_neon
 	.type	i420_yuyv_neon, %function
 i420_yuyv_neon:
-	push		{r4-r8, lr}
-	ldr		HEIGHT, [sp, #(4*6)]
+	push		{r4-r8}
+	ldr		HEIGHT, [sp, #(4*5)]
 	ldmia		r1,	{Y1, U, V}
 	add		O2,	O1,	PITCH, lsl #1
 	add		Y2,	Y1,	PITCH
@@ -64,7 +64,6 @@ i420_yuyv_neon:
 	cmp		O1,	END_O1
 	bne		2b
 
-	sub		HEIGHT,	#2
 	mov		O1,	O2
 	add		O2,	PITCH,	lsl #1
 	add		Y2,	S_OFF
@@ -74,16 +73,17 @@ i420_yuyv_neon:
 	add		U,	S_OFF,	lsr #1
 	add		V,	S_OFF,	lsr #1
 
-	cmp		HEIGHT,	#0
+	subs		HEIGHT,	#2
 	bne		1b
 
-	pop		{r4-r8, pc}
+	pop		{r4-r8}
+	bx		lr
 
 	.global i420_uyvy_neon
 	.type	i420_uyvy_neon, %function
 i420_uyvy_neon:
-	push		{r4-r8, lr}
-	ldr		HEIGHT, [sp, #(4*6)]
+	push		{r4-r8}
+	ldr		HEIGHT, [sp, #(4*5)]
 	ldmia		r1,	{Y1, U, V}
 	add		O2,	O1,	PITCH, lsl #1
 	add		Y2,	Y1,	PITCH
@@ -109,7 +109,6 @@ i420_uyvy_neon:
 	cmp		O1,	END_O1
 	bne		2b
 
-	sub		HEIGHT,	#2
 	mov		O1,	O2
 	add		O2,	PITCH,	lsl #1
 	add		Y2,	S_OFF
@@ -119,7 +118,8 @@ i420_uyvy_neon:
 	add		U,	S_OFF,	lsr #1
 	add		V,	S_OFF,	lsr #1
 
-	cmp		HEIGHT,	#0
+	subs		HEIGHT,	#2
 	bne		1b
 
-	pop		{r4-r8, pc}
+	pop		{r4-r8}
+	bx		lr
diff --git a/modules/arm_neon/s32_s16.S b/modules/arm_neon/s32_s16.S
index 88effca..4901c1a 100644
--- a/modules/arm_neon/s32_s16.S
+++ b/modules/arm_neon/s32_s16.S
@@ -86,15 +86,13 @@ s32_s16_neon:
 	@ Input must be on 32-bits boundary, output on 16-bits
 s32_s16_neon_unaligned:
 	mov		HALF,	#4096
-1:
 	cmp		N,	#0
+1:
 	bxeq		lr
 
-	ldr		BUF,	[IN]
-	add		IN,	#4
-	add		OUT,	#2
+	ldr		BUF,	[IN], #4
 	qadd		BUF,	HALF,	BUF
-	sub		N,	#1
 	ssat		BUF,	#16,	BUF, asr #13
-	strh		BUF,	[OUT, #-2]
+	strh		BUF,	[OUT], #2
+	subs		N,	#1
 	b		1b
-- 
1.7.4.1





More information about the vlc-devel mailing list