[vlc-devel] commit: i420->YUYV NEON: rewrite using VZIP ( Rémi Denis-Courmont )

git version control git at videolan.org
Sun Sep 20 19:28:54 CEST 2009


vlc | branch: master | Rémi Denis-Courmont <remi at remlab.net> | Sun Sep 20 20:25:09 2009 +0300| [0e770b173c3885622990bba17e7dde67a47fcdaf] | committer: Rémi Denis-Courmont 

i420->YUYV NEON: rewrite using VZIP

This is over twice faster. Thanks to Måns Rullgård for the hint.

> http://git.videolan.org/gitweb.cgi/vlc.git/?a=commit;h=0e770b173c3885622990bba17e7dde67a47fcdaf
---

 modules/video_chroma/i420_yuyv_neon.S |   93 ++++++++++++++++-----------------
 1 files changed, 45 insertions(+), 48 deletions(-)

diff --git a/modules/video_chroma/i420_yuyv_neon.S b/modules/video_chroma/i420_yuyv_neon.S
index 0fd3e83..9fd3088 100644
--- a/modules/video_chroma/i420_yuyv_neon.S
+++ b/modules/video_chroma/i420_yuyv_neon.S
@@ -1,4 +1,4 @@
- @****************************************************************************
+ @*****************************************************************************
  @ i420_yuyv_neon.S : ARM NEONv1 I420 to YUYV chroma conversion
  @*****************************************************************************
  @ Copyright (C) 2009 Rémi Denis-Courmont
@@ -14,8 +14,8 @@
  @ GNU General Public License for more details.
  @
  @ You should have received a copy of the GNU General Public License
- @ along with this program; if not, write to the Free Software
- @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  @****************************************************************************/
 
 	.fpu neon
@@ -32,41 +32,26 @@
 #define	END_O1	r12
 
 	.align
-	.global i420_uyvy_neon
-	.type	i420_uyvy_neon, %function
-i420_uyvy_neon:
-	adr		r12,	indexes+64
-	b		i420_pack_neon
-
 	.global i420_yuyv_neon
 	.type	i420_yuyv_neon, %function
 i420_yuyv_neon:
-	adr		r12,	indexes
-	.hidden	i420_pack_neon
-i420_pack_neon:
 	push		{r4-r7, lr}
-	vld1.u8		{d24-d27},	[r12]!
 	ldmia		r1,	{Y1, U, V}
-	vld1.u8		{d28-d31},	[r12]
 	add		O2,	O1,	PITCH, lsl #1
 	add		Y2,	Y1,	PITCH
 1:
 	mov		END_O1,	O2
 2:
-	vld1.u8		{d0-d1},	[Y1,:128]!
 	vld1.u8		{d2},		[U,:64]!
 	vld1.u8		{d3},		[V,:64]!
-	vld1.u8		{d4-d5},	[Y2,:128]!
-	vtbl.u8		d16,	{d0-d3},	d24
-	vtbl.u8		d17,	{d0-d3},	d25
-	vtbl.u8		d18,	{d0-d3},	d26
-	vtbl.u8		d19,	{d0-d3},	d27
-	vtbl.u8		d20,	{d2-d5},	d28
-	vtbl.u8		d21,	{d2-d5},	d29
-	vtbl.u8		d22,	{d2-d5},	d30
-	vtbl.u8		d23,	{d2-d5},	d31
-	vst1.u8		{d16-d19},	[O1,:128]!
-	vst1.u8		{d20-d23},	[O2,:128]!
+	vzip.u8		d2,	d3
+	vld1.u8		{q0},		[Y1,:128]!
+	vmov		q3,	q1
+	vzip.u8		q0,	q1
+	vld1.u8		{q2},		[Y2,:128]!
+	vzip.u8		q2,	q3
+	vst1.u8		{q0-q1},	[O1,:128]!
+	vst1.u8		{q2-q3},	[O2,:128]!
 
 	cmp		O1,	END_O1
 	bne		2b
@@ -82,25 +67,37 @@ i420_pack_neon:
 
 	pop		{r4-r7, pc}
 
-	.hidden indexes
-indexes:
-	@ YUYV1
-	.byte	0x00, 0x10, 0x01, 0x18, 0x02, 0x11, 0x03, 0x19
-	.byte	0x04, 0x12, 0x05, 0x1A, 0x06, 0x13, 0x07, 0x1B
-	.byte	0x08, 0x14, 0x09, 0x1C, 0x0A, 0x15, 0x0B, 0x1D
-	.byte	0x0C, 0x16, 0x0D, 0x1E, 0x0E, 0x17, 0x0F, 0x1F
-	@ YUYV2
-	.byte	0x10, 0x00, 0x11, 0x08, 0x12, 0x01, 0x13, 0x09
-	.byte	0x14, 0x02, 0x15, 0x0A, 0x16, 0x03, 0x17, 0x0B
-	.byte	0x18, 0x04, 0x19, 0x0C, 0x1A, 0x05, 0x1B, 0x0D
-	.byte	0x1C, 0x06, 0x1D, 0x0E, 0x1E, 0x07, 0x1F, 0x0F
-	@ UYVY1
-	.byte	0x10, 0x00, 0x18, 0x01, 0x11, 0x02, 0x19, 0x03
-	.byte	0x12, 0x04, 0x1A, 0x05, 0x13, 0x06, 0x1B, 0x07
-	.byte	0x14, 0x08, 0x1C, 0x09, 0x15, 0x0A, 0x1D, 0x0B
-	.byte	0x16, 0x0C, 0x1E, 0x0D, 0x17, 0x0E, 0x1F, 0x0F
-	@ UYVY2
-	.byte	0x00, 0x10, 0x08, 0x11, 0x01, 0x12, 0x09, 0x13
-	.byte	0x02, 0x14, 0x0A, 0x15, 0x03, 0x16, 0x0B, 0x17
-	.byte	0x04, 0x18, 0x0C, 0x19, 0x05, 0x1A, 0x0D, 0x1B
-	.byte	0x06, 0x1C, 0x0E, 0x1D, 0x07, 0x1E, 0x0F, 0x1F
+	.global i420_uyvy_neon
+	.type	i420_uyvy_neon, %function
+i420_uyvy_neon:
+	push		{r4-r7, lr}
+	ldmia		r1,	{Y1, U, V}
+	add		O2,	O1,	PITCH, lsl #1
+	add		Y2,	Y1,	PITCH
+1:
+	mov		END_O1,	O2
+2:
+	vld1.u8		{d0},		[U,:64]!
+	vld1.u8		{d1},		[V,:64]!
+	vzip.u8		d0,	d1
+	vld1.u8		{q1},		[Y1,:128]!
+	vmov		q2,	q0
+	vzip.u8		q0,	q1
+	vld1.u8		{q3},		[Y2,:128]!
+	vzip.u8		q2,	q3
+	vst1.u8		{q0-q1},	[O1,:128]!
+	vst1.u8		{q2-q3},	[O2,:128]!
+
+	cmp		O1,	END_O1
+	bne		2b
+
+	sub		HEIGHT,	#2
+	mov		O1,	O2
+	add		O2,	PITCH,	lsl #1
+	mov		Y1,	Y2
+	add		Y2,	PITCH
+
+	cmp		HEIGHT,	#0
+	bne		1b
+
+	pop		{r4-r7, pc}




More information about the vlc-devel mailing list