[vlc-devel] [PATCH 2/2] arm: Allow building assembly in thumb mode

Wed Feb 21 13:20:35 CET 2018

Windows on arm is thumb2 only.

Add the necessary "it" instructions before conditionally executed
instructions (which doesn't emit any extra instructions when not
building in thumb mode). The number of "it" instructions could
be reduced in some places by reordering the instructions, but keeping
them as they were originally to avoid any impact on existing
targets.

Add #ifndef _WIN32 around a few ".arm" directives, since we shouldn't
force the assembler to arm mode in this target.
---
 modules/arm_neon/amplify.S                   | 3 +++
 modules/arm_neon/deinterleave_chroma.S       | 1 +
 modules/arm_neon/i420_rgb.S                  | 3 +++
 modules/arm_neon/i420_rv16.S                 | 3 +++
 modules/arm_neon/i420_yuyv.S                 | 4 ++++
 modules/arm_neon/i422_yuyv.S                 | 2 ++
 modules/arm_neon/nv12_rgb.S                  | 3 +++
 modules/arm_neon/nv21_rgb.S                  | 3 +++
 modules/arm_neon/yuyv_i422.S                 | 2 ++
 modules/video_filter/deinterlace/merge_arm.S | 6 ++++++
 10 files changed, 30 insertions(+)

diff --git a/modules/arm_neon/amplify.S b/modules/arm_neon/amplify.S
index 9e655af..7a8adf7 100644
--- a/modules/arm_neon/amplify.S
+++ b/modules/arm_neon/amplify.S
@@ -21,7 +21,9 @@
 #include "asm.S"
 
 	.syntax	unified
+#ifndef _WIN32
 	.arm
+#endif
 #if HAVE_AS_FPU_DIRECTIVE
 	.fpu	neon
 #endif
@@ -33,6 +35,7 @@
 	.align 2
 function amplify_float_arm_neon
 	cmp		SIZE,	#0
+	it		eq
 	bxeq		lr
 #ifdef __ARM_PCS
 	vmov		s0,	r3	@ softfp
diff --git a/modules/arm_neon/deinterleave_chroma.S b/modules/arm_neon/deinterleave_chroma.S
index 9cd01c7..7c0723e 100644
--- a/modules/arm_neon/deinterleave_chroma.S
+++ b/modules/arm_neon/deinterleave_chroma.S
@@ -52,6 +52,7 @@ function deinterleave_chroma_neon
 	sub		IPAD,	IPITCH,	WIDTH, lsl #1
 	sub		OPAD,	OPITCH,	WIDTH
 1:
+	ite		gt
 	movsgt		COUNT,	WIDTH
 	pople		{r4-r6,pc}
 2:
diff --git a/modules/arm_neon/i420_rgb.S b/modules/arm_neon/i420_rgb.S
index 54fb387..6624c4e 100644
--- a/modules/arm_neon/i420_rgb.S
+++ b/modules/arm_neon/i420_rgb.S
@@ -95,6 +95,7 @@ function i420_rgb_neon
 	/* round the width to be a multiple of 16 */
 	ands		OPAD, WIDTH, #15
 	sub			WIDTH, WIDTH, OPAD
+	it		ne
 	addne		WIDTH, WIDTH, #16
 
 	/* init constants (scale value by 64) */
@@ -115,10 +116,12 @@ function i420_rgb_neon
 	sub			YPAD,	YPITCH,	WIDTH
 
 loop_row:
+	it	gt
 	movsgt	COUNT,	WIDTH
 	add		O2,	O1,	OPITCH
 	add		Y2,	Y1,	YPITCH
 	/* exit if all rows have been processed */
+	itt	le
 	vpople	{q4-q7}
 	pople	{r4-r8,r10-r11,pc}
 
diff --git a/modules/arm_neon/i420_rv16.S b/modules/arm_neon/i420_rv16.S
index 15d1e7b..a3bbae2 100644
--- a/modules/arm_neon/i420_rv16.S
+++ b/modules/arm_neon/i420_rv16.S
@@ -98,6 +98,7 @@ function i420_rv16_neon
 	/* round the width to be a multiple of 16 */
 	ands		OPAD, WIDTH, #15
 	sub			WIDTH, WIDTH, OPAD
+	it		ne
 	addne		WIDTH, WIDTH, #16
 
 	/* init constants (scale value by 64) */
@@ -117,10 +118,12 @@ function i420_rv16_neon
 	sub			YPAD,	YPITCH,	WIDTH
 
 loop_row:
+	it	gt
 	movsgt	COUNT,	WIDTH
 	add		O2,	O1,	OPITCH
 	add		Y2,	Y1,	YPITCH
 	/* exit if all rows have been processed */
+	itt	le
 	vpople	{q4-q7}
 	pople	{r4-r8,r10-r11,pc}
 
diff --git a/modules/arm_neon/i420_yuyv.S b/modules/arm_neon/i420_yuyv.S
index 29668e4..22355e0 100644
--- a/modules/arm_neon/i420_yuyv.S
+++ b/modules/arm_neon/i420_yuyv.S
@@ -49,9 +49,11 @@ function i420_yuyv_neon
 	sub		OPAD,	OPITCH,	WIDTH,	lsl #1
 	sub		YPAD,	YPITCH,	WIDTH
 1:
+	it		gt
 	movsgt		COUNT,	WIDTH
 	add		O2,	O1,	OPITCH
 	add		Y2,	Y1,	YPITCH
+	it		le
 	pople		{r4-r8,r10-r11,pc}
 2:
 	pld		[U, #64]
@@ -86,9 +88,11 @@ function i420_uyvy_neon
 	sub		OPAD,	OPITCH,	WIDTH,	lsl #1
 	sub		YPAD,	YPITCH,	WIDTH
 1:
+	it		gt
 	movsgt		COUNT,	WIDTH
 	add		O2,	O1,	OPITCH
 	add		Y2,	Y1,	YPITCH
+	it		le
 	pople		{r4-r8,r10-r11,pc}
 2:
 	pld		[U, #64]
diff --git a/modules/arm_neon/i422_yuyv.S b/modules/arm_neon/i422_yuyv.S
index 9119839..9a5b8fc 100644
--- a/modules/arm_neon/i422_yuyv.S
+++ b/modules/arm_neon/i422_yuyv.S
@@ -45,6 +45,7 @@ function i422_yuyv_neon
 	sub		OPAD,	OPAD,	WIDTH,	lsl #1
 	sub		YPAD,	YPAD,	WIDTH
 1:
+	ite		gt
 	movsgt		COUNT,	WIDTH
 	pople		{r4-r6,pc}
 2:
@@ -76,6 +77,7 @@ function i422_uyvy_neon
 	sub		OPAD,	OPAD,	WIDTH,	lsl #1
 	sub		YPAD,	YPAD,	WIDTH
 1:
+	ite		gt
 	movsgt		COUNT,	WIDTH
 	pople		{r4-r6,pc}
 2:
diff --git a/modules/arm_neon/nv12_rgb.S b/modules/arm_neon/nv12_rgb.S
index 1bb924f..ceef76c 100644
--- a/modules/arm_neon/nv12_rgb.S
+++ b/modules/arm_neon/nv12_rgb.S
@@ -91,6 +91,7 @@ function nv12_rgb_neon
 	/* round the width to be a multiple of 16 */
 	ands		OPAD, WIDTH, #15
 	sub			WIDTH, WIDTH, OPAD
+	it		ne
 	addne		WIDTH, WIDTH, #16
 
 	/* init constants (scale value by 64) */
@@ -111,10 +112,12 @@ function nv12_rgb_neon
 	sub			YPAD,	YPITCH,	WIDTH
 
 loop_row:
+	it	gt
 	movsgt	COUNT,	WIDTH
 	add		O2,	O1,	OPITCH
 	add		Y2,	Y1,	YPITCH
 	/* exit if all rows have been processed */
+	itt	le
 	vpople	{q4-q7}
 	pople	{r4-r8,r10-r11,pc}
 
diff --git a/modules/arm_neon/nv21_rgb.S b/modules/arm_neon/nv21_rgb.S
index f775b5a..0d75b9f 100644
--- a/modules/arm_neon/nv21_rgb.S
+++ b/modules/arm_neon/nv21_rgb.S
@@ -91,6 +91,7 @@ function nv21_rgb_neon
 	/* round the width to be a multiple of 16 */
 	ands		OPAD, WIDTH, #15
 	sub			WIDTH, WIDTH, OPAD
+	it		ne
 	addne		WIDTH, WIDTH, #16
 
 	/* init constants (scale value by 64) */
@@ -111,10 +112,12 @@ function nv21_rgb_neon
 	sub			YPAD,	YPITCH,	WIDTH
 
 loop_row:
+	it	gt
 	movsgt	COUNT,	WIDTH
 	add		O2,	O1,	OPITCH
 	add		Y2,	Y1,	YPITCH
 	/* exit if all rows have been processed */
+	itt	le
 	vpople	{q4-q7}
 	pople	{r4-r8,r10-r11,pc}
 
diff --git a/modules/arm_neon/yuyv_i422.S b/modules/arm_neon/yuyv_i422.S
index 637effe..62d826c 100644
--- a/modules/arm_neon/yuyv_i422.S
+++ b/modules/arm_neon/yuyv_i422.S
@@ -45,6 +45,7 @@ function yuyv_i422_neon
 	sub		YPAD,	YPAD,	WIDTH
 	sub		IPAD,	IPAD,	WIDTH,	lsl #1
 1:
+	ite		gt
 	movsgt		COUNT,	WIDTH
 	pople		{r4-r6,pc}
 2:
@@ -74,6 +75,7 @@ function uyvy_i422_neon
 	sub		YPAD,	YPAD,	WIDTH
 	sub		IPAD,	IPAD,	WIDTH,	lsl #1
 1:
+	ite		gt
 	movsgt		COUNT,	WIDTH
 	pople		{r4-r6,pc}
 2:
diff --git a/modules/video_filter/deinterlace/merge_arm.S b/modules/video_filter/deinterlace/merge_arm.S
index d3f32c5..0ae661e 100644
--- a/modules/video_filter/deinterlace/merge_arm.S
+++ b/modules/video_filter/deinterlace/merge_arm.S
@@ -21,7 +21,9 @@
 #include "../arm_neon/asm.S"
 
 	.syntax	unified
+#ifndef _WIN32
 	.arm
+#endif
 #if HAVE_AS_ARCH_DIRECTIVE
 	.arch	armv6
 #endif
@@ -67,6 +69,7 @@ function merge8_arm_neon
 	vst1.u8		{q0-q1},	[DEST,:128]!
 3:
 	cmp		SIZE,	#16
+	it		lo
 	bxlo		lr
 	vld1.u8		{q0},		[SRC1,:128]!
 	sub		SIZE,	SIZE,	#16
@@ -106,6 +109,7 @@ function merge16_arm_neon
 	vst1.u16	{q0-q1},	[DEST,:128]!
 3:
 	cmp		SIZE,	#16
+	it		lo
 	bxlo		lr
 	vld1.u16	{q0},		[SRC1,:128]!
 	sub		SIZE,	SIZE,	#16
@@ -131,6 +135,7 @@ function merge8_armv6
 	stm		DEST!,	{r4-r5}
 	uhadd8		r7,	r7,	lr
 	stm		DEST!,	{r6-r7}
+	it		eq
 	popeq		{r4-r9,pc}
 	b		1b
 
@@ -151,5 +156,6 @@ function merge16_armv6
 	stm		DEST!,	{r4-r5}
 	uhadd16		r7,	r7,	lr
 	stm		DEST!,	{r6-r7}
+	it		eq
 	popeq		{r4-r9,pc}
 	b		1b
-- 
2.7.4