[x264-devel] [PATCH 2/2] arm: optimize plane_copy_swap
Janne Grunau
janne-x264 at jannau.net
Fri May 23 13:39:11 CEST 2014
~10% faster on cortex-a9
---
common/arm/mc-a.S | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
index f2305b8..4218fc4 100644
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
@@ -1605,10 +1605,11 @@ function x264_plane_copy_swap_neon
sub r1, r1, lr, lsl #1
sub r3, r3, lr, lsl #1
1:
- vld2.8 {q0, q1}, [r2]!
+ vld1.8 {q0, q1}, [r2]!
subs lr, lr, #16
- vswp q0, q1
- vst2.8 {q0, q1}, [r0]!
+ vrev16.8 q0, q0
+ vrev16.8 q1, q1
+ vst1.8 {q0, q1}, [r0]!
bgt 1b
subs r5, r5, #1
--
1.9.3
More information about the x264-devel
mailing list