[x264-devel] [PATCH 2/2] arm: optimize plane_copy_swap

Janne Grunau janne-x264 at jannau.net
Fri May 23 13:39:11 CEST 2014


~10% faster on cortex-a9
---
 common/arm/mc-a.S | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
index f2305b8..4218fc4 100644
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
@@ -1605,10 +1605,11 @@ function x264_plane_copy_swap_neon
     sub             r1,  r1,  lr, lsl #1
     sub             r3,  r3,  lr, lsl #1
 1:
-    vld2.8          {q0, q1}, [r2]!
+    vld1.8          {q0, q1}, [r2]!
     subs            lr,  lr,  #16
-    vswp            q0, q1
-    vst2.8          {q0, q1}, [r0]!
+    vrev16.8        q0,  q0
+    vrev16.8        q1,  q1
+    vst1.8          {q0, q1}, [r0]!
     bgt             1b
 
     subs            r5,  r5,  #1
-- 
1.9.3



More information about the x264-devel mailing list