[x264-devel] aarch64: Faster intra_predict_4x4_h

Sun Oct 11 19:01:03 CEST 2015

x264 | branch: master | Janne Grunau <janne-x264 at jannau.net> | Tue Aug 18 10:25:10 2015 +0200| [b16268ac0826d78455d0d704ea0fc8b1edc6b6bf] | committer: Henrik Gramner

aarch64: Faster intra_predict_4x4_h

Use multiplication with 0x01010101 for splats.

On a cortex-a53:
                     gcc 4.9.2   llvm 3.6   neon (before)   neon (after)
intra_predict_4x4_h: 162         147        160/155         139/135

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=b16268ac0826d78455d0d704ea0fc8b1edc6b6bf
---

 common/aarch64/predict-a.S |   29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/common/aarch64/predict-a.S b/common/aarch64/predict-a.S
index a01016a..a7dd2d1 100644
--- a/common/aarch64/predict-a.S
+++ b/common/aarch64/predict-a.S
@@ -63,22 +63,19 @@ endconst
 
 
 function x264_predict_4x4_h_aarch64, export=1
-    ldrb    w1, [x0, #0*FDEC_STRIDE-1]
-    ldrb    w2, [x0, #1*FDEC_STRIDE-1]
-    ldrb    w3, [x0, #2*FDEC_STRIDE-1]
-    ldrb    w4, [x0, #3*FDEC_STRIDE-1]
-    add     w1, w1, w1, lsl #8
-    add     w2, w2, w2, lsl #8
-    add     w3, w3, w3, lsl #8
-    add     w4, w4, w4, lsl #8
-    add     w1, w1, w1, lsl #16
-    str     w1, [x0, #0*FDEC_STRIDE]
-    add     w2, w2, w2, lsl #16
-    str     w2, [x0, #1*FDEC_STRIDE]
-    add     w3, w3, w3, lsl #16
-    str     w3, [x0, #2*FDEC_STRIDE]
-    add     w4, w4, w4, lsl #16
-    str     w4, [x0, #3*FDEC_STRIDE]
+    ldrb    w1,  [x0, #0*FDEC_STRIDE-1]
+    mov     w5,  #0x01010101
+    ldrb    w2,  [x0, #1*FDEC_STRIDE-1]
+    ldrb    w3,  [x0, #2*FDEC_STRIDE-1]
+    mul     w1,  w1,  w5
+    ldrb    w4,  [x0, #3*FDEC_STRIDE-1]
+    mul     w2,  w2,  w5
+    str     w1,  [x0, #0*FDEC_STRIDE]
+    mul     w3,  w3,  w5
+    str     w2,  [x0, #1*FDEC_STRIDE]
+    mul     w4,  w4,  w5
+    str     w3,  [x0, #2*FDEC_STRIDE]
+    str     w4,  [x0, #3*FDEC_STRIDE]
     ret
 endfunc