[x264-devel] [PATCH 01/23] aarch64: optimize x264_predict_8x8c_dc_left_neon

Janne Grunau janne-x264 at jannau.net
Thu Nov 27 08:56:29 CET 2014


25% faster than the previous version.
---
 common/aarch64/predict-a.S | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/common/aarch64/predict-a.S b/common/aarch64/predict-a.S
index 8b2283c..8c29d07 100644
--- a/common/aarch64/predict-a.S
+++ b/common/aarch64/predict-a.S
@@ -436,14 +436,25 @@ function x264_predict_8x8c_dc_top_neon, export=1
 endfunc
 
 function x264_predict_8x8c_dc_left_neon, export=1
-    sub         x2,  x0,  #1
+    ldrb        w2,  [x0, #0 * FDEC_STRIDE - 1]
+    ldrb        w3,  [x0, #1 * FDEC_STRIDE - 1]
+    ldrb        w4,  [x0, #2 * FDEC_STRIDE - 1]
+    ldrb        w5,  [x0, #3 * FDEC_STRIDE - 1]
     mov         x1,  #FDEC_STRIDE
-    ldcol.8     v0,  x2,  x1
-    uaddlp      v0.4h,  v0.8b
-    addp        v0.4h,  v0.4h,  v0.4h
+    add         w2,  w2,  w3
+    add         w3,  w4,  w5
+    ldrb        w6,  [x0, #4 * FDEC_STRIDE - 1]
+    ldrb        w7,  [x0, #5 * FDEC_STRIDE - 1]
+    ldrb        w8,  [x0, #6 * FDEC_STRIDE - 1]
+    ldrb        w9,  [x0, #7 * FDEC_STRIDE - 1]
+    add         w6,  w6,  w7
+    add         w7,  w8,  w9
+    add         w2,  w2,  w3
+    add         w6,  w6,  w7
+    dup         v0.8h,  w2
+    dup         v1.8h,  w6
     rshrn       v0.8b,  v0.8h,  #2
-    dup         v1.8b,  v0.b[1]
-    dup         v0.8b,  v0.b[0]
+    rshrn       v1.8b,  v1.8h,  #2
     b           pred8x8c_dc_end
 endfunc
 
-- 
2.1.3



More information about the x264-devel mailing list