[x264-devel] [PATCH 01/23] aarch64: optimize x264_predict_8x8c_dc_left_neon
Janne Grunau
janne-x264 at jannau.net
Thu Nov 27 08:56:29 CET 2014
25% faster than the previous version.
---
common/aarch64/predict-a.S | 23 +++++++++++++++++------
1 file changed, 17 insertions(+), 6 deletions(-)
diff --git a/common/aarch64/predict-a.S b/common/aarch64/predict-a.S
index 8b2283c..8c29d07 100644
--- a/common/aarch64/predict-a.S
+++ b/common/aarch64/predict-a.S
@@ -436,14 +436,25 @@ function x264_predict_8x8c_dc_top_neon, export=1
endfunc
function x264_predict_8x8c_dc_left_neon, export=1
- sub x2, x0, #1
+ ldrb w2, [x0, #0 * FDEC_STRIDE - 1]
+ ldrb w3, [x0, #1 * FDEC_STRIDE - 1]
+ ldrb w4, [x0, #2 * FDEC_STRIDE - 1]
+ ldrb w5, [x0, #3 * FDEC_STRIDE - 1]
mov x1, #FDEC_STRIDE
- ldcol.8 v0, x2, x1
- uaddlp v0.4h, v0.8b
- addp v0.4h, v0.4h, v0.4h
+ add w2, w2, w3
+ add w3, w4, w5
+ ldrb w6, [x0, #4 * FDEC_STRIDE - 1]
+ ldrb w7, [x0, #5 * FDEC_STRIDE - 1]
+ ldrb w8, [x0, #6 * FDEC_STRIDE - 1]
+ ldrb w9, [x0, #7 * FDEC_STRIDE - 1]
+ add w6, w6, w7
+ add w7, w8, w9
+ add w2, w2, w3
+ add w6, w6, w7
+ dup v0.8h, w2
+ dup v1.8h, w6
rshrn v0.8b, v0.8h, #2
- dup v1.8b, v0.b[1]
- dup v0.8b, v0.b[0]
+ rshrn v1.8b, v1.8h, #2
b pred8x8c_dc_end
endfunc
--
2.1.3
More information about the x264-devel
mailing list