[x264-devel] aarch64: optimize x264_predict_8x8c_dc_left_neon
Janne Grunau
git at videolan.org
Sat Dec 20 21:10:42 CET 2014
x264 | branch: master | Janne Grunau <janne-x264 at jannau.net> | Thu Aug 14 23:13:27 2014 +0200| [91a01d4ca95ee1c621578e118b86d767eab96b3b] | committer: Anton Mitrofanov
aarch64: optimize x264_predict_8x8c_dc_left_neon
25% faster than the previous version.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=91a01d4ca95ee1c621578e118b86d767eab96b3b
---
common/aarch64/predict-a.S | 23 +++++++++++++++++------
1 file changed, 17 insertions(+), 6 deletions(-)
diff --git a/common/aarch64/predict-a.S b/common/aarch64/predict-a.S
index 8b2283c..8c29d07 100644
--- a/common/aarch64/predict-a.S
+++ b/common/aarch64/predict-a.S
@@ -436,14 +436,25 @@ function x264_predict_8x8c_dc_top_neon, export=1
endfunc
function x264_predict_8x8c_dc_left_neon, export=1
- sub x2, x0, #1
+ ldrb w2, [x0, #0 * FDEC_STRIDE - 1]
+ ldrb w3, [x0, #1 * FDEC_STRIDE - 1]
+ ldrb w4, [x0, #2 * FDEC_STRIDE - 1]
+ ldrb w5, [x0, #3 * FDEC_STRIDE - 1]
mov x1, #FDEC_STRIDE
- ldcol.8 v0, x2, x1
- uaddlp v0.4h, v0.8b
- addp v0.4h, v0.4h, v0.4h
+ add w2, w2, w3
+ add w3, w4, w5
+ ldrb w6, [x0, #4 * FDEC_STRIDE - 1]
+ ldrb w7, [x0, #5 * FDEC_STRIDE - 1]
+ ldrb w8, [x0, #6 * FDEC_STRIDE - 1]
+ ldrb w9, [x0, #7 * FDEC_STRIDE - 1]
+ add w6, w6, w7
+ add w7, w8, w9
+ add w2, w2, w3
+ add w6, w6, w7
+ dup v0.8h, w2
+ dup v1.8h, w6
rshrn v0.8b, v0.8h, #2
- dup v1.8b, v0.b[1]
- dup v0.8b, v0.b[0]
+ rshrn v1.8b, v1.8h, #2
b pred8x8c_dc_end
endfunc
More information about the x264-devel
mailing list