[x264-devel] aarch64: Faster intra_predict_4x4_h
Janne Grunau
git at videolan.org
Sun Oct 11 19:01:03 CEST 2015
x264 | branch: master | Janne Grunau <janne-x264 at jannau.net> | Tue Aug 18 10:25:10 2015 +0200| [b16268ac0826d78455d0d704ea0fc8b1edc6b6bf] | committer: Henrik Gramner
aarch64: Faster intra_predict_4x4_h
Use multiplication with 0x01010101 for splats.
On a cortex-a53:
gcc 4.9.2 llvm 3.6 neon (before) neon (after)
intra_predict_4x4_h: 162 147 160/155 139/135
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=b16268ac0826d78455d0d704ea0fc8b1edc6b6bf
---
common/aarch64/predict-a.S | 29 +++++++++++++----------------
1 file changed, 13 insertions(+), 16 deletions(-)
diff --git a/common/aarch64/predict-a.S b/common/aarch64/predict-a.S
index a01016a..a7dd2d1 100644
--- a/common/aarch64/predict-a.S
+++ b/common/aarch64/predict-a.S
@@ -63,22 +63,19 @@ endconst
function x264_predict_4x4_h_aarch64, export=1
- ldrb w1, [x0, #0*FDEC_STRIDE-1]
- ldrb w2, [x0, #1*FDEC_STRIDE-1]
- ldrb w3, [x0, #2*FDEC_STRIDE-1]
- ldrb w4, [x0, #3*FDEC_STRIDE-1]
- add w1, w1, w1, lsl #8
- add w2, w2, w2, lsl #8
- add w3, w3, w3, lsl #8
- add w4, w4, w4, lsl #8
- add w1, w1, w1, lsl #16
- str w1, [x0, #0*FDEC_STRIDE]
- add w2, w2, w2, lsl #16
- str w2, [x0, #1*FDEC_STRIDE]
- add w3, w3, w3, lsl #16
- str w3, [x0, #2*FDEC_STRIDE]
- add w4, w4, w4, lsl #16
- str w4, [x0, #3*FDEC_STRIDE]
+ ldrb w1, [x0, #0*FDEC_STRIDE-1]
+ mov w5, #0x01010101
+ ldrb w2, [x0, #1*FDEC_STRIDE-1]
+ ldrb w3, [x0, #2*FDEC_STRIDE-1]
+ mul w1, w1, w5
+ ldrb w4, [x0, #3*FDEC_STRIDE-1]
+ mul w2, w2, w5
+ str w1, [x0, #0*FDEC_STRIDE]
+ mul w3, w3, w5
+ str w2, [x0, #1*FDEC_STRIDE]
+ mul w4, w4, w5
+ str w3, [x0, #2*FDEC_STRIDE]
+ str w4, [x0, #3*FDEC_STRIDE]
ret
endfunc
More information about the x264-devel
mailing list