[x264-devel] [PATCH 3/3] arm64: intra_predict_4x4_h: use multiplication with 0x01010101 for splat
Janne Grunau
janne-x264 at jannau.net
Tue Aug 18 10:25:10 CEST 2015
On a cortex-a53:
gcc 4.9.2 llvm 3.6 neon (before) neon (after)
intra_predict_4x4_h: 162 147 160/155 139/135
---
common/aarch64/predict-a.S | 29 +++++++++++++----------------
1 file changed, 13 insertions(+), 16 deletions(-)
diff --git a/common/aarch64/predict-a.S b/common/aarch64/predict-a.S
index 465d3b4..bcc3d7a 100644
--- a/common/aarch64/predict-a.S
+++ b/common/aarch64/predict-a.S
@@ -63,22 +63,19 @@ endconst
function x264_predict_4x4_h_aarch64, export=1
- ldrb w1, [x0, #0*FDEC_STRIDE-1]
- ldrb w2, [x0, #1*FDEC_STRIDE-1]
- ldrb w3, [x0, #2*FDEC_STRIDE-1]
- ldrb w4, [x0, #3*FDEC_STRIDE-1]
- add w1, w1, w1, lsl #8
- add w2, w2, w2, lsl #8
- add w3, w3, w3, lsl #8
- add w4, w4, w4, lsl #8
- add w1, w1, w1, lsl #16
- str w1, [x0, #0*FDEC_STRIDE]
- add w2, w2, w2, lsl #16
- str w2, [x0, #1*FDEC_STRIDE]
- add w3, w3, w3, lsl #16
- str w3, [x0, #2*FDEC_STRIDE]
- add w4, w4, w4, lsl #16
- str w4, [x0, #3*FDEC_STRIDE]
+ ldrb w1, [x0, #0*FDEC_STRIDE-1]
+ mov w5, #0x01010101
+ ldrb w2, [x0, #1*FDEC_STRIDE-1]
+ ldrb w3, [x0, #2*FDEC_STRIDE-1]
+ mul w1, w1, w5
+ ldrb w4, [x0, #3*FDEC_STRIDE-1]
+ mul w2, w2, w5
+ str w1, [x0, #0*FDEC_STRIDE]
+ mul w3, w3, w5
+ str w2, [x0, #1*FDEC_STRIDE]
+ mul w4, w4, w5
+ str w3, [x0, #2*FDEC_STRIDE]
+ str w4, [x0, #3*FDEC_STRIDE]
ret
endfunc
--
2.5.0
More information about the x264-devel
mailing list