[x264-devel] arm: Simplify x264_predict_8x8c_p_neon
Martin Storsjö
git at videolan.org
Sun Oct 11 19:01:02 CEST 2015
x264 | branch: master | Martin Storsjö <martin at martin.st> | Thu Aug 13 23:59:24 2015 +0300| [3f89a6bbee061cb0361770cf5b8495448515a011] | committer: Henrik Gramner
arm: Simplify x264_predict_8x8c_p_neon
This gets rid of a few unnecessary (and confusing) steps in
calculating the increment to i00.
checkasm timing Cortex-A7 A8 A9
intra_predict_8x8c_p_c 5525 4732 4755
intra_predict_8x8c_p_neon 1719 1140 1262 (before)
intra_predict_8x8c_p_neon 1663 1142 1255 (after)
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=3f89a6bbee061cb0361770cf5b8495448515a011
---
common/arm/predict-a.S | 7 +------
1 file changed, 1 insertion(+), 6 deletions(-)
diff --git a/common/arm/predict-a.S b/common/arm/predict-a.S
index 3343144..7e5d9d3 100644
--- a/common/arm/predict-a.S
+++ b/common/arm/predict-a.S
@@ -535,17 +535,12 @@ function x264_predict_8x8c_p_neon
vadd.i16 d16, d16, d0
vshl.i16 d2, d16, #4
vsub.i16 d2, d2, d3
- vshl.i16 d3, d4, #3
vext.16 q0, q0, q0, #7
- vsub.i16 d6, d5, d3
vmov.16 d0[0], r3
vmul.i16 q0, q0, d4[0]
vdup.16 q1, d2[0]
- vdup.16 q2, d4[0]
- vdup.16 q3, d6[0]
- vshl.i16 q2, q2, #3
+ vdup.16 q3, d5[0]
vadd.i16 q1, q1, q0
- vadd.i16 q3, q3, q2
mov r3, #8
1:
vqshrun.s16 d0, q1, #5
More information about the x264-devel
mailing list