[x264-devel] arm: Simplify x264_predict_8x8c_p_neon

Sun Oct 11 19:01:02 CEST 2015

x264 | branch: master | Martin Storsjö <martin at martin.st> | Thu Aug 13 23:59:24 2015 +0300| [3f89a6bbee061cb0361770cf5b8495448515a011] | committer: Henrik Gramner

arm: Simplify x264_predict_8x8c_p_neon

This gets rid of a few unnecessary (and confusing) steps in
calculating the increment to i00.

checkasm timing      Cortex-A7    A8    A9
intra_predict_8x8c_p_c      5525  4732  4755
intra_predict_8x8c_p_neon   1719  1140  1262  (before)
intra_predict_8x8c_p_neon   1663  1142  1255  (after)

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=3f89a6bbee061cb0361770cf5b8495448515a011
---

 common/arm/predict-a.S |    7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/common/arm/predict-a.S b/common/arm/predict-a.S
index 3343144..7e5d9d3 100644
--- a/common/arm/predict-a.S
+++ b/common/arm/predict-a.S
@@ -535,17 +535,12 @@ function x264_predict_8x8c_p_neon
     vadd.i16    d16, d16, d0
     vshl.i16    d2,  d16, #4
     vsub.i16    d2,  d2,  d3
-    vshl.i16    d3,  d4,  #3
     vext.16     q0,  q0,  q0,  #7
-    vsub.i16    d6,  d5,  d3
     vmov.16     d0[0], r3
     vmul.i16    q0,  q0,  d4[0]
     vdup.16     q1,  d2[0]
-    vdup.16     q2,  d4[0]
-    vdup.16     q3,  d6[0]
-    vshl.i16    q2,  q2,  #3
+    vdup.16     q3,  d5[0]
     vadd.i16    q1,  q1,  q0
-    vadd.i16    q3,  q3,  q2
     mov         r3,  #8
 1:
     vqshrun.s16 d0,  q1,  #5