[x264-devel] [PATCH 1/1] arm: optimize mbtree_propagate_list_neon

Fri Aug 21 14:04:14 CEST 2015

Avoid pushing anything to the stack and interleave the calculations. The
latter is probably responsible for the speedup.

Cortex-A9:             GCC 4.8.4   neon (before)   neon (after)
mbtree_propagate_list: 106820      74059           72360
---
 common/arm/mc-a.S | 105 ++++++++++++++++++++++++++----------------------------
 1 file changed, 50 insertions(+), 55 deletions(-)

diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
index beae5bd..8f74ef2 100644
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
@@ -1821,70 +1821,65 @@ function x264_mbtree_propagate_cost_neon
 endfunc
 
 function x264_mbtree_propagate_list_internal_neon
-    vpush           {q4-q7}
-    push            {r4-r6}
-    ldrd            r4, r5, [sp, #76]
-    ldr             r6, [sp, #84]
+    vld2.16         {d4[], d5[]}, [sp]      @ bipred_weight, mb_y
     movrel          r12, pw_0to15
-    vdup.16         d12, r4                 @ bipred_weight
-    vmov.u16        q4,  #0xc000
+    vmov.u16        q10, #0xc000
     vld1.16         {q0},  [r12, :128]      @h->mb.i_mb_x,h->mb.i_mb_y
-    vmov.u32        q5,  #4
-    vmov.u16        q2,  #31
-    vmov.u16        q3,  #32
-    vdup.u16        q8,  r5                 @ mb_y
+    vmov.u32        q11, #4
+    vmov.u8         q3,  #32
+    vdup.u16        q8,  d5[0]              @ mb_y
     vzip.u16        q0,  q8
+    ldr             r12, [sp, #8]
 8:
-    subs            r6,  r6,  #8
-    vld1.16         {q8},  [r1, :128]!      @ propagate_amount
-    vld1.16         {q9},  [r2, :128]!      @ lowres_cost
-    vand            q9,  q9,  q4
-    vceq.u16        q1,  q9,  q4
-    vmull.u16       q10, d16, d12
-    vmull.u16       q11, d17, d12
-    vrshrn.u32      d20, q10, #6
-    vrshrn.u32      d21, q11, #6
-    vbsl            q1,  q10, q8 @ if( lists_used == 3 )
-    @               propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
+    subs            r12, r12,  #8
+    vld1.16         {q14},  [r1, :128]!      @ propagate_amount
+    vld1.16         {q15},  [r2, :128]!      @ lowres_cost
     vld1.16         {q8, q9},  [r0, :128]!
-    vshr.s16        q10, q8,  #5
-    vshr.s16        q11, q9,  #5
-    vadd.s16        q10, q10, q0
-    vadd.s16        q0,  q0,  q5
-    vadd.s16        q11, q11, q0
-    vadd.s16        q0,  q0,  q5
-    vst1.16         {q10, q11},  [r3, :128]!
-    vand            q8,  q8,  q2
-    vand            q9,  q9,  q2
+    vand            q15, q15, q10
+    vceq.u16        q1,  q15, q10
+    vmull.u16       q12, d28, d4
+    vmull.u16       q13, d29, d4
+    vrshrn.u32      d30, q12, #6
+    vrshrn.u32      d31, q13, #6
+    vbsl            q1,  q15, q14           @ if( lists_used == 3 )
+    @ propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
+    vshr.s16        q12, q8,  #5
+    vshr.s16        q13, q9,  #5
     vuzp.16         q8,  q9                 @ x & 31, y & 31
-    vsub.s16        q10, q3,  q8            @ 32 - (x & 31)
-    vsub.s16        q11, q3,  q9            @ 32 - (y & 31)
-    vmul.u16        q12, q8,  q9            @ idx3weight = y*x
-    vmul.u16        q13, q10, q9            @ idx2weight = y*(32-x)
-    vmul.u16        q14, q8,  q11           @ idx1weight = (32-y)*x
-    vmul.u16        q15, q10, q11           @ idx0weight = (32-y)*(32-x)
-    vmull.u16       q8,  d24, d2            @ idx3weight
-    vmull.u16       q9,  d25, d3
-    vmull.u16       q10, d26, d2            @ idx2weight
-    vmull.u16       q11, d27, d3
-    vmull.u16       q12, d28, d2            @ idx1weight
-    vmull.u16       q13, d29, d3
+    vadd.s16        q12, q12, q0
+    vadd.s16        q0,  q0,  q11
+    vmovn.i16       d16, q8
+    vmovn.i16       d17, q9
+    vadd.s16        q13, q13, q0
+    vbic.i16        q8,  q8,  #128+64+32
+    vadd.s16        q0,  q0,  q11
+    vbic.i16        q8,  q8,  #(128+64+32)<<8
+    vst1.16         {q12, q13},  [r3, :128]!
+    vsub.i8         q9,  q3,  q8
+    vmull.u8        q12, d17, d16           @ idx3weight = y*x
+    vmull.u8        q14, d19, d16           @ idx1weight = (32-y)*x
+    vmull.u8        q15, d19, d18           @ idx0weight = (32-y)*(32-x)
+    vmull.u8        q13, d17, d18           @ idx2weight = y*(32-x)
+    vmull.u16       q9,  d28, d2            @ idx1weight
+    vmull.u16       q8,  d29, d3
     vmull.u16       q14, d30, d2            @ idx0weight
     vmull.u16       q15, d31, d3
-    vrshrn.u32      d19, q9,  #10           @ idx3weight
-    vrshrn.u32      d18, q8,  #10
-    vrshrn.u32      d16, q10, #10           @ idx2weight
-    vrshrn.u32      d17, q11, #10
-    vrshrn.u32      d22, q12, #10           @ idx1weight
-    vrshrn.u32      d23, q13, #10
-    vrshrn.u32      d20, q14, #10           @ idx0weight
-    vrshrn.u32      d21, q15, #10
-    vzip.16         q10, q11
+    vrshrn.u32      d18, q9,  #10           @ idx1weight
+    vrshrn.u32      d19, q8,  #10
+    vrshrn.u32      d16, q14, #10           @ idx0weight
+    vrshrn.u32      d17, q15, #10
+    vmull.u16       q14, d24, d2            @ idx3weight
+    vmull.u16       q15, d25, d3
     vzip.16         q8,  q9
-    vst1.16         {q10, q11},  [r3, :128]!
-    vst1.16         {q8,  q9},   [r3, :128]!
+    vmull.u16       q12, d26, d2            @ idx2weight
+    vmull.u16       q13, d27, d3
+    vst1.16         {q8, q9},   [r3, :128]!
+    vrshrn.u32      d19, q15, #10           @ idx3weight
+    vrshrn.u32      d18, q14, #10
+    vrshrn.u32      d16, q12, #10           @ idx2weight
+    vrshrn.u32      d17, q13, #10
+    vzip.16         q8,  q9
+    vst1.16         {q8, q9},   [r3, :128]!
     bge             8b
-    pop             {r4-r6}
-    vpop            {q4-q7}
     bx              lr
 endfunc
-- 
2.5.0