[x264-devel] [PATCH 1/1] arm: optimize mbtree_propagate_list_neon
Janne Grunau
janne-x264 at jannau.net
Fri Aug 21 14:04:14 CEST 2015
Avoid pushing anything to the stack and interleave the calculations. The
latter is probably responsible for the speedup.
Cortex-A9: GCC 4.8.4 neon (before) neon (after)
mbtree_propagate_list: 106820 74059 72360
---
common/arm/mc-a.S | 105 ++++++++++++++++++++++++++----------------------------
1 file changed, 50 insertions(+), 55 deletions(-)
diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
index beae5bd..8f74ef2 100644
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
@@ -1821,70 +1821,65 @@ function x264_mbtree_propagate_cost_neon
endfunc
function x264_mbtree_propagate_list_internal_neon
- vpush {q4-q7}
- push {r4-r6}
- ldrd r4, r5, [sp, #76]
- ldr r6, [sp, #84]
+ vld2.16 {d4[], d5[]}, [sp] @ bipred_weight, mb_y
movrel r12, pw_0to15
- vdup.16 d12, r4 @ bipred_weight
- vmov.u16 q4, #0xc000
+ vmov.u16 q10, #0xc000
vld1.16 {q0}, [r12, :128] @h->mb.i_mb_x,h->mb.i_mb_y
- vmov.u32 q5, #4
- vmov.u16 q2, #31
- vmov.u16 q3, #32
- vdup.u16 q8, r5 @ mb_y
+ vmov.u32 q11, #4
+ vmov.u8 q3, #32
+ vdup.u16 q8, d5[0] @ mb_y
vzip.u16 q0, q8
+ ldr r12, [sp, #8]
8:
- subs r6, r6, #8
- vld1.16 {q8}, [r1, :128]! @ propagate_amount
- vld1.16 {q9}, [r2, :128]! @ lowres_cost
- vand q9, q9, q4
- vceq.u16 q1, q9, q4
- vmull.u16 q10, d16, d12
- vmull.u16 q11, d17, d12
- vrshrn.u32 d20, q10, #6
- vrshrn.u32 d21, q11, #6
- vbsl q1, q10, q8 @ if( lists_used == 3 )
- @ propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
+ subs r12, r12, #8
+ vld1.16 {q14}, [r1, :128]! @ propagate_amount
+ vld1.16 {q15}, [r2, :128]! @ lowres_cost
vld1.16 {q8, q9}, [r0, :128]!
- vshr.s16 q10, q8, #5
- vshr.s16 q11, q9, #5
- vadd.s16 q10, q10, q0
- vadd.s16 q0, q0, q5
- vadd.s16 q11, q11, q0
- vadd.s16 q0, q0, q5
- vst1.16 {q10, q11}, [r3, :128]!
- vand q8, q8, q2
- vand q9, q9, q2
+ vand q15, q15, q10
+ vceq.u16 q1, q15, q10
+ vmull.u16 q12, d28, d4
+ vmull.u16 q13, d29, d4
+ vrshrn.u32 d30, q12, #6
+ vrshrn.u32 d31, q13, #6
+ vbsl q1, q15, q14 @ if( lists_used == 3 )
+ @ propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
+ vshr.s16 q12, q8, #5
+ vshr.s16 q13, q9, #5
vuzp.16 q8, q9 @ x & 31, y & 31
- vsub.s16 q10, q3, q8 @ 32 - (x & 31)
- vsub.s16 q11, q3, q9 @ 32 - (y & 31)
- vmul.u16 q12, q8, q9 @ idx3weight = y*x
- vmul.u16 q13, q10, q9 @ idx2weight = y*(32-x)
- vmul.u16 q14, q8, q11 @ idx1weight = (32-y)*x
- vmul.u16 q15, q10, q11 @ idx0weight = (32-y)*(32-x)
- vmull.u16 q8, d24, d2 @ idx3weight
- vmull.u16 q9, d25, d3
- vmull.u16 q10, d26, d2 @ idx2weight
- vmull.u16 q11, d27, d3
- vmull.u16 q12, d28, d2 @ idx1weight
- vmull.u16 q13, d29, d3
+ vadd.s16 q12, q12, q0
+ vadd.s16 q0, q0, q11
+ vmovn.i16 d16, q8
+ vmovn.i16 d17, q9
+ vadd.s16 q13, q13, q0
+ vbic.i16 q8, q8, #128+64+32
+ vadd.s16 q0, q0, q11
+ vbic.i16 q8, q8, #(128+64+32)<<8
+ vst1.16 {q12, q13}, [r3, :128]!
+ vsub.i8 q9, q3, q8
+ vmull.u8 q12, d17, d16 @ idx3weight = y*x
+ vmull.u8 q14, d19, d16 @ idx1weight = (32-y)*x
+ vmull.u8 q15, d19, d18 @ idx0weight = (32-y)*(32-x)
+ vmull.u8 q13, d17, d18 @ idx2weight = y*(32-x)
+ vmull.u16 q9, d28, d2 @ idx1weight
+ vmull.u16 q8, d29, d3
vmull.u16 q14, d30, d2 @ idx0weight
vmull.u16 q15, d31, d3
- vrshrn.u32 d19, q9, #10 @ idx3weight
- vrshrn.u32 d18, q8, #10
- vrshrn.u32 d16, q10, #10 @ idx2weight
- vrshrn.u32 d17, q11, #10
- vrshrn.u32 d22, q12, #10 @ idx1weight
- vrshrn.u32 d23, q13, #10
- vrshrn.u32 d20, q14, #10 @ idx0weight
- vrshrn.u32 d21, q15, #10
- vzip.16 q10, q11
+ vrshrn.u32 d18, q9, #10 @ idx1weight
+ vrshrn.u32 d19, q8, #10
+ vrshrn.u32 d16, q14, #10 @ idx0weight
+ vrshrn.u32 d17, q15, #10
+ vmull.u16 q14, d24, d2 @ idx3weight
+ vmull.u16 q15, d25, d3
vzip.16 q8, q9
- vst1.16 {q10, q11}, [r3, :128]!
- vst1.16 {q8, q9}, [r3, :128]!
+ vmull.u16 q12, d26, d2 @ idx2weight
+ vmull.u16 q13, d27, d3
+ vst1.16 {q8, q9}, [r3, :128]!
+ vrshrn.u32 d19, q15, #10 @ idx3weight
+ vrshrn.u32 d18, q14, #10
+ vrshrn.u32 d16, q12, #10 @ idx2weight
+ vrshrn.u32 d17, q13, #10
+ vzip.16 q8, q9
+ vst1.16 {q8, q9}, [r3, :128]!
bge 8b
- pop {r4-r6}
- vpop {q4-q7}
bx lr
endfunc
--
2.5.0
More information about the x264-devel
mailing list