[x264-devel] [PATCH 22/24] RFC: arm: Implement x264_mbtree_propagate_{cost, list}_neon
Martin Storsjö
martin at martin.st
Mon Aug 24 22:10:17 CEST 2015
On Fri, 21 Aug 2015, Janne Grunau wrote:
> On 2015-08-13 23:59:43 +0300, Martin Storsjö wrote:
>> The cost function could be simplified to avoid having to clobber
>> q4/q5, but this requires reordering instructions which increase
>> the total runtime.
>>
>> The list function could avoid pushing q4-q7 to the stack by
>> reusing registers and reloading the constants, but that also
>> seems to actually make it slightly slower in practice.
>
> if you don't have enough register for an algorithm it is usually better
> to use the callee saved registers than to redo something every loop.
>
>> checkasm timing Cortex-A7 A8 A9
>> mbtree_propagate_cost_c 63584 156719 62616
>> mbtree_propagate_cost_neon 17316 10833 11290
>>
>> mbtree_propagate_list_c 110894 108411 84324
>> mbtree_propagate_list_neon 83313 78821 62271
>> ---
>> common/arm/mc-a.S | 128 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>> common/arm/mc-c.c | 86 +++++++++++++++++++++++++++++++++++
>> 2 files changed, 214 insertions(+)
>>
>> diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
>> index 6538dec..beae5bd 100644
>> --- a/common/arm/mc-a.S
>> +++ b/common/arm/mc-a.S
>> @@ -28,6 +28,11 @@
>>
>> #include "asm.S"
>>
>> +.section .rodata
>> +.align 4
>> +pw_0to15:
>> +.short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
>> +
>> .text
>>
>> // note: prefetch stuff assumes 64-byte cacheline, true for the Cortex-A8
>> @@ -1760,3 +1765,126 @@ function integral_init8v_neon
>> 2:
>> bx lr
>> endfunc
>> +
>> +function x264_mbtree_propagate_cost_neon
>> + vpush {q4-q5}
>> + push {r4-r6}
>
> push {r4-r5, lr} before the vpush first and the function return becomes
> pop {r4-r5, pc}
>
>> + ldrd r4, r5, [sp, #44]
>> + ldr r6, [sp, #52]
>> + vld1.32 {d6[]}, [r5]
>> + vmov d7, d6
>
> vld1.32 {d6[], d7[]}, ...
>
>> +8:
>> + subs r6, r6, #8
>> + vld1.16 {q8}, [r1]!
>> + vld1.16 {q9}, [r2]!
>> + vld1.16 {q10}, [r3]!
>> + vld1.16 {q11}, [r4]!
>> + vbic.u16 q10, q10, #0xc000
>> + vmin.u16 q10, q9, q10
>> + vmull.u16 q12, d18, d22 @ propagate_intra
>> + vmull.u16 q13, d19, d23 @ propagate_intra
>> + vsubl.u16 q14, d18, d20 @ propagate_num
>> + vsubl.u16 q15, d19, d21 @ propagate_num
>> + vmovl.u16 q10, d18 @ propagate_denom
>> + vmovl.u16 q11, d19 @ propagate_denom
>> + vmovl.u16 q9, d17
>> + vmovl.u16 q8, d16
>> + vcvt.f32.s32 q12, q12
>> + vcvt.f32.s32 q13, q13
>> + vcvt.f32.s32 q14, q14
>> + vcvt.f32.s32 q15, q15
>> + vcvt.f32.s32 q10, q10
>> + vcvt.f32.s32 q11, q11
>> + vrecpe.f32 q0, q10
>> + vrecpe.f32 q1, q11
>> + vcvt.f32.s32 q8, q8
>> + vcvt.f32.s32 q9, q9
>> + vrecps.f32 q4, q0, q10
>> + vrecps.f32 q5, q1, q11
>
> you don't need q4,q5. q10,q11 can be used to hold the result
Thanks, fixed these locally.
>> + vmla.f32 q8, q12, q3 @ propagate_amount
>> + vmla.f32 q9, q13, q3 @ propagate_amount
>> + vmul.f32 q0, q0, q4
>> + vmul.f32 q1, q1, q5
>> + vmul.f32 q8, q8, q14
>> + vmul.f32 q9, q9, q15
>> + vmul.f32 q0, q8, q0
>> + vmul.f32 q1, q9, q1
>> + vcvt.s32.f32 q0, q0
>> + vcvt.s32.f32 q1, q1
>> + vqmovn.s32 d0, q0
>> + vqmovn.s32 d1, q1
>> + vst1.16 {q0}, [r0]!
>> + bge 8b
>> + pop {r4-r6}
>> + vpop {q4-q5}
>> + bx lr
>> +endfunc
>> +
>> +function x264_mbtree_propagate_list_internal_neon
>> + vpush {q4-q7}
>
> q7 is not used
>
>> + push {r4-r6}
>
> same as above although you might get even away with
> vld2.16 {d12[], d13[]}, [sp]; vdup q8, d13[0]
>
>> + ldrd r4, r5, [sp, #76]
>> + ldr r6, [sp, #84]
>> + movrel r12, pw_0to15
>> + vdup.16 d12, r4 @ bipred_weight
>> + vmov.u16 q4, #0xc000
>> + vld1.16 {q0}, [r12, :128] @h->mb.i_mb_x,h->mb.i_mb_y
>> + vmov.u32 q5, #4
>> + vmov.u16 q2, #31
>> + vmov.u16 q3, #32
>> + vdup.u16 q8, r5 @ mb_y
>> + vzip.u16 q0, q8
>> +8:
>> + subs r6, r6, #8
>
> you can use r12 if you reorder the use above. which means you won't need
> to push gpr
>
>> + vld1.16 {q8}, [r1, :128]! @ propagate_amount
>> + vld1.16 {q9}, [r2, :128]! @ lowres_cost
>> + vand q9, q9, q4
>> + vceq.u16 q1, q9, q4
>> + vmull.u16 q10, d16, d12
>> + vmull.u16 q11, d17, d12
>> + vrshrn.u32 d20, q10, #6
>> + vrshrn.u32 d21, q11, #6
>> + vbsl q1, q10, q8 @ if( lists_used == 3 )
>> + @ propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
>> + vld1.16 {q8, q9}, [r0, :128]!
>> + vshr.s16 q10, q8, #5
>> + vshr.s16 q11, q9, #5
>> + vadd.s16 q10, q10, q0
>> + vadd.s16 q0, q0, q5
>> + vadd.s16 q11, q11, q0
>> + vadd.s16 q0, q0, q5
>> + vst1.16 {q10, q11}, [r3, :128]!
>> + vand q8, q8, q2
>> + vand q9, q9, q2
>> + vuzp.16 q8, q9 @ x & 31, y & 31
>> + vsub.s16 q10, q3, q8 @ 32 - (x & 31)
>> + vsub.s16 q11, q3, q9 @ 32 - (y & 31)
>
> I'll send a patch with rewritten and rescheduled version which doesn't
> use the stack. speedup comes probably from the reschedule though
Thanks, that version looks good, I'll squash it.
>> + vmul.u16 q12, q8, q9 @ idx3weight = y*x
>> + vmul.u16 q13, q10, q9 @ idx2weight = y*(32-x)
>> + vmul.u16 q14, q8, q11 @ idx1weight = (32-y)*x
>> + vmul.u16 q15, q10, q11 @ idx0weight = (32-y)*(32-x)
>> + vmull.u16 q8, d24, d2 @ idx3weight
>> + vmull.u16 q9, d25, d3
>> + vmull.u16 q10, d26, d2 @ idx2weight
>> + vmull.u16 q11, d27, d3
>> + vmull.u16 q12, d28, d2 @ idx1weight
>> + vmull.u16 q13, d29, d3
>> + vmull.u16 q14, d30, d2 @ idx0weight
>> + vmull.u16 q15, d31, d3
>> + vrshrn.u32 d19, q9, #10 @ idx3weight
>> + vrshrn.u32 d18, q8, #10
>> + vrshrn.u32 d16, q10, #10 @ idx2weight
>> + vrshrn.u32 d17, q11, #10
>> + vrshrn.u32 d22, q12, #10 @ idx1weight
>> + vrshrn.u32 d23, q13, #10
>> + vrshrn.u32 d20, q14, #10 @ idx0weight
>> + vrshrn.u32 d21, q15, #10
>> + vzip.16 q10, q11
>> + vzip.16 q8, q9
>> + vst1.16 {q10, q11}, [r3, :128]!
>> + vst1.16 {q8, q9}, [r3, :128]!
>> + bge 8b
>> + pop {r4-r6}
>> + vpop {q4-q7}
>> + bx lr
>> +endfunc
>> diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
>> index dd86fb2..eb582c2 100644
>> --- a/common/arm/mc-c.c
>> +++ b/common/arm/mc-c.c
>> @@ -4,6 +4,7 @@
>> * Copyright (C) 2009-2015 x264 project
>> *
>> * Authors: David Conrad <lessen42 at gmail.com>
>> + * Janne Grunau <janne-x264 at jannau.net>
>> *
>> * This program is free software; you can redistribute it and/or modify
>> * it under the terms of the GNU General Public License as published by
>> @@ -104,6 +105,8 @@ void integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
>> void integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
>> void integral_init8v_neon( uint16_t *, intptr_t );
>>
>> +void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
>> +
>> #if !HIGH_BIT_DEPTH
>> static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
>> {
>> @@ -226,6 +229,86 @@ static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8
>> }
>> #endif // !HIGH_BIT_DEPTH
>>
>> +#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
>> +#define CLIP_ADD2(s,x)\
>> +do\
>> +{\
>> + CLIP_ADD((s)[0], (x)[0]);\
>> + CLIP_ADD((s)[1], (x)[1]);\
>> +} while(0)
>> +
>> +void x264_mbtree_propagate_list_internal_neon( int16_t (*mvs)[2],
>> + int16_t *propagate_amount,
>> + uint16_t *lowres_costs,
>> + int16_t *output,
>> + int bipred_weight, int mb_y,
>> + int len );
>> +
>> +static void x264_mbtree_propagate_list_neon( x264_t *h, uint16_t *ref_costs,
>> + int16_t (*mvs)[2],
>> + int16_t *propagate_amount,
>> + uint16_t *lowres_costs,
>> + int bipred_weight, int mb_y,
>> + int len, int list )
>> +{
>
> I think we should reuse the CPP macro x86 for aarch64/arm. sorry for me
> being lazy when doing the arm64 version
Ok, I'll try to factorize this.
// Martin
More information about the x264-devel
mailing list