[x264-devel] [PATCH 22/24] RFC: arm: Implement x264_mbtree_propagate_{cost, list}_neon
Janne Grunau
janne-x264 at jannau.net
Fri Aug 21 14:03:32 CEST 2015
On 2015-08-13 23:59:43 +0300, Martin Storsjö wrote:
> The cost function could be simplified to avoid having to clobber
> q4/q5, but this requires reordering instructions which increase
> the total runtime.
>
> The list function could avoid pushing q4-q7 to the stack by
> reusing registers and reloading the constants, but that also
> seems to actually make it slightly slower in practice.
if you don't have enough register for an algorithm it is usually better
to use the callee saved registers than to redo something every loop.
> checkasm timing Cortex-A7 A8 A9
> mbtree_propagate_cost_c 63584 156719 62616
> mbtree_propagate_cost_neon 17316 10833 11290
>
> mbtree_propagate_list_c 110894 108411 84324
> mbtree_propagate_list_neon 83313 78821 62271
> ---
> common/arm/mc-a.S | 128 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> common/arm/mc-c.c | 86 +++++++++++++++++++++++++++++++++++
> 2 files changed, 214 insertions(+)
>
> diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
> index 6538dec..beae5bd 100644
> --- a/common/arm/mc-a.S
> +++ b/common/arm/mc-a.S
> @@ -28,6 +28,11 @@
>
> #include "asm.S"
>
> +.section .rodata
> +.align 4
> +pw_0to15:
> +.short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
> +
> .text
>
> // note: prefetch stuff assumes 64-byte cacheline, true for the Cortex-A8
> @@ -1760,3 +1765,126 @@ function integral_init8v_neon
> 2:
> bx lr
> endfunc
> +
> +function x264_mbtree_propagate_cost_neon
> + vpush {q4-q5}
> + push {r4-r6}
push {r4-r5, lr} before the vpush first and the function return becomes
pop {r4-r5, pc}
> + ldrd r4, r5, [sp, #44]
> + ldr r6, [sp, #52]
> + vld1.32 {d6[]}, [r5]
> + vmov d7, d6
vld1.32 {d6[], d7[]}, ...
> +8:
> + subs r6, r6, #8
> + vld1.16 {q8}, [r1]!
> + vld1.16 {q9}, [r2]!
> + vld1.16 {q10}, [r3]!
> + vld1.16 {q11}, [r4]!
> + vbic.u16 q10, q10, #0xc000
> + vmin.u16 q10, q9, q10
> + vmull.u16 q12, d18, d22 @ propagate_intra
> + vmull.u16 q13, d19, d23 @ propagate_intra
> + vsubl.u16 q14, d18, d20 @ propagate_num
> + vsubl.u16 q15, d19, d21 @ propagate_num
> + vmovl.u16 q10, d18 @ propagate_denom
> + vmovl.u16 q11, d19 @ propagate_denom
> + vmovl.u16 q9, d17
> + vmovl.u16 q8, d16
> + vcvt.f32.s32 q12, q12
> + vcvt.f32.s32 q13, q13
> + vcvt.f32.s32 q14, q14
> + vcvt.f32.s32 q15, q15
> + vcvt.f32.s32 q10, q10
> + vcvt.f32.s32 q11, q11
> + vrecpe.f32 q0, q10
> + vrecpe.f32 q1, q11
> + vcvt.f32.s32 q8, q8
> + vcvt.f32.s32 q9, q9
> + vrecps.f32 q4, q0, q10
> + vrecps.f32 q5, q1, q11
you don't need q4,q5. q10,q11 can be used to hold the result
> + vmla.f32 q8, q12, q3 @ propagate_amount
> + vmla.f32 q9, q13, q3 @ propagate_amount
> + vmul.f32 q0, q0, q4
> + vmul.f32 q1, q1, q5
> + vmul.f32 q8, q8, q14
> + vmul.f32 q9, q9, q15
> + vmul.f32 q0, q8, q0
> + vmul.f32 q1, q9, q1
> + vcvt.s32.f32 q0, q0
> + vcvt.s32.f32 q1, q1
> + vqmovn.s32 d0, q0
> + vqmovn.s32 d1, q1
> + vst1.16 {q0}, [r0]!
> + bge 8b
> + pop {r4-r6}
> + vpop {q4-q5}
> + bx lr
> +endfunc
> +
> +function x264_mbtree_propagate_list_internal_neon
> + vpush {q4-q7}
q7 is not used
> + push {r4-r6}
same as above although you might get even away with
vld2.16 {d12[], d13[]}, [sp]; vdup q8, d13[0]
> + ldrd r4, r5, [sp, #76]
> + ldr r6, [sp, #84]
> + movrel r12, pw_0to15
> + vdup.16 d12, r4 @ bipred_weight
> + vmov.u16 q4, #0xc000
> + vld1.16 {q0}, [r12, :128] @h->mb.i_mb_x,h->mb.i_mb_y
> + vmov.u32 q5, #4
> + vmov.u16 q2, #31
> + vmov.u16 q3, #32
> + vdup.u16 q8, r5 @ mb_y
> + vzip.u16 q0, q8
> +8:
> + subs r6, r6, #8
you can use r12 if you reorder the use above. which means you won't need
to push gpr
> + vld1.16 {q8}, [r1, :128]! @ propagate_amount
> + vld1.16 {q9}, [r2, :128]! @ lowres_cost
> + vand q9, q9, q4
> + vceq.u16 q1, q9, q4
> + vmull.u16 q10, d16, d12
> + vmull.u16 q11, d17, d12
> + vrshrn.u32 d20, q10, #6
> + vrshrn.u32 d21, q11, #6
> + vbsl q1, q10, q8 @ if( lists_used == 3 )
> + @ propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
> + vld1.16 {q8, q9}, [r0, :128]!
> + vshr.s16 q10, q8, #5
> + vshr.s16 q11, q9, #5
> + vadd.s16 q10, q10, q0
> + vadd.s16 q0, q0, q5
> + vadd.s16 q11, q11, q0
> + vadd.s16 q0, q0, q5
> + vst1.16 {q10, q11}, [r3, :128]!
> + vand q8, q8, q2
> + vand q9, q9, q2
> + vuzp.16 q8, q9 @ x & 31, y & 31
> + vsub.s16 q10, q3, q8 @ 32 - (x & 31)
> + vsub.s16 q11, q3, q9 @ 32 - (y & 31)
I'll send a patch with rewritten and rescheduled version which doesn't
use the stack. speedup comes probably from the reschedule though
> + vmul.u16 q12, q8, q9 @ idx3weight = y*x
> + vmul.u16 q13, q10, q9 @ idx2weight = y*(32-x)
> + vmul.u16 q14, q8, q11 @ idx1weight = (32-y)*x
> + vmul.u16 q15, q10, q11 @ idx0weight = (32-y)*(32-x)
> + vmull.u16 q8, d24, d2 @ idx3weight
> + vmull.u16 q9, d25, d3
> + vmull.u16 q10, d26, d2 @ idx2weight
> + vmull.u16 q11, d27, d3
> + vmull.u16 q12, d28, d2 @ idx1weight
> + vmull.u16 q13, d29, d3
> + vmull.u16 q14, d30, d2 @ idx0weight
> + vmull.u16 q15, d31, d3
> + vrshrn.u32 d19, q9, #10 @ idx3weight
> + vrshrn.u32 d18, q8, #10
> + vrshrn.u32 d16, q10, #10 @ idx2weight
> + vrshrn.u32 d17, q11, #10
> + vrshrn.u32 d22, q12, #10 @ idx1weight
> + vrshrn.u32 d23, q13, #10
> + vrshrn.u32 d20, q14, #10 @ idx0weight
> + vrshrn.u32 d21, q15, #10
> + vzip.16 q10, q11
> + vzip.16 q8, q9
> + vst1.16 {q10, q11}, [r3, :128]!
> + vst1.16 {q8, q9}, [r3, :128]!
> + bge 8b
> + pop {r4-r6}
> + vpop {q4-q7}
> + bx lr
> +endfunc
> diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
> index dd86fb2..eb582c2 100644
> --- a/common/arm/mc-c.c
> +++ b/common/arm/mc-c.c
> @@ -4,6 +4,7 @@
> * Copyright (C) 2009-2015 x264 project
> *
> * Authors: David Conrad <lessen42 at gmail.com>
> + * Janne Grunau <janne-x264 at jannau.net>
> *
> * This program is free software; you can redistribute it and/or modify
> * it under the terms of the GNU General Public License as published by
> @@ -104,6 +105,8 @@ void integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
> void integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
> void integral_init8v_neon( uint16_t *, intptr_t );
>
> +void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
> +
> #if !HIGH_BIT_DEPTH
> static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
> {
> @@ -226,6 +229,86 @@ static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8
> }
> #endif // !HIGH_BIT_DEPTH
>
> +#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
> +#define CLIP_ADD2(s,x)\
> +do\
> +{\
> + CLIP_ADD((s)[0], (x)[0]);\
> + CLIP_ADD((s)[1], (x)[1]);\
> +} while(0)
> +
> +void x264_mbtree_propagate_list_internal_neon( int16_t (*mvs)[2],
> + int16_t *propagate_amount,
> + uint16_t *lowres_costs,
> + int16_t *output,
> + int bipred_weight, int mb_y,
> + int len );
> +
> +static void x264_mbtree_propagate_list_neon( x264_t *h, uint16_t *ref_costs,
> + int16_t (*mvs)[2],
> + int16_t *propagate_amount,
> + uint16_t *lowres_costs,
> + int bipred_weight, int mb_y,
> + int len, int list )
> +{
I think we should reuse the CPP macro x86 for aarch64/arm. sorry for me
being lazy when doing the arm64 version
Janne
More information about the x264-devel
mailing list