[x264-devel] [PATCH 22/24] RFC: arm: Implement x264_mbtree_propagate_{cost, list}_neon

Fri Aug 21 14:03:32 CEST 2015

On 2015-08-13 23:59:43 +0300, Martin Storsjö wrote:
> The cost function could be simplified to avoid having to clobber
> q4/q5, but this requires reordering instructions which increase
> the total runtime.
> 
> The list function could avoid pushing q4-q7 to the stack by
> reusing registers and reloading the constants, but that also
> seems to actually make it slightly slower in practice.

if you don't have enough register for an algorithm it is usually better 
to use the callee saved registers than to redo something every loop.

> checkasm timing       Cortex-A7      A8      A9
> mbtree_propagate_cost_c      63584   156719  62616
> mbtree_propagate_cost_neon   17316   10833   11290
> 
> mbtree_propagate_list_c      110894  108411  84324
> mbtree_propagate_list_neon   83313   78821   62271
> ---
>  common/arm/mc-a.S |  128 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  common/arm/mc-c.c |   86 +++++++++++++++++++++++++++++++++++
>  2 files changed, 214 insertions(+)
> 
> diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
> index 6538dec..beae5bd 100644
> --- a/common/arm/mc-a.S
> +++ b/common/arm/mc-a.S
> @@ -28,6 +28,11 @@
>  
>  #include "asm.S"
>  
> +.section .rodata
> +.align 4
> +pw_0to15:
> +.short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
> +
>  .text
>  
>  // note: prefetch stuff assumes 64-byte cacheline, true for the Cortex-A8
> @@ -1760,3 +1765,126 @@ function integral_init8v_neon
>  2:
>      bx              lr
>  endfunc
> +
> +function x264_mbtree_propagate_cost_neon
> +    vpush           {q4-q5}
> +    push            {r4-r6}

push {r4-r5, lr} before the vpush first and the function return becomes
pop {r4-r5, pc}

> +    ldrd            r4, r5, [sp, #44]
> +    ldr             r6, [sp, #52]
> +    vld1.32         {d6[]},  [r5]
> +    vmov            d7,  d6

vld1.32 {d6[], d7[]}, ...

> +8:
> +    subs            r6,  r6,  #8
> +    vld1.16         {q8},  [r1]!
> +    vld1.16         {q9},  [r2]!
> +    vld1.16         {q10}, [r3]!
> +    vld1.16         {q11}, [r4]!
> +    vbic.u16        q10, q10, #0xc000
> +    vmin.u16        q10, q9,  q10
> +    vmull.u16       q12, d18, d22           @ propagate_intra
> +    vmull.u16       q13, d19, d23           @ propagate_intra
> +    vsubl.u16       q14, d18, d20           @ propagate_num
> +    vsubl.u16       q15, d19, d21           @ propagate_num
> +    vmovl.u16       q10, d18                @ propagate_denom
> +    vmovl.u16       q11, d19                @ propagate_denom
> +    vmovl.u16       q9,  d17
> +    vmovl.u16       q8,  d16
> +    vcvt.f32.s32    q12, q12
> +    vcvt.f32.s32    q13, q13
> +    vcvt.f32.s32    q14, q14
> +    vcvt.f32.s32    q15, q15
> +    vcvt.f32.s32    q10, q10
> +    vcvt.f32.s32    q11, q11
> +    vrecpe.f32      q0,  q10
> +    vrecpe.f32      q1,  q11
> +    vcvt.f32.s32    q8,  q8
> +    vcvt.f32.s32    q9,  q9
> +    vrecps.f32      q4,  q0,  q10
> +    vrecps.f32      q5,  q1,  q11

you don't need q4,q5. q10,q11 can be used to hold the result

> +    vmla.f32        q8,  q12, q3            @ propagate_amount
> +    vmla.f32        q9,  q13, q3            @ propagate_amount
> +    vmul.f32        q0,  q0,  q4
> +    vmul.f32        q1,  q1,  q5
> +    vmul.f32        q8,  q8,  q14
> +    vmul.f32        q9,  q9,  q15
> +    vmul.f32        q0,  q8,  q0
> +    vmul.f32        q1,  q9,  q1
> +    vcvt.s32.f32    q0,  q0
> +    vcvt.s32.f32    q1,  q1
> +    vqmovn.s32      d0,  q0
> +    vqmovn.s32      d1,  q1
> +    vst1.16         {q0},  [r0]!
> +    bge             8b
> +    pop             {r4-r6}
> +    vpop            {q4-q5}
> +    bx              lr
> +endfunc
> +
> +function x264_mbtree_propagate_list_internal_neon
> +    vpush           {q4-q7}

q7 is not used

> +    push            {r4-r6}

same as above although you might get even away with
vld2.16 {d12[], d13[]}, [sp]; vdup q8, d13[0]

> +    ldrd            r4, r5, [sp, #76]
> +    ldr             r6, [sp, #84]
> +    movrel          r12, pw_0to15
> +    vdup.16         d12, r4                 @ bipred_weight
> +    vmov.u16        q4,  #0xc000
> +    vld1.16         {q0},  [r12, :128]      @h->mb.i_mb_x,h->mb.i_mb_y
> +    vmov.u32        q5,  #4
> +    vmov.u16        q2,  #31
> +    vmov.u16        q3,  #32
> +    vdup.u16        q8,  r5                 @ mb_y
> +    vzip.u16        q0,  q8
> +8:
> +    subs            r6,  r6,  #8

you can use r12 if you reorder the use above. which means you won't need 
to push gpr

> +    vld1.16         {q8},  [r1, :128]!      @ propagate_amount
> +    vld1.16         {q9},  [r2, :128]!      @ lowres_cost
> +    vand            q9,  q9,  q4
> +    vceq.u16        q1,  q9,  q4
> +    vmull.u16       q10, d16, d12
> +    vmull.u16       q11, d17, d12
> +    vrshrn.u32      d20, q10, #6
> +    vrshrn.u32      d21, q11, #6
> +    vbsl            q1,  q10, q8 @ if( lists_used == 3 )
> +    @               propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
> +    vld1.16         {q8, q9},  [r0, :128]!
> +    vshr.s16        q10, q8,  #5
> +    vshr.s16        q11, q9,  #5
> +    vadd.s16        q10, q10, q0
> +    vadd.s16        q0,  q0,  q5
> +    vadd.s16        q11, q11, q0
> +    vadd.s16        q0,  q0,  q5
> +    vst1.16         {q10, q11},  [r3, :128]!
> +    vand            q8,  q8,  q2
> +    vand            q9,  q9,  q2
> +    vuzp.16         q8,  q9                 @ x & 31, y & 31
> +    vsub.s16        q10, q3,  q8            @ 32 - (x & 31)
> +    vsub.s16        q11, q3,  q9            @ 32 - (y & 31)

I'll send a patch with rewritten and rescheduled version which doesn't 
use the stack. speedup comes probably from the reschedule though

> +    vmul.u16        q12, q8,  q9            @ idx3weight = y*x
> +    vmul.u16        q13, q10, q9            @ idx2weight = y*(32-x)
> +    vmul.u16        q14, q8,  q11           @ idx1weight = (32-y)*x
> +    vmul.u16        q15, q10, q11           @ idx0weight = (32-y)*(32-x)
> +    vmull.u16       q8,  d24, d2            @ idx3weight
> +    vmull.u16       q9,  d25, d3
> +    vmull.u16       q10, d26, d2            @ idx2weight
> +    vmull.u16       q11, d27, d3
> +    vmull.u16       q12, d28, d2            @ idx1weight
> +    vmull.u16       q13, d29, d3
> +    vmull.u16       q14, d30, d2            @ idx0weight
> +    vmull.u16       q15, d31, d3
> +    vrshrn.u32      d19, q9,  #10           @ idx3weight
> +    vrshrn.u32      d18, q8,  #10
> +    vrshrn.u32      d16, q10, #10           @ idx2weight
> +    vrshrn.u32      d17, q11, #10
> +    vrshrn.u32      d22, q12, #10           @ idx1weight
> +    vrshrn.u32      d23, q13, #10
> +    vrshrn.u32      d20, q14, #10           @ idx0weight
> +    vrshrn.u32      d21, q15, #10
> +    vzip.16         q10, q11
> +    vzip.16         q8,  q9
> +    vst1.16         {q10, q11},  [r3, :128]!
> +    vst1.16         {q8,  q9},   [r3, :128]!
> +    bge             8b
> +    pop             {r4-r6}
> +    vpop            {q4-q7}
> +    bx              lr
> +endfunc
> diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
> index dd86fb2..eb582c2 100644
> --- a/common/arm/mc-c.c
> +++ b/common/arm/mc-c.c
> @@ -4,6 +4,7 @@
>   * Copyright (C) 2009-2015 x264 project
>   *
>   * Authors: David Conrad <lessen42 at gmail.com>
> + *          Janne Grunau <janne-x264 at jannau.net>
>   *
>   * This program is free software; you can redistribute it and/or modify
>   * it under the terms of the GNU General Public License as published by
> @@ -104,6 +105,8 @@ void integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
>  void integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
>  void integral_init8v_neon( uint16_t *, intptr_t );
>  
> +void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
> +
>  #if !HIGH_BIT_DEPTH
>  static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
>  {
> @@ -226,6 +229,86 @@ static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8
>  }
>  #endif // !HIGH_BIT_DEPTH
>  
> +#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
> +#define CLIP_ADD2(s,x)\
> +do\
> +{\
> +    CLIP_ADD((s)[0], (x)[0]);\
> +    CLIP_ADD((s)[1], (x)[1]);\
> +} while(0)
> +
> +void x264_mbtree_propagate_list_internal_neon( int16_t (*mvs)[2],
> +                                               int16_t *propagate_amount,
> +                                               uint16_t *lowres_costs,
> +                                               int16_t *output,
> +                                               int bipred_weight, int mb_y,
> +                                               int len );
> +
> +static void x264_mbtree_propagate_list_neon( x264_t *h, uint16_t *ref_costs,
> +                                             int16_t (*mvs)[2],
> +                                             int16_t *propagate_amount,
> +                                             uint16_t *lowres_costs,
> +                                             int bipred_weight, int mb_y,
> +                                             int len, int list )
> +{

I think we should reuse the CPP macro x86 for aarch64/arm. sorry for me 
being lazy when doing the arm64 version

Janne