[x264-devel] [PATCH 22/24] RFC: arm: Implement x264_mbtree_propagate_{cost, list}_neon

Martin Storsjö martin at martin.st
Mon Aug 24 22:10:17 CEST 2015


On Fri, 21 Aug 2015, Janne Grunau wrote:

> On 2015-08-13 23:59:43 +0300, Martin Storsjö wrote:
>> The cost function could be simplified to avoid having to clobber
>> q4/q5, but this requires reordering instructions which increase
>> the total runtime.
>>
>> The list function could avoid pushing q4-q7 to the stack by
>> reusing registers and reloading the constants, but that also
>> seems to actually make it slightly slower in practice.
>
> if you don't have enough register for an algorithm it is usually better
> to use the callee saved registers than to redo something every loop.
>
>> checkasm timing       Cortex-A7      A8      A9
>> mbtree_propagate_cost_c      63584   156719  62616
>> mbtree_propagate_cost_neon   17316   10833   11290
>>
>> mbtree_propagate_list_c      110894  108411  84324
>> mbtree_propagate_list_neon   83313   78821   62271
>> ---
>>  common/arm/mc-a.S |  128 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  common/arm/mc-c.c |   86 +++++++++++++++++++++++++++++++++++
>>  2 files changed, 214 insertions(+)
>>
>> diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
>> index 6538dec..beae5bd 100644
>> --- a/common/arm/mc-a.S
>> +++ b/common/arm/mc-a.S
>> @@ -28,6 +28,11 @@
>>
>>  #include "asm.S"
>>
>> +.section .rodata
>> +.align 4
>> +pw_0to15:
>> +.short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
>> +
>>  .text
>>
>>  // note: prefetch stuff assumes 64-byte cacheline, true for the Cortex-A8
>> @@ -1760,3 +1765,126 @@ function integral_init8v_neon
>>  2:
>>      bx              lr
>>  endfunc
>> +
>> +function x264_mbtree_propagate_cost_neon
>> +    vpush           {q4-q5}
>> +    push            {r4-r6}
>
> push {r4-r5, lr} before the vpush first and the function return becomes
> pop {r4-r5, pc}
>
>> +    ldrd            r4, r5, [sp, #44]
>> +    ldr             r6, [sp, #52]
>> +    vld1.32         {d6[]},  [r5]
>> +    vmov            d7,  d6
>
> vld1.32 {d6[], d7[]}, ...
>
>> +8:
>> +    subs            r6,  r6,  #8
>> +    vld1.16         {q8},  [r1]!
>> +    vld1.16         {q9},  [r2]!
>> +    vld1.16         {q10}, [r3]!
>> +    vld1.16         {q11}, [r4]!
>> +    vbic.u16        q10, q10, #0xc000
>> +    vmin.u16        q10, q9,  q10
>> +    vmull.u16       q12, d18, d22           @ propagate_intra
>> +    vmull.u16       q13, d19, d23           @ propagate_intra
>> +    vsubl.u16       q14, d18, d20           @ propagate_num
>> +    vsubl.u16       q15, d19, d21           @ propagate_num
>> +    vmovl.u16       q10, d18                @ propagate_denom
>> +    vmovl.u16       q11, d19                @ propagate_denom
>> +    vmovl.u16       q9,  d17
>> +    vmovl.u16       q8,  d16
>> +    vcvt.f32.s32    q12, q12
>> +    vcvt.f32.s32    q13, q13
>> +    vcvt.f32.s32    q14, q14
>> +    vcvt.f32.s32    q15, q15
>> +    vcvt.f32.s32    q10, q10
>> +    vcvt.f32.s32    q11, q11
>> +    vrecpe.f32      q0,  q10
>> +    vrecpe.f32      q1,  q11
>> +    vcvt.f32.s32    q8,  q8
>> +    vcvt.f32.s32    q9,  q9
>> +    vrecps.f32      q4,  q0,  q10
>> +    vrecps.f32      q5,  q1,  q11
>
> you don't need q4,q5. q10,q11 can be used to hold the result

Thanks, fixed these locally.

>> +    vmla.f32        q8,  q12, q3            @ propagate_amount
>> +    vmla.f32        q9,  q13, q3            @ propagate_amount
>> +    vmul.f32        q0,  q0,  q4
>> +    vmul.f32        q1,  q1,  q5
>> +    vmul.f32        q8,  q8,  q14
>> +    vmul.f32        q9,  q9,  q15
>> +    vmul.f32        q0,  q8,  q0
>> +    vmul.f32        q1,  q9,  q1
>> +    vcvt.s32.f32    q0,  q0
>> +    vcvt.s32.f32    q1,  q1
>> +    vqmovn.s32      d0,  q0
>> +    vqmovn.s32      d1,  q1
>> +    vst1.16         {q0},  [r0]!
>> +    bge             8b
>> +    pop             {r4-r6}
>> +    vpop            {q4-q5}
>> +    bx              lr
>> +endfunc
>> +
>> +function x264_mbtree_propagate_list_internal_neon
>> +    vpush           {q4-q7}
>
> q7 is not used
>
>> +    push            {r4-r6}
>
> same as above although you might get even away with
> vld2.16 {d12[], d13[]}, [sp]; vdup q8, d13[0]
>
>> +    ldrd            r4, r5, [sp, #76]
>> +    ldr             r6, [sp, #84]
>> +    movrel          r12, pw_0to15
>> +    vdup.16         d12, r4                 @ bipred_weight
>> +    vmov.u16        q4,  #0xc000
>> +    vld1.16         {q0},  [r12, :128]      @h->mb.i_mb_x,h->mb.i_mb_y
>> +    vmov.u32        q5,  #4
>> +    vmov.u16        q2,  #31
>> +    vmov.u16        q3,  #32
>> +    vdup.u16        q8,  r5                 @ mb_y
>> +    vzip.u16        q0,  q8
>> +8:
>> +    subs            r6,  r6,  #8
>
> you can use r12 if you reorder the use above. which means you won't need
> to push gpr
>
>> +    vld1.16         {q8},  [r1, :128]!      @ propagate_amount
>> +    vld1.16         {q9},  [r2, :128]!      @ lowres_cost
>> +    vand            q9,  q9,  q4
>> +    vceq.u16        q1,  q9,  q4
>> +    vmull.u16       q10, d16, d12
>> +    vmull.u16       q11, d17, d12
>> +    vrshrn.u32      d20, q10, #6
>> +    vrshrn.u32      d21, q11, #6
>> +    vbsl            q1,  q10, q8 @ if( lists_used == 3 )
>> +    @               propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
>> +    vld1.16         {q8, q9},  [r0, :128]!
>> +    vshr.s16        q10, q8,  #5
>> +    vshr.s16        q11, q9,  #5
>> +    vadd.s16        q10, q10, q0
>> +    vadd.s16        q0,  q0,  q5
>> +    vadd.s16        q11, q11, q0
>> +    vadd.s16        q0,  q0,  q5
>> +    vst1.16         {q10, q11},  [r3, :128]!
>> +    vand            q8,  q8,  q2
>> +    vand            q9,  q9,  q2
>> +    vuzp.16         q8,  q9                 @ x & 31, y & 31
>> +    vsub.s16        q10, q3,  q8            @ 32 - (x & 31)
>> +    vsub.s16        q11, q3,  q9            @ 32 - (y & 31)
>
> I'll send a patch with rewritten and rescheduled version which doesn't
> use the stack. speedup comes probably from the reschedule though

Thanks, that version looks good, I'll squash it.

>> +    vmul.u16        q12, q8,  q9            @ idx3weight = y*x
>> +    vmul.u16        q13, q10, q9            @ idx2weight = y*(32-x)
>> +    vmul.u16        q14, q8,  q11           @ idx1weight = (32-y)*x
>> +    vmul.u16        q15, q10, q11           @ idx0weight = (32-y)*(32-x)
>> +    vmull.u16       q8,  d24, d2            @ idx3weight
>> +    vmull.u16       q9,  d25, d3
>> +    vmull.u16       q10, d26, d2            @ idx2weight
>> +    vmull.u16       q11, d27, d3
>> +    vmull.u16       q12, d28, d2            @ idx1weight
>> +    vmull.u16       q13, d29, d3
>> +    vmull.u16       q14, d30, d2            @ idx0weight
>> +    vmull.u16       q15, d31, d3
>> +    vrshrn.u32      d19, q9,  #10           @ idx3weight
>> +    vrshrn.u32      d18, q8,  #10
>> +    vrshrn.u32      d16, q10, #10           @ idx2weight
>> +    vrshrn.u32      d17, q11, #10
>> +    vrshrn.u32      d22, q12, #10           @ idx1weight
>> +    vrshrn.u32      d23, q13, #10
>> +    vrshrn.u32      d20, q14, #10           @ idx0weight
>> +    vrshrn.u32      d21, q15, #10
>> +    vzip.16         q10, q11
>> +    vzip.16         q8,  q9
>> +    vst1.16         {q10, q11},  [r3, :128]!
>> +    vst1.16         {q8,  q9},   [r3, :128]!
>> +    bge             8b
>> +    pop             {r4-r6}
>> +    vpop            {q4-q7}
>> +    bx              lr
>> +endfunc
>> diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
>> index dd86fb2..eb582c2 100644
>> --- a/common/arm/mc-c.c
>> +++ b/common/arm/mc-c.c
>> @@ -4,6 +4,7 @@
>>   * Copyright (C) 2009-2015 x264 project
>>   *
>>   * Authors: David Conrad <lessen42 at gmail.com>
>> + *          Janne Grunau <janne-x264 at jannau.net>
>>   *
>>   * This program is free software; you can redistribute it and/or modify
>>   * it under the terms of the GNU General Public License as published by
>> @@ -104,6 +105,8 @@ void integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
>>  void integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
>>  void integral_init8v_neon( uint16_t *, intptr_t );
>>
>> +void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
>> +
>>  #if !HIGH_BIT_DEPTH
>>  static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
>>  {
>> @@ -226,6 +229,86 @@ static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8
>>  }
>>  #endif // !HIGH_BIT_DEPTH
>>
>> +#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
>> +#define CLIP_ADD2(s,x)\
>> +do\
>> +{\
>> +    CLIP_ADD((s)[0], (x)[0]);\
>> +    CLIP_ADD((s)[1], (x)[1]);\
>> +} while(0)
>> +
>> +void x264_mbtree_propagate_list_internal_neon( int16_t (*mvs)[2],
>> +                                               int16_t *propagate_amount,
>> +                                               uint16_t *lowres_costs,
>> +                                               int16_t *output,
>> +                                               int bipred_weight, int mb_y,
>> +                                               int len );
>> +
>> +static void x264_mbtree_propagate_list_neon( x264_t *h, uint16_t *ref_costs,
>> +                                             int16_t (*mvs)[2],
>> +                                             int16_t *propagate_amount,
>> +                                             uint16_t *lowres_costs,
>> +                                             int bipred_weight, int mb_y,
>> +                                             int len, int list )
>> +{
>
> I think we should reuse the CPP macro x86 for aarch64/arm. sorry for me
> being lazy when doing the arm64 version

Ok, I'll try to factorize this.

// Martin


More information about the x264-devel mailing list