[x264-devel] [PATCH 11/24] arm: Implement neon 8x16c intra predict functions

Thu Aug 27 11:25:48 CEST 2015

On Wed, 19 Aug 2015, Janne Grunau wrote:

> On 2015-08-13 23:59:32 +0300, Martin Storsjö wrote:
>> This implements the same functions as are implemented for 8x8c
>> and as for 8x16c on aarch64.
>>
>> Some of the simpler ones actually turn out to be slower than the
>> plain C version, at least on some CPUs.
>
> See 'arm64: optimize various intra_predict asm functions'
> (<1439822360-17282-1-git-send-email-janne-x264 at jannau.net>)
>
> That makes all intra_predict functions at least as fast as the C version
> on a cortex-a53 in arm64 mode.
>
>> checkasm timing       Cortex-A7      A8     A9
>> intra_predict_8x16c_dc_c     1347    910    1017
>> intra_predict_8x16c_dc_neon  1271    1366   1247
>> intra_predict_8x16c_dcl_c    859     677    692
>> intra_predict_8x16c_dcl_neon 1006    1209   1065
>> intra_predict_8x16c_dct_c    871     540    590
>> intra_predict_8x16c_dct_neon 672     511    657
>> intra_predict_8x16c_h_c      937     712    719
>> intra_predict_8x16c_h_neon   722     682    672
>> intra_predict_8x16c_p_c      10184   9967   8652
>> intra_predict_8x16c_p_neon   2617    1973   1983
>> intra_predict_8x16c_v_c      610     380    429
>> intra_predict_8x16c_v_neon   570     513    507
>> ---
>>  common/arm/predict-a.S |  158 ++++++++++++++++++++++++++++++++++++++++++++++++
>>  common/arm/predict-c.c |   15 +++++
>>  common/arm/predict.h   |    8 +++
>>  common/predict.c       |    4 ++
>>  4 files changed, 185 insertions(+)
>>
>> diff --git a/common/arm/predict-a.S b/common/arm/predict-a.S
>> index 7e5d9d3..228fd2e 100644
>> --- a/common/arm/predict-a.S
>> +++ b/common/arm/predict-a.S
>> @@ -5,6 +5,7 @@
>>   *
>>   * Authors: David Conrad <lessen42 at gmail.com>
>>   *          Mans Rullgard <mans at mansr.com>
>> + *          Martin Storsjo <martin at martin.st>
>>   *
>>   * This program is free software; you can redistribute it and/or modify
>>   * it under the terms of the GNU General Public License as published by
>> @@ -552,6 +553,163 @@ function x264_predict_8x8c_p_neon
>>  endfunc
>>
>>
>> +function x264_predict_8x16c_dc_top_neon
>> +    sub         r2,  r0,  #FDEC_STRIDE
>> +    mov         r1,  #FDEC_STRIDE
>> +    vld1.8      {d0}, [r2,:64]
>> +    vpaddl.u8   d0,  d0
>> +    vpadd.u16   d0,  d0,  d0
>> +    vrshrn.u16  d0,  q0,  #2
>> +    vdup.8      d1,  d0[1]
>> +    vdup.8      d0,  d0[0]
>> +    vtrn.32     d0,  d1
>
> vmov d1, d0
>
>
>> +    vmov        q1,  q0
>> +    b           pred8x16_dc_end
>
> since we need every cycle to it probably makes sense to avoid the branch
> and vmov

Didn't turn out to help nearly enough (this only helped around one cycle)

>> +endfunc
>> +
>> +function x264_predict_8x16c_dc_left_neon
>> +    mov         r1,  #FDEC_STRIDE
>> +    sub         r2,  r0,  #1
>> +    ldcol.8     d0,  r2,  r1
>> +    ldcol.8     d2,  r2,  r1
>
> the ldcol is probably the mayor factor which makes the neon versions
> slower. one idea would be using a ldcol.16 which interleaves the loads
> to both registers.

This helped quite a bit (on the order of magnitude of a couple tens of 
cycles), but not enough

>> +    vpaddl.u8   d0,  d0
>> +    vpaddl.u8   d2,  d2
>
> have you tried using d0 and d1 and vpaddl q0, q0?

Ah, yes, that also helps a little but not enough

>> +    vpadd.u16   d0,  d0,  d0
>> +    vpadd.u16   d2,  d2,  d2
>
> vpadd d0, d0, d2 (or d1)

Ditto

>> +    vrshrn.u16  d0,  q0,  #2
>> +    vrshrn.u16  d2,  q1,  #2
>> +    vdup.8      d1,  d0[1]
>> +    vdup.8      d0,  d0[0]
>> +    vdup.8      d3,  d2[1]
>> +    vdup.8      d2,  d2[0]
>> +    b           pred8x16_dc_end
>> +endfunc
>> +
>> +function x264_predict_8x16c_dc_neon
>> +    sub         r2,  r0,  #FDEC_STRIDE
>> +    mov         r1,  #FDEC_STRIDE
>> +    vld1.8      {d0}, [r2,:64]
>> +    sub         r2,  r0,  #1
>> +    ldcol.8     d1,  r2,  r1
>> +    vdup.32     d2,  d0[1]
>> +    ldcol.8     d3,  r2,  r1
>
> see above but I doubt that using gpr as on arm64 will be faster
>
>> +    vtrn.32     d0,  d1
>> +    vtrn.32     d2,  d3
>> +    vpaddl.u8   q0,  q0
>> +    vpaddl.u8   q1,  q1
>> +    vpadd.u16   d0,  d0,  d1
>> +    vpadd.u16   d2,  d2,  d3
>> +    vpadd.u16   d1,  d0,  d0
>> +    vpadd.u16   d3,  d2,  d2
>> +    vrshrn.u16  d4,  q0,  #3
>> +    vrshrn.u16  d5,  q0,  #2
>> +    vrshrn.u16  d6,  q1,  #3
>> +    vrshrn.u16  d7,  q1,  #2
>> +    vdup.8      d0,  d4[4]
>> +    vdup.8      d1,  d5[3]
>> +    vdup.8      d16, d5[2]
>> +    vdup.8      d17, d4[5]
>> +    vtrn.32     q0,  q8
>> +    vdup.8      d2,  d7[1]
>> +    vdup.8      d3,  d7[3]
>> +    vdup.8      d16, d6[4]
>> +    vdup.8      d17, d6[5]
>> +    vtrn.32     q1,  q8
>> +pred8x16_dc_end:
>> +    add         r2,  r0,  r1,  lsl #2
>> +.rept 4
>> +    vst1.8      {d0}, [r0,:64], r1
>> +    vst1.8      {d1}, [r2,:64], r1
>> +.endr
>> +    add         r2,  r2,  r1,  lsl #2
>> +    add         r0,  r0,  r1,  lsl #2
>> +.rept 4
>> +    vst1.8      {d2}, [r0,:64], r1
>> +    vst1.8      {d3}, [r2,:64], r1
>> +.endr
>
> r3 and r12 are free too, you could try to write all 4 registers at once

Didn't change almost anything (less than a cycle)

>> +    bx          lr
>> +endfunc
>> +
>> +function x264_predict_8x16c_h_neon
>> +    sub         r1, r0, #1
>> +    mov         ip, #FDEC_STRIDE
>> +.rept 8
>> +    vld1.8      {d0[]}, [r1], ip
>> +    vld1.8      {d2[]}, [r1], ip
>> +    vst1.64     {d0}, [r0,:64], ip
>> +    vst1.64     {d2}, [r0,:64], ip
>> +.endr
>> +    bx          lr
>> +endfunc
>> +
>> +function x264_predict_8x16c_v_neon
>> +    sub         r0, r0, #FDEC_STRIDE
>> +    mov         ip, #FDEC_STRIDE
>> +    vld1.64     {d0}, [r0,:64], ip
>> +.rept 16
>> +    vst1.64     {d0}, [r0,:64], ip
>
> this would be faster if use more than 1 gpr register for writeback.
> vldr/vstr would be probably faster since it has immediate offset

This made no difference at all (within one cycle) on A9, helped a lot on 
A7 (20 cycles), and hurt a lot on A8 (13 cycles).

So even with these changes, the collective conclusion is:

                          A7  A8  A9
intra_predict_8x16c_dc   +   -   -
intra_predict_8x16c_dcl  +   -   -
intra_predict_8x16c_dct  +   +   -
intra_predict_8x16c_h    +   +   +
intra_predict_8x16c_p    +   +   +
intra_predict_8x16c_v    +   -   -

(where + is better, aka lower numbers)

So I'd say we should add at least the _h and _p functions. For _dc_top the 
difference on A8 and A9 is pretty small so it shouldn't matter (although 
it helps quite a bit on A7 so perhaps it should be included). The _v 
version with vldr/vstr helps lots on A7 but hurts on the others, so 
perhaps that one should be skipped.

Does that sound sane?

// Martin