[x264-devel] [PATCH 14/24] arm: Implement x284_decimate_score15/16/64_neon

Tue Aug 25 12:07:26 CEST 2015

On Fri, 21 Aug 2015, Janne Grunau wrote:

> On 2015-08-13 23:59:35 +0300, Martin Storsjö wrote:
>> checkasm timing       Cortex-A7      A8     A9
>> decimate_score15_c           767     723    545
>> decimate_score15_neon        507     504    496
>> decimate_score16_c           776     742    546
>> decimate_score16_neon        494     507    470
>> decimate_score64_c           2399    2511   2023
>> decimate_score64_neon        1041    842    804
>> ---
>>  common/aarch64/quant-a.S |    1 +
>>  common/arm/quant-a.S     |  142 ++++++++++++++++++++++++++++++++++++++++++++++
>>  common/arm/quant.h       |    4 ++
>>  common/quant.c           |    6 +-
>>  4 files changed, 150 insertions(+), 3 deletions(-)
>>
>> diff --git a/common/aarch64/quant-a.S b/common/aarch64/quant-a.S
>> index 443a91d..5aea85f 100644
>> --- a/common/aarch64/quant-a.S
>> +++ b/common/aarch64/quant-a.S
>> @@ -5,6 +5,7 @@
>>   *
>>   * Authors: David Conrad <lessen42 at gmail.com>
>>   *          Janne Grunau <janne-x264 at jannau.net>
>> + *          Martin Storsjo <martin at martin.st>
>>   *
>>   * This program is free software; you can redistribute it and/or modify
>>   * it under the terms of the GNU General Public License as published by
>> diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S
>> index e3d5cd2..5ec8c04 100644
>> --- a/common/arm/quant-a.S
>> +++ b/common/arm/quant-a.S
>> @@ -32,6 +32,14 @@ pmovmskb_byte:
>>  .byte 1,2,4,8,16,32,64,128
>>  .byte 1,2,4,8,16,32,64,128
>>
>> +mask_2bit:
>> +.byte 3,12,48,192,3,12,48,192
>> +.byte 3,12,48,192,3,12,48,192
>> +
>> +mask_1bit:
>> +.byte 128,64,32,16,8,4,2,1
>> +.byte 128,64,32,16,8,4,2,1
>> +
>>  .text
>>
>>  .macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no
>> @@ -308,6 +316,140 @@ dequant_4x4_dc_rshift:
>>      bx          lr
>>  endfunc
>>
>> +.macro decimate_score_1x size
>> +function x264_decimate_score\size\()_neon
>> +.if \size == 15
>> +    vld1.16     {q0, q1}, [r0]
>> +.else
>> +    vld1.16     {q0, q1}, [r0, :128]
>> +.endif
>
> I think the r0 is in both cases 16-byte aligned

Indeed, fixed

>> +    movrel      r3, mask_2bit
>> +    vmov.s8     q3,  #0x01
>> +    vqmovn.s16  d0,  q0
>> +    vqmovn.s16  d1,  q1
>> +    vqabs.s8    q2,  q0
>> +    vld1.8      {q8}, [r3, :128]
>
> exchange the vqabs and vld1

It's inconclusive whether this is beneficial or not; on Cortex-A8, it 
slows things down by almost 10 cycles, while it speeds up by 10 cycles on 
A9.

>> +    vceq.s8     q1,  q0,  #0
>> +    vcgt.s8     q2,  q2,  q3
>> +    vand.u8     q1,  q1,  q8
>> +    vshrn.u16   d4,  q2,  #4
>> +    vpadd.u8    d2,  d2,  d3
>> +    vpadd.u8    d4,  d4,  d4
>> +    vpadd.u8    d2,  d2,  d2
>> +    vmov.32     r2,  d4[0]
>> +    vmov.u32    r1,  d2[0]
>
> vmov.32, also vpadd.u8 d2, d2, d4; vmov r1, r2, d2 might be a little
> faster

That certainly looks nicer, but it turns out to be slower though, 
consistently on A8 and in some cases on A9, so keeping the previous form 
(but with the s/u32/32/).

>> +    cmp         r2,  #0
>> +    bne         9f
>
> beq 0f
> mov r0, #
> bx  lr
> 0:
>
> is a little bit easier to follow

Ok, will change (although this is the form from your aarch64 version, 
except that it has cbnz).

>> +    mvns        r1,  r1
>> +    mov         r0,  #0
>> +    beq         0f
>
> bxeq lr

Ok

>> +.ifc \size, 15
>> +    lsr         r1,  r1,  #2
>> +.endif
>> +    rbit        r1,  r1
>> +    movrel      r3,  X(x264_decimate_table4)
>> +1:
>> +    clz         r2,  r1
>> +    lsl         r1,  r1,  r2
>> +    lsr         r12, r2,  #1
>> +    ldrb        r2,  [r3, r12]
>> +    lsls        r1,  r1,  #2
>> +    add         r0,  r0,  r2
>> +    bne         1b
>> +    bx          lr
>> +9:
>> +    mov         r0,  #9
>> +0:
>> +    bx          lr
>> +endfunc
>> +.endm
>> +
>> +decimate_score_1x 15
>> +decimate_score_1x 16
>> +
>> +function x264_decimate_score64_neon
>> +    push        {r4-r5}
>
> I think you need only one additional register, r12 is unused. make it lr
> and use pop(eq)? {pc} for return

Ok

For some reason, doing "pop {lr}; bx lr" seems to be faster than pop {pc} 
on Cortex-A8, but I guess it's still better to use this since it's a 
common pattern.

>> +    vld1.16     {q8,  q9},  [r0, :128]!
>> +    vld1.16     {q10, q11}, [r0, :128]!
>> +    vld1.16     {q12, q13}, [r0, :128]!
>> +    vld1.16     {q14, q15}, [r0, :128]
>> +    movrel      r3, mask_1bit
>> +    vmov.s8     q3,  #0x01
>> +    vqmovn.s16  d17, q8
>> +    vqmovn.s16  d16, q9
>> +    vqmovn.s16  d19, q10
>> +    vqmovn.s16  d18, q11
>> +    vqmovn.s16  d21, q12
>> +    vqmovn.s16  d20, q13
>> +    vqmovn.s16  d23, q14
>> +    vqmovn.s16  d22, q15
>> +    vqabs.s8    q12, q8
>> +    vqabs.s8    q13, q9
>> +    vqabs.s8    q14, q10
>> +    vqabs.s8    q15, q11
>> +    vld1.8      {q2}, [r3, :128]
>> +    vceq.s8     q8,  q8,  #0
>> +    vceq.s8     q9,  q9,  #0
>> +    vceq.s8     q10, q10, #0
>> +    vceq.s8     q11, q11, #0
>> +    vmax.s8     q12, q12, q13
>> +    vmax.s8     q14, q14, q15
>> +    vand.u8     q8,  q8,  q2
>> +    vand.u8     q9,  q9,  q2
>> +    vand.u8     q10, q10, q2
>> +    vand.u8     q11, q11, q2
>> +    vmax.s8     q12, q12, q14
>> +    vpadd.u8    d18, d18, d19
>> +    vpadd.u8    d19, d16, d17
>> +    vpadd.u8    d22, d22, d23
>> +    vpadd.u8    d23, d20, d21
>> +    vcgt.s8     q12, q12, q3
>> +    vpadd.u8    d16, d22, d23
>> +    vpadd.u8    d17, d18, d19
>> +    vshrn.u16   d24, q12, #4
>> +    vpadd.u8    d16, d16, d17
>> +    vpadd.u8    d24, d24, d24
>
> I'd move the last 3 q12/d24 related lines up so that the vmov doesn't
> follows immeaditely after the vpadd

Thanks, that helps a bit

>> +    vmov.32     r2,  d24[0]
>> +    vmov.u32    r4,  d16[0]
>> +    vmov.u32    r1,  d16[1]
>
> vmov r4, r1, d16

Ok, here it doesn't seem to hurt in the same way as it did in the 15/16 
version

>> +    cmp         r2,  #0
>> +    bne         9f
>
> same comment as in decimate_score_1x applies
>
>> +    mvns        r1,  r1
>> +    mvn         r4,  r4
>> +    mov         r0,  #0
>> +    mov         r5,  #32
>> +    movrel      r3,  X(x264_decimate_table8)
>> +    beq         2f
>> +1:
>> +    clz         r2,  r1
>> +    lsl         r1,  r1,  r2
>> +    sub         r5,  r5,  r2
>> +    ldrb        r2,  [r3, r2]
>> +    lsls        r1,  r1,  #1
>> +    sub         r5,  r5,  #1
>> +    add         r0,  r0,  r2
>> +    bne         1b
>> +2:
>> +    cmp         r4,  #0
>> +    beq         0f
>> +
>> +    mov         r1,  r4
>> +    mov         r4,  #0
>> +
>> +    clz         r2,  r1
>> +    lsl         r1,  r1,  r2
>
> the mov r1, r4 can be folded into this instruction

Indeed

>> +    add         r2,  r2,  r5
>> +    ldrb        r2,  [r3, r2]
>> +    lsls        r1,  r1,  #1
>> +    add         r0,  r0,  r2
>> +    beq         0f
>> +    b           1b
>
> I'd copy the code since you don't need to update r5 anymore and the
> mov r4, #0 can be ommitted too

Yep, that's a good idea.

// Martin