[x264-devel] [PATCH 14/24] arm: Implement x284_decimate_score15/16/64_neon

Fri Aug 21 18:36:59 CEST 2015

On 2015-08-13 23:59:35 +0300, Martin Storsjö wrote:
> checkasm timing       Cortex-A7      A8     A9
> decimate_score15_c           767     723    545
> decimate_score15_neon        507     504    496
> decimate_score16_c           776     742    546
> decimate_score16_neon        494     507    470
> decimate_score64_c           2399    2511   2023
> decimate_score64_neon        1041    842    804
> ---
>  common/aarch64/quant-a.S |    1 +
>  common/arm/quant-a.S     |  142 ++++++++++++++++++++++++++++++++++++++++++++++
>  common/arm/quant.h       |    4 ++
>  common/quant.c           |    6 +-
>  4 files changed, 150 insertions(+), 3 deletions(-)
> 
> diff --git a/common/aarch64/quant-a.S b/common/aarch64/quant-a.S
> index 443a91d..5aea85f 100644
> --- a/common/aarch64/quant-a.S
> +++ b/common/aarch64/quant-a.S
> @@ -5,6 +5,7 @@
>   *
>   * Authors: David Conrad <lessen42 at gmail.com>
>   *          Janne Grunau <janne-x264 at jannau.net>
> + *          Martin Storsjo <martin at martin.st>
>   *
>   * This program is free software; you can redistribute it and/or modify
>   * it under the terms of the GNU General Public License as published by
> diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S
> index e3d5cd2..5ec8c04 100644
> --- a/common/arm/quant-a.S
> +++ b/common/arm/quant-a.S
> @@ -32,6 +32,14 @@ pmovmskb_byte:
>  .byte 1,2,4,8,16,32,64,128
>  .byte 1,2,4,8,16,32,64,128
>  
> +mask_2bit:
> +.byte 3,12,48,192,3,12,48,192
> +.byte 3,12,48,192,3,12,48,192
> +
> +mask_1bit:
> +.byte 128,64,32,16,8,4,2,1
> +.byte 128,64,32,16,8,4,2,1
> +
>  .text
>  
>  .macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no
> @@ -308,6 +316,140 @@ dequant_4x4_dc_rshift:
>      bx          lr
>  endfunc
>  
> +.macro decimate_score_1x size
> +function x264_decimate_score\size\()_neon
> +.if \size == 15
> +    vld1.16     {q0, q1}, [r0]
> +.else
> +    vld1.16     {q0, q1}, [r0, :128]
> +.endif

I think the r0 is in both cases 16-byte aligned

> +    movrel      r3, mask_2bit
> +    vmov.s8     q3,  #0x01
> +    vqmovn.s16  d0,  q0
> +    vqmovn.s16  d1,  q1
> +    vqabs.s8    q2,  q0
> +    vld1.8      {q8}, [r3, :128]

exchange the vqabs and vld1

> +    vceq.s8     q1,  q0,  #0
> +    vcgt.s8     q2,  q2,  q3
> +    vand.u8     q1,  q1,  q8
> +    vshrn.u16   d4,  q2,  #4
> +    vpadd.u8    d2,  d2,  d3
> +    vpadd.u8    d4,  d4,  d4
> +    vpadd.u8    d2,  d2,  d2
> +    vmov.32     r2,  d4[0]
> +    vmov.u32    r1,  d2[0]

vmov.32, also vpadd.u8 d2, d2, d4; vmov r1, r2, d2 might be a little 
faster

> +    cmp         r2,  #0
> +    bne         9f

beq 0f
mov r0, #
bx  lr
0:

is a little bit easier to follow

> +    mvns        r1,  r1
> +    mov         r0,  #0
> +    beq         0f

bxeq lr

> +.ifc \size, 15
> +    lsr         r1,  r1,  #2
> +.endif
> +    rbit        r1,  r1
> +    movrel      r3,  X(x264_decimate_table4)
> +1:
> +    clz         r2,  r1
> +    lsl         r1,  r1,  r2
> +    lsr         r12, r2,  #1
> +    ldrb        r2,  [r3, r12]
> +    lsls        r1,  r1,  #2
> +    add         r0,  r0,  r2
> +    bne         1b
> +    bx          lr
> +9:
> +    mov         r0,  #9
> +0:
> +    bx          lr
> +endfunc
> +.endm
> +
> +decimate_score_1x 15
> +decimate_score_1x 16
> +
> +function x264_decimate_score64_neon
> +    push        {r4-r5}

I think you need only one additional register, r12 is unused. make it lr 
and use pop(eq)? {pc} for return

> +    vld1.16     {q8,  q9},  [r0, :128]!
> +    vld1.16     {q10, q11}, [r0, :128]!
> +    vld1.16     {q12, q13}, [r0, :128]!
> +    vld1.16     {q14, q15}, [r0, :128]
> +    movrel      r3, mask_1bit
> +    vmov.s8     q3,  #0x01
> +    vqmovn.s16  d17, q8
> +    vqmovn.s16  d16, q9
> +    vqmovn.s16  d19, q10
> +    vqmovn.s16  d18, q11
> +    vqmovn.s16  d21, q12
> +    vqmovn.s16  d20, q13
> +    vqmovn.s16  d23, q14
> +    vqmovn.s16  d22, q15
> +    vqabs.s8    q12, q8
> +    vqabs.s8    q13, q9
> +    vqabs.s8    q14, q10
> +    vqabs.s8    q15, q11
> +    vld1.8      {q2}, [r3, :128]
> +    vceq.s8     q8,  q8,  #0
> +    vceq.s8     q9,  q9,  #0
> +    vceq.s8     q10, q10, #0
> +    vceq.s8     q11, q11, #0
> +    vmax.s8     q12, q12, q13
> +    vmax.s8     q14, q14, q15
> +    vand.u8     q8,  q8,  q2
> +    vand.u8     q9,  q9,  q2
> +    vand.u8     q10, q10, q2
> +    vand.u8     q11, q11, q2
> +    vmax.s8     q12, q12, q14
> +    vpadd.u8    d18, d18, d19
> +    vpadd.u8    d19, d16, d17
> +    vpadd.u8    d22, d22, d23
> +    vpadd.u8    d23, d20, d21
> +    vcgt.s8     q12, q12, q3
> +    vpadd.u8    d16, d22, d23
> +    vpadd.u8    d17, d18, d19
> +    vshrn.u16   d24, q12, #4
> +    vpadd.u8    d16, d16, d17
> +    vpadd.u8    d24, d24, d24

I'd move the last 3 q12/d24 related lines up so that the vmov doesn't 
follows immeaditely after the vpadd

> +    vmov.32     r2,  d24[0]
> +    vmov.u32    r4,  d16[0]
> +    vmov.u32    r1,  d16[1]

vmov r4, r1, d16

> +    cmp         r2,  #0
> +    bne         9f

same comment as in decimate_score_1x applies

> +    mvns        r1,  r1
> +    mvn         r4,  r4
> +    mov         r0,  #0
> +    mov         r5,  #32
> +    movrel      r3,  X(x264_decimate_table8)
> +    beq         2f
> +1:
> +    clz         r2,  r1
> +    lsl         r1,  r1,  r2
> +    sub         r5,  r5,  r2
> +    ldrb        r2,  [r3, r2]
> +    lsls        r1,  r1,  #1
> +    sub         r5,  r5,  #1
> +    add         r0,  r0,  r2
> +    bne         1b
> +2:
> +    cmp         r4,  #0
> +    beq         0f
> +
> +    mov         r1,  r4
> +    mov         r4,  #0
> +
> +    clz         r2,  r1
> +    lsl         r1,  r1,  r2

the mov r1, r4 can be folded into this instruction

> +    add         r2,  r2,  r5
> +    ldrb        r2,  [r3, r2]
> +    lsls        r1,  r1,  #1
> +    add         r0,  r0,  r2
> +    beq         0f
> +    b           1b

I'd copy the code since you don't need to update r5 anymore and the
mov r4, #0 can be ommitted too

> +9:
> +    mov         r0,  #9
> +0:
> +    pop         {r4-r5}
> +    bx          lr
> +endfunc
>  
>  // int coeff_last( int16_t *l )
>  function x264_coeff_last4_arm
> diff --git a/common/arm/quant.h b/common/arm/quant.h
> index 78178e8..2ec91eb 100644
> --- a/common/arm/quant.h
> +++ b/common/arm/quant.h
> @@ -38,6 +38,10 @@ void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp
>  void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
>  void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
>  
> +int x264_decimate_score15_neon( int16_t * );
> +int x264_decimate_score16_neon( int16_t * );
> +int x264_decimate_score64_neon( int16_t * );
> +
>  int x264_coeff_last4_arm( int16_t * );
>  int x264_coeff_last8_arm( int16_t * );
>  int x264_coeff_last15_neon( int16_t * );
> diff --git a/common/quant.c b/common/quant.c
> index f8279a7..be000ec 100644
> --- a/common/quant.c
> +++ b/common/quant.c
> @@ -751,6 +751,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
>          pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon;
>          pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
>          pf->denoise_dct = x264_denoise_dct_neon;
> +        pf->decimate_score15 = x264_decimate_score15_neon;
> +        pf->decimate_score16 = x264_decimate_score16_neon;
> +        pf->decimate_score64 = x264_decimate_score64_neon;
>      }
>  #endif
>  #if ARCH_AARCH64
> @@ -765,9 +768,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
>          pf->coeff_level_run8 = x264_coeff_level_run8_neon;
>          pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_neon;
>          pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_neon;
> -        pf->decimate_score15 = x264_decimate_score15_neon;
> -        pf->decimate_score16 = x264_decimate_score16_neon;
> -        pf->decimate_score64 = x264_decimate_score64_neon;
>      }
>  #endif

otherwise ok

Janne