[x264-devel] [PATCH 14/24] arm: Implement x284_decimate_score15/16/64_neon
Janne Grunau
janne-x264 at jannau.net
Fri Aug 21 18:36:59 CEST 2015
On 2015-08-13 23:59:35 +0300, Martin Storsjö wrote:
> checkasm timing Cortex-A7 A8 A9
> decimate_score15_c 767 723 545
> decimate_score15_neon 507 504 496
> decimate_score16_c 776 742 546
> decimate_score16_neon 494 507 470
> decimate_score64_c 2399 2511 2023
> decimate_score64_neon 1041 842 804
> ---
> common/aarch64/quant-a.S | 1 +
> common/arm/quant-a.S | 142 ++++++++++++++++++++++++++++++++++++++++++++++
> common/arm/quant.h | 4 ++
> common/quant.c | 6 +-
> 4 files changed, 150 insertions(+), 3 deletions(-)
>
> diff --git a/common/aarch64/quant-a.S b/common/aarch64/quant-a.S
> index 443a91d..5aea85f 100644
> --- a/common/aarch64/quant-a.S
> +++ b/common/aarch64/quant-a.S
> @@ -5,6 +5,7 @@
> *
> * Authors: David Conrad <lessen42 at gmail.com>
> * Janne Grunau <janne-x264 at jannau.net>
> + * Martin Storsjo <martin at martin.st>
> *
> * This program is free software; you can redistribute it and/or modify
> * it under the terms of the GNU General Public License as published by
> diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S
> index e3d5cd2..5ec8c04 100644
> --- a/common/arm/quant-a.S
> +++ b/common/arm/quant-a.S
> @@ -32,6 +32,14 @@ pmovmskb_byte:
> .byte 1,2,4,8,16,32,64,128
> .byte 1,2,4,8,16,32,64,128
>
> +mask_2bit:
> +.byte 3,12,48,192,3,12,48,192
> +.byte 3,12,48,192,3,12,48,192
> +
> +mask_1bit:
> +.byte 128,64,32,16,8,4,2,1
> +.byte 128,64,32,16,8,4,2,1
> +
> .text
>
> .macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no
> @@ -308,6 +316,140 @@ dequant_4x4_dc_rshift:
> bx lr
> endfunc
>
> +.macro decimate_score_1x size
> +function x264_decimate_score\size\()_neon
> +.if \size == 15
> + vld1.16 {q0, q1}, [r0]
> +.else
> + vld1.16 {q0, q1}, [r0, :128]
> +.endif
I think the r0 is in both cases 16-byte aligned
> + movrel r3, mask_2bit
> + vmov.s8 q3, #0x01
> + vqmovn.s16 d0, q0
> + vqmovn.s16 d1, q1
> + vqabs.s8 q2, q0
> + vld1.8 {q8}, [r3, :128]
exchange the vqabs and vld1
> + vceq.s8 q1, q0, #0
> + vcgt.s8 q2, q2, q3
> + vand.u8 q1, q1, q8
> + vshrn.u16 d4, q2, #4
> + vpadd.u8 d2, d2, d3
> + vpadd.u8 d4, d4, d4
> + vpadd.u8 d2, d2, d2
> + vmov.32 r2, d4[0]
> + vmov.u32 r1, d2[0]
vmov.32, also vpadd.u8 d2, d2, d4; vmov r1, r2, d2 might be a little
faster
> + cmp r2, #0
> + bne 9f
beq 0f
mov r0, #
bx lr
0:
is a little bit easier to follow
> + mvns r1, r1
> + mov r0, #0
> + beq 0f
bxeq lr
> +.ifc \size, 15
> + lsr r1, r1, #2
> +.endif
> + rbit r1, r1
> + movrel r3, X(x264_decimate_table4)
> +1:
> + clz r2, r1
> + lsl r1, r1, r2
> + lsr r12, r2, #1
> + ldrb r2, [r3, r12]
> + lsls r1, r1, #2
> + add r0, r0, r2
> + bne 1b
> + bx lr
> +9:
> + mov r0, #9
> +0:
> + bx lr
> +endfunc
> +.endm
> +
> +decimate_score_1x 15
> +decimate_score_1x 16
> +
> +function x264_decimate_score64_neon
> + push {r4-r5}
I think you need only one additional register, r12 is unused. make it lr
and use pop(eq)? {pc} for return
> + vld1.16 {q8, q9}, [r0, :128]!
> + vld1.16 {q10, q11}, [r0, :128]!
> + vld1.16 {q12, q13}, [r0, :128]!
> + vld1.16 {q14, q15}, [r0, :128]
> + movrel r3, mask_1bit
> + vmov.s8 q3, #0x01
> + vqmovn.s16 d17, q8
> + vqmovn.s16 d16, q9
> + vqmovn.s16 d19, q10
> + vqmovn.s16 d18, q11
> + vqmovn.s16 d21, q12
> + vqmovn.s16 d20, q13
> + vqmovn.s16 d23, q14
> + vqmovn.s16 d22, q15
> + vqabs.s8 q12, q8
> + vqabs.s8 q13, q9
> + vqabs.s8 q14, q10
> + vqabs.s8 q15, q11
> + vld1.8 {q2}, [r3, :128]
> + vceq.s8 q8, q8, #0
> + vceq.s8 q9, q9, #0
> + vceq.s8 q10, q10, #0
> + vceq.s8 q11, q11, #0
> + vmax.s8 q12, q12, q13
> + vmax.s8 q14, q14, q15
> + vand.u8 q8, q8, q2
> + vand.u8 q9, q9, q2
> + vand.u8 q10, q10, q2
> + vand.u8 q11, q11, q2
> + vmax.s8 q12, q12, q14
> + vpadd.u8 d18, d18, d19
> + vpadd.u8 d19, d16, d17
> + vpadd.u8 d22, d22, d23
> + vpadd.u8 d23, d20, d21
> + vcgt.s8 q12, q12, q3
> + vpadd.u8 d16, d22, d23
> + vpadd.u8 d17, d18, d19
> + vshrn.u16 d24, q12, #4
> + vpadd.u8 d16, d16, d17
> + vpadd.u8 d24, d24, d24
I'd move the last 3 q12/d24 related lines up so that the vmov doesn't
follows immeaditely after the vpadd
> + vmov.32 r2, d24[0]
> + vmov.u32 r4, d16[0]
> + vmov.u32 r1, d16[1]
vmov r4, r1, d16
> + cmp r2, #0
> + bne 9f
same comment as in decimate_score_1x applies
> + mvns r1, r1
> + mvn r4, r4
> + mov r0, #0
> + mov r5, #32
> + movrel r3, X(x264_decimate_table8)
> + beq 2f
> +1:
> + clz r2, r1
> + lsl r1, r1, r2
> + sub r5, r5, r2
> + ldrb r2, [r3, r2]
> + lsls r1, r1, #1
> + sub r5, r5, #1
> + add r0, r0, r2
> + bne 1b
> +2:
> + cmp r4, #0
> + beq 0f
> +
> + mov r1, r4
> + mov r4, #0
> +
> + clz r2, r1
> + lsl r1, r1, r2
the mov r1, r4 can be folded into this instruction
> + add r2, r2, r5
> + ldrb r2, [r3, r2]
> + lsls r1, r1, #1
> + add r0, r0, r2
> + beq 0f
> + b 1b
I'd copy the code since you don't need to update r5 anymore and the
mov r4, #0 can be ommitted too
> +9:
> + mov r0, #9
> +0:
> + pop {r4-r5}
> + bx lr
> +endfunc
>
> // int coeff_last( int16_t *l )
> function x264_coeff_last4_arm
> diff --git a/common/arm/quant.h b/common/arm/quant.h
> index 78178e8..2ec91eb 100644
> --- a/common/arm/quant.h
> +++ b/common/arm/quant.h
> @@ -38,6 +38,10 @@ void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp
> void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
> void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
>
> +int x264_decimate_score15_neon( int16_t * );
> +int x264_decimate_score16_neon( int16_t * );
> +int x264_decimate_score64_neon( int16_t * );
> +
> int x264_coeff_last4_arm( int16_t * );
> int x264_coeff_last8_arm( int16_t * );
> int x264_coeff_last15_neon( int16_t * );
> diff --git a/common/quant.c b/common/quant.c
> index f8279a7..be000ec 100644
> --- a/common/quant.c
> +++ b/common/quant.c
> @@ -751,6 +751,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
> pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon;
> pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
> pf->denoise_dct = x264_denoise_dct_neon;
> + pf->decimate_score15 = x264_decimate_score15_neon;
> + pf->decimate_score16 = x264_decimate_score16_neon;
> + pf->decimate_score64 = x264_decimate_score64_neon;
> }
> #endif
> #if ARCH_AARCH64
> @@ -765,9 +768,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
> pf->coeff_level_run8 = x264_coeff_level_run8_neon;
> pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_neon;
> pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_neon;
> - pf->decimate_score15 = x264_decimate_score15_neon;
> - pf->decimate_score16 = x264_decimate_score16_neon;
> - pf->decimate_score64 = x264_decimate_score64_neon;
> }
> #endif
otherwise ok
Janne
More information about the x264-devel
mailing list