[x264-devel] [PATCH 11/24] arm: Implement neon 8x16c intra predict functions
Janne Grunau
janne-x264 at jannau.net
Wed Aug 19 12:35:01 CEST 2015
On 2015-08-13 23:59:32 +0300, Martin Storsjö wrote:
> This implements the same functions as are implemented for 8x8c
> and as for 8x16c on aarch64.
>
> Some of the simpler ones actually turn out to be slower than the
> plain C version, at least on some CPUs.
See 'arm64: optimize various intra_predict asm functions'
(<1439822360-17282-1-git-send-email-janne-x264 at jannau.net>)
That makes all intra_predict functions at least as fast as the C version
on a cortex-a53 in arm64 mode.
> checkasm timing Cortex-A7 A8 A9
> intra_predict_8x16c_dc_c 1347 910 1017
> intra_predict_8x16c_dc_neon 1271 1366 1247
> intra_predict_8x16c_dcl_c 859 677 692
> intra_predict_8x16c_dcl_neon 1006 1209 1065
> intra_predict_8x16c_dct_c 871 540 590
> intra_predict_8x16c_dct_neon 672 511 657
> intra_predict_8x16c_h_c 937 712 719
> intra_predict_8x16c_h_neon 722 682 672
> intra_predict_8x16c_p_c 10184 9967 8652
> intra_predict_8x16c_p_neon 2617 1973 1983
> intra_predict_8x16c_v_c 610 380 429
> intra_predict_8x16c_v_neon 570 513 507
> ---
> common/arm/predict-a.S | 158 ++++++++++++++++++++++++++++++++++++++++++++++++
> common/arm/predict-c.c | 15 +++++
> common/arm/predict.h | 8 +++
> common/predict.c | 4 ++
> 4 files changed, 185 insertions(+)
>
> diff --git a/common/arm/predict-a.S b/common/arm/predict-a.S
> index 7e5d9d3..228fd2e 100644
> --- a/common/arm/predict-a.S
> +++ b/common/arm/predict-a.S
> @@ -5,6 +5,7 @@
> *
> * Authors: David Conrad <lessen42 at gmail.com>
> * Mans Rullgard <mans at mansr.com>
> + * Martin Storsjo <martin at martin.st>
> *
> * This program is free software; you can redistribute it and/or modify
> * it under the terms of the GNU General Public License as published by
> @@ -552,6 +553,163 @@ function x264_predict_8x8c_p_neon
> endfunc
>
>
> +function x264_predict_8x16c_dc_top_neon
> + sub r2, r0, #FDEC_STRIDE
> + mov r1, #FDEC_STRIDE
> + vld1.8 {d0}, [r2,:64]
> + vpaddl.u8 d0, d0
> + vpadd.u16 d0, d0, d0
> + vrshrn.u16 d0, q0, #2
> + vdup.8 d1, d0[1]
> + vdup.8 d0, d0[0]
> + vtrn.32 d0, d1
vmov d1, d0
> + vmov q1, q0
> + b pred8x16_dc_end
since we need every cycle to it probably makes sense to avoid the branch
and vmov
> +endfunc
> +
> +function x264_predict_8x16c_dc_left_neon
> + mov r1, #FDEC_STRIDE
> + sub r2, r0, #1
> + ldcol.8 d0, r2, r1
> + ldcol.8 d2, r2, r1
the ldcol is probably the mayor factor which makes the neon versions
slower. one idea would be using a ldcol.16 which interleaves the loads
to both registers.
> + vpaddl.u8 d0, d0
> + vpaddl.u8 d2, d2
have you tried using d0 and d1 and vpaddl q0, q0?
> + vpadd.u16 d0, d0, d0
> + vpadd.u16 d2, d2, d2
vpadd d0, d0, d2 (or d1)
> + vrshrn.u16 d0, q0, #2
> + vrshrn.u16 d2, q1, #2
> + vdup.8 d1, d0[1]
> + vdup.8 d0, d0[0]
> + vdup.8 d3, d2[1]
> + vdup.8 d2, d2[0]
> + b pred8x16_dc_end
> +endfunc
> +
> +function x264_predict_8x16c_dc_neon
> + sub r2, r0, #FDEC_STRIDE
> + mov r1, #FDEC_STRIDE
> + vld1.8 {d0}, [r2,:64]
> + sub r2, r0, #1
> + ldcol.8 d1, r2, r1
> + vdup.32 d2, d0[1]
> + ldcol.8 d3, r2, r1
see above but I doubt that using gpr as on arm64 will be faster
> + vtrn.32 d0, d1
> + vtrn.32 d2, d3
> + vpaddl.u8 q0, q0
> + vpaddl.u8 q1, q1
> + vpadd.u16 d0, d0, d1
> + vpadd.u16 d2, d2, d3
> + vpadd.u16 d1, d0, d0
> + vpadd.u16 d3, d2, d2
> + vrshrn.u16 d4, q0, #3
> + vrshrn.u16 d5, q0, #2
> + vrshrn.u16 d6, q1, #3
> + vrshrn.u16 d7, q1, #2
> + vdup.8 d0, d4[4]
> + vdup.8 d1, d5[3]
> + vdup.8 d16, d5[2]
> + vdup.8 d17, d4[5]
> + vtrn.32 q0, q8
> + vdup.8 d2, d7[1]
> + vdup.8 d3, d7[3]
> + vdup.8 d16, d6[4]
> + vdup.8 d17, d6[5]
> + vtrn.32 q1, q8
> +pred8x16_dc_end:
> + add r2, r0, r1, lsl #2
> +.rept 4
> + vst1.8 {d0}, [r0,:64], r1
> + vst1.8 {d1}, [r2,:64], r1
> +.endr
> + add r2, r2, r1, lsl #2
> + add r0, r0, r1, lsl #2
> +.rept 4
> + vst1.8 {d2}, [r0,:64], r1
> + vst1.8 {d3}, [r2,:64], r1
> +.endr
r3 and r12 are free too, you could try to write all 4 registers at once
> + bx lr
> +endfunc
> +
> +function x264_predict_8x16c_h_neon
> + sub r1, r0, #1
> + mov ip, #FDEC_STRIDE
> +.rept 8
> + vld1.8 {d0[]}, [r1], ip
> + vld1.8 {d2[]}, [r1], ip
> + vst1.64 {d0}, [r0,:64], ip
> + vst1.64 {d2}, [r0,:64], ip
> +.endr
> + bx lr
> +endfunc
> +
> +function x264_predict_8x16c_v_neon
> + sub r0, r0, #FDEC_STRIDE
> + mov ip, #FDEC_STRIDE
> + vld1.64 {d0}, [r0,:64], ip
> +.rept 16
> + vst1.64 {d0}, [r0,:64], ip
this would be faster if use more than 1 gpr register for writeback.
vldr/vstr would be probably faster since it has immediate offset
> +.endr
> + bx lr
> +endfunc
> +
> +function x264_predict_8x16c_p_neon
> + sub r3, r0, #FDEC_STRIDE
> + mov r1, #FDEC_STRIDE
> + add r2, r3, #4
> + sub r3, r3, #1
> + vld1.32 {d0[0]}, [r3]
> + vld1.32 {d2[0]}, [r2,:32], r1
> + ldcol.8 d1, r3, r1
> + add r3, r3, r1
> + ldcol.8 d3, r3, r1
> + vrev64.32 d16, d3
> + vaddl.u8 q8, d2, d16
> + vrev32.8 d0, d0
> + vsubl.u8 q2, d2, d0
> + vrev64.8 d1, d1
> + vsubl.u8 q3, d3, d1
> + movrel r3, p16weight
> + vld1.16 {q0}, [r3,:128]
> + vmul.s16 d4, d4, d0
> + vmul.s16 q3, q3, q0
> + vpadd.i16 d4, d4, d5
> + vpadd.i16 d6, d6, d7
> + vpaddl.s16 d4, d4 @ d4[0] = H
> + vpaddl.s16 d6, d6
> + vpadd.s32 d6, d6 @ d6[0] = V
> + vshl.i32 d5, d4, #4
> + vadd.s32 d4, d4, d5 @ d4[0] = 17*H
> + vshl.i32 d7, d6, #2
> + vrshrn.s32 d4, q2, #5 @ d4[0] = b
> + vadd.s32 d6, d6, d7 @ d6[0] = 5*V
> + vrshrn.s32 d6, q3, #6 @ d6[0] = c
> + mov r3, #0
> + vshl.i16 d3, d4, #2
> + vsub.i16 d3, d3, d4 @ d2[0] = 3 * b
> + vshl.i16 d2, d6, #3
> + vadd.i16 d3, d3, d2 @ d2[0] = 3 * b + 8 * c
> + vsub.i16 d3, d3, d6 @ d2[0] = 3 * b + 7 * c
> + vrev64.16 d16, d16
> + vadd.i16 d16, d16, d0 @ d16[0] = src[]+src[] + 1
> + vshl.i16 d2, d16, #4 @ d3[0] = a + 16
> + vsub.i16 d2, d2, d3 @ i00
> + vext.16 q0, q0, q0, #7
> + vmov.16 d0[0], r3
> + vmul.i16 q0, q0, d4[0]
> + vdup.16 q1, d2[0]
> + vdup.16 q3, d6[0]
> + vadd.i16 q1, q1, q0
> + mov r3, #16
> +1:
> + vqshrun.s16 d0, q1, #5
> + vadd.i16 q1, q1, q3
> + vst1.8 {d0}, [r0,:64], r1
> + subs r3, r3, #1
> + bne 1b
> + bx lr
> +endfunc
> +
> +
> function x264_predict_16x16_dc_top_neon
> sub r2, r0, #FDEC_STRIDE
> mov r1, #FDEC_STRIDE
> diff --git a/common/arm/predict-c.c b/common/arm/predict-c.c
> index e0ba0da..b0aedfc 100644
> --- a/common/arm/predict-c.c
> +++ b/common/arm/predict-c.c
> @@ -61,6 +61,21 @@ void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] )
> #endif // !HIGH_BIT_DEPTH
> }
>
> +void x264_predict_8x16c_init_arm( int cpu, x264_predict_t pf[7] )
> +{
> + if (!(cpu&X264_CPU_NEON))
> + return;
> +
> +#if !HIGH_BIT_DEPTH
> + pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_neon;
> + pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_neon;
> + pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x16c_dc_left_neon;
> + pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_neon;
> + pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_neon;
> + pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_neon;
> +#endif // !HIGH_BIT_DEPTH
> +}
> +
> void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
> {
> if (!(cpu&X264_CPU_NEON))
> diff --git a/common/arm/predict.h b/common/arm/predict.h
> index 242043d..2aa902a 100644
> --- a/common/arm/predict.h
> +++ b/common/arm/predict.h
> @@ -40,6 +40,13 @@ void x264_predict_8x8c_h_neon( uint8_t *src );
> void x264_predict_8x8c_v_neon( uint8_t *src );
> void x264_predict_8x8c_p_neon( uint8_t *src );
>
> +void x264_predict_8x16c_v_neon( uint8_t *src );
> +void x264_predict_8x16c_h_neon( uint8_t *src );
> +void x264_predict_8x16c_dc_neon( uint8_t *src );
> +void x264_predict_8x16c_dc_left_neon( uint8_t *src );
> +void x264_predict_8x16c_dc_top_neon( uint8_t *src );
> +void x264_predict_8x16c_p_neon( uint8_t *src );
> +
> void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
> void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
> void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
> @@ -60,6 +67,7 @@ void x264_predict_16x16_p_neon( uint8_t *src );
> void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] );
> void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
> void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] );
> +void x264_predict_8x16c_init_arm( int cpu, x264_predict_t pf[7] );
> void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] );
>
> #endif
> diff --git a/common/predict.c b/common/predict.c
> index c0f2a0b..f7080f0 100644
> --- a/common/predict.c
> +++ b/common/predict.c
> @@ -977,6 +977,10 @@ void x264_predict_8x16c_init( int cpu, x264_predict_t pf[7] )
> x264_predict_8x16c_init_mmx( cpu, pf );
> #endif
>
> +#if HAVE_ARMV6
> + x264_predict_8x16c_init_arm( cpu, pf );
> +#endif
> +
> #if ARCH_AARCH64
> x264_predict_8x16c_init_aarch64( cpu, pf );
> #endif
otherwise ok
Janne
More information about the x264-devel
mailing list