[x264-devel] [PATCH 11/24] arm: Implement neon 8x16c intra predict functions

Janne Grunau janne-x264 at jannau.net
Wed Aug 19 12:35:01 CEST 2015


On 2015-08-13 23:59:32 +0300, Martin Storsjö wrote:
> This implements the same functions as are implemented for 8x8c
> and as for 8x16c on aarch64.
> 
> Some of the simpler ones actually turn out to be slower than the
> plain C version, at least on some CPUs.

See 'arm64: optimize various intra_predict asm functions' 
(<1439822360-17282-1-git-send-email-janne-x264 at jannau.net>)

That makes all intra_predict functions at least as fast as the C version 
on a cortex-a53 in arm64 mode.

> checkasm timing       Cortex-A7      A8     A9
> intra_predict_8x16c_dc_c     1347    910    1017
> intra_predict_8x16c_dc_neon  1271    1366   1247
> intra_predict_8x16c_dcl_c    859     677    692
> intra_predict_8x16c_dcl_neon 1006    1209   1065
> intra_predict_8x16c_dct_c    871     540    590
> intra_predict_8x16c_dct_neon 672     511    657
> intra_predict_8x16c_h_c      937     712    719
> intra_predict_8x16c_h_neon   722     682    672
> intra_predict_8x16c_p_c      10184   9967   8652
> intra_predict_8x16c_p_neon   2617    1973   1983
> intra_predict_8x16c_v_c      610     380    429
> intra_predict_8x16c_v_neon   570     513    507
> ---
>  common/arm/predict-a.S |  158 ++++++++++++++++++++++++++++++++++++++++++++++++
>  common/arm/predict-c.c |   15 +++++
>  common/arm/predict.h   |    8 +++
>  common/predict.c       |    4 ++
>  4 files changed, 185 insertions(+)
> 
> diff --git a/common/arm/predict-a.S b/common/arm/predict-a.S
> index 7e5d9d3..228fd2e 100644
> --- a/common/arm/predict-a.S
> +++ b/common/arm/predict-a.S
> @@ -5,6 +5,7 @@
>   *
>   * Authors: David Conrad <lessen42 at gmail.com>
>   *          Mans Rullgard <mans at mansr.com>
> + *          Martin Storsjo <martin at martin.st>
>   *
>   * This program is free software; you can redistribute it and/or modify
>   * it under the terms of the GNU General Public License as published by
> @@ -552,6 +553,163 @@ function x264_predict_8x8c_p_neon
>  endfunc
>  
>  
> +function x264_predict_8x16c_dc_top_neon
> +    sub         r2,  r0,  #FDEC_STRIDE
> +    mov         r1,  #FDEC_STRIDE
> +    vld1.8      {d0}, [r2,:64]
> +    vpaddl.u8   d0,  d0
> +    vpadd.u16   d0,  d0,  d0
> +    vrshrn.u16  d0,  q0,  #2
> +    vdup.8      d1,  d0[1]
> +    vdup.8      d0,  d0[0]
> +    vtrn.32     d0,  d1

vmov d1, d0


> +    vmov        q1,  q0
> +    b           pred8x16_dc_end

since we need every cycle to it probably makes sense to avoid the branch 
and vmov

> +endfunc
> +
> +function x264_predict_8x16c_dc_left_neon
> +    mov         r1,  #FDEC_STRIDE
> +    sub         r2,  r0,  #1
> +    ldcol.8     d0,  r2,  r1
> +    ldcol.8     d2,  r2,  r1

the ldcol is probably the mayor factor which makes the neon versions 
slower. one idea would be using a ldcol.16 which interleaves the loads 
to both registers.

> +    vpaddl.u8   d0,  d0
> +    vpaddl.u8   d2,  d2

have you tried using d0 and d1 and vpaddl q0, q0?

> +    vpadd.u16   d0,  d0,  d0
> +    vpadd.u16   d2,  d2,  d2

vpadd d0, d0, d2 (or d1)

> +    vrshrn.u16  d0,  q0,  #2
> +    vrshrn.u16  d2,  q1,  #2
> +    vdup.8      d1,  d0[1]
> +    vdup.8      d0,  d0[0]
> +    vdup.8      d3,  d2[1]
> +    vdup.8      d2,  d2[0]
> +    b           pred8x16_dc_end
> +endfunc
> +
> +function x264_predict_8x16c_dc_neon
> +    sub         r2,  r0,  #FDEC_STRIDE
> +    mov         r1,  #FDEC_STRIDE
> +    vld1.8      {d0}, [r2,:64]
> +    sub         r2,  r0,  #1
> +    ldcol.8     d1,  r2,  r1
> +    vdup.32     d2,  d0[1]
> +    ldcol.8     d3,  r2,  r1

see above but I doubt that using gpr as on arm64 will be faster

> +    vtrn.32     d0,  d1
> +    vtrn.32     d2,  d3
> +    vpaddl.u8   q0,  q0
> +    vpaddl.u8   q1,  q1
> +    vpadd.u16   d0,  d0,  d1
> +    vpadd.u16   d2,  d2,  d3
> +    vpadd.u16   d1,  d0,  d0
> +    vpadd.u16   d3,  d2,  d2
> +    vrshrn.u16  d4,  q0,  #3
> +    vrshrn.u16  d5,  q0,  #2
> +    vrshrn.u16  d6,  q1,  #3
> +    vrshrn.u16  d7,  q1,  #2
> +    vdup.8      d0,  d4[4]
> +    vdup.8      d1,  d5[3]
> +    vdup.8      d16, d5[2]
> +    vdup.8      d17, d4[5]
> +    vtrn.32     q0,  q8
> +    vdup.8      d2,  d7[1]
> +    vdup.8      d3,  d7[3]
> +    vdup.8      d16, d6[4]
> +    vdup.8      d17, d6[5]
> +    vtrn.32     q1,  q8
> +pred8x16_dc_end:
> +    add         r2,  r0,  r1,  lsl #2
> +.rept 4
> +    vst1.8      {d0}, [r0,:64], r1
> +    vst1.8      {d1}, [r2,:64], r1
> +.endr
> +    add         r2,  r2,  r1,  lsl #2
> +    add         r0,  r0,  r1,  lsl #2
> +.rept 4
> +    vst1.8      {d2}, [r0,:64], r1
> +    vst1.8      {d3}, [r2,:64], r1
> +.endr

r3 and r12 are free too, you could try to write all 4 registers at once

> +    bx          lr
> +endfunc
> +
> +function x264_predict_8x16c_h_neon
> +    sub         r1, r0, #1
> +    mov         ip, #FDEC_STRIDE
> +.rept 8
> +    vld1.8      {d0[]}, [r1], ip
> +    vld1.8      {d2[]}, [r1], ip
> +    vst1.64     {d0}, [r0,:64], ip
> +    vst1.64     {d2}, [r0,:64], ip
> +.endr
> +    bx          lr
> +endfunc
> +
> +function x264_predict_8x16c_v_neon
> +    sub         r0, r0, #FDEC_STRIDE
> +    mov         ip, #FDEC_STRIDE
> +    vld1.64     {d0}, [r0,:64], ip
> +.rept 16
> +    vst1.64     {d0}, [r0,:64], ip

this would be faster if use more than 1 gpr register for writeback.  
vldr/vstr would be probably faster since it has immediate offset

> +.endr
> +    bx          lr
> +endfunc
> +
> +function x264_predict_8x16c_p_neon
> +    sub         r3,  r0,  #FDEC_STRIDE
> +    mov         r1,  #FDEC_STRIDE
> +    add         r2,  r3,  #4
> +    sub         r3,  r3,  #1
> +    vld1.32     {d0[0]}, [r3]
> +    vld1.32     {d2[0]}, [r2,:32], r1
> +    ldcol.8     d1,  r3,  r1
> +    add         r3,  r3,  r1
> +    ldcol.8     d3,  r3,  r1
> +    vrev64.32   d16, d3
> +    vaddl.u8    q8,  d2,  d16
> +    vrev32.8    d0,  d0
> +    vsubl.u8    q2,  d2,  d0
> +    vrev64.8    d1,  d1
> +    vsubl.u8    q3,  d3,  d1
> +    movrel      r3,  p16weight
> +    vld1.16     {q0}, [r3,:128]
> +    vmul.s16    d4,  d4,  d0
> +    vmul.s16    q3,  q3,  q0
> +    vpadd.i16   d4,  d4,  d5
> +    vpadd.i16   d6,  d6,  d7
> +    vpaddl.s16  d4,  d4        @ d4[0] = H
> +    vpaddl.s16  d6,  d6
> +    vpadd.s32   d6,  d6        @ d6[0] = V
> +    vshl.i32    d5,  d4,  #4
> +    vadd.s32    d4,  d4,  d5   @ d4[0] = 17*H
> +    vshl.i32    d7,  d6,  #2
> +    vrshrn.s32  d4,  q2,  #5   @ d4[0] = b
> +    vadd.s32    d6,  d6,  d7   @ d6[0] = 5*V
> +    vrshrn.s32  d6,  q3,  #6   @ d6[0] = c
> +    mov         r3,  #0
> +    vshl.i16    d3,  d4,  #2
> +    vsub.i16    d3,  d3,  d4   @ d2[0] = 3 * b
> +    vshl.i16    d2,  d6,  #3
> +    vadd.i16    d3,  d3,  d2   @ d2[0] = 3 * b + 8 * c
> +    vsub.i16    d3,  d3,  d6   @ d2[0] = 3 * b + 7 * c
> +    vrev64.16   d16, d16
> +    vadd.i16    d16, d16, d0   @ d16[0] = src[]+src[] + 1
> +    vshl.i16    d2,  d16, #4   @ d3[0] = a + 16
> +    vsub.i16    d2,  d2,  d3   @ i00
> +    vext.16     q0,  q0,  q0,  #7
> +    vmov.16     d0[0], r3
> +    vmul.i16    q0,  q0,  d4[0]
> +    vdup.16     q1,  d2[0]
> +    vdup.16     q3,  d6[0]
> +    vadd.i16    q1,  q1,  q0
> +    mov         r3,  #16
> +1:
> +    vqshrun.s16 d0,  q1,  #5
> +    vadd.i16    q1,  q1,  q3
> +    vst1.8      {d0}, [r0,:64], r1
> +    subs        r3,  r3,  #1
> +    bne         1b
> +    bx          lr
> +endfunc
> +
> +
>  function x264_predict_16x16_dc_top_neon
>      sub         r2,  r0,  #FDEC_STRIDE
>      mov         r1,  #FDEC_STRIDE
> diff --git a/common/arm/predict-c.c b/common/arm/predict-c.c
> index e0ba0da..b0aedfc 100644
> --- a/common/arm/predict-c.c
> +++ b/common/arm/predict-c.c
> @@ -61,6 +61,21 @@ void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] )
>  #endif // !HIGH_BIT_DEPTH
>  }
>  
> +void x264_predict_8x16c_init_arm( int cpu, x264_predict_t pf[7] )
> +{
> +    if (!(cpu&X264_CPU_NEON))
> +        return;
> +
> +#if !HIGH_BIT_DEPTH
> +    pf[I_PRED_CHROMA_DC]      = x264_predict_8x16c_dc_neon;
> +    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x16c_dc_top_neon;
> +    pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x16c_dc_left_neon;
> +    pf[I_PRED_CHROMA_H]       = x264_predict_8x16c_h_neon;
> +    pf[I_PRED_CHROMA_V]       = x264_predict_8x16c_v_neon;
> +    pf[I_PRED_CHROMA_P]       = x264_predict_8x16c_p_neon;
> +#endif // !HIGH_BIT_DEPTH
> +}
> +
>  void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
>  {
>      if (!(cpu&X264_CPU_NEON))
> diff --git a/common/arm/predict.h b/common/arm/predict.h
> index 242043d..2aa902a 100644
> --- a/common/arm/predict.h
> +++ b/common/arm/predict.h
> @@ -40,6 +40,13 @@ void x264_predict_8x8c_h_neon( uint8_t *src );
>  void x264_predict_8x8c_v_neon( uint8_t *src );
>  void x264_predict_8x8c_p_neon( uint8_t *src );
>  
> +void x264_predict_8x16c_v_neon( uint8_t *src );
> +void x264_predict_8x16c_h_neon( uint8_t *src );
> +void x264_predict_8x16c_dc_neon( uint8_t *src );
> +void x264_predict_8x16c_dc_left_neon( uint8_t *src );
> +void x264_predict_8x16c_dc_top_neon( uint8_t *src );
> +void x264_predict_8x16c_p_neon( uint8_t *src );
> +
>  void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
>  void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
>  void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
> @@ -60,6 +67,7 @@ void x264_predict_16x16_p_neon( uint8_t *src );
>  void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] );
>  void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
>  void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] );
> +void x264_predict_8x16c_init_arm( int cpu, x264_predict_t pf[7] );
>  void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] );
>  
>  #endif
> diff --git a/common/predict.c b/common/predict.c
> index c0f2a0b..f7080f0 100644
> --- a/common/predict.c
> +++ b/common/predict.c
> @@ -977,6 +977,10 @@ void x264_predict_8x16c_init( int cpu, x264_predict_t pf[7] )
>      x264_predict_8x16c_init_mmx( cpu, pf );
>  #endif
>  
> +#if HAVE_ARMV6
> +    x264_predict_8x16c_init_arm( cpu, pf );
> +#endif
> +
>  #if ARCH_AARCH64
>      x264_predict_8x16c_init_aarch64( cpu, pf );
>  #endif

otherwise ok

Janne


More information about the x264-devel mailing list