[x264-devel] [PATCH 09/24] arm: Add x264_nal_escape_neon

Janne Grunau janne-x264 at jannau.net
Tue Aug 18 11:11:27 CEST 2015


On 2015-08-13 23:59:30 +0300, Martin Storsjö wrote:
> checkasm timing      Cortex-A7      A8      A9
> nal_escape_c                908338  878032  633692
> nal_escape_neon             379946  451936  373471
> ---
>  Makefile                 |    2 +-
>  common/arm/bitstream-a.S |   89 ++++++++++++++++++++++++++++++++++++++++++++++
>  common/bitstream.c       |    4 +++
>  3 files changed, 94 insertions(+), 1 deletion(-)
>  create mode 100644 common/arm/bitstream-a.S
> 
> diff --git a/Makefile b/Makefile
> index 6193c59..4403a11 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -119,7 +119,7 @@ ifeq ($(SYS_ARCH),ARM)
>  ifneq ($(AS),)
>  ASMSRC += common/arm/cpu-a.S common/arm/pixel-a.S common/arm/mc-a.S \
>            common/arm/dct-a.S common/arm/quant-a.S common/arm/deblock-a.S \
> -          common/arm/predict-a.S
> +          common/arm/predict-a.S common/arm/bitstream-a.S
>  SRCS   += common/arm/mc-c.c common/arm/predict-c.c
>  OBJASM  = $(ASMSRC:%.S=%.o)
>  endif
> diff --git a/common/arm/bitstream-a.S b/common/arm/bitstream-a.S
> new file mode 100644
> index 0000000..62f9c96
> --- /dev/null
> +++ b/common/arm/bitstream-a.S
> @@ -0,0 +1,89 @@
> +/*****************************************************************************
> + * bitstream-a.S: arm bitstream functions
> + *****************************************************************************
> + * Copyright (C) 2014-2015 x264 project
> + *
> + * Authors: Janne Grunau <janne-x264 at jannau.net>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
> + *
> + * This program is also available under a commercial proprietary license.
> + * For more information, contact us at licensing at x264.com.
> + *****************************************************************************/
> +
> +#include "asm.S"
> +
> +function x264_nal_escape_neon
> +    push        {r4-r9}

I'm not quite sure if you need all those registers. I certainly only 
used that many because arm64 has enough caller saved registers. Also lr 
is usually in the register list when registers are pushed/popped to/from 
the stack. The function returns becomes then pop {rx-ry, pc} instead of 
pop; bx

> +    vpush       {q4-q7}

please use q8-q15, I know this register number conflicts are annoying 
when porting neon between arm and arm64 (both ways, I endured )

> +    vmov.u8     q0,  #0xff
> +    vmov.u8     q4,  #4
> +    mov         r3,  #3
> +    subs        r6,  r1,  r2
> +    beq         99f
> +0:
> +    cmn         r6,  #15
> +    blt         16f
> +    mov         r1,  r2
> +    b           100f
> +16:
> +    vld1.8      {q1}, [r1]!
> +    vext.8      q2,  q0,  q1, #14
> +    vext.8      q3,  q0,  q1, #15
> +    vcgt.u8     q7,  q4,  q1
> +    vceq.u8     q5,  q2,  #0
> +    vceq.u8     q6,  q3,  #0
> +    vand        q5,  q5,  q7
> +    vand        q5,  q5,  q6
> +    vshrn.u16   d14, q5,  #4
> +    vmov        r7,  r8,  d14
> +    orrs        r7,  r7,  r8
> +    beq         16f
> +    mov         r6,  #-16
> +100:
> +    vmov.u8     r5,  d1[6]
> +    vmov.u8     r4,  d1[7]
> +    orr         r5,  r4,  r5, lsl #8
> +101:
> +    ldrb        r4,  [r1, r6]
> +    orr         r9,  r4,  r5, lsl #16
> +    cmp         r9,  #3
> +    bhi         102f
> +    strb        r3,  [r0], #1
> +    orr         r5,  r3,  r5, lsl #8
> +102:
> +    adds        r6,  r6,  #1
> +    strb        r4,  [r0], #1
> +    orr         r5,  r4,  r5, lsl #8
> +    blt         101b
> +    subs        r6,  r1,  r2
> +    lsr         r9,  r5,  #8
> +    vmov.u8     d1[6],  r9
> +    vmov.u8     d1[7],  r5
> +    blt         0b
> +
> +    vpop        {q4-q7}
> +    pop         {r4-r9}
> +    bx          lr
> +16:
> +    subs        r6,  r1,  r2
> +    vst1.8      {q1}, [r0]!
> +    vmov        q0, q1
> +    blt         0b
> +99:
> +    vpop        {q4-q7}
> +    pop         {r4-r9}
> +    bx          lr
> +endfunc
> diff --git a/common/bitstream.c b/common/bitstream.c
> index 6ca1f44..ec9836a 100644
> --- a/common/bitstream.c
> +++ b/common/bitstream.c
> @@ -144,6 +144,10 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
>      }
>  #endif
>  #endif
> +#if HAVE_ARMV6
> +    if( cpu&X264_CPU_NEON )
> +        pf->nal_escape = x264_nal_escape_neon;
> +#endif
>  #if ARCH_AARCH64
>      if( cpu&X264_CPU_NEON )
>          pf->nal_escape = x264_nal_escape_neon;

otherwise ok

Janne


More information about the x264-devel mailing list