[x264-devel] [PATCH 12/24] arm: Implement x264_plane_copy_neon

Janne Grunau janne-x264 at jannau.net
Fri Aug 21 18:57:12 CEST 2015


On 2015-08-13 23:59:33 +0300, Martin Storsjö wrote:
> checkasm timing       Cortex-A7      A8     A9
> plane_copy_c                  13253  10923  9016
> plane_copy_neon               7339   5191   8939
> ---
>  common/arm/mc-a.S |   32 ++++++++++++++++++++++++++++++++
>  common/arm/mc-c.c |    3 +++
>  2 files changed, 35 insertions(+)
> 
> diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
> index 695a6ca..4225c71 100644
> --- a/common/arm/mc-a.S
> +++ b/common/arm/mc-a.S
> @@ -6,6 +6,7 @@
>   * Authors: David Conrad <lessen42 at gmail.com>
>   *          Mans Rullgard <mans at mansr.com>
>   *          Stefan Groenroos <stefan.gronroos at gmail.com>
> + *          Janne Grunau <janne-x264 at jannau.net>
>   *
>   * This program is free software; you can redistribute it and/or modify
>   * it under the terms of the GNU General Public License as published by
> @@ -1461,6 +1462,37 @@ function x264_load_deinterleave_chroma_fenc_neon
>      bx              lr
>  endfunc
>  
> +function x264_plane_copy_neon
> +    push            {r4-r5}
> +    ldrd            r4,  r5, [sp, #8]

you could use r4 and lr for the common pop {..., pc} pattern, not that 
it'll make a differences here

> +    add             r12, r4,  #15
> +    and             r4,  r12, #~15
> +    sub             r1,  r1,  r4
> +    sub             r3,  r3,  r4
> +1:
> +    mov             r12, r4
> +16:
> +    tst             r12, #16
> +    beq             32f
> +    subs            r12, r12, #16
> +    vld1.8          {q0}, [r2]!
> +    vst1.8          {q0}, [r0]!
> +    beq             0f
> +32:
> +    subs            r12, r12, #32
> +    vld1.8          {q0, q1}, [r2]!
> +    vst1.8          {q0, q1}, [r0]!
> +    bgt             32b
> +0:
> +    subs            r5,  r5,  #1
> +    add             r2,  r2,  r3
> +    add             r0,  r0,  r1
> +    bgt             1b
> +
> +    pop             {r4-r5}
> +    bx              lr
> +endfunc
> +
>  function x264_plane_copy_deinterleave_neon
>      push            {r4-r7, lr}
>      ldrd            r6, r7, [sp, #28]
> diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
> index 2633772..2ebf1ba 100644
> --- a/common/arm/mc-c.c
> +++ b/common/arm/mc-c.c
> @@ -47,6 +47,8 @@ void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t
>  void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
>  void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
>  
> +void x264_plane_copy_neon( pixel *dst, intptr_t i_dst,
> +                           pixel *src, intptr_t i_src, int w, int h );
>  void x264_plane_copy_deinterleave_neon(  pixel *dstu, intptr_t i_dstu,
>                                           pixel *dstv, intptr_t i_dstv,
>                                           pixel *src,  intptr_t i_src, int w, int h );
> @@ -239,6 +241,7 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
>      pf->copy[PIXEL_8x8]   = x264_mc_copy_w8_neon;
>      pf->copy[PIXEL_4x4]   = x264_mc_copy_w4_neon;
>  
> +    pf->plane_copy              = x264_plane_copy_neon;
>      pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
>      pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
>      pf->plane_copy_interleave = x264_plane_copy_interleave_neon;

ok for me either way

Janne


More information about the x264-devel mailing list