[x264-devel] [PATCH 12/24] arm: Implement x264_plane_copy_neon
Janne Grunau
janne-x264 at jannau.net
Fri Aug 21 18:57:12 CEST 2015
On 2015-08-13 23:59:33 +0300, Martin Storsjö wrote:
> checkasm timing Cortex-A7 A8 A9
> plane_copy_c 13253 10923 9016
> plane_copy_neon 7339 5191 8939
> ---
> common/arm/mc-a.S | 32 ++++++++++++++++++++++++++++++++
> common/arm/mc-c.c | 3 +++
> 2 files changed, 35 insertions(+)
>
> diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
> index 695a6ca..4225c71 100644
> --- a/common/arm/mc-a.S
> +++ b/common/arm/mc-a.S
> @@ -6,6 +6,7 @@
> * Authors: David Conrad <lessen42 at gmail.com>
> * Mans Rullgard <mans at mansr.com>
> * Stefan Groenroos <stefan.gronroos at gmail.com>
> + * Janne Grunau <janne-x264 at jannau.net>
> *
> * This program is free software; you can redistribute it and/or modify
> * it under the terms of the GNU General Public License as published by
> @@ -1461,6 +1462,37 @@ function x264_load_deinterleave_chroma_fenc_neon
> bx lr
> endfunc
>
> +function x264_plane_copy_neon
> + push {r4-r5}
> + ldrd r4, r5, [sp, #8]
you could use r4 and lr for the common pop {..., pc} pattern, not that
it'll make a differences here
> + add r12, r4, #15
> + and r4, r12, #~15
> + sub r1, r1, r4
> + sub r3, r3, r4
> +1:
> + mov r12, r4
> +16:
> + tst r12, #16
> + beq 32f
> + subs r12, r12, #16
> + vld1.8 {q0}, [r2]!
> + vst1.8 {q0}, [r0]!
> + beq 0f
> +32:
> + subs r12, r12, #32
> + vld1.8 {q0, q1}, [r2]!
> + vst1.8 {q0, q1}, [r0]!
> + bgt 32b
> +0:
> + subs r5, r5, #1
> + add r2, r2, r3
> + add r0, r0, r1
> + bgt 1b
> +
> + pop {r4-r5}
> + bx lr
> +endfunc
> +
> function x264_plane_copy_deinterleave_neon
> push {r4-r7, lr}
> ldrd r6, r7, [sp, #28]
> diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
> index 2633772..2ebf1ba 100644
> --- a/common/arm/mc-c.c
> +++ b/common/arm/mc-c.c
> @@ -47,6 +47,8 @@ void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t
> void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
> void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
>
> +void x264_plane_copy_neon( pixel *dst, intptr_t i_dst,
> + pixel *src, intptr_t i_src, int w, int h );
> void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu,
> pixel *dstv, intptr_t i_dstv,
> pixel *src, intptr_t i_src, int w, int h );
> @@ -239,6 +241,7 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
> pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon;
> pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon;
>
> + pf->plane_copy = x264_plane_copy_neon;
> pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
> pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
> pf->plane_copy_interleave = x264_plane_copy_interleave_neon;
ok for me either way
Janne
More information about the x264-devel
mailing list