[x264-devel] [PATCH 08/24] arm: Add neon versions of vsad, asd8 and ssd_nv12_core
Janne Grunau
janne-x264 at jannau.net
Tue Aug 18 10:58:47 CEST 2015
On 2015-08-13 23:59:29 +0300, Martin Storsjö wrote:
> These are straight translations of the aarch64 versions.
>
> checkasm timing Cortex-A7 A8 A9
> vsad_c 16234 10984 9850
> vsad_neon 2132 1020 789
>
> asd8_c 5859 3561 3543
> asd8_neon 1407 1279 1250
>
> ssd_nv12_c 608967 593057 427131
> ssd_nv12_neon 73017 34251 41577
> ---
> common/arm/pixel-a.S | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++
> common/arm/pixel.h | 6 +++
> common/pixel.c | 3 ++
> 3 files changed, 138 insertions(+)
>
> diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
> index 36858bc..f60fdb5 100644
> --- a/common/arm/pixel-a.S
> +++ b/common/arm/pixel-a.S
> @@ -4,6 +4,7 @@
> * Copyright (C) 2009-2015 x264 project
> *
> * Authors: David Conrad <lessen42 at gmail.com>
> + * Janne Grunau <janne-x264 at jannau.net>
> *
> * This program is free software; you can redistribute it and/or modify
> * it under the terms of the GNU General Public License as published by
> @@ -388,6 +389,59 @@ SAD_X_FUNC 4, 8, 16
> SAD_X_FUNC 4, 16, 8
> SAD_X_FUNC 4, 16, 16
>
> +function x264_pixel_vsad_neon
> + subs r2, r2, #2
> + vld1.8 {q0}, [r0], r1
> + vld1.8 {q1}, [r0], r1
> + vabdl.u8 q2, d0, d2
> + vabdl.u8 q3, d1, d3
> + ble 2f
> +1:
> + subs r2, r2, #2
> + vld1.8 {q0}, [r0], r1
> + vabal.u8 q2, d2, d0
> + vabal.u8 q3, d3, d1
> + vld1.8 {q1}, [r0], r1
> + blt 2f
> + vabal.u8 q2, d0, d2
> + vabal.u8 q3, d1, d3
> + bgt 1b
> +2:
> + vadd.u16 q0, q2, q3
> + HORIZ_ADD d0, d0, d1
> + vmov.32 r0, d0[0]
> + bx lr
> +endfunc
> +
> +function x264_pixel_asd8_neon
> + ldr r12, [sp, #0]
> + sub r12, r12, #2
> + vld1.8 {d0}, [r0], r1
> + vld1.8 {d1}, [r2], r3
> + vld1.8 {d2}, [r0], r1
> + vld1.8 {d3}, [r2], r3
> + vsubl.u8 q8, d0, d1
> +1:
> + subs r12, r12, #2
> + vld1.8 {d4}, [r0], r1
> + vld1.8 {d5}, [r2], r3
> + vsubl.u8 q9, d2, d3
> + vsubl.u8 q10, d4, d5
> + vadd.s16 q8, q9
> + vld1.8 {d2}, [r0], r1
> + vld1.8 {d3}, [r2], r3
> + vadd.s16 q8, q10
> + bgt 1b
> + vsubl.u8 q9, d2, d3
> + vadd.s16 q8, q9
> + vpaddl.s16 q8, q8
> + vpadd.s32 d16, d16, d17
> + vpadd.s32 d16, d16, d17
> + vabs.s32 d16, d16
> + vmov.32 r0, d16[0]
> + bx lr
> +endfunc
> +
>
> .macro SSD_START_4
> vld1.32 {d16[]}, [r0,:32], r1
> @@ -489,6 +543,81 @@ SSD_FUNC 8, 16
> SSD_FUNC 16, 8
> SSD_FUNC 16, 16
>
> +function x264_pixel_ssd_nv12_core_neon
> + vpush {q4-q5}
why? q12/q13 seems to be free and could be used instead
> + push {r4-r5}
> + ldrd r4, r5, [sp, #40]
> + add r12, r4, #8
> + and r12, r12, #~15
bic r12, r12, #15
would be clearer, bic (immediate) doesn't exists in aarch64
> + vmov.u64 q8, #0
> + vmov.u64 q9, #0
> + sub r1, r1, r12, lsl #1
> + sub r3, r3, r12, lsl #1
> +1:
> + subs r12, r4, #16
> + vld2.8 {d0,d1}, [r0]!
> + vld2.8 {d2,d3}, [r2]!
> + vld2.8 {d4,d5}, [r0]!
> + vld2.8 {d6,d7}, [r2]!
> +
> + vsubl.u8 q10, d0, d2
> + vsubl.u8 q11, d1, d3
> + vmull.s16 q14, d20, d20
> + vmull.s16 q15, d22, d22
> + vsubl.u8 q4, d4, d6
> + vsubl.u8 q5, d5, d7
> + vmlal.s16 q14, d21, d21
> + vmlal.s16 q15, d23, d23
> +
> + blt 4f
> + beq 3f
> +2:
> + vmlal.s16 q14, d8, d8
> + vmlal.s16 q15, d10, d10
> + vld2.8 {d0,d1}, [r0]!
> + vld2.8 {d2,d3}, [r2]!
> + vmlal.s16 q14, d9, d9
> + vmlal.s16 q15, d11, d11
> +
> + subs r12, r12, #16
> + vsubl.u8 q10, d0, d2
> + vsubl.u8 q11, d1, d3
> + vmlal.s16 q14, d20, d20
> + vmlal.s16 q15, d22, d22
> + vld2.8 {d4,d5}, [r0]!
> + vld2.8 {d6,d7}, [r2]!
> + vmlal.s16 q14, d21, d21
> + vmlal.s16 q15, d23, d23
> + blt 4f
> +
> + vsubl.u8 q4, d4, d6
> + vsubl.u8 q5, d5, d7
> + bgt 2b
> +3:
> + vmlal.s16 q14, d8, d8
> + vmlal.s16 q15, d10, d10
> + vmlal.s16 q14, d9, d9
> + vmlal.s16 q15, d11, d11
> +4:
> + subs r5, r5, #1
> + vaddw.s32 q8, q8, d28
> + vaddw.s32 q9, q9, d30
> + add r0, r0, r1
> + add r2, r2, r3
> + vaddw.s32 q8, q8, d29
> + vaddw.s32 q9, q9, d31
> + bgt 1b
> +
> + vadd.u64 d16, d16, d17
> + vadd.u64 d18, d18, d19
> + ldrd r4, r5, [sp, #48]
> + vst1.64 {d16}, [r4]
> + vst1.64 {d18}, [r5]
> +
> + pop {r4-r5}
> + vpop {q4-q5}
> + bx lr
> +endfunc
>
> .macro VAR_SQR_SUM qsqr_sum qsqr_last qsqr dsrc vpadal=vpadal.u16
> vmull.u8 \qsqr, \dsrc, \dsrc
> diff --git a/common/arm/pixel.h b/common/arm/pixel.h
> index f361b9d..81c21dc 100644
> --- a/common/arm/pixel.h
> +++ b/common/arm/pixel.h
> @@ -52,6 +52,10 @@ DECL_X4( sad, neon )
> DECL_X1( satd, neon )
> DECL_X1( ssd, neon )
>
> +void x264_pixel_ssd_nv12_core_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, uint64_t *, uint64_t * );
> +
> +int x264_pixel_vsad_neon( uint8_t *, intptr_t, int );
> +
> int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t );
> int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
>
> @@ -71,4 +75,6 @@ void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t,
> int sums[2][4] );
> float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
>
> +int x264_pixel_asd8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
> +
> #endif
> diff --git a/common/pixel.c b/common/pixel.c
> index e0ad76c..9904b17 100644
> --- a/common/pixel.c
> +++ b/common/pixel.c
> @@ -1380,6 +1380,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
> pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon;
> pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon;
> pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon;
> + pixf->vsad = x264_pixel_vsad_neon;
> + pixf->asd8 = x264_pixel_asd8_neon;
>
> pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_neon;
> pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_neon;
> @@ -1392,6 +1394,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
> pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_neon;
> pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_neon;
>
> + pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_neon;
> pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon;
> pixf->ssim_end4 = x264_pixel_ssim_end4_neon;
ok otherwise
Janne
More information about the x264-devel
mailing list