[x264-devel] [PATCH 17/24] arm: Implement x264_deblock_h_chroma_mbaff_neon
Janne Grunau
janne-x264 at jannau.net
Thu Aug 20 17:46:53 CEST 2015
On 2015-08-13 23:59:38 +0300, Martin Storsjö wrote:
> checkasm timing Cortex-A7 A8 A9
> deblock_chroma_420_mbaff_c 1944 1706 1526
> deblock_chroma_420_mbaff_neon 1210 873 865
> ---
> common/arm/deblock-a.S | 57 ++++++++++++++++++++++++++++++++++++++++++++++++
> common/deblock.c | 4 ++--
> 2 files changed, 59 insertions(+), 2 deletions(-)
>
> diff --git a/common/arm/deblock-a.S b/common/arm/deblock-a.S
> index 26e95ed..d5210e0 100644
> --- a/common/arm/deblock-a.S
> +++ b/common/arm/deblock-a.S
> @@ -309,6 +309,63 @@ function x264_deblock_h_chroma_422_neon
> pop {pc}
> endfunc
>
> +.macro h264_loop_filter_chroma8
> + vdup.8 d22, r2 @ alpha
> + vmovl.u8 q12, d24
> + vabd.u8 d26, d16, d0 @ abs(p0 - q0)
> + vabd.u8 d28, d18, d16 @ abs(p1 - p0)
> + vsubl.u8 q2, d0, d16
> + vsli.16 d24, d24, #8
> + vshl.i16 q2, q2, #2
> + vabd.u8 d30, d2, d0 @ abs(q1 - q0)
> + vaddw.u8 q2, q2, d18
> + vclt.u8 d26, d26, d22 @ < alpha
> + vsubw.u8 q2, q2, d2
> + vdup.8 d22, r3 @ beta
> + vclt.s8 d20, d24, #0
> + vrshrn.i16 d4, q2, #3
> + vclt.u8 d28, d28, d22 @ < beta
> + vbic d26, d26, d20
> + vclt.u8 d30, d30, d22 @ < beta
> + vand d26, d26, d28
> + vneg.s8 d20, d24
> + vand d26, d26, d30
> + vmin.s8 d4, d4, d24
> + vmovl.u8 q14, d16
> + vand d4, d4, d26
> + vmax.s8 d4, d4, d20
> + vmovl.u8 q11, d0
> + vaddw.s8 q14, q14, d4
> + vsubw.s8 q11, q11, d4
> + vqmovun.s16 d16, q14
> + vqmovun.s16 d0, q11
> +.endm
> +
> +function x264_deblock_h_chroma_mbaff_neon
> + h264_loop_filter_start
> +
> + sub r0, r0, #4
> + vld1.8 {d18}, [r0], r1
> + vld1.8 {d16}, [r0], r1
> + vld1.8 {d0}, [r0], r1
> + vld1.8 {d2}, [r0], r1
> +
> + TRANSPOSE4x4_16 d18, d16, d0, d2
> +
> + h264_loop_filter_chroma8
> +
> + vtrn.16 d16, d0
> +
> + sub r0, r0, r1, lsl #2
> + add r0, r0, #2
> + vst1.32 {d16[0]}, [r0], r1
> + vst1.32 {d0[0]}, [r0], r1
> + vst1.32 {d16[1]}, [r0], r1
> + vst1.32 {d0[1]}, [r0]
> +
> + bx lr
> +endfunc
> +
> function x264_deblock_strength_neon
> ldr ip, [sp]
> vmov.i8 q8, #0
> diff --git a/common/deblock.c b/common/deblock.c
> index 83bda62..1d398ad 100644
> --- a/common/deblock.c
> +++ b/common/deblock.c
> @@ -740,8 +740,8 @@ void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X26
> int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
> int mvy_limit, int bframe );
> void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
> -#if ARCH_AARCH64
> void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
> +#if ARCH_AARCH64
> void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
> void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
> void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
> @@ -874,8 +874,8 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
> pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
> pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
> pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_neon;
> -#if ARCH_AARCH64
> pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_neon;
> +#if ARCH_AARCH64
> pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon;
> pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon;
> pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon;
ok
Janne
More information about the x264-devel
mailing list