[x264-devel] [PATCH 18/24] arm: Implement chroma intra deblock
Janne Grunau
janne-x264 at jannau.net
Sun Aug 23 14:27:53 CEST 2015
On 2015-08-13 23:59:39 +0300, Martin Storsjö wrote:
> checkasm timing Cortex-A7 A8 A9
> deblock_chroma_420_intra_mbaff_c 1486 1274 1183
> deblock_chroma_420_intra_mbaff_neon 999 726 644
> deblock_chroma_intra[1]_c 2969 2396 2324
> deblock_chroma_intra[1]_neon 949 600 575
> deblock_h_chroma_420_intra_c 2886 2535 2265
> deblock_h_chroma_420_intra_neon 1531 1146 1028
> deblock_h_chroma_422_intra_c 6205 4910 4782
> deblock_h_chroma_422_intra_neon 2974 2031 2074
> deblock_luma_intra[0]_c 6051 4695 4349
> deblock_luma_intra[0]_neon 3554 2444 2414
> deblock_luma_intra[1]_c 10381 5860 5331
> deblock_luma_intra[1]_neon 2895 1572 1683
deblock_h_chroma_intra_mbaff is missing and deblock_luma_intra is not
affected by the patch
All functions miss the zero check for alpha and beta but I'm not
actually sure if that's ever going to trigger. The encoder seems to
disable deblocking anyway if alpha or beta become 0.
> ---
> common/arm/deblock-a.S | 116 ++++++++++++++++++++++++++++++++++++++++++++++++
> common/deblock.c | 4 +-
> 2 files changed, 118 insertions(+), 2 deletions(-)
>
> diff --git a/common/arm/deblock-a.S b/common/arm/deblock-a.S
> index d5210e0..f1f6eaf 100644
> --- a/common/arm/deblock-a.S
> +++ b/common/arm/deblock-a.S
> @@ -366,6 +366,122 @@ function x264_deblock_h_chroma_mbaff_neon
> bx lr
> endfunc
>
> +.macro h264_loop_filter_chroma_intra, width=16
> + vdup.8 q11, r2 @ alpha
> + vabd.u8 q13, q8, q0 @ abs(p0 - q0)
> + vabd.u8 q14, q9, q8 @ abs(p1 - p0)
> + vabd.u8 q15, q1, q0 @ abs(q1 - q0)
> + vclt.u8 q13, q13, q11 @ < alpha
> + vdup.8 q11, r3 @ beta
> + vclt.u8 q14, q14, q11 @ < beta
> + vclt.u8 q15, q15, q11 @ < beta
> + vand q13, q13, q14
> + vand q13, q13, q15
> +
> + vshll.u8 q14, d18, #1
> + vshll.u8 q2, d2, #1
> +.ifc \width, 16
> + vshll.u8 q15, d19, #1
> + vshll.u8 q3, d3, #1
> + vaddl.u8 q12, d17, d3
> + vaddl.u8 q10, d1, d19
> +.endif
> + vaddl.u8 q11, d16, d2
> + vaddl.u8 q1, d18, d0 @ or vaddw q2, to not clobber q1
> + vadd.u16 q14, q14, q11
> + vadd.u16 q2, q2, q1
> +.ifc \width, 16
> + vadd.u16 q15, q15, q12
> + vadd.u16 q3, q3, q10
> +.endif
> + vqrshrn.u16 d28, q14, #2
> + vqrshrn.u16 d4, q2, #2
> +.ifc \width, 16
> + vqrshrn.u16 d29, q15, #2
> + vqrshrn.u16 d5, q3, #2
> +.endif
> + vbit q8, q14, q13
> + vbit q0, q2, q13
> +.endm
> +
> +function x264_deblock_v_chroma_intra_neon
> + sub r0, r0, r1, lsl #1
> + vld2.8 {d18,d19}, [r0,:128], r1
> + vld2.8 {d16,d17}, [r0,:128], r1
> + vld2.8 {d0, d1}, [r0,:128], r1
> + vld2.8 {d2, d3}, [r0,:128]
> +
> + h264_loop_filter_chroma_intra
> +
> + sub r0, r0, r1, lsl #1
> + vst2.8 {d16,d17}, [r0,:128], r1
> + vst2.8 {d0, d1}, [r0,:128], r1
> +
> + bx lr
> +endfunc
> +
> +function x264_deblock_h_chroma_intra_neon
> + sub r0, r0, #4
> + vld1.8 {d18}, [r0], r1
> + vld1.8 {d16}, [r0], r1
> + vld1.8 {d0}, [r0], r1
> + vld1.8 {d2}, [r0], r1
> + vld1.8 {d19}, [r0], r1
> + vld1.8 {d17}, [r0], r1
> + vld1.8 {d1}, [r0], r1
> + vld1.8 {d3}, [r0], r1
> +
> + TRANSPOSE4x4_16 q9, q8, q0, q1
> +
> + h264_loop_filter_chroma_intra
> +
> + vtrn.16 q8, q0
> +
> + sub r0, r0, r1, lsl #3
> + add r0, r0, #2
> + vst1.32 {d16[0]}, [r0], r1
> + vst1.32 {d0[0]}, [r0], r1
> + vst1.32 {d16[1]}, [r0], r1
> + vst1.32 {d0[1]}, [r0], r1
> + vst1.32 {d17[0]}, [r0], r1
> + vst1.32 {d1[0]}, [r0], r1
> + vst1.32 {d17[1]}, [r0], r1
> + vst1.32 {d1[1]}, [r0], r1
> +
> + bx lr
> +endfunc
> +
> +function x264_deblock_h_chroma_422_intra_neon
> + push {lr}
> + bl X(x264_deblock_h_chroma_intra_neon)
> + add r0, r0, #2
> + bl X(x264_deblock_h_chroma_intra_neon)
> + pop {pc}
restore lr before and you can return directly from the tail call
> +endfunc
> +
> +function x264_deblock_h_chroma_intra_mbaff_neon
> + sub r0, r0, #4
> + vld1.8 {d18}, [r0], r1
> + vld1.8 {d16}, [r0], r1
> + vld1.8 {d0}, [r0], r1
> + vld1.8 {d2}, [r0], r1
> +
> + TRANSPOSE4x4_16 d18, d16, d0, d2
> +
> + h264_loop_filter_chroma_intra, width=8
> +
> + vtrn.16 d16, d0
> +
> + sub r0, r0, r1, lsl #2
> + add r0, r0, #2
> + vst1.32 {d16[0]}, [r0], r1
> + vst1.32 {d0[0]}, [r0], r1
> + vst1.32 {d16[1]}, [r0], r1
> + vst1.32 {d0[1]}, [r0]
> +
> + bx lr
> +endfunc
> +
> function x264_deblock_strength_neon
> ldr ip, [sp]
> vmov.i8 q8, #0
> diff --git a/common/deblock.c b/common/deblock.c
> index 1d398ad..46379ec 100644
> --- a/common/deblock.c
> +++ b/common/deblock.c
> @@ -741,11 +741,11 @@ void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X26
> int mvy_limit, int bframe );
> void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
> void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
> -#if ARCH_AARCH64
> void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
> void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
> void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
> void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
> +#if ARCH_AARCH64
> void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
> void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
> #endif
> @@ -875,11 +875,11 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
> pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
> pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_neon;
> pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_neon;
> -#if ARCH_AARCH64
> pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon;
> pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon;
> pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon;
> pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_neon;
> +#if ARCH_AARCH64
> pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_neon;
> pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_neon;
> #endif
otherwise ok
Janne
More information about the x264-devel
mailing list