[x264-devel] [PATCH 15/24] arm: Implement integral_init4/8h/v_neon
Janne Grunau
janne-x264 at jannau.net
Sat Aug 22 17:52:57 CEST 2015
On 2015-08-13 23:59:36 +0300, Martin Storsjö wrote:
> checkasm timing Cortex-A7 A8 A9
> integral_init4h_c 10466 8590 6161
> integral_init4h_neon 3021 1494 1800
> integral_init4v_c 16250 13590 13628
> integral_init4v_neon 3473 2073 3291
> integral_init8h_c 10100 8275 5705
> integral_init8h_neon 4403 2344 2751
> integral_init8v_c 6403 4632 4999
> integral_init8v_neon 1184 783 1306
> ---
> common/arm/mc-a.S | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> common/arm/mc-c.c | 10 +++++
> 2 files changed, 135 insertions(+)
>
> diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
> index 4225c71..6538dec 100644
> --- a/common/arm/mc-a.S
> +++ b/common/arm/mc-a.S
> @@ -1635,3 +1635,128 @@ function x264_store_interleave_chroma_neon
>
> pop {pc}
> endfunc
> +
> +.macro integral4h p1, p2
> + vext.8 d1, \p1, \p2, #1
> + vext.8 d2, \p1, \p2, #2
> + vext.8 d3, \p1, \p2, #3
> + vaddl.u8 q0, \p1, d1
> + vaddl.u8 q1, d2, d3
> + vadd.u16 q0, q0, q1
> + vadd.u16 q0, q0, q2
> +.endm
> +
> +function integral_init4h_neon
> + sub r3, r0, r2, lsl #1
> + vld1.8 {d6, d7}, [r1, :128]!
> +1:
> + subs r2, r2, #16
> + vld1.16 {q2}, [r3, :128]!
> + integral4h d6, d7
> + vld1.8 {d6}, [r1, :64]!
> + vld1.16 {q2}, [r3, :128]!
> + vst1.16 {q0}, [r0, :128]!
> + integral4h d7, d6
> + vld1.8 {d7}, [r1, :64]!
> + vst1.16 {q0}, [r0, :128]!
> + bgt 1b
> + bx lr
> +endfunc
> +
> +.macro integral8h p1, p2, s
> + vext.8 d1, \p1, \p2, #1
> + vext.8 d2, \p1, \p2, #2
> + vext.8 d3, \p1, \p2, #3
> + vext.8 d4, \p1, \p2, #4
> + vext.8 d5, \p1, \p2, #5
> + vext.8 d6, \p1, \p2, #6
> + vext.8 d7, \p1, \p2, #7
> + vaddl.u8 q0, \p1, d1
> + vaddl.u8 q1, d2, d3
> + vaddl.u8 q2, d4, d5
> + vaddl.u8 q3, d6, d7
> + vadd.u16 q0, q0, q1
> + vadd.u16 q2, q2, q3
> + vadd.u16 q0, q0, q2
> + vadd.u16 q0, q0, \s
> +.endm
> +
> +function integral_init8h_neon
> + sub r3, r0, r2, lsl #1
> + vld1.8 {d16, d17}, [r1, :128]!
> +1:
> + subs r2, r2, #16
> + vld1.16 {q9}, [r3, :128]!
> + integral8h d16, d17, q9
> + vld1.8 {d16}, [r1, :64]!
> + vld1.16 {q9}, [r3, :128]!
> + vst1.16 {q0}, [r0, :128]!
> + integral8h d17, d16, q9
> + vld1.8 {d17}, [r1, :64]!
> + vst1.16 {q0}, [r0, :128]!
> + bgt 1b
> + bx lr
> +endfunc
> +
> +function integral_init4v_neon
> + push {r4-r5}
> + mov r3, r0
> + add r4, r0, r2, lsl #3
> + add r5, r0, r2, lsl #4
> + sub r2, r2, #8
> + vld1.16 {q11, q12}, [r3]!
> + vld1.16 {q8, q9}, [r5]!
> + vld1.16 {q13}, [r3]!
> + vld1.16 {q10}, [r5]!
> +1:
> + subs r2, r2, #16
> + vld1.16 {q14, q15}, [r4]!
> + vext.8 q0, q11, q12, #8
> + vext.8 q1, q12, q13, #8
> + vext.8 q2, q8, q9, #8
> + vext.8 q3, q9, q10, #8
> + vsub.u16 q14, q14, q11
> + vsub.u16 q15, q15, q12
> + vadd.u16 q0, q0, q11
> + vadd.u16 q1, q1, q12
> + vadd.u16 q2, q2, q8
> + vadd.u16 q3, q3, q9
> + vst1.16 {q14}, [r1]!
> + vst1.16 {q15}, [r1]!
> + vmov q11, q13
> + vmov q8, q10
> + vsub.u16 q0, q2, q0
> + vsub.u16 q1, q3, q1
> + vld1.16 {q12, q13}, [r3]!
> + vld1.16 {q9, q10}, [r5]!
> + vst1.16 {q0}, [r0]!
> + vst1.16 {q1}, [r0]!
> + bgt 1b
> +2:
> + pop {r4-r5}
> + bx lr
> +endfunc
> +
> +function integral_init8v_neon
> + add r2, r0, r1, lsl #4
> + sub r1, r1, #8
> + ands r3, r1, #16 - 1
> + beq 1f
> + subs r1, r1, #8
> + vld1.16 {q0}, [r0]
> + vld1.16 {q2}, [r2]!
> + vsub.u16 q8, q2, q0
> + vst1.16 {q8}, [r0]!
> + ble 2f
> +1:
> + subs r1, r1, #16
> + vld1.16 {q0, q1}, [r0]
> + vld1.16 {q2, q3}, [r2]!
> + vsub.u16 q8, q2, q0
> + vsub.u16 q9, q3, q1
> + vst1.16 {q8}, [r0]!
> + vst1.16 {q9}, [r0]!
> + bgt 1b
> +2:
> + bx lr
> +endfunc
> diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
> index 2ebf1ba..dd86fb2 100644
> --- a/common/arm/mc-c.c
> +++ b/common/arm/mc-c.c
> @@ -99,6 +99,11 @@ void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, intptr_t, int );
> void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
> void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
>
> +void integral_init4h_neon( uint16_t *, uint8_t *, intptr_t );
> +void integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
> +void integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
> +void integral_init8v_neon( uint16_t *, intptr_t );
> +
> #if !HIGH_BIT_DEPTH
> static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
> {
> @@ -271,6 +276,11 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
> pf->get_ref = get_ref_neon;
> pf->hpel_filter = hpel_filter_neon;
> pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
> +
> + pf->integral_init4h = integral_init4h_neon;
> + pf->integral_init8h = integral_init8h_neon;
> + pf->integral_init4v = integral_init4v_neon;
> + pf->integral_init8v = integral_init8v_neon;
> #endif // !HIGH_BIT_DEPTH
>
> // Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
ok
Janne
More information about the x264-devel
mailing list