[x264-devel] [PATCH] Fix mc_chroma NEON code for NV12
Måns Rullgård
mans at mansr.com
Tue Feb 8 14:59:18 CET 2011
Stefan Grönroos <stefan.gronroos at gmail.com> writes:
> Hello guys!
>
> The attached patch changes the x264_mc_chroma_neon to work with the NV12
> format. With quick tests, the code seems to give about 15% higher fps on a
> Nokia N900 using a Foreman CIF test video on default settings.
>
> Still, the instruction ordering should maybe be optimized still. I also have
> a feeling that I thought something could be done better when I wrote it, but
> I don't remember what it was. Maybe someone more into NEON could take a look
> at the code?
>
> Best regards,
> Stefan Grönroos
>
> From a14ef35d96f9eb5e452405f10fcd0d2f8c48cb55 Mon Sep 17 00:00:00 2001
> From: Stefan Groenroos <stefan.gronroos at gmail.com>
> Date: Sun, 6 Feb 2011 11:56:30 +0200
> Subject: [PATCH] Fix mc_chroma NEON code for NV12.
>
> ---
> common/arm/mc-a.S | 355 +++++++++++++++++++++++++++++++---------------------
> common/arm/mc-c.c | 2 +-
> 2 files changed, 213 insertions(+), 144 deletions(-)
>
> diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
> index 62e88cc..f06de85 100644
> --- a/common/arm/mc-a.S
> +++ b/common/arm/mc-a.S
> @@ -806,57 +806,56 @@ copy_w16_aligned_loop:
> .endfunc
>
>
> -// void x264_mc_chroma_neon( uint8_t *dst, int i_dst_stride,
> -// uint8_t *src, int i_src_stride,
> -// int dx, int dy, int i_width, int i_height );
> +//void mc_chroma( pixel *dstu, pixel *dstv, int i_dst_stride,
> +// pixel *src, int i_src_stride,
> +// int mvx, int mvy,
> +// int i_width, int i_height )
> function x264_mc_chroma_neon
> - push {r4-r6, lr}
> - ldrd r4, [sp, #16]
> - ldr r6, [sp, #24]
> + push {r4-r7, lr}
> + vpush {d8-d15}
Only push/pop the registers you actually use. Keeping sp 8-byte
aligned before the vpush might also save a cycle. Pushing an extra
core register to maintain alignment costs nothing.
> + ldrd r4, [sp, #84]
> + ldrd r6, [sp, #92]
>
> - asr lr, r5, #3
> - mul lr, r3, lr
> - add r2, r2, r4, asr #3
> - cmp r6, #4
> - add r2, r2, lr
> + asr lr, r6, #3
> + mul lr, r4, lr
> + add r3, r3, r5, asr #2
> + cmp r7, #4
> + add r3, r3, lr
> + bic r3, r3, #0x1
>
> - and r4, r4, #7
> and r5, r5, #7
> - pld [r2]
> - pld [r2, r3]
> + and r6, r6, #7
> + pld [r3]
> + pld [r3, r4]
>
> bgt mc_chroma_w8
> beq mc_chroma_w4
>
> -// calculate cA cB cC cD
> .macro CHROMA_MC_START r0 r1
> - muls lr, r4, r5
> - rsb r6, lr, r5, lsl #3
> - rsb ip, lr, r4, lsl #3
> - sub r4, lr, r4, lsl #3
> - sub r4, r4, r5, lsl #3
> - add r4, r4, #64
> + muls lr, r5, r6
> + rsb r7, lr, r6, lsl #3
> + rsb ip, lr, r5, lsl #3
> + sub r5, lr, r5, lsl #3
> + sub r5, r5, r6, lsl #3
> + add r5, r5, #64
>
> beq 2f
> -
> - add r5, r2, r3
> -
> - vdup.8 d0, r4
> - lsl r3, r3, #1
> + vdup.8 d0, r5
> vdup.8 d1, ip
> - vld1.64 {\r0}, [r2], r3
> - vdup.8 d2, r6
> - vld1.64 {\r1}, [r5], r3
> + vld2.8 {\r0}, [r3], r4
> + vdup.8 d2, r7
> + vld2.8 {\r1}, [r3], r4
> vdup.8 d3, lr
It is sometimes faster to do use VLD1 followed by VUZP. The total
issue cycles are the same, but if interleaved with arithmetic, there
are more opportunities for dual-issue. On Cortex-A5 this split is
almost always faster due to how VLDn for n>1 is implemented there.
It could also be an idea to defer the deinterleaving until later in
the calculations.
> - ldr r4, [sp, #28]
> -
> - vext.8 d5, d4, d5, #1
> - vext.8 d7, d6, d7, #1
> + ldr r5, [sp, #100]
> .endm
>
> .macro CHROMA_MC width, align
> mc_chroma_w\width:
> - CHROMA_MC_START d4, d6
> + CHROMA_MC_START d4-d5, d8-d9
> + vext.8 d6, d4, d6, #1
> + vext.8 d7, d5, d7, #1
> + vext.8 d10, d8, d10, #1
> + vext.8 d11, d9, d11, #1
> // since the element size varies, there's a different index for the 2nd store
> .if \width == 4
> .set st2, 1
> @@ -864,187 +863,257 @@ mc_chroma_w\width:
> .set st2, 2
> .endif
>
> - vtrn.32 d4, d5
> - vtrn.32 d6, d7
> + vtrn.32 d4, d6
> + vtrn.32 d5, d7
> + vtrn.32 d8, d10
> + vtrn.32 d9, d11
>
> vtrn.32 d0, d1
> vtrn.32 d2, d3
>
> 1: // height loop, interpolate xy
> - pld [r5]
> + pld [r3]
> vmull.u8 q8, d4, d0
> - vmlal.u8 q8, d6, d2
> - vld1.64 {d4}, [r2], r3
> - vext.8 d5, d4, d5, #1
> - vtrn.32 d4, d5
> - vmull.u8 q9, d6, d0
> - vmlal.u8 q9, d4, d2
> - vld1.64 {d6}, [r5], r3
> + vmlal.u8 q8, d8, d2
> + vmull.u8 q9, d5, d0
> + vmlal.u8 q9, d9, d2
> + vld2.8 {d4-d5}, [r3], r4
> + vext.8 d6, d4, d6, #1
> + vext.8 d7, d5, d7, #1
> + vtrn.32 d4, d6
> + vtrn.32 d5, d7
> +
> + vmull.u8 q10, d8, d0
> + vmlal.u8 q10, d4, d2
> + vmull.u8 q11, d9, d0
> + vmlal.u8 q11, d5, d2
> + vld2.8 {d8-d9}, [r3], r4
Here some interleaving of arithmetic and permutation instructions
might help since these can dual-issue (on A8).
> vadd.i16 d16, d16, d17
> vadd.i16 d17, d18, d19
> + vadd.i16 d18, d20, d21
> + vadd.i16 d19, d22, d23
> vrshrn.u16 d16, q8, #6
> - subs r4, r4, #2
> - pld [r2]
> - vext.8 d7, d6, d7, #1
> - vtrn.32 d6, d7
> - vst1.\align {d16[0]}, [r0,:\align], r1
> - vst1.\align {d16[st2]}, [r0,:\align], r1
> + vrshrn.u16 d18, q9, #6
> + subs r5, r5, #2
> + pld [r3]
> + vext.8 d10, d8, d10, #1
> + vext.8 d11, d9, d11, #1
> + vtrn.32 d8, d10
> + vtrn.32 d9, d11
I like to keep the operands vertically aligned.
> + vst1.\align {d16[0]}, [r0,:\align], r2
> + vst1.\align {d16[st2]}, [r1,:\align], r2
> + vst1.\align {d18[0]}, [r0,:\align], r2
> + vst1.\align {d18[st2]}, [r1,:\align], r2
> bgt 1b
The same comments apply to the rest of other cases.
I would suggest trying out a few improvements if time permits,
otherwise go with this as is. It's certainly faster than C...
--
Måns Rullgård
mans at mansr.com
More information about the x264-devel
mailing list