[x264-devel] [PATCH] Fix mc_chroma NEON code for NV12

Måns Rullgård mans at mansr.com
Tue Feb 8 14:59:18 CET 2011


Stefan Grönroos <stefan.gronroos at gmail.com> writes:

> Hello guys!
>
> The attached patch changes the x264_mc_chroma_neon to work with the NV12
> format. With quick tests, the code seems to give about 15% higher fps on a
> Nokia N900 using a Foreman CIF test video on default settings.
>
> Still, the instruction ordering should maybe be optimized still. I also have
> a feeling that I thought something could be done better when I wrote it, but
> I don't remember what it was. Maybe someone more into NEON could take a look
> at the code?
>
> Best regards,
> Stefan Grönroos
>
> From a14ef35d96f9eb5e452405f10fcd0d2f8c48cb55 Mon Sep 17 00:00:00 2001
> From: Stefan Groenroos <stefan.gronroos at gmail.com>
> Date: Sun, 6 Feb 2011 11:56:30 +0200
> Subject: [PATCH] Fix mc_chroma NEON code for NV12.
>
> ---
>  common/arm/mc-a.S |  355 +++++++++++++++++++++++++++++++---------------------
>  common/arm/mc-c.c |    2 +-
>  2 files changed, 213 insertions(+), 144 deletions(-)
>
> diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
> index 62e88cc..f06de85 100644
> --- a/common/arm/mc-a.S
> +++ b/common/arm/mc-a.S
> @@ -806,57 +806,56 @@ copy_w16_aligned_loop:
>  .endfunc
>  
>  
> -// void x264_mc_chroma_neon( uint8_t *dst, int i_dst_stride,
> -//                           uint8_t *src, int i_src_stride,
> -//                           int dx, int dy, int i_width, int i_height );
> +//void mc_chroma( pixel *dstu, pixel *dstv, int i_dst_stride,
> +//                       pixel *src, int i_src_stride,
> +//                       int mvx, int mvy,
> +//                       int i_width, int i_height )
>  function x264_mc_chroma_neon
> -    push            {r4-r6, lr}
> -    ldrd            r4,  [sp, #16]
> -    ldr             r6,  [sp, #24]
> +    push            {r4-r7, lr}
> +    vpush           {d8-d15}

Only push/pop the registers you actually use.  Keeping sp 8-byte
aligned before the vpush might also save a cycle.  Pushing an extra
core register to maintain alignment costs nothing.

> +    ldrd            r4,  [sp, #84]
> +    ldrd            r6,  [sp, #92]
>  
> -    asr             lr,  r5,  #3
> -    mul             lr,  r3,  lr
> -    add             r2,  r2,  r4,  asr #3
> -    cmp             r6, #4
> -    add             r2,  r2,  lr
> +    asr             lr,  r6,  #3
> +    mul             lr,  r4,  lr
> +    add             r3,  r3,  r5,  asr #2
> +    cmp             r7, #4
> +    add             r3,  r3,  lr
> +    bic             r3, r3, #0x1
>  
> -    and             r4, r4, #7
>      and             r5, r5, #7
> -    pld             [r2]
> -    pld             [r2, r3]
> +    and             r6, r6, #7
> +    pld             [r3]
> +    pld             [r3, r4]
>  
>      bgt             mc_chroma_w8
>      beq             mc_chroma_w4
>  
> -// calculate cA cB cC cD
>  .macro CHROMA_MC_START r0 r1
> -    muls            lr,  r4,  r5
> -    rsb             r6,  lr,  r5,  lsl #3
> -    rsb             ip,  lr,  r4,  lsl #3
> -    sub             r4,  lr,  r4,  lsl #3
> -    sub             r4,  r4,  r5,  lsl #3
> -    add             r4,  r4,  #64
> +    muls            lr,  r5,  r6
> +    rsb             r7,  lr,  r6,  lsl #3
> +    rsb             ip,  lr,  r5,  lsl #3
> +    sub             r5,  lr,  r5,  lsl #3
> +    sub             r5,  r5,  r6,  lsl #3
> +    add             r5,  r5,  #64
>  
>      beq             2f
> -
> -    add             r5,  r2,  r3
> -
> -    vdup.8          d0,  r4
> -    lsl             r3,  r3,  #1
> +    vdup.8          d0,  r5
>      vdup.8          d1,  ip
> -    vld1.64         {\r0}, [r2], r3
> -    vdup.8          d2,  r6
> -    vld1.64         {\r1}, [r5], r3
> +    vld2.8         {\r0}, [r3], r4
> +    vdup.8          d2,  r7
> +    vld2.8         {\r1}, [r3], r4
>      vdup.8          d3,  lr

It is sometimes faster to do use VLD1 followed by VUZP.  The total
issue cycles are the same, but if interleaved with arithmetic, there
are more opportunities for dual-issue.  On Cortex-A5 this split is
almost always faster due to how VLDn for n>1 is implemented there.

It could also be an idea to defer the deinterleaving until later in
the calculations.

> -    ldr             r4,  [sp, #28]
> -
> -    vext.8          d5,  d4,  d5,  #1
> -    vext.8          d7,  d6,  d7,  #1
> +    ldr             r5,  [sp, #100]
>  .endm
>  
>  .macro CHROMA_MC width, align
>  mc_chroma_w\width:
> -    CHROMA_MC_START d4,  d6
> +    CHROMA_MC_START d4-d5,  d8-d9
> +    vext.8          d6,  d4,  d6,  #1
> +    vext.8          d7,  d5,  d7,  #1
> +    vext.8          d10,  d8,  d10,  #1
> +    vext.8          d11, d9, d11,  #1
>  // since the element size varies, there's a different index for the 2nd store
>  .if \width == 4
>      .set st2, 1
> @@ -864,187 +863,257 @@ mc_chroma_w\width:
>      .set st2, 2
>  .endif
>  
> -    vtrn.32         d4,  d5
> -    vtrn.32         d6,  d7
> +    vtrn.32         d4,  d6
> +    vtrn.32         d5,  d7
> +    vtrn.32         d8, d10
> +    vtrn.32         d9, d11
>  
>      vtrn.32         d0,  d1
>      vtrn.32         d2,  d3
>  
>  1:  // height loop, interpolate xy
> -    pld             [r5]
> +    pld             [r3]
>      vmull.u8        q8,  d4,  d0
> -    vmlal.u8        q8,  d6,  d2
> -    vld1.64         {d4},     [r2], r3
> -    vext.8          d5,  d4,  d5,  #1
> -    vtrn.32         d4,  d5
> -    vmull.u8        q9,  d6,  d0
> -    vmlal.u8        q9,  d4,  d2
> -    vld1.64         {d6},     [r5], r3
> +    vmlal.u8        q8,  d8,  d2
> +    vmull.u8        q9,  d5,  d0
> +    vmlal.u8        q9,  d9,  d2
> +    vld2.8          {d4-d5},  [r3], r4
> +    vext.8          d6,  d4,  d6,  #1
> +    vext.8          d7,  d5,  d7,  #1
> +    vtrn.32         d4,  d6
> +    vtrn.32         d5,  d7
> +
> +    vmull.u8        q10, d8,  d0
> +    vmlal.u8        q10, d4,  d2
> +    vmull.u8        q11, d9,  d0
> +    vmlal.u8        q11, d5,  d2
> +    vld2.8          {d8-d9},  [r3], r4

Here some interleaving of arithmetic and permutation instructions
might help since these can dual-issue (on A8).

>      vadd.i16        d16, d16, d17
>      vadd.i16        d17, d18, d19
> +    vadd.i16        d18, d20, d21
> +    vadd.i16        d19, d22, d23
>      vrshrn.u16      d16, q8,  #6
> -    subs            r4,  r4,  #2
> -    pld             [r2]
> -    vext.8          d7,  d6,  d7,  #1
> -    vtrn.32         d6,  d7
> -    vst1.\align     {d16[0]},   [r0,:\align], r1
> -    vst1.\align     {d16[st2]}, [r0,:\align], r1
> +    vrshrn.u16      d18, q9,  #6
> +    subs            r5,  r5,  #2
> +    pld             [r3]
> +    vext.8          d10,  d8,  d10,  #1
> +    vext.8          d11, d9, d11,  #1
> +    vtrn.32         d8, d10
> +    vtrn.32         d9, d11

I like to keep the operands vertically aligned.

> +    vst1.\align     {d16[0]},   [r0,:\align], r2
> +    vst1.\align     {d16[st2]}, [r1,:\align], r2
> +    vst1.\align     {d18[0]},   [r0,:\align], r2
> +    vst1.\align     {d18[st2]}, [r1,:\align], r2
>      bgt             1b

The same comments apply to the rest of other cases.

I would suggest trying out a few improvements if time permits,
otherwise go with this as is.  It's certainly faster than C...

-- 
Måns Rullgård
mans at mansr.com


More information about the x264-devel mailing list