[x264-devel] [PATCH 1/1] checkasm: fix arm64 register clobber test

Mon Aug 24 11:42:47 CEST 2015

On Mon, 17 Aug 2015, Janne Grunau wrote:

> Hi Martin,
>
> the code has a couple of problems, see below in the commit message.
> feel free squash this patch. I still need to test on ios though.
> IIRC it won't work there since the ABI for stack arguments and variadic
> arguments is incompatible. Stack arguments take only there size + padding
> for natural alignment but variadic arguments are allocated to slots of
> 8 bytes. There are iirc one or two functions which require args on the
> stack. If those arguments are smaller than 8 bytes this won't work.
> I don't think this is fixable, i.e. arm64 x264_checkasm_call can't work
> on ios.
>
> Janne
>
> ---8<---
> The stackpointer SP must be 16-byte aligned if it's used for meory
> access. Solve this by creating a proper frame record.
>
> Use the frame pointer to access the original arguments on the stack.
>
> Provide a macro to test neon args.
>
> Do not write past int ok.
> ---
> tools/checkasm-aarch64.S | 58 ++++++++++++++++++++++--------------------------
> 1 file changed, 26 insertions(+), 32 deletions(-)
>
> diff --git a/tools/checkasm-aarch64.S b/tools/checkasm-aarch64.S
> index 2b86d62..ec36518 100644
> --- a/tools/checkasm-aarch64.S
> +++ b/tools/checkasm-aarch64.S
> @@ -56,11 +56,11 @@ error_message:
> // max number of args used by any x264 asm function.
> #define MAX_ARGS 15
>
> -#define ARG_STACK 8*(MAX_ARGS - 6)
> -#define PUSHED 8*8 + 8*12
> +#define ARG_STACK ((8*(MAX_ARGS - 6) + 15) & ~15)
>
> function x264_checkasm_call, export=1
> -    str         x30, [sp, #-8]!
> +    stp         x29, x30, [sp, #-16]!
> +    mov         x29, sp
>     stp         x19, x20, [sp, #-16]!
>     stp         x21, x22, [sp, #-16]!
>     stp         x23, x24, [sp, #-16]!
> @@ -82,12 +82,13 @@ function x264_checkasm_call, export=1
>     ldp         x25, x26, [x9], #16
>     ldp         x27, x28, [x9], #16
>
> -    str         x1,  [sp, #-8]!
> +    str         x1,  [sp, #-16]!
>
>     sub         sp,  sp,  #ARG_STACK
> .equ pos, 0
> +// first two stacked args are copied to x6, x7
> .rept MAX_ARGS-6
> -    ldr         x9, [sp, #ARG_STACK + PUSHED + 16 + pos]
> +    ldr         x9, [x29, #16 + 16 + pos]
>     str         x9, [sp, #pos]
> .equ pos, pos + 8
> .endr
> @@ -99,33 +100,26 @@ function x264_checkasm_call, export=1
>     mov         x3,  x5
>     mov         x4,  x6
>     mov         x5,  x7
> -    ldp         x6,  x7,  [sp, #ARG_STACK + PUSHED]
> +    ldp         x6,  x7,  [x29, #16]
>     blr         x12
>     add         sp,  sp,  #ARG_STACK
> -    ldr         x2,  [sp], #8
> -
> -    stp         x0,  x1, [sp, #-16]!
> +    ldr         x2,  [sp]
> +    stp         x0,  x1, [sp]
>     movrel      x9, register_init
> -    ldp         d0,  d1,  [x9], #16
> -    ldp         d2,  d3,  [x9], #16
> -    ldp         d4,  d5,  [x9], #16
> -    ldp         d6,  d7,  [x9], #16
> -    eor         v0.8b,  v0.8b,  v8.8b
> -    eor         v1.8b,  v1.8b,  v9.8b
> -    eor         v2.8b,  v2.8b,  v10.8b
> -    eor         v3.8b,  v3.8b,  v11.8b
> -    eor         v4.8b,  v4.8b,  v12.8b
> -    eor         v5.8b,  v5.8b,  v13.8b
> -    eor         v6.8b,  v6.8b,  v14.8b
> -    eor         v7.8b,  v7.8b,  v15.8b
> -    orr         v0.8b,  v0.8b,  v1.8b
> -    orr         v0.8b,  v0.8b,  v2.8b
> -    orr         v0.8b,  v0.8b,  v3.8b
> -    orr         v0.8b,  v0.8b,  v4.8b
> -    orr         v0.8b,  v0.8b,  v5.8b
> -    orr         v0.8b,  v0.8b,  v6.8b
> -    orr         v0.8b,  v0.8b,  v7.8b
> -    fmov        x3,  d0
> +    movi        v3.8h,  #0
> +
> +.macro check_reg_neon reg1, reg2
> +    ldr         q0,  [x9], #16
> +    uzp1        v1.2d,  v\reg1\().2d, v\reg2\().2d
> +    eor         v0.16b, v0.16b, v1.16b
> +    orr         v3.16b, v3.16b, v0.16b
> +.endm
> +    check_reg_neon  8,  9
> +    check_reg_neon  10, 11
> +    check_reg_neon  12, 13
> +    check_reg_neon  14, 15
> +    xtn         v3.8b,  v3.8h

Doesn't this drop half of the bits in the registers, i.e. a bit set in the 
upper half of the halfwords would be discarded and missed? An uqxtn would 
probably handle that, right?

The rest of this patch looks good though - thanks!

Did you ever get back to testing on iOS? I guess it'd require something 
like (ARCH_AARCH64 && !defined(__APPLE__)) in checkasm.c, to skip it 
there.

// Martin