[x264-devel] [PATCH 1/1] checkasm: fix arm64 register clobber test
Martin Storsjö
martin at martin.st
Mon Aug 24 11:42:47 CEST 2015
On Mon, 17 Aug 2015, Janne Grunau wrote:
> Hi Martin,
>
> the code has a couple of problems, see below in the commit message.
> feel free squash this patch. I still need to test on ios though.
> IIRC it won't work there since the ABI for stack arguments and variadic
> arguments is incompatible. Stack arguments take only there size + padding
> for natural alignment but variadic arguments are allocated to slots of
> 8 bytes. There are iirc one or two functions which require args on the
> stack. If those arguments are smaller than 8 bytes this won't work.
> I don't think this is fixable, i.e. arm64 x264_checkasm_call can't work
> on ios.
>
> Janne
>
> ---8<---
> The stackpointer SP must be 16-byte aligned if it's used for meory
> access. Solve this by creating a proper frame record.
>
> Use the frame pointer to access the original arguments on the stack.
>
> Provide a macro to test neon args.
>
> Do not write past int ok.
> ---
> tools/checkasm-aarch64.S | 58 ++++++++++++++++++++++--------------------------
> 1 file changed, 26 insertions(+), 32 deletions(-)
>
> diff --git a/tools/checkasm-aarch64.S b/tools/checkasm-aarch64.S
> index 2b86d62..ec36518 100644
> --- a/tools/checkasm-aarch64.S
> +++ b/tools/checkasm-aarch64.S
> @@ -56,11 +56,11 @@ error_message:
> // max number of args used by any x264 asm function.
> #define MAX_ARGS 15
>
> -#define ARG_STACK 8*(MAX_ARGS - 6)
> -#define PUSHED 8*8 + 8*12
> +#define ARG_STACK ((8*(MAX_ARGS - 6) + 15) & ~15)
>
> function x264_checkasm_call, export=1
> - str x30, [sp, #-8]!
> + stp x29, x30, [sp, #-16]!
> + mov x29, sp
> stp x19, x20, [sp, #-16]!
> stp x21, x22, [sp, #-16]!
> stp x23, x24, [sp, #-16]!
> @@ -82,12 +82,13 @@ function x264_checkasm_call, export=1
> ldp x25, x26, [x9], #16
> ldp x27, x28, [x9], #16
>
> - str x1, [sp, #-8]!
> + str x1, [sp, #-16]!
>
> sub sp, sp, #ARG_STACK
> .equ pos, 0
> +// first two stacked args are copied to x6, x7
> .rept MAX_ARGS-6
> - ldr x9, [sp, #ARG_STACK + PUSHED + 16 + pos]
> + ldr x9, [x29, #16 + 16 + pos]
> str x9, [sp, #pos]
> .equ pos, pos + 8
> .endr
> @@ -99,33 +100,26 @@ function x264_checkasm_call, export=1
> mov x3, x5
> mov x4, x6
> mov x5, x7
> - ldp x6, x7, [sp, #ARG_STACK + PUSHED]
> + ldp x6, x7, [x29, #16]
> blr x12
> add sp, sp, #ARG_STACK
> - ldr x2, [sp], #8
> -
> - stp x0, x1, [sp, #-16]!
> + ldr x2, [sp]
> + stp x0, x1, [sp]
> movrel x9, register_init
> - ldp d0, d1, [x9], #16
> - ldp d2, d3, [x9], #16
> - ldp d4, d5, [x9], #16
> - ldp d6, d7, [x9], #16
> - eor v0.8b, v0.8b, v8.8b
> - eor v1.8b, v1.8b, v9.8b
> - eor v2.8b, v2.8b, v10.8b
> - eor v3.8b, v3.8b, v11.8b
> - eor v4.8b, v4.8b, v12.8b
> - eor v5.8b, v5.8b, v13.8b
> - eor v6.8b, v6.8b, v14.8b
> - eor v7.8b, v7.8b, v15.8b
> - orr v0.8b, v0.8b, v1.8b
> - orr v0.8b, v0.8b, v2.8b
> - orr v0.8b, v0.8b, v3.8b
> - orr v0.8b, v0.8b, v4.8b
> - orr v0.8b, v0.8b, v5.8b
> - orr v0.8b, v0.8b, v6.8b
> - orr v0.8b, v0.8b, v7.8b
> - fmov x3, d0
> + movi v3.8h, #0
> +
> +.macro check_reg_neon reg1, reg2
> + ldr q0, [x9], #16
> + uzp1 v1.2d, v\reg1\().2d, v\reg2\().2d
> + eor v0.16b, v0.16b, v1.16b
> + orr v3.16b, v3.16b, v0.16b
> +.endm
> + check_reg_neon 8, 9
> + check_reg_neon 10, 11
> + check_reg_neon 12, 13
> + check_reg_neon 14, 15
> + xtn v3.8b, v3.8h
Doesn't this drop half of the bits in the registers, i.e. a bit set in the
upper half of the halfwords would be discarded and missed? An uqxtn would
probably handle that, right?
The rest of this patch looks good though - thanks!
Did you ever get back to testing on iOS? I guess it'd require something
like (ARCH_AARCH64 && !defined(__APPLE__)) in checkasm.c, to skip it
there.
// Martin
More information about the x264-devel
mailing list