[x264-devel] [PATCH 1/1] checkasm: fix arm64 register clobber test
Janne Grunau
janne-x264 at jannau.net
Mon Aug 17 18:43:02 CEST 2015
Hi Martin,
the code has a couple of problems, see below in the commit message.
feel free squash this patch. I still need to test on ios though.
IIRC it won't work there since the ABI for stack arguments and variadic
arguments is incompatible. Stack arguments take only there size + padding
for natural alignment but variadic arguments are allocated to slots of
8 bytes. There are iirc one or two functions which require args on the
stack. If those arguments are smaller than 8 bytes this won't work.
I don't think this is fixable, i.e. arm64 x264_checkasm_call can't work
on ios.
Janne
---8<---
The stackpointer SP must be 16-byte aligned if it's used for meory
access. Solve this by creating a proper frame record.
Use the frame pointer to access the original arguments on the stack.
Provide a macro to test neon args.
Do not write past int ok.
---
tools/checkasm-aarch64.S | 58 ++++++++++++++++++++++--------------------------
1 file changed, 26 insertions(+), 32 deletions(-)
diff --git a/tools/checkasm-aarch64.S b/tools/checkasm-aarch64.S
index 2b86d62..ec36518 100644
--- a/tools/checkasm-aarch64.S
+++ b/tools/checkasm-aarch64.S
@@ -56,11 +56,11 @@ error_message:
// max number of args used by any x264 asm function.
#define MAX_ARGS 15
-#define ARG_STACK 8*(MAX_ARGS - 6)
-#define PUSHED 8*8 + 8*12
+#define ARG_STACK ((8*(MAX_ARGS - 6) + 15) & ~15)
function x264_checkasm_call, export=1
- str x30, [sp, #-8]!
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
stp x19, x20, [sp, #-16]!
stp x21, x22, [sp, #-16]!
stp x23, x24, [sp, #-16]!
@@ -82,12 +82,13 @@ function x264_checkasm_call, export=1
ldp x25, x26, [x9], #16
ldp x27, x28, [x9], #16
- str x1, [sp, #-8]!
+ str x1, [sp, #-16]!
sub sp, sp, #ARG_STACK
.equ pos, 0
+// first two stacked args are copied to x6, x7
.rept MAX_ARGS-6
- ldr x9, [sp, #ARG_STACK + PUSHED + 16 + pos]
+ ldr x9, [x29, #16 + 16 + pos]
str x9, [sp, #pos]
.equ pos, pos + 8
.endr
@@ -99,33 +100,26 @@ function x264_checkasm_call, export=1
mov x3, x5
mov x4, x6
mov x5, x7
- ldp x6, x7, [sp, #ARG_STACK + PUSHED]
+ ldp x6, x7, [x29, #16]
blr x12
add sp, sp, #ARG_STACK
- ldr x2, [sp], #8
-
- stp x0, x1, [sp, #-16]!
+ ldr x2, [sp]
+ stp x0, x1, [sp]
movrel x9, register_init
- ldp d0, d1, [x9], #16
- ldp d2, d3, [x9], #16
- ldp d4, d5, [x9], #16
- ldp d6, d7, [x9], #16
- eor v0.8b, v0.8b, v8.8b
- eor v1.8b, v1.8b, v9.8b
- eor v2.8b, v2.8b, v10.8b
- eor v3.8b, v3.8b, v11.8b
- eor v4.8b, v4.8b, v12.8b
- eor v5.8b, v5.8b, v13.8b
- eor v6.8b, v6.8b, v14.8b
- eor v7.8b, v7.8b, v15.8b
- orr v0.8b, v0.8b, v1.8b
- orr v0.8b, v0.8b, v2.8b
- orr v0.8b, v0.8b, v3.8b
- orr v0.8b, v0.8b, v4.8b
- orr v0.8b, v0.8b, v5.8b
- orr v0.8b, v0.8b, v6.8b
- orr v0.8b, v0.8b, v7.8b
- fmov x3, d0
+ movi v3.8h, #0
+
+.macro check_reg_neon reg1, reg2
+ ldr q0, [x9], #16
+ uzp1 v1.2d, v\reg1\().2d, v\reg2\().2d
+ eor v0.16b, v0.16b, v1.16b
+ orr v3.16b, v3.16b, v0.16b
+.endm
+ check_reg_neon 8, 9
+ check_reg_neon 10, 11
+ check_reg_neon 12, 13
+ check_reg_neon 14, 15
+ xtn v3.8b, v3.8h
+ umov x3, v3.d[0]
.macro check_reg reg1, reg2
ldp x0, x1, [x9], #16
@@ -142,8 +136,8 @@ function x264_checkasm_call, export=1
cbz x3, 0f
- mov x9, #0
- str x9, [x2]
+ mov w9, #0
+ str w9, [x2]
movrel x0, error_message
bl puts
0:
@@ -157,6 +151,6 @@ function x264_checkasm_call, export=1
ldp x23, x24, [sp], #16
ldp x21, x22, [sp], #16
ldp x19, x20, [sp], #16
- ldr x30, [sp], #8
+ ldp x29, x30, [sp], #16
ret
endfunc
--
2.5.0
More information about the x264-devel
mailing list