[x264-devel] [PATCH 1/1] checkasm: fix arm64 register clobber test

Mon Aug 17 18:43:02 CEST 2015

Hi Martin,

the code has a couple of problems, see below in the commit message.
feel free squash this patch. I still need to test on ios though.
IIRC it won't work there since the ABI for stack arguments and variadic
arguments is incompatible. Stack arguments take only there size + padding
for natural alignment but variadic arguments are allocated to slots of
8 bytes. There are iirc one or two functions which require args on the
stack. If those arguments are smaller than 8 bytes this won't work.
I don't think this is fixable, i.e. arm64 x264_checkasm_call can't work
on ios.

Janne

---8<---
The stackpointer SP must be 16-byte aligned if it's used for meory
access. Solve this by creating a proper frame record.

Use the frame pointer to access the original arguments on the stack.

Provide a macro to test neon args.

Do not write past int ok.
---
 tools/checkasm-aarch64.S | 58 ++++++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 32 deletions(-)

diff --git a/tools/checkasm-aarch64.S b/tools/checkasm-aarch64.S
index 2b86d62..ec36518 100644
--- a/tools/checkasm-aarch64.S
+++ b/tools/checkasm-aarch64.S
@@ -56,11 +56,11 @@ error_message:
 // max number of args used by any x264 asm function.
 #define MAX_ARGS 15
 
-#define ARG_STACK 8*(MAX_ARGS - 6)
-#define PUSHED 8*8 + 8*12
+#define ARG_STACK ((8*(MAX_ARGS - 6) + 15) & ~15)
 
 function x264_checkasm_call, export=1
-    str         x30, [sp, #-8]!
+    stp         x29, x30, [sp, #-16]!
+    mov         x29, sp
     stp         x19, x20, [sp, #-16]!
     stp         x21, x22, [sp, #-16]!
     stp         x23, x24, [sp, #-16]!
@@ -82,12 +82,13 @@ function x264_checkasm_call, export=1
     ldp         x25, x26, [x9], #16
     ldp         x27, x28, [x9], #16
 
-    str         x1,  [sp, #-8]!
+    str         x1,  [sp, #-16]!
 
     sub         sp,  sp,  #ARG_STACK
 .equ pos, 0
+// first two stacked args are copied to x6, x7
 .rept MAX_ARGS-6
-    ldr         x9, [sp, #ARG_STACK + PUSHED + 16 + pos]
+    ldr         x9, [x29, #16 + 16 + pos]
     str         x9, [sp, #pos]
 .equ pos, pos + 8
 .endr
@@ -99,33 +100,26 @@ function x264_checkasm_call, export=1
     mov         x3,  x5
     mov         x4,  x6
     mov         x5,  x7
-    ldp         x6,  x7,  [sp, #ARG_STACK + PUSHED]
+    ldp         x6,  x7,  [x29, #16]
     blr         x12
     add         sp,  sp,  #ARG_STACK
-    ldr         x2,  [sp], #8
-
-    stp         x0,  x1, [sp, #-16]!
+    ldr         x2,  [sp]
+    stp         x0,  x1, [sp]
     movrel      x9, register_init
-    ldp         d0,  d1,  [x9], #16
-    ldp         d2,  d3,  [x9], #16
-    ldp         d4,  d5,  [x9], #16
-    ldp         d6,  d7,  [x9], #16
-    eor         v0.8b,  v0.8b,  v8.8b
-    eor         v1.8b,  v1.8b,  v9.8b
-    eor         v2.8b,  v2.8b,  v10.8b
-    eor         v3.8b,  v3.8b,  v11.8b
-    eor         v4.8b,  v4.8b,  v12.8b
-    eor         v5.8b,  v5.8b,  v13.8b
-    eor         v6.8b,  v6.8b,  v14.8b
-    eor         v7.8b,  v7.8b,  v15.8b
-    orr         v0.8b,  v0.8b,  v1.8b
-    orr         v0.8b,  v0.8b,  v2.8b
-    orr         v0.8b,  v0.8b,  v3.8b
-    orr         v0.8b,  v0.8b,  v4.8b
-    orr         v0.8b,  v0.8b,  v5.8b
-    orr         v0.8b,  v0.8b,  v6.8b
-    orr         v0.8b,  v0.8b,  v7.8b
-    fmov        x3,  d0
+    movi        v3.8h,  #0
+
+.macro check_reg_neon reg1, reg2
+    ldr         q0,  [x9], #16
+    uzp1        v1.2d,  v\reg1\().2d, v\reg2\().2d
+    eor         v0.16b, v0.16b, v1.16b
+    orr         v3.16b, v3.16b, v0.16b
+.endm
+    check_reg_neon  8,  9
+    check_reg_neon  10, 11
+    check_reg_neon  12, 13
+    check_reg_neon  14, 15
+    xtn         v3.8b,  v3.8h
+    umov        x3,  v3.d[0]
 
 .macro check_reg reg1, reg2
     ldp         x0,  x1,  [x9], #16
@@ -142,8 +136,8 @@ function x264_checkasm_call, export=1
 
     cbz         x3,  0f
 
-    mov         x9, #0
-    str         x9, [x2]
+    mov         w9,  #0
+    str         w9,  [x2]
     movrel      x0, error_message
     bl          puts
 0:
@@ -157,6 +151,6 @@ function x264_checkasm_call, export=1
     ldp         x23, x24, [sp], #16
     ldp         x21, x22, [sp], #16
     ldp         x19, x20, [sp], #16
-    ldr         x30, [sp], #8
+    ldp         x29, x30, [sp], #16
     ret
 endfunc
-- 
2.5.0