[x264-devel] [PATCH 3/3] checkasm: arm: Check register clobbering

Thu Aug 27 23:15:03 CEST 2015

Use a separate function prototype which returns uint64_t instead
of the normal one that returns intptr_t.

Use two separate sets of functions, depending on whether neon
is available.

---
Instead of using a separate checkasm_call function for a 64 bit
return value, one could also just declare the second one with
asm("x264_checkasm_call_neon") to have it map to the same symbol,
but I guess it's easier to duplicate than to deal with the
potential portability issues.

The same would have to be done for x86 32 bit as well, if the
sa8d_satd function would be implemented there.
---
 Makefile             |    1 +
 tools/checkasm-arm.S |  180 ++++++++++++++++++++++++++++++++++++++++++++++++++
 tools/checkasm.c     |   25 ++++++-
 3 files changed, 204 insertions(+), 2 deletions(-)
 create mode 100644 tools/checkasm-arm.S

diff --git a/Makefile b/Makefile
index 4feef33..d0b1633 100644
--- a/Makefile
+++ b/Makefile
@@ -122,6 +122,7 @@ ASMSRC += common/arm/cpu-a.S common/arm/pixel-a.S common/arm/mc-a.S \
           common/arm/predict-a.S common/arm/bitstream-a.S
 SRCS   += common/arm/mc-c.c common/arm/predict-c.c
 OBJASM  = $(ASMSRC:%.S=%.o)
+OBJCHK += tools/checkasm-arm.o
 endif
 endif
 
diff --git a/tools/checkasm-arm.S b/tools/checkasm-arm.S
new file mode 100644
index 0000000..8b3baac
--- /dev/null
+++ b/tools/checkasm-arm.S
@@ -0,0 +1,180 @@
+/****************************************************************************
+ * checkasm-arm.S: assembly check tool
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Martin Storsjo <martin at martin.st>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "../common/arm/asm.S"
+
+.section .rodata
+.align 4
+register_init:
+.quad 0x21f86d66c8ca00ce
+.quad 0x75b6ba21077c48ad
+.quad 0xed56bb2dcb3c7736
+.quad 0x8bda43d3fd1a7e06
+.quad 0xb64a9c9e5d318408
+.quad 0xdf9a54b303f1d3a3
+.quad 0x4a75479abd64e097
+.quad 0x249214109d5d1c88
+
+error_message:
+.asciz "failed to preserve register"
+
+.text
+
+@ max number of args used by any x264 asm function.
+#define MAX_ARGS 15
+
+#define ARG_STACK 4*(MAX_ARGS - 2)
+#define PUSHED 16*4 + 4*10
+
+function x264_checkasm_call_neon
+    push        {r4-r11, lr}
+    vpush       {q4-q7}
+
+    movrel      r12, register_init
+    vldm        r12, {q4-q7}
+    ldm         r12, {r4-r11}
+
+    push        {r1}
+
+    sub         sp,  sp,  #ARG_STACK
+.equ pos, 0
+.rept MAX_ARGS-2
+    ldr         r12, [sp, #ARG_STACK + PUSHED + 8 + pos]
+    str         r12, [sp, #pos]
+.equ pos, pos + 4
+.endr
+
+    mov         r12, r0
+    mov         r0,  r2
+    mov         r1,  r3
+    ldrd        r2,  r3,  [sp, #ARG_STACK + PUSHED]
+    blx         r12
+    add         sp,  sp,  #ARG_STACK
+    pop         {r2}
+
+    push        {r0, r1}
+    movrel      r12, register_init
+    vldm        r12, {q0-q3}
+    veor        q0,  q0,  q4
+    veor        q1,  q1,  q5
+    veor        q2,  q2,  q6
+    veor        q3,  q3,  q7
+    vorr        q0,  q0,  q1
+    vorr        q0,  q0,  q2
+    vorr        q0,  q0,  q3
+    vorr        d0,  d0,  d1
+    vrev64.32   d1,  d0
+    vorr        d0,  d0,  d1
+    vmov.32     r3,  d0[0]
+
+.macro check_reg reg1, reg2
+    ldrd        r0,  r1,  [r12], #8
+    eor         r0,  r0, \reg1
+    eor         r1,  r1, \reg2
+    orr         r3,  r3, r0
+    orr         r3,  r3, r1
+.endm
+    check_reg   r4,  r5
+    check_reg   r6,  r7
+    check_reg   r8,  r9
+    check_reg   r10, r11
+.purgem check_reg
+
+    cmp         r3,  #0
+    beq         0f
+
+    mov         r12, #0
+    str         r12, [r2]
+    movrel      r0, error_message
+    bl          puts
+0:
+    pop         {r0, r1}
+    vpop        {q4-q7}
+    pop         {r4-r11, pc}
+endfunc
+
+function x264_checkasm_call_neon64
+    b           X(x264_checkasm_call_neon)
+endfunc
+
+#undef PUSHED
+#define PUSHED 4*10
+
+function x264_checkasm_call_noneon
+    push        {r4-r11, lr}
+
+    movrel      r12, register_init
+    ldm         r12, {r4-r11}
+
+    push        {r1}
+
+    sub         sp,  sp,  #ARG_STACK
+.equ pos, 0
+.rept MAX_ARGS-2
+    ldr         r12, [sp, #ARG_STACK + PUSHED + 8 + pos]
+    str         r12, [sp, #pos]
+.equ pos, pos + 4
+.endr
+
+    mov         r12, r0
+    mov         r0,  r2
+    mov         r1,  r3
+    ldrd        r2,  r3,  [sp, #ARG_STACK + PUSHED]
+    blx         r12
+    add         sp,  sp,  #ARG_STACK
+    pop         {r2}
+
+    push        {r0, r1}
+    movrel      r12, register_init
+    mov         r3,  #0
+
+.macro check_reg reg1, reg2
+    ldrd        r0,  r1,  [r12], #8
+    eor         r0,  r0, \reg1
+    eor         r1,  r1, \reg2
+    orr         r3,  r3, r0
+    orr         r3,  r3, r1
+.endm
+    check_reg   r4,  r5
+    check_reg   r6,  r7
+    check_reg   r8,  r9
+    check_reg   r10, r11
+.purgem check_reg
+
+    cmp         r3,  #0
+    beq         0f
+
+    mov         r12, #0
+    str         r12, [r2]
+    movrel      r0, error_message
+    bl          puts
+0:
+    pop         {r0, r1}
+    pop         {r4-r11, pc}
+endfunc
+
+function x264_checkasm_call_noneon64
+    b           X(x264_checkasm_call_noneon)
+endfunc
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 01a97c9..ca345f1 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -231,6 +231,15 @@ intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... );
 intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... );
 #endif
 
+#if ARCH_ARM
+intptr_t x264_checkasm_call_neon( intptr_t (*func)(), int *ok, ... );
+uint64_t x264_checkasm_call_neon64( intptr_t (*func)(), int *ok, ... );
+intptr_t x264_checkasm_call_noneon( intptr_t (*func)(), int *ok, ... );
+uint64_t x264_checkasm_call_noneon64( intptr_t (*func)(), int *ok, ... );
+intptr_t (*x264_checkasm_call)( intptr_t (*func)(), int *ok, ... ) = x264_checkasm_call_noneon;
+uint64_t (*x264_checkasm_call64)( intptr_t (*func)(), int *ok, ... ) = x264_checkasm_call_noneon64;
+#endif
+
 #define call_c1(func,...) func(__VA_ARGS__)
 
 #if ARCH_X86_64
@@ -248,12 +257,18 @@ void x264_checkasm_stack_clobber( uint64_t clobber, ... );
     uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \
     x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+6 */ \
     x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, __VA_ARGS__ ); })
-#elif ARCH_X86 || (ARCH_AARCH64 && !defined(__APPLE__))
+#elif ARCH_X86 || (ARCH_AARCH64 && !defined(__APPLE__)) || ARCH_ARM
 #define call_a1(func,...) x264_checkasm_call( (intptr_t(*)())func, &ok, __VA_ARGS__ )
 #else
 #define call_a1 call_c1
 #endif
 
+#if ARCH_ARM
+#define call_a1_64(func,...) x264_checkasm_call64( (intptr_t(*)())func, &ok, __VA_ARGS__ )
+#else
+#define call_a1_64 call_a1
+#endif
+
 #define call_bench(func,cpu,...)\
     if( do_bench && !strncmp(func_name, bench_pattern, bench_pattern_len) )\
     {\
@@ -286,6 +301,7 @@ void x264_checkasm_stack_clobber( uint64_t clobber, ... );
 #define call_c(func,...) ({ call_c2(func,__VA_ARGS__); call_c1(func,__VA_ARGS__); })
 #define call_a2(func,...) ({ call_bench(func,cpu_new,__VA_ARGS__); })
 #define call_c2(func,...) ({ call_bench(func,0,__VA_ARGS__); })
+#define call_a64(func,...) ({ call_a2(func,__VA_ARGS__); call_a1_64(func,__VA_ARGS__); })
 
 
 static int check_pixel( int cpu_ref, int cpu_new )
@@ -372,7 +388,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
         {
             uint32_t cost8_c = pixel_c.sa8d[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 );
             uint32_t cost4_c = pixel_c.satd[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 );
-            uint64_t res_a = call_a( pixel_asm.sa8d_satd[PIXEL_16x16], pbuf1, (intptr_t)16, pbuf2, (intptr_t)64 );
+            uint64_t res_a = call_a64( pixel_asm.sa8d_satd[PIXEL_16x16], pbuf1, (intptr_t)16, pbuf2, (intptr_t)64 );
             uint32_t cost8_a = res_a;
             uint32_t cost4_a = res_a >> 32;
             if( cost8_a != cost8_c || cost4_a != cost4_c )
@@ -2786,6 +2802,11 @@ static int check_all_flags( void )
         ret = check_all_funcs( 0, X264_CPU_ALTIVEC );
     }
 #elif ARCH_ARM
+    if( cpu_detect & X264_CPU_NEON )
+    {
+        x264_checkasm_call   = x264_checkasm_call_neon;
+        x264_checkasm_call64 = x264_checkasm_call_neon64;
+    }
     if( cpu_detect & X264_CPU_ARMV6 )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV6, "ARMv6" );
     if( cpu_detect & X264_CPU_NEON )
-- 
1.7.10.4