[x265] [PATCH] testbench: port x264 stack & register check code for ARM arch

Mon Feb 1 06:36:00 CET 2016

# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1454304881 -19800
#      Mon Feb 01 11:04:41 2016 +0530
# Node ID 6995de365269cf92af0d7d350557385ebff568d8
# Parent  548a45bbf2232d6321bc06ec25979499e233405d
testbench: port x264 stack & register check code for ARM arch

diff -r 548a45bbf223 -r 6995de365269 source/common/arm/asm.S

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/arm/asm.S	Mon Feb 01 11:04:41 2016 +0530
@@ -0,0 +1,185 @@
+/*****************************************************************************
+ * asm.S: arm utility macros
+ *****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Mans Rullgard <mans at mansr.com>
+ *          David Conrad <lessen42 at gmail.com>
+ *          Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "x265_config.h"
+
+.syntax unified
+
+#if   HAVE_NEON
+        .arch           armv7-a
+#elif HAVE_ARMV6T2
+        .arch           armv6t2
+#elif HAVE_ARMV6
+        .arch           armv6
+#endif
+
+.fpu neon
+
+#ifdef PREFIX
+#   define EXTERN_ASM _
+#else
+#   define EXTERN_ASM
+#endif
+
+#ifdef __ELF__
+#   define ELF
+#else
+#   define ELF @
+#endif
+
+#if HAVE_AS_FUNC
+#   define FUNC
+#else
+#   define FUNC @
+#endif
+
+.macro require8, val=1
+ELF     .eabi_attribute 24, \val
+.endm
+
+.macro preserve8, val=1
+ELF     .eabi_attribute 25, \val
+.endm
+
+.macro function name, export=1
+    .macro endfunc
+ELF     .size   \name, . - \name
+FUNC    .endfunc
+        .purgem endfunc
+    .endm
+        .align  2
+.if \export == 1
+        .global EXTERN_ASM\name
+ELF     .hidden EXTERN_ASM\name
+ELF     .type   EXTERN_ASM\name, %function
+FUNC    .func   EXTERN_ASM\name
+EXTERN_ASM\name:
+.else
+ELF     .hidden \name
+ELF     .type   \name, %function
+FUNC    .func   \name
+\name:
+.endif
+.endm
+
+.macro movrel rd, val
+#if HAVE_ARMV6T2 && !defined(PIC)
+        movw            \rd, #:lower16:\val
+        movt            \rd, #:upper16:\val
+#else
+        ldr             \rd, =\val
+#endif
+.endm
+
+.macro movconst rd, val
+#if HAVE_ARMV6T2
+    movw        \rd, #:lower16:\val
+.if \val >> 16
+    movt        \rd, #:upper16:\val
+.endif
+#else
+    ldr         \rd, =\val
+#endif
+.endm
+
+#define GLUE(a, b) a ## b
+#define JOIN(a, b) GLUE(a, b)
+#define X(s) JOIN(EXTERN_ASM, s)
+
+#define FENC_STRIDE 16
+#define FDEC_STRIDE 32
+
+.macro HORIZ_ADD dest, a, b
+.ifnb \b
+    vadd.u16    \a, \a, \b
+.endif
+    vpaddl.u16  \a, \a
+    vpaddl.u32  \dest, \a
+.endm
+
+.macro SUMSUB_AB sum, diff, a, b
+    vadd.s16    \sum,  \a, \b
+    vsub.s16    \diff, \a, \b
+.endm
+
+.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
+    SUMSUB_AB   \s1, \d1, \a, \b
+    SUMSUB_AB   \s2, \d2, \c, \d
+.endm
+
+.macro ABS2 a b
+    vabs.s16 \a, \a
+    vabs.s16 \b, \b
+.endm
+
+// dist = distance in elements (0 for vertical pass, 1/2 for horizontal passes)
+// op = sumsub/amax (sum and diff / maximum of absolutes)
+// d1/2 = destination registers
+// s1/2 = source registers
+.macro HADAMARD dist, op, d1, d2, s1, s2
+.if \dist == 1
+    vtrn.16     \s1, \s2
+.else
+    vtrn.32     \s1, \s2
+.endif
+.ifc \op, sumsub
+    SUMSUB_AB   \d1, \d2, \s1, \s2
+.else
+    vabs.s16    \s1, \s1
+    vabs.s16    \s2, \s2
+    vmax.s16    \d1, \s1, \s2
+.endif
+.endm
+
+.macro TRANSPOSE8x8 r0 r1 r2 r3 r4 r5 r6 r7
+    vtrn.32         \r0, \r4
+    vtrn.32         \r1, \r5
+    vtrn.32         \r2, \r6
+    vtrn.32         \r3, \r7
+    vtrn.16         \r0, \r2
+    vtrn.16         \r1, \r3
+    vtrn.16         \r4, \r6
+    vtrn.16         \r5, \r7
+    vtrn.8          \r0, \r1
+    vtrn.8          \r2, \r3
+    vtrn.8          \r4, \r5
+    vtrn.8          \r6, \r7
+.endm
+
+.macro TRANSPOSE4x4 r0 r1 r2 r3
+    vtrn.16         \r0, \r2
+    vtrn.16         \r1, \r3
+    vtrn.8          \r0, \r1
+    vtrn.8          \r2, \r3
+.endm
+
+.macro TRANSPOSE4x4_16  d0 d1 d2 d3
+    vtrn.32     \d0, \d2
+    vtrn.32     \d1, \d3
+    vtrn.16     \d0, \d1
+    vtrn.16     \d2, \d3
+.endm
diff -r 548a45bbf223 -r 6995de365269 source/common/arm/cpu-a.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/arm/cpu-a.S	Mon Feb 01 11:04:41 2016 +0530
@@ -0,0 +1,109 @@
+/*****************************************************************************
+ * cpu-a.S: arm cpu detection
+ *****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *          Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.align 2
+
+// done in gas because .fpu neon overrides the refusal to assemble
+// instructions the selected -march/-mcpu doesn't support
+function x265_cpu_neon_test
+    vadd.i16    q0, q0, q0
+    bx          lr
+endfunc
+
+// return: 0 on success
+//         1 if counters were already enabled
+//         9 if lo-res counters were already enabled
+function x265_cpu_enable_armv7_counter, export=0
+    mrc         p15, 0, r2, c9, c12, 0      // read PMNC
+    ands        r0, r2, #1
+    andne       r0, r2, #9
+
+    orr         r2, r2, #1                  // enable counters
+    bic         r2, r2, #8                  // full resolution
+    mcreq       p15, 0, r2, c9, c12, 0      // write PMNC
+    mov         r2, #1 << 31                // enable cycle counter
+    mcr         p15, 0, r2, c9, c12, 1      // write CNTENS
+    bx          lr
+endfunc
+
+function x265_cpu_disable_armv7_counter, export=0
+    mrc         p15, 0, r0, c9, c12, 0      // read PMNC
+    bic         r0, r0, #1                  // disable counters
+    mcr         p15, 0, r0, c9, c12, 0      // write PMNC
+    bx          lr
+endfunc
+
+
+.macro READ_TIME r
+    mrc         p15, 0, \r, c9, c13, 0
+.endm
+
+// return: 0 if transfers neon -> arm transfers take more than 10 cycles
+//         nonzero otherwise
+function x265_cpu_fast_neon_mrc_test
+    // check for user access to performance counters
+    mrc         p15, 0, r0, c9, c14, 0
+    cmp         r0, #0
+    bxeq        lr
+
+    push        {r4-r6,lr}
+    bl          x265_cpu_enable_armv7_counter
+    ands        r1, r0, #8
+    mov         r3, #0
+    mov         ip, #4
+    mov         r6, #4
+    moveq       r5, #1
+    movne       r5, #64
+
+average_loop:
+    mov         r4, r5
+    READ_TIME   r1
+1:  subs        r4, r4, #1
+.rept 8
+    vmov.u32    lr, d0[0]
+    add         lr, lr, lr
+.endr
+    bgt         1b
+    READ_TIME   r2
+
+    subs        r6, r6, #1
+    sub         r2, r2, r1
+    cmpgt       r2, #30 << 3    // assume context switch if it took over 30 cycles
+    addle       r3, r3, r2
+    subsle      ip, ip, #1
+    bgt         average_loop
+
+    // disable counters if we enabled them
+    ands        r0, r0, #1
+    bleq        x265_cpu_disable_armv7_counter
+
+    lsr         r0, r3, #5
+    cmp         r0, #10
+    movgt       r0, #0
+    pop         {r4-r6,pc}
+endfunc
diff -r 548a45bbf223 -r 6995de365269 source/test/CMakeLists.txt
--- a/source/test/CMakeLists.txt	Fri Jan 29 15:42:09 2016 +0530
+++ b/source/test/CMakeLists.txt	Mon Feb 01 11:04:41 2016 +0530
@@ -23,7 +23,13 @@
 
 # add ARM assembly files
 if(ARM OR CROSS_COMPILE_ARM)
-    set(YASM_SRC)
+    enable_language(ASM)
+    set(YASM_SRC checkasm-arm.S)
+    add_custom_command(
+        OUTPUT checkasm-arm.obj
+        COMMAND ${CMAKE_CXX_COMPILER}
+        ARGS ${YASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj
+        DEPENDS checkasm-arm.S)
 endif(ARM OR CROSS_COMPILE_ARM)
 
 # add PowerPC assembly files
diff -r 548a45bbf223 -r 6995de365269 source/test/checkasm-arm.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/test/checkasm-arm.S	Mon Feb 01 11:04:41 2016 +0530
@@ -0,0 +1,133 @@
+/****************************************************************************
+ * checkasm-arm.S: assembly check tool
+ *****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Martin Storsjo <martin at martin.st>
+ *          Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "../common/arm/asm.S"
+
+.section .rodata
+.align 4
+register_init:
+.quad 0x21f86d66c8ca00ce
+.quad 0x75b6ba21077c48ad
+.quad 0xed56bb2dcb3c7736
+.quad 0x8bda43d3fd1a7e06
+.quad 0xb64a9c9e5d318408
+.quad 0xdf9a54b303f1d3a3
+.quad 0x4a75479abd64e097
+.quad 0x249214109d5d1c88
+
+error_message:
+.asciz "failed to preserve register"
+
+.text
+
+@ max number of args used by any x265 asm function.
+#define MAX_ARGS 15
+
+#define ARG_STACK 4*(MAX_ARGS - 2)
+
+.macro clobbercheck variant
+.equ pushed, 4*10
+function x265_checkasm_call_\variant
+    push        {r4-r11, lr}
+.ifc \variant, neon
+    vpush       {q4-q7}
+.equ pushed, pushed + 16*4
+.endif
+
+    movrel      r12, register_init
+.ifc \variant, neon
+    vldm        r12, {q4-q7}
+.endif
+    ldm         r12, {r4-r11}
+
+    push        {r1}
+
+    sub         sp,  sp,  #ARG_STACK
+.equ pos, 0
+.rept MAX_ARGS-2
+    ldr         r12, [sp, #ARG_STACK + pushed + 8 + pos]
+    str         r12, [sp, #pos]
+.equ pos, pos + 4
+.endr
+
+    mov         r12, r0
+    mov         r0,  r2
+    mov         r1,  r3
+    ldrd        r2,  r3,  [sp, #ARG_STACK + pushed]
+    blx         r12
+    add         sp,  sp,  #ARG_STACK
+    pop         {r2}
+
+    push        {r0, r1}
+    movrel      r12, register_init
+.ifc \variant, neon
+    vldm        r12, {q0-q3}
+    veor        q0,  q0,  q4
+    veor        q1,  q1,  q5
+    veor        q2,  q2,  q6
+    veor        q3,  q3,  q7
+    vorr        q0,  q0,  q1
+    vorr        q0,  q0,  q2
+    vorr        q0,  q0,  q3
+    vorr        d0,  d0,  d1
+    vrev64.32   d1,  d0
+    vorr        d0,  d0,  d1
+    vmov.32     r3,  d0[0]
+.else
+    mov         r3,  #0
+.endif
+
+.macro check_reg reg1, reg2
+    ldrd        r0,  r1,  [r12], #8
+    eor         r0,  r0, \reg1
+    eor         r1,  r1, \reg2
+    orr         r3,  r3, r0
+    orr         r3,  r3, r1
+.endm
+    check_reg   r4,  r5
+    check_reg   r6,  r7
+    check_reg   r8,  r9
+    check_reg   r10, r11
+.purgem check_reg
+
+    cmp         r3,  #0
+    beq         0f
+
+    mov         r12, #0
+    str         r12, [r2]
+    movrel      r0, error_message
+    bl          puts
+0:
+    pop         {r0, r1}
+.ifc \variant, neon
+    vpop        {q4-q7}
+.endif
+    pop         {r4-r11, pc}
+endfunc
+.endm
+
+clobbercheck neon
+clobbercheck noneon