[x264-devel] [PATCH 23/23] aarch64: cabac_encode_{decision, bypass, terminal}_asm

Janne Grunau janne-x264 at jannau.net
Thu Nov 27 08:56:51 CET 2014


benchmarks on a Nexus 9 (nvidia denver):
101.3 cycles in x264_cabac_encode_decision_c,   67105369 runs, 3495 skips
 97.3 cycles in x264_cabac_encode_decision_asm, 67105493 runs, 3371 skips
132.8 cycles in x264_cabac_encode_terminal_c,    1046950 runs, 1626 skips
116.1 cycles in x264_cabac_encode_terminal_asm,  1048424 runs, 152 skips
 92.4 cycles in x264_cabac_encode_bypass_c,     16776192 runs, 1024 skips
 89.6 cycles in x264_cabac_encode_bypass_asm,   16776453 runs, 763 skips

Cycle counts are not as stable as one would like. The dynamic code
optimisation seems to produce different results for small chnages in a
binary. Repeated runs with the same binary produce stable results
though (ignoring the first run).
---
 Makefile                     |   4 +-
 common/aarch64/asm-offsets.c |  43 +++++++++++++++
 common/aarch64/asm-offsets.h |  37 +++++++++++++
 common/aarch64/cabac-a.S     | 122 +++++++++++++++++++++++++++++++++++++++++++
 common/cabac.h               |   4 ++
 tools/checkasm.c             |   2 +
 6 files changed, 211 insertions(+), 1 deletion(-)
 create mode 100644 common/aarch64/asm-offsets.c
 create mode 100644 common/aarch64/asm-offsets.h
 create mode 100644 common/aarch64/cabac-a.S

diff --git a/Makefile b/Makefile
index f293542..12c74e4 100644
--- a/Makefile
+++ b/Makefile
@@ -129,13 +129,15 @@ endif
 ifeq ($(ARCH),AARCH64)
 ifneq ($(AS),)
 ASMSRC += common/aarch64/bitstream-a.S \
+          common/aarch64/cabac-a.S     \
           common/aarch64/dct-a.S     \
           common/aarch64/deblock-a.S \
           common/aarch64/mc-a.S      \
           common/aarch64/pixel-a.S   \
           common/aarch64/predict-a.S \
           common/aarch64/quant-a.S
-SRCS   += common/aarch64/mc-c.c      \
+SRCS   += common/aarch64/asm-offsets.c \
+          common/aarch64/mc-c.c        \
           common/aarch64/predict-c.c
 OBJASM  = $(ASMSRC:%.S=%.o)
 endif
diff --git a/common/aarch64/asm-offsets.c b/common/aarch64/asm-offsets.c
new file mode 100644
index 0000000..2fcf5a4
--- /dev/null
+++ b/common/aarch64/asm-offsets.c
@@ -0,0 +1,43 @@
+/*****************************************************************************
+ * aarch64/asm-offsets.c: check asm offsets for aarch64
+ *****************************************************************************
+ * Copyright (C) 2014 x264 project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include <stdint.h>
+#include "common/osdep.h"
+#include "common/common.h"
+#include "common/cabac.h"
+#include "asm-offsets.h"
+
+#define X264_CHECK_OFFSET(s, m, o) struct check_##s##_##m {    \
+        int m_##m[2 * (offsetof(s, m) == o) - 1];              \
+    }
+
+
+X264_CHECK_OFFSET(x264_cabac_t, i_low,               CABAC_I_LOW);
+X264_CHECK_OFFSET(x264_cabac_t, i_range,             CABAC_I_RANGE);
+X264_CHECK_OFFSET(x264_cabac_t, i_queue,             CABAC_I_QUEUE);
+X264_CHECK_OFFSET(x264_cabac_t, i_bytes_outstanding, CABAC_I_BYTES_OUTSTANDING);
+X264_CHECK_OFFSET(x264_cabac_t, p_start,             CABAC_P_START);
+X264_CHECK_OFFSET(x264_cabac_t, p,                   CABAC_P);
+X264_CHECK_OFFSET(x264_cabac_t, p_end,               CABAC_P_END);
+X264_CHECK_OFFSET(x264_cabac_t, f8_bits_encoded,     CABAC_F8_BITS_ENCODED);
+X264_CHECK_OFFSET(x264_cabac_t, state,               CABAC_STATE);
diff --git a/common/aarch64/asm-offsets.h b/common/aarch64/asm-offsets.h
new file mode 100644
index 0000000..64d5193
--- /dev/null
+++ b/common/aarch64/asm-offsets.h
@@ -0,0 +1,37 @@
+/*****************************************************************************
+ * aarch64/asm-offsets.h: asm offsets for aarch64
+ *****************************************************************************
+ * Copyright (C) 2014 x264 project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#ifndef X264_AARCH64_ASM_OFFSETS_H
+#define X264_AARCH64_ASM_OFFSETS_H
+
+#define CABAC_I_LOW                 0x00
+#define CABAC_I_RANGE               0x04
+#define CABAC_I_QUEUE               0x08
+#define CABAC_I_BYTES_OUTSTANDING   0x0c
+#define CABAC_P_START               0x10
+#define CABAC_P                     0x18
+#define CABAC_P_END                 0x20
+#define CABAC_F8_BITS_ENCODED       0x30
+#define CABAC_STATE                 0x34
+
+#endif
diff --git a/common/aarch64/cabac-a.S b/common/aarch64/cabac-a.S
new file mode 100644
index 0000000..ffcbb92
--- /dev/null
+++ b/common/aarch64/cabac-a.S
@@ -0,0 +1,122 @@
+/*****************************************************************************
+ * cabac-a.S: aarch64 cabac
+ *****************************************************************************
+ * Copyright (C) 2014 x264 project
+ *
+ * Authors: Janne Grunau <janne-x264 at jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "asm-offsets.h"
+
+// w11 holds x264_cabac_t.i_low
+// w12 holds x264_cabac_t.i_range
+
+function x264_cabac_encode_decision_asm, export=1
+    movrel      x8,  X(x264_cabac_range_lps)
+    movrel      x9,  X(x264_cabac_transition)
+    add         w10, w1, #CABAC_STATE
+    ldrb        w3,  [x0,  x10]         // i_state
+    ldr         w12, [x0,  #CABAC_I_RANGE]
+    and         x4,  x3,  #~1
+    asr         w5,  w12, #6
+    add         x8,  x8,  x4, lsl #1
+    sub         w5,  w5,  #4
+    eor         w6,  w2,  w3            // b ^ i_state
+    ldrb        w4,  [x8,  x5]          // i_range_lps
+    ldr         w11, [x0, #CABAC_I_LOW]
+    sub         w12, w12, w4
+    tbz         w6,  #0,  1f            // (b ^ i_state) & 1
+    add         w11, w11, w12
+    mov         w12,  w4
+1:
+    orr         w4,  w2,  w3, lsl #1
+    ldrb        w9,  [x9,  x4]
+    strb        w9,  [x0,  x10]    // i_state
+
+cabac_encode_renorm:
+    clz         w5,  w12
+    ldr         w2,  [x0, #CABAC_I_QUEUE]
+    sub         w5,  w5,  #23
+    lsl         w12, w12, w5
+    lsl         w11, w11, w5
+2:
+    adds        w2,  w2,  w5
+    str         w12, [x0, #CABAC_I_RANGE]
+    b.lt        0f
+cabac_putbyte:
+    mov         w13, #0x400
+    add         w12, w2,  #10
+    lsl         w13, w13, w2
+    asr         w4,  w11, w12           // out
+    sub         w2,  w2,  #8
+    sub         w13, w13, #1
+    subs        w5,  w4,  #0xff
+    and         w11, w11, w13
+    ldr         w6,  [x0, #CABAC_I_BYTES_OUTSTANDING]
+    str         w2,  [x0, #CABAC_I_QUEUE]
+    b.ne        1f
+
+    add         w6,  w6,  #1
+    str         w11, [x0, #CABAC_I_LOW]
+    str         w6,  [x0, #CABAC_I_BYTES_OUTSTANDING]
+    ret
+
+1:
+    ldr         x7,  [x0, #CABAC_P]
+    asr         w5,  w4,  #8            // carry
+    ldrb        w8,  [x7, #-1]
+    add         w8,  w8,  w5
+    sub         w5,  w5,  #1
+    strb        w8,  [x7, #-1]
+    cbz         w6,  3f
+2:
+    subs        w6,  w6,  #1
+    strb        w5,  [x7],  #1
+    b.gt        2b
+3:  
+    strb        w4,  [x7],  #1
+    str         wzr, [x0, #CABAC_I_BYTES_OUTSTANDING]
+    str         x7,  [x0, #CABAC_P]
+0:
+    str         w11, [x0, #CABAC_I_LOW]
+    str         w2,  [x0, #CABAC_I_QUEUE]
+    ret
+endfunc
+
+function x264_cabac_encode_bypass_asm, export=1
+    ldr         w12, [x0, #CABAC_I_RANGE]
+    ldr         w11, [x0, #CABAC_I_LOW]
+    ldr         w2,  [x0, #CABAC_I_QUEUE]
+    and         w1,  w1,  w12
+    add         w11, w1,  w11, lsl #1
+    adds        w2,  w2,  #1
+    b.ge        cabac_putbyte
+    str         w11, [x0, #CABAC_I_LOW]
+    str         w2,  [x0, #CABAC_I_QUEUE]
+    ret
+endfunc
+
+function x264_cabac_encode_terminal_asm, export=1
+    ldr         w12, [x0, #CABAC_I_RANGE]
+    ldr         w11, [x0, #CABAC_I_LOW]
+    sub         w12, w12, #2
+    b           cabac_encode_renorm
+endfunc
diff --git a/common/cabac.h b/common/cabac.h
index dbe6820..cc27761 100644
--- a/common/cabac.h
+++ b/common/cabac.h
@@ -72,6 +72,10 @@ void x264_cabac_encode_flush( x264_t *h, x264_cabac_t *cb );
 #define x264_cabac_encode_decision x264_cabac_encode_decision_asm
 #define x264_cabac_encode_bypass x264_cabac_encode_bypass_asm
 #define x264_cabac_encode_terminal x264_cabac_encode_terminal_asm
+#elif defined(ARCH_AARCH64)
+#define x264_cabac_encode_decision x264_cabac_encode_decision_asm
+#define x264_cabac_encode_bypass x264_cabac_encode_bypass_asm
+#define x264_cabac_encode_terminal x264_cabac_encode_terminal_asm
 #else
 #define x264_cabac_encode_decision x264_cabac_encode_decision_c
 #define x264_cabac_encode_bypass x264_cabac_encode_bypass_c
diff --git a/tools/checkasm.c b/tools/checkasm.c
index b6a1140..08c1917 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -2432,6 +2432,8 @@ static void run_cabac_terminal_##cpu( x264_t *h, uint8_t *dst )\
 DECL_CABAC(c)
 #if HAVE_MMX
 DECL_CABAC(asm)
+#elif defined(ARCH_AARCH64)
+DECL_CABAC(asm)
 #else
 #define run_cabac_decision_asm run_cabac_decision_c
 #define run_cabac_bypass_asm run_cabac_bypass_c
-- 
2.1.3



More information about the x264-devel mailing list