[x264-devel] [Git][videolan/x264][master] 3 commits: aarch64/asm: optimize cabac_encode_terminal with extrinsic knowledge

Mon Dec 28 19:39:38 UTC 2020


Janne Grunau pushed to branch master at VideoLAN / x264


Commits:
8578bd9f by Janne Grunau at 2020-12-28T11:53:20+01:00
aarch64/asm: optimize cabac_encode_terminal with extrinsic knowledge

Approach taken from x86 asm. Overall speedup meaningless.
cabac_encode_terminal on average twice as fast on cortex-53 while
encoding with following command:
./x264 --threads 1 --profile high --preset veryfast --crf 15 -o /dev/null park_joy_420_720p50.y4m

Less relative speedup on cortex-a72/73.

- - - - -
9981ea83 by Janne Grunau at 2020-12-28T11:53:34+01:00
aarch64/asm: support offsets in movrel macro

Imported from dav1d.

- - - - -
8bd6d280 by Janne Grunau at 2020-12-28T11:53:34+01:00
aarch64/asm: optimize cabac asm

0.5% - 2% overall speedup on
`./x264 --threads X --profile high --preset veryfast --crf 15 -o /dev/null park_joy_420_720p50.y4m`
cabac is responsible for roughly 1/6 of the CPU use.
Branch mispredictions are reduced by 15% to 20%.

cortex-s53: 0.5% faster
cortex-a72: 2%  faster
neoverse-n1: 0.9% faster

- - - - -


3 changed files:

- common/aarch64/asm-offsets.c
- common/aarch64/asm.S
- common/aarch64/cabac-a.S


Changes:

=====================================
common/aarch64/asm-offsets.c
=====================================
@@ -26,11 +26,19 @@
 #include "common/common.h"
 #include "asm-offsets.h"
 
+#define STATIC_ASSERT(name, x) int assert_##name[2 * !!(x) - 1]
+
 #define X264_CHECK_OFFSET(s, m, o) struct check_##s##_##m \
 { \
-    int m_##m[2 * (offsetof(s, m) == o) - 1]; \
+    STATIC_ASSERT(offset_##m, offsetof(s, m) == o); \
+}
+
+#define X264_CHECK_REL_OFFSET(s, a, type, b) struct check_##s##_##a##_##b \
+{ \
+    STATIC_ASSERT(rel_offset_##a##_##b, offsetof(s, a) + sizeof(type) == offsetof(s, b)); \
 }
 
+
 X264_CHECK_OFFSET(x264_cabac_t, i_low,               CABAC_I_LOW);
 X264_CHECK_OFFSET(x264_cabac_t, i_range,             CABAC_I_RANGE);
 X264_CHECK_OFFSET(x264_cabac_t, i_queue,             CABAC_I_QUEUE);
@@ -40,3 +48,9 @@ X264_CHECK_OFFSET(x264_cabac_t, p,                   CABAC_P);
 X264_CHECK_OFFSET(x264_cabac_t, p_end,               CABAC_P_END);
 X264_CHECK_OFFSET(x264_cabac_t, f8_bits_encoded,     CABAC_F8_BITS_ENCODED);
 X264_CHECK_OFFSET(x264_cabac_t, state,               CABAC_STATE);
+
+// the aarch64 asm makes following additional assumptions about the x264_cabac_t
+// memory layout
+
+X264_CHECK_REL_OFFSET(x264_cabac_t, i_low,    int, i_range);
+X264_CHECK_REL_OFFSET(x264_cabac_t, i_queue,  int, i_bytes_outstanding);


=====================================
common/aarch64/asm.S
=====================================
@@ -101,15 +101,30 @@ MACH    .const_data
 \name:
 .endm
 
-.macro  movrel rd, val
-#if defined(PIC) && defined(__APPLE__)
+.macro  movrel rd, val, offset=0
+#if defined(__APPLE__)
+  .if \offset < 0
         adrp            \rd, \val at PAGE
         add             \rd, \rd, \val at PAGEOFF
-#elif defined(PIC)
+        sub             \rd, \rd, -(\offset)
+  .else
+        adrp            \rd, \val+(\offset)@PAGE
+        add             \rd, \rd, \val+(\offset)@PAGEOFF
+  .endif
+#elif defined(PIC) && defined(_WIN32)
+  .if \offset < 0
         adrp            \rd, \val
         add             \rd, \rd, :lo12:\val
+        sub             \rd, \rd, -(\offset)
+  .else
+        adrp            \rd, \val+(\offset)
+        add             \rd, \rd, :lo12:\val+(\offset)
+  .endif
+#elif defined(PIC)
+        adrp            \rd, \val+(\offset)
+        add             \rd, \rd, :lo12:\val+(\offset)
 #else
-        ldr             \rd, =\val
+        ldr             \rd, =\val+\offset
 #endif
 .endm
 


=====================================
common/aarch64/cabac-a.S
=====================================
@@ -30,54 +30,51 @@
 // w12 holds x264_cabac_t.i_range
 
 function cabac_encode_decision_asm, export=1
-    movrel      x8,  X264(cabac_range_lps)
-    movrel      x9,  X264(cabac_transition)
-    add         w10, w1, #CABAC_STATE
-    ldrb        w3,  [x0,  x10]         // i_state
+    add         w10, w1,  #CABAC_STATE
+    ldrb        w3,  [x0,  w10, uxtw]           // i_state
     ldr         w12, [x0,  #CABAC_I_RANGE]
-    and         x4,  x3,  #~1
+    movrel      x8,  X264(cabac_range_lps), -4
+    movrel      x9,  X264(cabac_transition)
+    ubfx        x4,  x3,  #1,  #7
     asr         w5,  w12, #6
-    add         x8,  x8,  x4, lsl #1
-    sub         w5,  w5,  #4
-    eor         w6,  w2,  w3            // b ^ i_state
-    ldrb        w4,  [x8,  x5]          // i_range_lps
-    ldr         w11, [x0, #CABAC_I_LOW]
+    add         x8,  x8,  x4, lsl #2
+    orr         w14, w2,  w3, lsl #1
+    ldrb        w4,  [x8,  w5,  uxtw]           // i_range_lps
+    ldr         w11, [x0,  #CABAC_I_LOW]
+    eor         w6,  w2,  w3               	    // b ^ i_state
+    ldrb        w9,  [x9,  w14, uxtw]
     sub         w12, w12, w4
-    tbz         w6,  #0,  1f            // (b ^ i_state) & 1
-    add         w11, w11, w12
-    mov         w12,  w4
-1:
-    orr         w4,  w2,  w3, lsl #1
-    ldrb        w9,  [x9,  x4]
-    strb        w9,  [x0,  x10]    // i_state
+    add         w7,  w11, w12
+    tst         w6,  #1                         // (b ^ i_state) & 1
+    csel        w12, w4, w12, ne
+    csel        w11, w7, w11, ne
+    strb        w9,  [x0,  w10, uxtw]           // i_state
 
 cabac_encode_renorm:
-    clz         w5,  w12
     ldr         w2,  [x0, #CABAC_I_QUEUE]
+    clz         w5,  w12
     sub         w5,  w5,  #23
-    lsl         w12, w12, w5
     lsl         w11, w11, w5
-2:
+    lsl         w12, w12, w5
     adds        w2,  w2,  w5
-    str         w12, [x0, #CABAC_I_RANGE]
-    b.lt        0f
+    b.ge        cabac_putbyte
+
+    stp         w11, w12, [x0, #CABAC_I_LOW]    // store i_low, i_range
+    str         w2,  [x0, #CABAC_I_QUEUE]
+    ret
+
+.align 5
 cabac_putbyte:
-    mov         w13, #0x400
-    add         w12, w2,  #10
-    lsl         w13, w13, w2
-    asr         w4,  w11, w12           // out
+    ldr         w6,  [x0, #CABAC_I_BYTES_OUTSTANDING]
+    add         w14, w2,  #10
+    mov         w13, #-1
     sub         w2,  w2,  #8
-    sub         w13, w13, #1
+    asr         w4,  w11, w14           // out
+    lsl         w13, w13, w14
     subs        w5,  w4,  #0xff
-    and         w11, w11, w13
-    ldr         w6,  [x0, #CABAC_I_BYTES_OUTSTANDING]
-    str         w2,  [x0, #CABAC_I_QUEUE]
-    b.ne        1f
-
-    add         w6,  w6,  #1
-    str         w11, [x0, #CABAC_I_LOW]
-    str         w6,  [x0, #CABAC_I_BYTES_OUTSTANDING]
-    ret
+    bic         w11, w11, w13
+    cinc        w6,  w6,  eq
+    b.eq        0f
 
 1:
     ldr         x7,  [x0, #CABAC_P]
@@ -93,15 +90,14 @@ cabac_putbyte:
     b.gt        2b
 3:
     strb        w4,  [x7],  #1
-    str         wzr, [x0, #CABAC_I_BYTES_OUTSTANDING]
     str         x7,  [x0, #CABAC_P]
 0:
-    str         w11, [x0, #CABAC_I_LOW]
-    str         w2,  [x0, #CABAC_I_QUEUE]
+    stp         w11, w12, [x0, #CABAC_I_LOW]    // store i_low, i_range
+    stp         w2,  w6,  [x0, #CABAC_I_QUEUE]  // store i_queue, i_bytes_outstanding
     ret
 endfunc
 
-function cabac_encode_bypass_asm, export=1
+function cabac_encode_bypass_asm, export=1, align=5
     ldr         w12, [x0, #CABAC_I_RANGE]
     ldr         w11, [x0, #CABAC_I_LOW]
     ldr         w2,  [x0, #CABAC_I_QUEUE]
@@ -114,9 +110,22 @@ function cabac_encode_bypass_asm, export=1
     ret
 endfunc
 
-function cabac_encode_terminal_asm, export=1
+function cabac_encode_terminal_asm, export=1, align=5
     ldr         w12, [x0, #CABAC_I_RANGE]
-    ldr         w11, [x0, #CABAC_I_LOW]
     sub         w12, w12, #2
-    b           cabac_encode_renorm
+    tbz         w12, #8, 1f
+
+    str         w12, [x0, #CABAC_I_RANGE]
+    ret
+1:
+    ldr         w2,  [x0, #CABAC_I_QUEUE]
+    ldr         w11, [x0, #CABAC_I_LOW]
+    lsl         w12, w12, #1
+    adds        w2,  w2,  #1
+    lsl         w11, w11, #1
+    b.ge        cabac_putbyte
+
+    stp         w11, w12, [x0, #CABAC_I_LOW]    // store i_low, i_range
+    str         w2,  [x0, #CABAC_I_QUEUE]
+    ret
 endfunc



View it on GitLab: https://code.videolan.org/videolan/x264/-/compare/4121277b40a667665d4eea1726aefdc55d12d110...8bd6d28025c9dcc101cb194f9141bcff3ea91500

-- 
View it on GitLab: https://code.videolan.org/videolan/x264/-/compare/4121277b40a667665d4eea1726aefdc55d12d110...8bd6d28025c9dcc101cb194f9141bcff3ea91500
You're receiving this email because of your account on code.videolan.org.