[x264-devel] [Git][videolan/x264][master] 3 commits: aarch64/asm: optimize cabac_encode_terminal with extrinsic knowledge
Janne Grunau
gitlab at videolan.org
Mon Dec 28 19:39:38 UTC 2020
Janne Grunau pushed to branch master at VideoLAN / x264
Commits:
8578bd9f by Janne Grunau at 2020-12-28T11:53:20+01:00
aarch64/asm: optimize cabac_encode_terminal with extrinsic knowledge
Approach taken from x86 asm. Overall speedup meaningless.
cabac_encode_terminal on average twice as fast on cortex-53 while
encoding with following command:
./x264 --threads 1 --profile high --preset veryfast --crf 15 -o /dev/null park_joy_420_720p50.y4m
Less relative speedup on cortex-a72/73.
- - - - -
9981ea83 by Janne Grunau at 2020-12-28T11:53:34+01:00
aarch64/asm: support offsets in movrel macro
Imported from dav1d.
- - - - -
8bd6d280 by Janne Grunau at 2020-12-28T11:53:34+01:00
aarch64/asm: optimize cabac asm
0.5% - 2% overall speedup on
`./x264 --threads X --profile high --preset veryfast --crf 15 -o /dev/null park_joy_420_720p50.y4m`
cabac is responsible for roughly 1/6 of the CPU use.
Branch mispredictions are reduced by 15% to 20%.
cortex-s53: 0.5% faster
cortex-a72: 2% faster
neoverse-n1: 0.9% faster
- - - - -
3 changed files:
- common/aarch64/asm-offsets.c
- common/aarch64/asm.S
- common/aarch64/cabac-a.S
Changes:
=====================================
common/aarch64/asm-offsets.c
=====================================
@@ -26,11 +26,19 @@
#include "common/common.h"
#include "asm-offsets.h"
+#define STATIC_ASSERT(name, x) int assert_##name[2 * !!(x) - 1]
+
#define X264_CHECK_OFFSET(s, m, o) struct check_##s##_##m \
{ \
- int m_##m[2 * (offsetof(s, m) == o) - 1]; \
+ STATIC_ASSERT(offset_##m, offsetof(s, m) == o); \
+}
+
+#define X264_CHECK_REL_OFFSET(s, a, type, b) struct check_##s##_##a##_##b \
+{ \
+ STATIC_ASSERT(rel_offset_##a##_##b, offsetof(s, a) + sizeof(type) == offsetof(s, b)); \
}
+
X264_CHECK_OFFSET(x264_cabac_t, i_low, CABAC_I_LOW);
X264_CHECK_OFFSET(x264_cabac_t, i_range, CABAC_I_RANGE);
X264_CHECK_OFFSET(x264_cabac_t, i_queue, CABAC_I_QUEUE);
@@ -40,3 +48,9 @@ X264_CHECK_OFFSET(x264_cabac_t, p, CABAC_P);
X264_CHECK_OFFSET(x264_cabac_t, p_end, CABAC_P_END);
X264_CHECK_OFFSET(x264_cabac_t, f8_bits_encoded, CABAC_F8_BITS_ENCODED);
X264_CHECK_OFFSET(x264_cabac_t, state, CABAC_STATE);
+
+// the aarch64 asm makes following additional assumptions about the x264_cabac_t
+// memory layout
+
+X264_CHECK_REL_OFFSET(x264_cabac_t, i_low, int, i_range);
+X264_CHECK_REL_OFFSET(x264_cabac_t, i_queue, int, i_bytes_outstanding);
=====================================
common/aarch64/asm.S
=====================================
@@ -101,15 +101,30 @@ MACH .const_data
\name:
.endm
-.macro movrel rd, val
-#if defined(PIC) && defined(__APPLE__)
+.macro movrel rd, val, offset=0
+#if defined(__APPLE__)
+ .if \offset < 0
adrp \rd, \val at PAGE
add \rd, \rd, \val at PAGEOFF
-#elif defined(PIC)
+ sub \rd, \rd, -(\offset)
+ .else
+ adrp \rd, \val+(\offset)@PAGE
+ add \rd, \rd, \val+(\offset)@PAGEOFF
+ .endif
+#elif defined(PIC) && defined(_WIN32)
+ .if \offset < 0
adrp \rd, \val
add \rd, \rd, :lo12:\val
+ sub \rd, \rd, -(\offset)
+ .else
+ adrp \rd, \val+(\offset)
+ add \rd, \rd, :lo12:\val+(\offset)
+ .endif
+#elif defined(PIC)
+ adrp \rd, \val+(\offset)
+ add \rd, \rd, :lo12:\val+(\offset)
#else
- ldr \rd, =\val
+ ldr \rd, =\val+\offset
#endif
.endm
=====================================
common/aarch64/cabac-a.S
=====================================
@@ -30,54 +30,51 @@
// w12 holds x264_cabac_t.i_range
function cabac_encode_decision_asm, export=1
- movrel x8, X264(cabac_range_lps)
- movrel x9, X264(cabac_transition)
- add w10, w1, #CABAC_STATE
- ldrb w3, [x0, x10] // i_state
+ add w10, w1, #CABAC_STATE
+ ldrb w3, [x0, w10, uxtw] // i_state
ldr w12, [x0, #CABAC_I_RANGE]
- and x4, x3, #~1
+ movrel x8, X264(cabac_range_lps), -4
+ movrel x9, X264(cabac_transition)
+ ubfx x4, x3, #1, #7
asr w5, w12, #6
- add x8, x8, x4, lsl #1
- sub w5, w5, #4
- eor w6, w2, w3 // b ^ i_state
- ldrb w4, [x8, x5] // i_range_lps
- ldr w11, [x0, #CABAC_I_LOW]
+ add x8, x8, x4, lsl #2
+ orr w14, w2, w3, lsl #1
+ ldrb w4, [x8, w5, uxtw] // i_range_lps
+ ldr w11, [x0, #CABAC_I_LOW]
+ eor w6, w2, w3 // b ^ i_state
+ ldrb w9, [x9, w14, uxtw]
sub w12, w12, w4
- tbz w6, #0, 1f // (b ^ i_state) & 1
- add w11, w11, w12
- mov w12, w4
-1:
- orr w4, w2, w3, lsl #1
- ldrb w9, [x9, x4]
- strb w9, [x0, x10] // i_state
+ add w7, w11, w12
+ tst w6, #1 // (b ^ i_state) & 1
+ csel w12, w4, w12, ne
+ csel w11, w7, w11, ne
+ strb w9, [x0, w10, uxtw] // i_state
cabac_encode_renorm:
- clz w5, w12
ldr w2, [x0, #CABAC_I_QUEUE]
+ clz w5, w12
sub w5, w5, #23
- lsl w12, w12, w5
lsl w11, w11, w5
-2:
+ lsl w12, w12, w5
adds w2, w2, w5
- str w12, [x0, #CABAC_I_RANGE]
- b.lt 0f
+ b.ge cabac_putbyte
+
+ stp w11, w12, [x0, #CABAC_I_LOW] // store i_low, i_range
+ str w2, [x0, #CABAC_I_QUEUE]
+ ret
+
+.align 5
cabac_putbyte:
- mov w13, #0x400
- add w12, w2, #10
- lsl w13, w13, w2
- asr w4, w11, w12 // out
+ ldr w6, [x0, #CABAC_I_BYTES_OUTSTANDING]
+ add w14, w2, #10
+ mov w13, #-1
sub w2, w2, #8
- sub w13, w13, #1
+ asr w4, w11, w14 // out
+ lsl w13, w13, w14
subs w5, w4, #0xff
- and w11, w11, w13
- ldr w6, [x0, #CABAC_I_BYTES_OUTSTANDING]
- str w2, [x0, #CABAC_I_QUEUE]
- b.ne 1f
-
- add w6, w6, #1
- str w11, [x0, #CABAC_I_LOW]
- str w6, [x0, #CABAC_I_BYTES_OUTSTANDING]
- ret
+ bic w11, w11, w13
+ cinc w6, w6, eq
+ b.eq 0f
1:
ldr x7, [x0, #CABAC_P]
@@ -93,15 +90,14 @@ cabac_putbyte:
b.gt 2b
3:
strb w4, [x7], #1
- str wzr, [x0, #CABAC_I_BYTES_OUTSTANDING]
str x7, [x0, #CABAC_P]
0:
- str w11, [x0, #CABAC_I_LOW]
- str w2, [x0, #CABAC_I_QUEUE]
+ stp w11, w12, [x0, #CABAC_I_LOW] // store i_low, i_range
+ stp w2, w6, [x0, #CABAC_I_QUEUE] // store i_queue, i_bytes_outstanding
ret
endfunc
-function cabac_encode_bypass_asm, export=1
+function cabac_encode_bypass_asm, export=1, align=5
ldr w12, [x0, #CABAC_I_RANGE]
ldr w11, [x0, #CABAC_I_LOW]
ldr w2, [x0, #CABAC_I_QUEUE]
@@ -114,9 +110,22 @@ function cabac_encode_bypass_asm, export=1
ret
endfunc
-function cabac_encode_terminal_asm, export=1
+function cabac_encode_terminal_asm, export=1, align=5
ldr w12, [x0, #CABAC_I_RANGE]
- ldr w11, [x0, #CABAC_I_LOW]
sub w12, w12, #2
- b cabac_encode_renorm
+ tbz w12, #8, 1f
+
+ str w12, [x0, #CABAC_I_RANGE]
+ ret
+1:
+ ldr w2, [x0, #CABAC_I_QUEUE]
+ ldr w11, [x0, #CABAC_I_LOW]
+ lsl w12, w12, #1
+ adds w2, w2, #1
+ lsl w11, w11, #1
+ b.ge cabac_putbyte
+
+ stp w11, w12, [x0, #CABAC_I_LOW] // store i_low, i_range
+ str w2, [x0, #CABAC_I_QUEUE]
+ ret
endfunc
View it on GitLab: https://code.videolan.org/videolan/x264/-/compare/4121277b40a667665d4eea1726aefdc55d12d110...8bd6d28025c9dcc101cb194f9141bcff3ea91500
--
View it on GitLab: https://code.videolan.org/videolan/x264/-/compare/4121277b40a667665d4eea1726aefdc55d12d110...8bd6d28025c9dcc101cb194f9141bcff3ea91500
You're receiving this email because of your account on code.videolan.org.
More information about the x264-devel
mailing list