[x265] [PATCH 01/12] AArch64: Fix costCoeffNxN test on Apple Silicon
Hari Limaye
hari.limaye at arm.com
Thu May 2 21:19:36 UTC 2024
The assembly routine x265_costCoeffNxN_neon is buggy and produces an
incorrect result on Apple Silicon, causing the pixel testbench to fail
on these platforms.
x265_costCoeffNxN assumes that parameter `int subPosBase`, the second
parameter of type `int` passed on the stack, is at position `sp + 8`;
this assumption is consistent with the AArch64 PCS, as arguments smaller
than 8 bytes are widened to 8 bytes (aapcs64 6.8.2 C.16).
However arm64e diverges from AAPCS64: 'Function arguments may consume
slots on the stack that are not multiples of 8 bytes'.
---
source/common/aarch64/asm.S | 12 +++++++++++-
source/common/aarch64/pixel-util.S | 4 ++--
2 files changed, 13 insertions(+), 3 deletions(-)
diff --git a/source/common/aarch64/asm.S b/source/common/aarch64/asm.S
index ce0668103..742978631 100644
--- a/source/common/aarch64/asm.S
+++ b/source/common/aarch64/asm.S
@@ -72,6 +72,16 @@
#define PFX_C(name) JOIN(JOIN(JOIN(EXTERN_ASM, X265_NS), _), name)
+// Alignment of stack arguments of size less than 8 bytes.
+#ifdef __APPLE__
+#define STACK_ARG_ALIGNMENT 4
+#else
+#define STACK_ARG_ALIGNMENT 8
+#endif
+
+// Get offset from SP of stack argument at index `idx`.
+#define STACK_ARG_OFFSET(idx) (idx * STACK_ARG_ALIGNMENT)
+
#ifdef __APPLE__
.macro endfunc
ELF .size \name, . - \name
@@ -184,4 +194,4 @@ ELF .size \name, . - \name
vtrn \t3, \t4, \s3, \s4
.endm
-#endif
\ No newline at end of file
+#endif
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index 9b3c11504..378c6891c 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -2311,7 +2311,7 @@ endfunc
// uint8_t *baseCtx, // x6
// int offset, // x7
// int scanPosSigOff, // sp
-// int subPosBase) // sp + 8
+// int subPosBase) // sp + 8, or sp + 4 on APPLE
function PFX(costCoeffNxN_neon)
// abs(coeff)
add x2, x2, x2
@@ -2410,7 +2410,7 @@ function PFX(costCoeffNxN_neon)
add x4, x4, x15
str h2, [x13] // absCoeff[numNonZero] = tmpCoeff[blkPos]
- ldr x9, [sp, #8] // subPosBase
+ ldr x9, [sp, #STACK_ARG_OFFSET(1)] // subPosBase
uxth w9, w9
cmp w9, #0
cset x2, eq
--
2.42.1
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
More information about the x265-devel
mailing list