[x265] [PATCH 01/12] AArch64: Fix costCoeffNxN test on Apple Silicon

Hari Limaye hari.limaye at arm.com
Thu May 2 21:19:36 UTC 2024


The assembly routine x265_costCoeffNxN_neon is buggy and produces an
incorrect result on Apple Silicon, causing the pixel testbench to fail
on these platforms.

x265_costCoeffNxN assumes that parameter `int subPosBase`, the second
parameter of type `int` passed on the stack, is at position `sp + 8`;
this assumption is consistent with the AArch64 PCS, as arguments smaller
than 8 bytes are widened to 8 bytes (aapcs64 6.8.2 C.16).
However arm64e diverges from AAPCS64: 'Function arguments may consume
slots on the stack that are not multiples of 8 bytes'.
---
 source/common/aarch64/asm.S        | 12 +++++++++++-
 source/common/aarch64/pixel-util.S |  4 ++--
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/source/common/aarch64/asm.S b/source/common/aarch64/asm.S
index ce0668103..742978631 100644
--- a/source/common/aarch64/asm.S
+++ b/source/common/aarch64/asm.S
@@ -72,6 +72,16 @@

 #define PFX_C(name)        JOIN(JOIN(JOIN(EXTERN_ASM, X265_NS), _), name)

+// Alignment of stack arguments of size less than 8 bytes.
+#ifdef __APPLE__
+#define STACK_ARG_ALIGNMENT 4
+#else
+#define STACK_ARG_ALIGNMENT 8
+#endif
+
+// Get offset from SP of stack argument at index `idx`.
+#define STACK_ARG_OFFSET(idx) (idx * STACK_ARG_ALIGNMENT)
+
 #ifdef __APPLE__
 .macro endfunc
 ELF .size \name, . - \name
@@ -184,4 +194,4 @@ ELF     .size   \name, . - \name
     vtrn            \t3, \t4, \s3, \s4
 .endm

-#endif
\ No newline at end of file
+#endif
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index 9b3c11504..378c6891c 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -2311,7 +2311,7 @@ endfunc
 //    uint8_t *baseCtx,      // x6
 //    int offset,            // x7
 //    int scanPosSigOff,     // sp
-//    int subPosBase)        // sp + 8
+//    int subPosBase)        // sp + 8, or sp + 4 on APPLE
 function PFX(costCoeffNxN_neon)
     // abs(coeff)
     add             x2, x2, x2
@@ -2410,7 +2410,7 @@ function PFX(costCoeffNxN_neon)
     add             x4, x4, x15
     str             h2, [x13]              // absCoeff[numNonZero] = tmpCoeff[blkPos]

-    ldr             x9, [sp, #8]           // subPosBase
+    ldr             x9, [sp, #STACK_ARG_OFFSET(1)]           // subPosBase
     uxth            w9, w9
     cmp             w9, #0
     cset            x2, eq
--
2.42.1

IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.


More information about the x265-devel mailing list