[x265] [PATCH] AArch64: Add SVE2_BitPerm implementation of scanPosLast

Fri Jun 20 09:04:24 UTC 2025

Use the BEXT instruction to pack bits based on a bitmask, this avoids
the need for the loop in the prior Arm implementations.

There was an existing function declaration of scanPosLast_sve2, however
this function was never defined and unused so simply replace it with the
new scanPosLast_sve2_bitperm declaration.

Benchmarking on a Neoverse V2 machine supporting the SVE2_BitPerm
extension, this implementation improves --preset=medium encoding speed
by ~1.3%.

Also take this opportunity to reorder the ARM_ASMS extension list in
CMakeLists.txt to be in architecture order to match elsewhere.
---
 source/CMakeLists.txt                         |  12 ++
 source/common/CMakeLists.txt                  |   8 +-
 source/common/aarch64/asm-primitives.cpp      |  13 ++
 source/common/aarch64/fun-decls.h             |   3 +-
 .../common/aarch64/pixel-util-sve2-bitperm.S  | 125 ++++++++++++++++++
 5 files changed, 157 insertions(+), 4 deletions(-)
 create mode 100644 source/common/aarch64/pixel-util-sve2-bitperm.S

diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index c4253b723..4160514b9 100755
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -835,6 +835,18 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
                     DEPENDS ${ASM_SRC})
             endforeach()
         endif()
+        if(CPU_HAS_SVE2_BITPERM)
+            foreach(ASM ${ARM_ASMS_SVE2_BITPERM})
+                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
+                list(APPEND ASM_SRCS ${ASM_SRC})
+                list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
+                add_custom_command(
+                    OUTPUT ${ASM}.${SUFFIX}
+                    COMMAND ${CMAKE_CXX_COMPILER}
+                    ARGS ${ARM_ARGS} ${ASM_FLAGS} ${AARCH64_SVE2_BITPERM_FLAG} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
+                    DEPENDS ${ASM_SRC})
+            endforeach()
+        endif()
     elseif(X86)
     # compile X86 arch asm files here
         foreach(ASM ${MSVC_ASMS})
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 405ec0b2d..37f3e462c 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -116,12 +116,14 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
     set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S)
     set(A_SRCS_SVE asm-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
     set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S)
+    set(A_SRCS_SVE2_BITPERM pixel-util-sve2-bitperm.S)
     set(VEC_PRIMITIVES)
 
-    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
-    set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "ARM Assembly Sources that use SVE instruction set")
-    set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "ARM Assembly Sources that use SVE2 instruction set")
+    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "Arm Assembly Sources")
     set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL "Arm Assembly Sources that use the Neon DotProd extension")
+    set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "Arm Assembly Sources that use the SVE extension")
+    set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "Arm Assembly Sources that use the SVE2 extension")
+    set(ARM_ASMS_SVE2_BITPERM "${A_SRCS_SVE2_BITPERM}" CACHE INTERNAL "Arm Assembly Sources that use the SVE2 BitPerm extension")
     foreach(SRC ${C_SRCS_NEON})
         set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
         set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_NEON_FLAG})
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 5ce9352bd..b3c89b370 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -721,6 +721,13 @@ void setupSve2Primitives(EncoderPrimitives &p)
 }
 #endif // defined(HAVE_SVE2)
 
+#if defined(HAVE_SVE2_BITPERM)
+void setupSve2BitPermPrimitives(EncoderPrimitives &p)
+{
+    p.scanPosLast = PFX(scanPosLast_sve2_bitperm);
+}
+#endif // defined(HAVE_SVE2_BITPERM)
+
 #ifdef HAVE_NEON_DOTPROD
 #if !HIGH_BIT_DEPTH
 void setupNeonDotProdPrimitives(EncoderPrimitives &p)
@@ -771,6 +778,12 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
         setupSve2Primitives(p);
     }
 #endif
+#ifdef HAVE_SVE2_BITPERM
+    if (cpuMask & X265_CPU_SVE2_BITPERM)
+    {
+        setupSve2BitPermPrimitives(p);
+    }
+#endif
 }
 
 void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
diff --git a/source/common/aarch64/fun-decls.h b/source/common/aarch64/fun-decls.h
index 12383b573..56a434d34 100644
--- a/source/common/aarch64/fun-decls.h
+++ b/source/common/aarch64/fun-decls.h
@@ -255,4 +255,5 @@ void PFX(ssim_4x4x2_core_sve2(const pixel* pix1, intptr_t stride1, const pixel*
 
 int PFX(psyCost_8x8_sve2)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
 void PFX(weight_sp_sve2)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
-int PFX(scanPosLast_sve2)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
+
+int PFX(scanPosLast_sve2_bitperm)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
diff --git a/source/common/aarch64/pixel-util-sve2-bitperm.S b/source/common/aarch64/pixel-util-sve2-bitperm.S
new file mode 100644
index 000000000..5d7828317
--- /dev/null
+++ b/source/common/aarch64/pixel-util-sve2-bitperm.S
@@ -0,0 +1,125 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: George Steed <george.steed at arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm-sve.S"
+#include "pixel-util-common.S"
+
+.arch armv8-a+sve2+sve2-bitperm
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+// int scanPosLast(
+//     const uint16_t *scan,      // x0
+//     const coeff_t *coeff,      // x1
+//     uint16_t *coeffSign,       // x2
+//     uint16_t *coeffFlag,       // x3
+//     uint8_t *coeffNum,         // x4
+//     int numSig,                // x5
+//     const uint16_t* scanCG4x4, // x6
+//     const int trSize)          // x7
+function PFX(scanPosLast_sve2_bitperm)
+    // Convert unit of trSize stride from elements (int16) to bytes.
+    add             x7, x7, x7
+
+    // Load scan table and convert to bytes.
+    ldp             q0, q1, [x6]
+    uzp1            v0.16b, v0.16b, v1.16b  // v0 - Zigzag scan table.
+
+    movrel          x10, g_SPL_and_mask
+    ldr             q28, [x10]              // v28 = mask for pmovmskb.
+    add             x10, x7, x7             // 2*x7
+    add             x11, x7, x7, lsl #1     // 3*x7
+    add             x9, x4, #1              // CG count
+
+1:
+    // Position of current CG.
+    ldrh            w6, [x0], #32
+    add             x6, x1, x6, lsl #1
+
+    // Loading current CG and saturate to bytes.
+    ldr             d2, [x6]
+    ldr             d3, [x6, x7]
+    ldr             d4, [x6, x10]
+    ldr             d5, [x6, x11]
+    mov             v2.d[1], v3.d[0]
+    mov             v4.d[1], v5.d[0]
+    sqxtn           v2.8b, v2.8h
+    sqxtn2          v2.16b, v4.8h
+
+    // Apply zigzag.
+    tbl             v3.16b, {v2.16b}, v0.16b
+
+    // Get zero/sign.
+    cmeq            v5.16b, v3.16b, #0   // v5 = zero
+    cmlt            v3.16b, v3.16b, #0   // v3 = negative
+
+    //  val: v3.h[0] = pmovmskb(v3).
+    // mask: v3.h[1] = pmovmskb(v4).
+    and             v3.16b, v3.16b, v28.16b
+    bic             v4.16b, v28.16b, v5.16b
+    addp            v3.16b, v3.16b, v4.16b
+    addp            v3.16b, v3.16b, v3.16b
+    addp            v3.16b, v3.16b, v3.16b
+    fmov            w15, s3
+
+    // coeffNum = addv(v3 != 0) = 16 - addv(v5).
+    addv            b5, v5.16b
+    smov            w6, v5.b[0]
+    add             w6, w6, #16
+    sub             x5, x5, x6
+    strb            w6, [x4], #1
+
+    // coeffFlag = reverse_bit(w15) in 16-bit.
+    rbit            w12, w15
+    strh            w12, [x3], #2
+
+    // Pack bits from z3.h[0] into z30.h[0], based on z3.h[1] mask.
+    mov             h31, v3.h[1]
+    bext            z30.h, z3.h, z31.h
+    str             h30, [x2], #2
+
+    cbnz            x5, 1b
+
+    // Count trailing zeros in (reversed) coeffFlag.
+    clz             w13, w15
+    lsr             w12, w12, w13
+    strh            w12, [x3, #-2]
+
+    // Get last pos.
+    sub             x9, x4, x9
+    eor             w13, w13, #15
+    add             x0, x13, x9, lsl #4
+    ret
+endfunc
+
+const g_SPL_and_mask, align=8
+.byte 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+endconst
-- 
2.43.0

-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-AArch64-Add-SVE2_BitPerm-implementation-of-scanPosLa.patch
Type: text/x-diff
Size: 10160 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250620/edba76d9/attachment.patch>