[x265] [PATCH] AArch64: Add SVE2_BitPerm implementation of scanPosLast

Sat Jun 21 04:27:57 UTC 2025

Thank for the patch, it looks good to me

At 2025-06-20 17:04:24, "George Steed" <george.steed at arm.com> wrote:
>Use the BEXT instruction to pack bits based on a bitmask, this avoids
>the need for the loop in the prior Arm implementations.
>
>There was an existing function declaration of scanPosLast_sve2, however
>this function was never defined and unused so simply replace it with the
>new scanPosLast_sve2_bitperm declaration.
>
>Benchmarking on a Neoverse V2 machine supporting the SVE2_BitPerm
>extension, this implementation improves --preset=medium encoding speed
>by ~1.3%.
>
>Also take this opportunity to reorder the ARM_ASMS extension list in
>CMakeLists.txt to be in architecture order to match elsewhere.
>---
> source/CMakeLists.txt                         |  12 ++
> source/common/CMakeLists.txt                  |   8 +-
> source/common/aarch64/asm-primitives.cpp      |  13 ++
> source/common/aarch64/fun-decls.h             |   3 +-
> .../common/aarch64/pixel-util-sve2-bitperm.S  | 125 ++++++++++++++++++
> 5 files changed, 157 insertions(+), 4 deletions(-)
> create mode 100644 source/common/aarch64/pixel-util-sve2-bitperm.S
>
>diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
>index c4253b723..4160514b9 100755
>--- a/source/CMakeLists.txt
>+++ b/source/CMakeLists.txt
>@@ -835,6 +835,18 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>                     DEPENDS ${ASM_SRC})
>             endforeach()
>         endif()
>+        if(CPU_HAS_SVE2_BITPERM)
>+            foreach(ASM ${ARM_ASMS_SVE2_BITPERM})
>+                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
>+                list(APPEND ASM_SRCS ${ASM_SRC})
>+                list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
>+                add_custom_command(
>+                    OUTPUT ${ASM}.${SUFFIX}
>+                    COMMAND ${CMAKE_CXX_COMPILER}
>+                    ARGS ${ARM_ARGS} ${ASM_FLAGS} ${AARCH64_SVE2_BITPERM_FLAG} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
>+                    DEPENDS ${ASM_SRC})
>+            endforeach()
>+        endif()
>     elseif(X86)
>     # compile X86 arch asm files here
>         foreach(ASM ${MSVC_ASMS})
>diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
>index 405ec0b2d..37f3e462c 100644
>--- a/source/common/CMakeLists.txt
>+++ b/source/common/CMakeLists.txt
>@@ -116,12 +116,14 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
>     set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S)
>     set(A_SRCS_SVE asm-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
>     set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S)
>+    set(A_SRCS_SVE2_BITPERM pixel-util-sve2-bitperm.S)
>     set(VEC_PRIMITIVES)
> 
>-    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
>-    set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "ARM Assembly Sources that use SVE instruction set")
>-    set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "ARM Assembly Sources that use SVE2 instruction set")
>+    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "Arm Assembly Sources")
>     set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL "Arm Assembly Sources that use the Neon DotProd extension")
>+    set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "Arm Assembly Sources that use the SVE extension")
>+    set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "Arm Assembly Sources that use the SVE2 extension")
>+    set(ARM_ASMS_SVE2_BITPERM "${A_SRCS_SVE2_BITPERM}" CACHE INTERNAL "Arm Assembly Sources that use the SVE2 BitPerm extension")
>     foreach(SRC ${C_SRCS_NEON})
>         set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>         set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_NEON_FLAG})
>diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
>index 5ce9352bd..b3c89b370 100644
>--- a/source/common/aarch64/asm-primitives.cpp
>+++ b/source/common/aarch64/asm-primitives.cpp
>@@ -721,6 +721,13 @@ void setupSve2Primitives(EncoderPrimitives &p)
> }
> #endif // defined(HAVE_SVE2)
> 
>+#if defined(HAVE_SVE2_BITPERM)
>+void setupSve2BitPermPrimitives(EncoderPrimitives &p)
>+{
>+    p.scanPosLast = PFX(scanPosLast_sve2_bitperm);
>+}
>+#endif // defined(HAVE_SVE2_BITPERM)
>+
> #ifdef HAVE_NEON_DOTPROD
> #if !HIGH_BIT_DEPTH
> void setupNeonDotProdPrimitives(EncoderPrimitives &p)
>@@ -771,6 +778,12 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
>         setupSve2Primitives(p);
>     }
> #endif
>+#ifdef HAVE_SVE2_BITPERM
>+    if (cpuMask & X265_CPU_SVE2_BITPERM)
>+    {
>+        setupSve2BitPermPrimitives(p);
>+    }
>+#endif
> }
> 
> void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
>diff --git a/source/common/aarch64/fun-decls.h b/source/common/aarch64/fun-decls.h
>index 12383b573..56a434d34 100644
>--- a/source/common/aarch64/fun-decls.h
>+++ b/source/common/aarch64/fun-decls.h
>@@ -255,4 +255,5 @@ void PFX(ssim_4x4x2_core_sve2(const pixel* pix1, intptr_t stride1, const pixel*
> 
> int PFX(psyCost_8x8_sve2)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
> void PFX(weight_sp_sve2)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
>-int PFX(scanPosLast_sve2)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
>+
>+int PFX(scanPosLast_sve2_bitperm)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
>diff --git a/source/common/aarch64/pixel-util-sve2-bitperm.S b/source/common/aarch64/pixel-util-sve2-bitperm.S
>new file mode 100644
>index 000000000..5d7828317
>--- /dev/null
>+++ b/source/common/aarch64/pixel-util-sve2-bitperm.S
>@@ -0,0 +1,125 @@
>+/*****************************************************************************
>+ * Copyright (C) 2025 MulticoreWare, Inc
>+ *
>+ * Authors: George Steed <george.steed at arm.com>
>+ *
>+ * This program is free software; you can redistribute it and/or modify
>+ * it under the terms of the GNU General Public License as published by
>+ * the Free Software Foundation; either version 2 of the License, or
>+ * (at your option) any later version.
>+ *
>+ * This program is distributed in the hope that it will be useful,
>+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
>+ * GNU General Public License for more details.
>+ *
>+ * You should have received a copy of the GNU General Public License
>+ * along with this program; if not, write to the Free Software
>+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
>+ *
>+ * This program is also available under a commercial proprietary license.
>+ * For more information, contact us at license @ x265.com.
>+ *****************************************************************************/
>+
>+#include "asm-sve.S"
>+#include "pixel-util-common.S"
>+
>+.arch armv8-a+sve2+sve2-bitperm
>+
>+#ifdef __APPLE__
>+.section __RODATA,__rodata
>+#else
>+.section .rodata
>+#endif
>+
>+.align 4
>+
>+.text
>+
>+// int scanPosLast(
>+//     const uint16_t *scan,      // x0
>+//     const coeff_t *coeff,      // x1
>+//     uint16_t *coeffSign,       // x2
>+//     uint16_t *coeffFlag,       // x3
>+//     uint8_t *coeffNum,         // x4
>+//     int numSig,                // x5
>+//     const uint16_t* scanCG4x4, // x6
>+//     const int trSize)          // x7
>+function PFX(scanPosLast_sve2_bitperm)
>+    // Convert unit of trSize stride from elements (int16) to bytes.
>+    add             x7, x7, x7
>+
>+    // Load scan table and convert to bytes.
>+    ldp             q0, q1, [x6]
>+    uzp1            v0.16b, v0.16b, v1.16b  // v0 - Zigzag scan table.
>+
>+    movrel          x10, g_SPL_and_mask
>+    ldr             q28, [x10]              // v28 = mask for pmovmskb.
>+    add             x10, x7, x7             // 2*x7
>+    add             x11, x7, x7, lsl #1     // 3*x7
>+    add             x9, x4, #1              // CG count
>+
>+1:
>+    // Position of current CG.
>+    ldrh            w6, [x0], #32
>+    add             x6, x1, x6, lsl #1
>+
>+    // Loading current CG and saturate to bytes.
>+    ldr             d2, [x6]
>+    ldr             d3, [x6, x7]
>+    ldr             d4, [x6, x10]
>+    ldr             d5, [x6, x11]
>+    mov             v2.d[1], v3.d[0]
>+    mov             v4.d[1], v5.d[0]
>+    sqxtn           v2.8b, v2.8h
>+    sqxtn2          v2.16b, v4.8h
>+
>+    // Apply zigzag.
>+    tbl             v3.16b, {v2.16b}, v0.16b
>+
>+    // Get zero/sign.
>+    cmeq            v5.16b, v3.16b, #0   // v5 = zero
>+    cmlt            v3.16b, v3.16b, #0   // v3 = negative
>+
>+    //  val: v3.h[0] = pmovmskb(v3).
>+    // mask: v3.h[1] = pmovmskb(v4).
>+    and             v3.16b, v3.16b, v28.16b
>+    bic             v4.16b, v28.16b, v5.16b
>+    addp            v3.16b, v3.16b, v4.16b
>+    addp            v3.16b, v3.16b, v3.16b
>+    addp            v3.16b, v3.16b, v3.16b
>+    fmov            w15, s3
>+
>+    // coeffNum = addv(v3 != 0) = 16 - addv(v5).
>+    addv            b5, v5.16b
>+    smov            w6, v5.b[0]
>+    add             w6, w6, #16
>+    sub             x5, x5, x6
>+    strb            w6, [x4], #1
>+
>+    // coeffFlag = reverse_bit(w15) in 16-bit.
>+    rbit            w12, w15
>+    strh            w12, [x3], #2
>+
>+    // Pack bits from z3.h[0] into z30.h[0], based on z3.h[1] mask.
>+    mov             h31, v3.h[1]
>+    bext            z30.h, z3.h, z31.h
>+    str             h30, [x2], #2
>+
>+    cbnz            x5, 1b
>+
>+    // Count trailing zeros in (reversed) coeffFlag.
>+    clz             w13, w15
>+    lsr             w12, w12, w13
>+    strh            w12, [x3, #-2]
>+
>+    // Get last pos.
>+    sub             x9, x4, x9
>+    eor             w13, w13, #15
>+    add             x0, x13, x9, lsl #4
>+    ret
>+endfunc
>+
>+const g_SPL_and_mask, align=8
>+.byte 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
>+endconst
>-- 
>2.43.0
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250621/6137bcee/attachment-0001.htm>