[x265] [PATCH] AArch64: Add SVE2_BitPerm implementation of scanPosLast
chen
chenm003 at 163.com
Sat Jun 21 04:27:57 UTC 2025
Thank for the patch, it looks good to me
At 2025-06-20 17:04:24, "George Steed" <george.steed at arm.com> wrote:
>Use the BEXT instruction to pack bits based on a bitmask, this avoids
>the need for the loop in the prior Arm implementations.
>
>There was an existing function declaration of scanPosLast_sve2, however
>this function was never defined and unused so simply replace it with the
>new scanPosLast_sve2_bitperm declaration.
>
>Benchmarking on a Neoverse V2 machine supporting the SVE2_BitPerm
>extension, this implementation improves --preset=medium encoding speed
>by ~1.3%.
>
>Also take this opportunity to reorder the ARM_ASMS extension list in
>CMakeLists.txt to be in architecture order to match elsewhere.
>---
> source/CMakeLists.txt | 12 ++
> source/common/CMakeLists.txt | 8 +-
> source/common/aarch64/asm-primitives.cpp | 13 ++
> source/common/aarch64/fun-decls.h | 3 +-
> .../common/aarch64/pixel-util-sve2-bitperm.S | 125 ++++++++++++++++++
> 5 files changed, 157 insertions(+), 4 deletions(-)
> create mode 100644 source/common/aarch64/pixel-util-sve2-bitperm.S
>
>diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
>index c4253b723..4160514b9 100755
>--- a/source/CMakeLists.txt
>+++ b/source/CMakeLists.txt
>@@ -835,6 +835,18 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
> DEPENDS ${ASM_SRC})
> endforeach()
> endif()
>+ if(CPU_HAS_SVE2_BITPERM)
>+ foreach(ASM ${ARM_ASMS_SVE2_BITPERM})
>+ set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
>+ list(APPEND ASM_SRCS ${ASM_SRC})
>+ list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
>+ add_custom_command(
>+ OUTPUT ${ASM}.${SUFFIX}
>+ COMMAND ${CMAKE_CXX_COMPILER}
>+ ARGS ${ARM_ARGS} ${ASM_FLAGS} ${AARCH64_SVE2_BITPERM_FLAG} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
>+ DEPENDS ${ASM_SRC})
>+ endforeach()
>+ endif()
> elseif(X86)
> # compile X86 arch asm files here
> foreach(ASM ${MSVC_ASMS})
>diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
>index 405ec0b2d..37f3e462c 100644
>--- a/source/common/CMakeLists.txt
>+++ b/source/common/CMakeLists.txt
>@@ -116,12 +116,14 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
> set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S)
> set(A_SRCS_SVE asm-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
> set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S)
>+ set(A_SRCS_SVE2_BITPERM pixel-util-sve2-bitperm.S)
> set(VEC_PRIMITIVES)
>
>- set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
>- set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "ARM Assembly Sources that use SVE instruction set")
>- set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "ARM Assembly Sources that use SVE2 instruction set")
>+ set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "Arm Assembly Sources")
> set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL "Arm Assembly Sources that use the Neon DotProd extension")
>+ set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "Arm Assembly Sources that use the SVE extension")
>+ set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "Arm Assembly Sources that use the SVE2 extension")
>+ set(ARM_ASMS_SVE2_BITPERM "${A_SRCS_SVE2_BITPERM}" CACHE INTERNAL "Arm Assembly Sources that use the SVE2 BitPerm extension")
> foreach(SRC ${C_SRCS_NEON})
> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
> set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_NEON_FLAG})
>diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
>index 5ce9352bd..b3c89b370 100644
>--- a/source/common/aarch64/asm-primitives.cpp
>+++ b/source/common/aarch64/asm-primitives.cpp
>@@ -721,6 +721,13 @@ void setupSve2Primitives(EncoderPrimitives &p)
> }
> #endif // defined(HAVE_SVE2)
>
>+#if defined(HAVE_SVE2_BITPERM)
>+void setupSve2BitPermPrimitives(EncoderPrimitives &p)
>+{
>+ p.scanPosLast = PFX(scanPosLast_sve2_bitperm);
>+}
>+#endif // defined(HAVE_SVE2_BITPERM)
>+
> #ifdef HAVE_NEON_DOTPROD
> #if !HIGH_BIT_DEPTH
> void setupNeonDotProdPrimitives(EncoderPrimitives &p)
>@@ -771,6 +778,12 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
> setupSve2Primitives(p);
> }
> #endif
>+#ifdef HAVE_SVE2_BITPERM
>+ if (cpuMask & X265_CPU_SVE2_BITPERM)
>+ {
>+ setupSve2BitPermPrimitives(p);
>+ }
>+#endif
> }
>
> void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
>diff --git a/source/common/aarch64/fun-decls.h b/source/common/aarch64/fun-decls.h
>index 12383b573..56a434d34 100644
>--- a/source/common/aarch64/fun-decls.h
>+++ b/source/common/aarch64/fun-decls.h
>@@ -255,4 +255,5 @@ void PFX(ssim_4x4x2_core_sve2(const pixel* pix1, intptr_t stride1, const pixel*
>
> int PFX(psyCost_8x8_sve2)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
> void PFX(weight_sp_sve2)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
>-int PFX(scanPosLast_sve2)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
>+
>+int PFX(scanPosLast_sve2_bitperm)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
>diff --git a/source/common/aarch64/pixel-util-sve2-bitperm.S b/source/common/aarch64/pixel-util-sve2-bitperm.S
>new file mode 100644
>index 000000000..5d7828317
>--- /dev/null
>+++ b/source/common/aarch64/pixel-util-sve2-bitperm.S
>@@ -0,0 +1,125 @@
>+/*****************************************************************************
>+ * Copyright (C) 2025 MulticoreWare, Inc
>+ *
>+ * Authors: George Steed <george.steed at arm.com>
>+ *
>+ * This program is free software; you can redistribute it and/or modify
>+ * it under the terms of the GNU General Public License as published by
>+ * the Free Software Foundation; either version 2 of the License, or
>+ * (at your option) any later version.
>+ *
>+ * This program is distributed in the hope that it will be useful,
>+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
>+ * GNU General Public License for more details.
>+ *
>+ * You should have received a copy of the GNU General Public License
>+ * along with this program; if not, write to the Free Software
>+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
>+ *
>+ * This program is also available under a commercial proprietary license.
>+ * For more information, contact us at license @ x265.com.
>+ *****************************************************************************/
>+
>+#include "asm-sve.S"
>+#include "pixel-util-common.S"
>+
>+.arch armv8-a+sve2+sve2-bitperm
>+
>+#ifdef __APPLE__
>+.section __RODATA,__rodata
>+#else
>+.section .rodata
>+#endif
>+
>+.align 4
>+
>+.text
>+
>+// int scanPosLast(
>+// const uint16_t *scan, // x0
>+// const coeff_t *coeff, // x1
>+// uint16_t *coeffSign, // x2
>+// uint16_t *coeffFlag, // x3
>+// uint8_t *coeffNum, // x4
>+// int numSig, // x5
>+// const uint16_t* scanCG4x4, // x6
>+// const int trSize) // x7
>+function PFX(scanPosLast_sve2_bitperm)
>+ // Convert unit of trSize stride from elements (int16) to bytes.
>+ add x7, x7, x7
>+
>+ // Load scan table and convert to bytes.
>+ ldp q0, q1, [x6]
>+ uzp1 v0.16b, v0.16b, v1.16b // v0 - Zigzag scan table.
>+
>+ movrel x10, g_SPL_and_mask
>+ ldr q28, [x10] // v28 = mask for pmovmskb.
>+ add x10, x7, x7 // 2*x7
>+ add x11, x7, x7, lsl #1 // 3*x7
>+ add x9, x4, #1 // CG count
>+
>+1:
>+ // Position of current CG.
>+ ldrh w6, [x0], #32
>+ add x6, x1, x6, lsl #1
>+
>+ // Loading current CG and saturate to bytes.
>+ ldr d2, [x6]
>+ ldr d3, [x6, x7]
>+ ldr d4, [x6, x10]
>+ ldr d5, [x6, x11]
>+ mov v2.d[1], v3.d[0]
>+ mov v4.d[1], v5.d[0]
>+ sqxtn v2.8b, v2.8h
>+ sqxtn2 v2.16b, v4.8h
>+
>+ // Apply zigzag.
>+ tbl v3.16b, {v2.16b}, v0.16b
>+
>+ // Get zero/sign.
>+ cmeq v5.16b, v3.16b, #0 // v5 = zero
>+ cmlt v3.16b, v3.16b, #0 // v3 = negative
>+
>+ // val: v3.h[0] = pmovmskb(v3).
>+ // mask: v3.h[1] = pmovmskb(v4).
>+ and v3.16b, v3.16b, v28.16b
>+ bic v4.16b, v28.16b, v5.16b
>+ addp v3.16b, v3.16b, v4.16b
>+ addp v3.16b, v3.16b, v3.16b
>+ addp v3.16b, v3.16b, v3.16b
>+ fmov w15, s3
>+
>+ // coeffNum = addv(v3 != 0) = 16 - addv(v5).
>+ addv b5, v5.16b
>+ smov w6, v5.b[0]
>+ add w6, w6, #16
>+ sub x5, x5, x6
>+ strb w6, [x4], #1
>+
>+ // coeffFlag = reverse_bit(w15) in 16-bit.
>+ rbit w12, w15
>+ strh w12, [x3], #2
>+
>+ // Pack bits from z3.h[0] into z30.h[0], based on z3.h[1] mask.
>+ mov h31, v3.h[1]
>+ bext z30.h, z3.h, z31.h
>+ str h30, [x2], #2
>+
>+ cbnz x5, 1b
>+
>+ // Count trailing zeros in (reversed) coeffFlag.
>+ clz w13, w15
>+ lsr w12, w12, w13
>+ strh w12, [x3, #-2]
>+
>+ // Get last pos.
>+ sub x9, x4, x9
>+ eor w13, w13, #15
>+ add x0, x13, x9, lsl #4
>+ ret
>+endfunc
>+
>+const g_SPL_and_mask, align=8
>+.byte 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
>+endconst
>--
>2.43.0
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250621/6137bcee/attachment-0001.htm>
More information about the x265-devel
mailing list