[x265] [PATCH] AArch64: Add SVE2_BitPerm implementation of scanPosLast
George Steed
george.steed at arm.com
Fri Jun 20 09:04:24 UTC 2025
Use the BEXT instruction to pack bits based on a bitmask, this avoids
the need for the loop in the prior Arm implementations.
There was an existing function declaration of scanPosLast_sve2, however
this function was never defined and unused so simply replace it with the
new scanPosLast_sve2_bitperm declaration.
Benchmarking on a Neoverse V2 machine supporting the SVE2_BitPerm
extension, this implementation improves --preset=medium encoding speed
by ~1.3%.
Also take this opportunity to reorder the ARM_ASMS extension list in
CMakeLists.txt to be in architecture order to match elsewhere.
---
source/CMakeLists.txt | 12 ++
source/common/CMakeLists.txt | 8 +-
source/common/aarch64/asm-primitives.cpp | 13 ++
source/common/aarch64/fun-decls.h | 3 +-
.../common/aarch64/pixel-util-sve2-bitperm.S | 125 ++++++++++++++++++
5 files changed, 157 insertions(+), 4 deletions(-)
create mode 100644 source/common/aarch64/pixel-util-sve2-bitperm.S
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index c4253b723..4160514b9 100755
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -835,6 +835,18 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
DEPENDS ${ASM_SRC})
endforeach()
endif()
+ if(CPU_HAS_SVE2_BITPERM)
+ foreach(ASM ${ARM_ASMS_SVE2_BITPERM})
+ set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
+ list(APPEND ASM_SRCS ${ASM_SRC})
+ list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
+ add_custom_command(
+ OUTPUT ${ASM}.${SUFFIX}
+ COMMAND ${CMAKE_CXX_COMPILER}
+ ARGS ${ARM_ARGS} ${ASM_FLAGS} ${AARCH64_SVE2_BITPERM_FLAG} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
+ DEPENDS ${ASM_SRC})
+ endforeach()
+ endif()
elseif(X86)
# compile X86 arch asm files here
foreach(ASM ${MSVC_ASMS})
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 405ec0b2d..37f3e462c 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -116,12 +116,14 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S)
set(A_SRCS_SVE asm-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S)
+ set(A_SRCS_SVE2_BITPERM pixel-util-sve2-bitperm.S)
set(VEC_PRIMITIVES)
- set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
- set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "ARM Assembly Sources that use SVE instruction set")
- set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "ARM Assembly Sources that use SVE2 instruction set")
+ set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "Arm Assembly Sources")
set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL "Arm Assembly Sources that use the Neon DotProd extension")
+ set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "Arm Assembly Sources that use the SVE extension")
+ set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "Arm Assembly Sources that use the SVE2 extension")
+ set(ARM_ASMS_SVE2_BITPERM "${A_SRCS_SVE2_BITPERM}" CACHE INTERNAL "Arm Assembly Sources that use the SVE2 BitPerm extension")
foreach(SRC ${C_SRCS_NEON})
set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_NEON_FLAG})
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 5ce9352bd..b3c89b370 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -721,6 +721,13 @@ void setupSve2Primitives(EncoderPrimitives &p)
}
#endif // defined(HAVE_SVE2)
+#if defined(HAVE_SVE2_BITPERM)
+void setupSve2BitPermPrimitives(EncoderPrimitives &p)
+{
+ p.scanPosLast = PFX(scanPosLast_sve2_bitperm);
+}
+#endif // defined(HAVE_SVE2_BITPERM)
+
#ifdef HAVE_NEON_DOTPROD
#if !HIGH_BIT_DEPTH
void setupNeonDotProdPrimitives(EncoderPrimitives &p)
@@ -771,6 +778,12 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
setupSve2Primitives(p);
}
#endif
+#ifdef HAVE_SVE2_BITPERM
+ if (cpuMask & X265_CPU_SVE2_BITPERM)
+ {
+ setupSve2BitPermPrimitives(p);
+ }
+#endif
}
void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
diff --git a/source/common/aarch64/fun-decls.h b/source/common/aarch64/fun-decls.h
index 12383b573..56a434d34 100644
--- a/source/common/aarch64/fun-decls.h
+++ b/source/common/aarch64/fun-decls.h
@@ -255,4 +255,5 @@ void PFX(ssim_4x4x2_core_sve2(const pixel* pix1, intptr_t stride1, const pixel*
int PFX(psyCost_8x8_sve2)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
void PFX(weight_sp_sve2)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
-int PFX(scanPosLast_sve2)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
+
+int PFX(scanPosLast_sve2_bitperm)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
diff --git a/source/common/aarch64/pixel-util-sve2-bitperm.S b/source/common/aarch64/pixel-util-sve2-bitperm.S
new file mode 100644
index 000000000..5d7828317
--- /dev/null
+++ b/source/common/aarch64/pixel-util-sve2-bitperm.S
@@ -0,0 +1,125 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: George Steed <george.steed at arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm-sve.S"
+#include "pixel-util-common.S"
+
+.arch armv8-a+sve2+sve2-bitperm
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+// int scanPosLast(
+// const uint16_t *scan, // x0
+// const coeff_t *coeff, // x1
+// uint16_t *coeffSign, // x2
+// uint16_t *coeffFlag, // x3
+// uint8_t *coeffNum, // x4
+// int numSig, // x5
+// const uint16_t* scanCG4x4, // x6
+// const int trSize) // x7
+function PFX(scanPosLast_sve2_bitperm)
+ // Convert unit of trSize stride from elements (int16) to bytes.
+ add x7, x7, x7
+
+ // Load scan table and convert to bytes.
+ ldp q0, q1, [x6]
+ uzp1 v0.16b, v0.16b, v1.16b // v0 - Zigzag scan table.
+
+ movrel x10, g_SPL_and_mask
+ ldr q28, [x10] // v28 = mask for pmovmskb.
+ add x10, x7, x7 // 2*x7
+ add x11, x7, x7, lsl #1 // 3*x7
+ add x9, x4, #1 // CG count
+
+1:
+ // Position of current CG.
+ ldrh w6, [x0], #32
+ add x6, x1, x6, lsl #1
+
+ // Loading current CG and saturate to bytes.
+ ldr d2, [x6]
+ ldr d3, [x6, x7]
+ ldr d4, [x6, x10]
+ ldr d5, [x6, x11]
+ mov v2.d[1], v3.d[0]
+ mov v4.d[1], v5.d[0]
+ sqxtn v2.8b, v2.8h
+ sqxtn2 v2.16b, v4.8h
+
+ // Apply zigzag.
+ tbl v3.16b, {v2.16b}, v0.16b
+
+ // Get zero/sign.
+ cmeq v5.16b, v3.16b, #0 // v5 = zero
+ cmlt v3.16b, v3.16b, #0 // v3 = negative
+
+ // val: v3.h[0] = pmovmskb(v3).
+ // mask: v3.h[1] = pmovmskb(v4).
+ and v3.16b, v3.16b, v28.16b
+ bic v4.16b, v28.16b, v5.16b
+ addp v3.16b, v3.16b, v4.16b
+ addp v3.16b, v3.16b, v3.16b
+ addp v3.16b, v3.16b, v3.16b
+ fmov w15, s3
+
+ // coeffNum = addv(v3 != 0) = 16 - addv(v5).
+ addv b5, v5.16b
+ smov w6, v5.b[0]
+ add w6, w6, #16
+ sub x5, x5, x6
+ strb w6, [x4], #1
+
+ // coeffFlag = reverse_bit(w15) in 16-bit.
+ rbit w12, w15
+ strh w12, [x3], #2
+
+ // Pack bits from z3.h[0] into z30.h[0], based on z3.h[1] mask.
+ mov h31, v3.h[1]
+ bext z30.h, z3.h, z31.h
+ str h30, [x2], #2
+
+ cbnz x5, 1b
+
+ // Count trailing zeros in (reversed) coeffFlag.
+ clz w13, w15
+ lsr w12, w12, w13
+ strh w12, [x3, #-2]
+
+ // Get last pos.
+ sub x9, x4, x9
+ eor w13, w13, #15
+ add x0, x13, x9, lsl #4
+ ret
+endfunc
+
+const g_SPL_and_mask, align=8
+.byte 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+endconst
--
2.43.0
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-AArch64-Add-SVE2_BitPerm-implementation-of-scanPosLa.patch
Type: text/x-diff
Size: 10160 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250620/edba76d9/attachment.patch>
More information about the x265-devel
mailing list