[x265] [PATCH v2 2/6] AArch64: Add run-time CPU feature detection
Hari Limaye
hari.limaye at arm.com
Wed Nov 6 15:35:36 UTC 2024
Add run-time CPU feature detection for AArch64 ISA extensions on Linux,
enabled by the CMake option `AARCH64_RUNTIME_CPU_DETECT`. This option is
enabled by default - for platforms with no detection method implemented
we will fall back to compile-time CPU feature detection.
Also add logic to testbench to handle the case where the --cpuid
parameter conflicts with the feature detection, to fail gracefully
rather than SIGILL.
---
source/CMakeLists.txt | 55 ++++++++++------
source/common/CMakeLists.txt | 18 ++++--
source/common/aarch64/cpu.h | 120 +++++++++++++++++++++++++++++++++++
source/common/cpu.cpp | 17 +----
source/common/param.cpp | 2 +
source/test/testbench.cpp | 6 ++
6 files changed, 177 insertions(+), 41 deletions(-)
create mode 100644 source/common/aarch64/cpu.h
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 908980675..f177c4522 100755
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -87,12 +87,28 @@ elseif(ARM64MATCH GREATER "-1")
option(AARCH64_WARNINGS_AS_ERRORS "Build with -Werror for AArch64 Intrinsics files" OFF)
+ option(AARCH64_RUNTIME_CPU_DETECT "Enable AArch64 run-time CPU feature detection" ON)
+ if(NOT CMAKE_SYSTEM_NAME MATCHES "Linux")
+ set(AARCH64_RUNTIME_CPU_DETECT OFF CACHE BOOL "" FORCE)
+ message(STATUS "Run-time CPU feature detection unsupported on this platform")
+ endif()
+
# Options for manually enabling/disabling AArch64 SIMD extensions.
option(ENABLE_NEON "Enable Neon" ON)
option(ENABLE_NEON_DOTPROD "Enable Neon DotProd" ON)
option(ENABLE_NEON_I8MM "Enable Neon I8MM" ON)
option(ENABLE_SVE "Enable SVE" ON)
option(ENABLE_SVE2 "Enable SVE2" ON)
+
+ # Compiler flags for AArch64 extensions.
+ set(AARCH64_NEON_FLAG "-march=armv8-a")
+ # Neon DotProd is mandatory from Armv8.4.
+ set(AARCH64_NEON_DOTPROD_FLAG "-march=armv8.2-a+dotprod")
+ # Neon I8MM is mandatory from Armv8.6.
+ set(AARCH64_NEON_I8MM_FLAG "-march=armv8.2-a+dotprod+i8mm")
+ set(AARCH64_SVE_FLAG "-march=armv8.2-a+dotprod+i8mm+sve")
+ # SVE2 is only available from Armv9.0, and armv9-a implies +dotprod and +sve.
+ set(AARCH64_SVE2_FLAG "-march=armv9-a+i8mm+sve2")
else()
message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
@@ -268,7 +284,13 @@ if(GCC)
set(CPU_HAS_NEON 1)
add_definitions(-DX265_ARCH_ARM64=1)
- if(CROSS_COMPILE_ARM64)
+ if (AARCH64_RUNTIME_CPU_DETECT)
+ add_definitions(-DAARCH64_RUNTIME_CPU_DETECT=1)
+ message(STATUS "Configuring build for run-time CPU feature detection")
+ endif()
+
+ if(AARCH64_RUNTIME_CPU_DETECT OR CROSS_COMPILE_ARM64)
+ # Add all extensions when compiling for run-time CPU feature detection or cross compiling.
set(CPU_HAS_NEON_DOTPROD 1)
set(CPU_HAS_NEON_I8MM 1)
set(CPU_HAS_SVE 1)
@@ -280,7 +302,7 @@ if(GCC)
find_package(SVE)
find_package(SVE2)
else()
- message(STATUS "Compile time feature detection unsupported on this platform")
+ message(STATUS "Compile-time CPU feature detection unsupported on this platform")
endif()
endif()
@@ -312,33 +334,25 @@ if(GCC)
if(CPU_HAS_NEON)
message(STATUS "Found Neon")
- set(ARM_ARGS -O3 -march=armv8-a)
add_definitions(-DHAVE_NEON=1)
endif()
if(CPU_HAS_NEON_DOTPROD)
- # Neon DotProd is mandatory from Armv8.4.
message(STATUS "Found Neon DotProd")
- set(ARM_ARGS -O3 -march=armv8.2-a+dotprod)
add_definitions(-DHAVE_NEON_DOTPROD=1)
endif()
if(CPU_HAS_NEON_I8MM)
- # Neon I8MM is mandatory from Armv8.6.
message(STATUS "Found Neon I8MM")
- set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm)
add_definitions(-DHAVE_NEON_I8MM=1)
endif()
if(CPU_HAS_SVE)
message(STATUS "Found SVE")
- set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve)
add_definitions(-DHAVE_SVE=1)
endif()
if(CPU_HAS_SVE2)
message(STATUS "Found SVE2")
- # SVE2 is only available from Armv9.0, and armv9-a implies +dotprod
- set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2)
add_definitions(-DHAVE_SVE2=1)
endif()
- set(ARM_ARGS ${ARM_ARGS} -fPIC)
+ set(ARM_ARGS -O3 -fPIC)
# Do not allow implicit vector type conversions in Clang builds (this
# is already the default in GCC builds).
check_cxx_compiler_flag(-flax-vector-conversions=none CC_HAS_FLAX_VEC_CONV_NONE)
@@ -356,7 +370,8 @@ int main() { return 0; }")
set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
# CMAKE_REQUIRED_FLAGS requires a space-delimited string, whereas
# ARM_ARGS is defined and used elsewhere as a ;-list.
- foreach(ARM_ARG ${ARM_ARGS})
+ # Add `-march=...+sve` so the test functions correctly with Clang.
+ foreach(ARM_ARG ${ARM_ARGS} ${AARCH64_SVE_FLAG})
string(APPEND CMAKE_REQUIRED_FLAGS " ${ARM_ARG}")
endforeach()
check_c_source_compiles("${SVE_HEADER_TEST}" SVE_HEADER_C_TEST_COMPILED)
@@ -705,18 +720,18 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
add_custom_command(
OUTPUT ${ASM}.${SUFFIX}
COMMAND ${CMAKE_CXX_COMPILER}
- ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
+ ARGS ${ARM_ARGS} ${ASM_FLAGS} ${AARCH64_NEON_FLAG} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
DEPENDS ${ASM_SRC})
endforeach()
- if(CPU_HAS_SVE2)
- foreach(ASM ${ARM_ASMS_SVE2})
+ if(CPU_HAS_NEON_DOTPROD)
+ foreach(ASM ${ARM_ASMS_NEON_DOTPROD})
set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
list(APPEND ASM_SRCS ${ASM_SRC})
list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
add_custom_command(
OUTPUT ${ASM}.${SUFFIX}
COMMAND ${CMAKE_CXX_COMPILER}
- ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
+ ARGS ${ARM_ARGS} ${ASM_FLAGS} ${AARCH64_NEON_DOTPROD_FLAG} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
DEPENDS ${ASM_SRC})
endforeach()
endif()
@@ -728,19 +743,19 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
add_custom_command(
OUTPUT ${ASM}.${SUFFIX}
COMMAND ${CMAKE_CXX_COMPILER}
- ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
+ ARGS ${ARM_ARGS} ${ASM_FLAGS} ${AARCH64_SVE_FLAG} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
DEPENDS ${ASM_SRC})
endforeach()
endif()
- if(CPU_HAS_NEON_DOTPROD)
- foreach(ASM ${ARM_ASMS_NEON_DOTPROD})
+ if(CPU_HAS_SVE2)
+ foreach(ASM ${ARM_ASMS_SVE2})
set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
list(APPEND ASM_SRCS ${ASM_SRC})
list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
add_custom_command(
OUTPUT ${ASM}.${SUFFIX}
COMMAND ${CMAKE_CXX_COMPILER}
- ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
+ ARGS ${ARM_ARGS} ${ASM_FLAGS} ${AARCH64_SVE2_FLAG} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
DEPENDS ${ASM_SRC})
endforeach()
endif()
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index dc4a74107..aacc0ef62 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -103,6 +103,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
add_definitions(-DAUTO_VECTORIZE=1)
endif()
+ # Add Arm intrinsics files here.
set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp mem-neon.h)
set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp)
set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp)
@@ -110,11 +111,11 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
set(C_SRCS_SVE2 sao-prim-sve2.cpp)
enable_language(ASM)
- # add ARM assembly/intrinsic files here
+ # Add Arm assembly files here.
set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S intrapred.S dct.S)
+ set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S)
set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S)
set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S ssd-a-sve2.S)
- set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S)
set(VEC_PRIMITIVES)
set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
@@ -123,29 +124,34 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL "Arm Assembly Sources that use the Neon DotProd extension")
foreach(SRC ${C_SRCS_NEON})
set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+ set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_NEON_FLAG})
endforeach()
- if(CPU_HAS_NEON_I8MM)
- foreach(SRC ${C_SRCS_NEON_I8MM})
+ if(CPU_HAS_NEON_DOTPROD)
+ foreach(SRC ${C_SRCS_NEON_DOTPROD})
set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+ set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_NEON_DOTPROD_FLAG})
endforeach()
endif()
- if(CPU_HAS_NEON_DOTPROD)
- foreach(SRC ${C_SRCS_NEON_DOTPROD})
+ if(CPU_HAS_NEON_I8MM)
+ foreach(SRC ${C_SRCS_NEON_I8MM})
set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+ set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_NEON_I8MM_FLAG})
endforeach()
endif()
if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE)
foreach(SRC ${C_SRCS_SVE})
set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+ set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_SVE_FLAG})
endforeach()
endif()
if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE)
foreach(SRC ${C_SRCS_SVE2})
set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+ set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_SVE2_FLAG})
endforeach()
endif()
diff --git a/source/common/aarch64/cpu.h b/source/common/aarch64/cpu.h
new file mode 100644
index 000000000..88ce2e310
--- /dev/null
+++ b/source/common/aarch64/cpu.h
@@ -0,0 +1,120 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye at arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_COMMON_AARCH64_CPU_H
+#define X265_COMMON_AARCH64_CPU_H
+
+#include "x265.h"
+
+#if AARCH64_RUNTIME_CPU_DETECT
+
+#if defined(__linux__)
+
+#include <sys/auxv.h>
+
+#define X265_AARCH64_HWCAP_ASIMDDP (1 << 20)
+#define X265_AARCH64_HWCAP_SVE (1 << 22)
+#define X265_AARCH64_HWCAP2_SVE2 (1 << 1)
+#define X265_AARCH64_HWCAP2_I8MM (1 << 13)
+
+static inline int aarch64_get_cpu_flags()
+{
+ int flags = 0;
+
+#if HAVE_NEON_DOTPROD || HAVE_SVE
+ unsigned long hwcap = getauxval(AT_HWCAP);
+#endif
+#if HAVE_NEON_I8MM || HAVE_SVE2
+ unsigned long hwcap2 = getauxval(AT_HWCAP2);
+#endif
+
+#if HAVE_NEON
+ flags |= X265_CPU_NEON;
+#endif
+#if HAVE_NEON_DOTPROD
+ if (hwcap & X265_AARCH64_HWCAP_ASIMDDP) flags |= X265_CPU_NEON_DOTPROD;
+#endif
+#if HAVE_NEON_I8MM
+ if (hwcap2 & X265_AARCH64_HWCAP2_I8MM) flags |= X265_CPU_NEON_I8MM;
+#endif
+#if HAVE_SVE
+ if (hwcap & X265_AARCH64_HWCAP_SVE) flags |= X265_CPU_SVE;
+#endif
+#if HAVE_SVE2
+ if (hwcap2 & X265_AARCH64_HWCAP2_SVE2) flags |= X265_CPU_SVE2;
+#endif
+
+ return flags;
+}
+
+#else // defined(__linux__)
+#error \
+ "Run-time CPU feature detection selected, but no detection method" \
+ "available for your platform. Rerun cmake configure with" \
+ "-DAARCH64_RUNTIME_CPU_DETECT=OFF."
+#endif // defined(__linux__)
+
+static inline int aarch64_cpu_detect()
+{
+ int flags = aarch64_get_cpu_flags();
+
+ // Restrict flags: FEAT_I8MM assumes that FEAT_DotProd is available.
+ if (!(flags & X265_CPU_NEON_DOTPROD)) flags &= ~X265_CPU_NEON_I8MM;
+
+ // Restrict flags: SVE assumes that FEAT_{DotProd,I8MM} are available.
+ if (!(flags & X265_CPU_NEON_DOTPROD)) flags &= ~X265_CPU_SVE;
+ if (!(flags & X265_CPU_NEON_I8MM)) flags &= ~X265_CPU_SVE;
+
+ // Restrict flags: SVE2 assumes that FEAT_SVE is available.
+ if (!(flags & X265_CPU_SVE)) flags &= ~X265_CPU_SVE2;
+
+ return flags;
+}
+
+#else // if AARCH64_RUNTIME_CPU_DETECT
+
+static inline int aarch64_cpu_detect()
+{
+ int flags = 0;
+
+#if HAVE_NEON
+ flags |= X265_CPU_NEON;
+#endif
+#if HAVE_NEON_DOTPROD
+ flags |= X265_CPU_NEON_DOTPROD;
+#endif
+#if HAVE_NEON_I8MM
+ flags |= X265_CPU_NEON_I8MM;
+#endif
+#if HAVE_SVE
+ flags |= X265_CPU_SVE;
+#endif
+#if HAVE_SVE2
+ flags |= X265_CPU_SVE2;
+#endif
+ return flags;
+}
+
+#endif // if AARCH64_RUNTIME_CPU_DETECT
+
+#endif // ifndef X265_COMMON_AARCH64_CPU_H
diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp
index 485aa681f..ae0907890 100644
--- a/source/common/cpu.cpp
+++ b/source/common/cpu.cpp
@@ -390,6 +390,7 @@ uint32_t cpu_detect(bool benableavx512)
}
#elif X265_ARCH_ARM64
+#include "aarch64/cpu.h"
uint32_t cpu_detect(bool benableavx512)
{
@@ -397,21 +398,7 @@ uint32_t cpu_detect(bool benableavx512)
int flags = 0;
#ifdef ENABLE_ASSEMBLY
- #if HAVE_NEON
- flags |= X265_CPU_NEON;
- #endif
- #if HAVE_NEON_DOTPROD
- flags |= X265_CPU_NEON_DOTPROD;
- #endif
- #if HAVE_NEON_I8MM
- flags |= X265_CPU_NEON_I8MM;
- #endif
- #if HAVE_SVE
- flags |= X265_CPU_SVE;
- #endif
- #if HAVE_SVE2
- flags |= X265_CPU_SVE2;
- #endif
+ flags = aarch64_cpu_detect();
#endif
return flags;
diff --git a/source/common/param.cpp b/source/common/param.cpp
index da039d914..1beb3c056 100755
--- a/source/common/param.cpp
+++ b/source/common/param.cpp
@@ -1587,8 +1587,10 @@ int parseCpuName(const char* value, bool& bError, bool bEnableavx512)
}
free(buf);
+#if X265_ARCH_X86
if ((cpu & X265_CPU_SSSE3) && !(cpu & X265_CPU_SSE2_IS_SLOW))
cpu |= X265_CPU_SSE2_IS_FAST;
+#endif
}
return cpu;
diff --git a/source/test/testbench.cpp b/source/test/testbench.cpp
index ac93e37b3..b8ef760f2 100644
--- a/source/test/testbench.cpp
+++ b/source/test/testbench.cpp
@@ -120,6 +120,7 @@ int main(int argc, char *argv[])
}
else if (!strncmp(name, "cpuid", strlen(name)))
{
+ int cpu_detect_cpuid = cpuid;
bool bError = false;
cpuid = parseCpuName(value, bError, enableavx512);
if (bError)
@@ -127,6 +128,11 @@ int main(int argc, char *argv[])
printf("Invalid CPU name: %s\n", value);
return 1;
}
+ else if ((cpuid & cpu_detect_cpuid) != cpuid)
+ {
+ printf("Feature detection conflicts with provided --cpuid: %s\n", value);
+ return 1;
+ }
i += 2;
}
else if (!strncmp(name, "testbench", strlen(name)))
--
2.42.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: v2-0002-AArch64-Add-run-time-CPU-feature-detection.patch
Type: text/x-patch
Size: 17493 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20241106/d992799d/attachment-0001.bin>
More information about the x265-devel
mailing list