[x265] [PATCH 2/6] AArch64: Add run-time CPU feature detection

Hari Limaye hari.limaye at arm.com
Thu Oct 24 23:27:49 UTC 2024


Add run-time CPU feature detection for AArch64 ISA extensions on Linux,
enabled by the CMake option `AARCH64_RUNTIME_CPU_DETECT`. This option is
enabled by default - for platforms with no detection method implemented
we will fall back to compile-time CPU feature detection.

Also add logic to testbench to handle the case where the --cpuid
parameter conflicts with the feature detection, to fail gracefully
rather than SIGILL.
---
 source/CMakeLists.txt        |  55 ++++++++++------
 source/common/CMakeLists.txt |  18 ++++--
 source/common/aarch64/cpu.h  | 120 +++++++++++++++++++++++++++++++++++
 source/common/cpu.cpp        |  19 +-----
 source/common/param.cpp      |   2 +
 source/test/testbench.cpp    |   6 ++
 6 files changed, 178 insertions(+), 42 deletions(-)
 create mode 100644 source/common/aarch64/cpu.h

diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 182b57634..cc114bc95 100755
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -87,12 +87,28 @@ elseif(ARM64MATCH GREATER "-1")
 
     option(AARCH64_WARNINGS_AS_ERRORS "Build with -Werror for AArch64 Intrinsics files" OFF)
 
+    option(AARCH64_RUNTIME_CPU_DETECT "Enable AArch64 run-time CPU feature detection" ON)
+    if(NOT CMAKE_SYSTEM_NAME MATCHES "Linux")
+        set(AARCH64_RUNTIME_CPU_DETECT OFF CACHE BOOL "" FORCE)
+        message(STATUS "Run-time CPU feature detection unsupported on this platform")
+    endif()
+
     # Options for manually enabling/disabling AArch64 SIMD extensions.
     option(ENABLE_NEON "Enable Neon" ON)
     option(ENABLE_NEON_DOTPROD "Enable Neon DotProd" ON)
     option(ENABLE_NEON_I8MM "Enable Neon I8MM" ON)
     option(ENABLE_SVE "Enable SVE" ON)
     option(ENABLE_SVE2 "Enable SVE2" ON)
+
+    # Compiler flags for AArch64 extensions.
+    set(AARCH64_NEON_FLAG "-march=armv8-a")
+    # Neon DotProd is mandatory from Armv8.4.
+    set(AARCH64_NEON_DOTPROD_FLAG "-march=armv8.2-a+dotprod")
+    # Neon I8MM is mandatory from Armv8.6.
+    set(AARCH64_NEON_I8MM_FLAG "-march=armv8.2-a+dotprod+i8mm")
+    set(AARCH64_SVE_FLAG "-march=armv8.2-a+dotprod+i8mm+sve")
+    # SVE2 is only available from Armv9.0, and armv9-a implies +dotprod and +sve.
+    set(AARCH64_SVE2_FLAG "-march=armv9-a+i8mm+sve2")
 else()
     message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
     message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
@@ -268,7 +284,13 @@ if(GCC)
         set(CPU_HAS_NEON 1)
         add_definitions(-DX265_ARCH_ARM64=1)
 
-        if(CROSS_COMPILE_ARM64)
+        if (AARCH64_RUNTIME_CPU_DETECT)
+            add_definitions(-DAARCH64_RUNTIME_CPU_DETECT=1)
+            message(STATUS "Configuring build for run-time CPU feature detection")
+        endif()
+
+        if(AARCH64_RUNTIME_CPU_DETECT OR CROSS_COMPILE_ARM64)
+            # Add all extensions when compiling for run-time CPU feature detection or cross compiling.
             set(CPU_HAS_NEON_DOTPROD 1)
             set(CPU_HAS_NEON_I8MM 1)
             set(CPU_HAS_SVE 1)
@@ -280,7 +302,7 @@ if(GCC)
                 find_package(SVE)
                 find_package(SVE2)
             else()
-                message(STATUS "Compile time feature detection unsupported on this platform")
+                message(STATUS "Compile-time CPU feature detection unsupported on this platform")
             endif()
         endif()
 
@@ -312,33 +334,25 @@ if(GCC)
 
         if(CPU_HAS_NEON)
             message(STATUS "Found Neon")
-            set(ARM_ARGS -O3 -march=armv8-a)
             add_definitions(-DHAVE_NEON=1)
         endif()
         if(CPU_HAS_NEON_DOTPROD)
-            # Neon DotProd is mandatory from Armv8.4.
             message(STATUS "Found Neon DotProd")
-            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod)
             add_definitions(-DHAVE_NEON_DOTPROD=1)
         endif()
         if(CPU_HAS_NEON_I8MM)
-            # Neon I8MM is mandatory from Armv8.6.
             message(STATUS "Found Neon I8MM")
-            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm)
             add_definitions(-DHAVE_NEON_I8MM=1)
         endif()
         if(CPU_HAS_SVE)
             message(STATUS "Found SVE")
-            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve)
             add_definitions(-DHAVE_SVE=1)
         endif()
         if(CPU_HAS_SVE2)
             message(STATUS "Found SVE2")
-            # SVE2 is only available from Armv9.0, and armv9-a implies +dotprod
-            set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2)
             add_definitions(-DHAVE_SVE2=1)
         endif()
-        set(ARM_ARGS ${ARM_ARGS} -fPIC)
+        set(ARM_ARGS -O3 -fPIC)
         # Do not allow implicit vector type conversions in Clang builds (this
         # is already the default in GCC builds).
         check_cxx_compiler_flag(-flax-vector-conversions=none CC_HAS_FLAX_VEC_CONV_NONE)
@@ -356,7 +370,8 @@ int main() { return 0; }")
             set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
             # CMAKE_REQUIRED_FLAGS requires a space-delimited string, whereas
             # ARM_ARGS is defined and used elsewhere as a ;-list.
-            foreach(ARM_ARG ${ARM_ARGS})
+            # Add `-march=...+sve` so the test functions correctly with Clang.
+            foreach(ARM_ARG ${ARM_ARGS} ${AARCH64_SVE_FLAG})
                 string(APPEND CMAKE_REQUIRED_FLAGS " ${ARM_ARG}")
             endforeach()
             check_c_source_compiles("${SVE_HEADER_TEST}" SVE_HEADER_C_TEST_COMPILED)
@@ -704,18 +719,18 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
             add_custom_command(
                 OUTPUT ${ASM}.${SUFFIX}
                 COMMAND ${CMAKE_CXX_COMPILER}
-                ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
+                ARGS ${ARM_ARGS} ${ASM_FLAGS} ${AARCH64_NEON_FLAG} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
                 DEPENDS ${ASM_SRC})
         endforeach()
-        if(CPU_HAS_SVE2)
-            foreach(ASM ${ARM_ASMS_SVE2})
+        if(CPU_HAS_NEON_DOTPROD)
+            foreach(ASM ${ARM_ASMS_NEON_DOTPROD})
                 set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
                 list(APPEND ASM_SRCS ${ASM_SRC})
                 list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
                 add_custom_command(
                     OUTPUT ${ASM}.${SUFFIX}
                     COMMAND ${CMAKE_CXX_COMPILER}
-                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
+                    ARGS ${ARM_ARGS} ${ASM_FLAGS} ${AARCH64_NEON_DOTPROD_FLAG} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
                     DEPENDS ${ASM_SRC})
             endforeach()
         endif()
@@ -727,19 +742,19 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
                 add_custom_command(
                     OUTPUT ${ASM}.${SUFFIX}
                     COMMAND ${CMAKE_CXX_COMPILER}
-                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
+                    ARGS ${ARM_ARGS} ${ASM_FLAGS} ${AARCH64_SVE_FLAG} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
                     DEPENDS ${ASM_SRC})
             endforeach()
         endif()
-        if(CPU_HAS_NEON_DOTPROD)
-            foreach(ASM ${ARM_ASMS_NEON_DOTPROD})
+        if(CPU_HAS_SVE2)
+            foreach(ASM ${ARM_ASMS_SVE2})
                 set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
                 list(APPEND ASM_SRCS ${ASM_SRC})
                 list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
                 add_custom_command(
                     OUTPUT ${ASM}.${SUFFIX}
                     COMMAND ${CMAKE_CXX_COMPILER}
-                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
+                    ARGS ${ARM_ARGS} ${ASM_FLAGS} ${AARCH64_SVE2_FLAG} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
                     DEPENDS ${ASM_SRC})
             endforeach()
         endif()
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index dc4a74107..aacc0ef62 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -103,6 +103,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
         add_definitions(-DAUTO_VECTORIZE=1)
     endif()
 
+    # Add Arm intrinsics files here.
     set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp  mem-neon.h)
     set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp)
     set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp)
@@ -110,11 +111,11 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
     set(C_SRCS_SVE2 sao-prim-sve2.cpp)
     enable_language(ASM)
 
-    # add ARM assembly/intrinsic files here
+    # Add Arm assembly files here.
     set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S intrapred.S dct.S)
+    set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S)
     set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S)
     set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S ssd-a-sve2.S)
-    set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S)
     set(VEC_PRIMITIVES)
 
     set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
@@ -123,29 +124,34 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
     set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL "Arm Assembly Sources that use the Neon DotProd extension")
     foreach(SRC ${C_SRCS_NEON})
         set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+        set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_NEON_FLAG})
     endforeach()
 
-    if(CPU_HAS_NEON_I8MM)
-        foreach(SRC ${C_SRCS_NEON_I8MM})
+    if(CPU_HAS_NEON_DOTPROD)
+        foreach(SRC ${C_SRCS_NEON_DOTPROD})
             set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+            set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_NEON_DOTPROD_FLAG})
         endforeach()
     endif()
 
-    if(CPU_HAS_NEON_DOTPROD)
-        foreach(SRC ${C_SRCS_NEON_DOTPROD})
+    if(CPU_HAS_NEON_I8MM)
+        foreach(SRC ${C_SRCS_NEON_I8MM})
             set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+            set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_NEON_I8MM_FLAG})
         endforeach()
     endif()
 
     if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE)
         foreach(SRC ${C_SRCS_SVE})
             set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+            set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_SVE_FLAG})
         endforeach()
     endif()
 
     if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE)
         foreach(SRC ${C_SRCS_SVE2})
             set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+            set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_SVE2_FLAG})
         endforeach()
     endif()
 
diff --git a/source/common/aarch64/cpu.h b/source/common/aarch64/cpu.h
new file mode 100644
index 000000000..88ce2e310
--- /dev/null
+++ b/source/common/aarch64/cpu.h
@@ -0,0 +1,120 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye at arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_COMMON_AARCH64_CPU_H
+#define X265_COMMON_AARCH64_CPU_H
+
+#include "x265.h"
+
+#if AARCH64_RUNTIME_CPU_DETECT
+
+#if defined(__linux__)
+
+#include <sys/auxv.h>
+
+#define X265_AARCH64_HWCAP_ASIMDDP (1 << 20)
+#define X265_AARCH64_HWCAP_SVE (1 << 22)
+#define X265_AARCH64_HWCAP2_SVE2 (1 << 1)
+#define X265_AARCH64_HWCAP2_I8MM (1 << 13)
+
+static inline int aarch64_get_cpu_flags()
+{
+    int flags = 0;
+
+#if HAVE_NEON_DOTPROD || HAVE_SVE
+    unsigned long hwcap = getauxval(AT_HWCAP);
+#endif
+#if HAVE_NEON_I8MM || HAVE_SVE2
+    unsigned long hwcap2 = getauxval(AT_HWCAP2);
+#endif
+
+#if HAVE_NEON
+    flags |= X265_CPU_NEON;
+#endif
+#if HAVE_NEON_DOTPROD
+    if (hwcap & X265_AARCH64_HWCAP_ASIMDDP) flags |= X265_CPU_NEON_DOTPROD;
+#endif
+#if HAVE_NEON_I8MM
+    if (hwcap2 & X265_AARCH64_HWCAP2_I8MM) flags |= X265_CPU_NEON_I8MM;
+#endif
+#if HAVE_SVE
+    if (hwcap & X265_AARCH64_HWCAP_SVE) flags |= X265_CPU_SVE;
+#endif
+#if HAVE_SVE2
+    if (hwcap2 & X265_AARCH64_HWCAP2_SVE2) flags |= X265_CPU_SVE2;
+#endif
+
+    return flags;
+}
+
+#else // defined(__linux__)
+#error                                                                 \
+    "Run-time CPU feature detection selected, but no detection method" \
+    "available for your platform. Rerun cmake configure with"          \
+    "-DAARCH64_RUNTIME_CPU_DETECT=OFF."
+#endif // defined(__linux__)
+
+static inline int aarch64_cpu_detect()
+{
+    int flags = aarch64_get_cpu_flags();
+
+    // Restrict flags: FEAT_I8MM assumes that FEAT_DotProd is available.
+    if (!(flags & X265_CPU_NEON_DOTPROD)) flags &= ~X265_CPU_NEON_I8MM;
+
+    // Restrict flags: SVE assumes that FEAT_{DotProd,I8MM} are available.
+    if (!(flags & X265_CPU_NEON_DOTPROD)) flags &= ~X265_CPU_SVE;
+    if (!(flags & X265_CPU_NEON_I8MM)) flags &= ~X265_CPU_SVE;
+
+    // Restrict flags: SVE2 assumes that FEAT_SVE is available.
+    if (!(flags & X265_CPU_SVE)) flags &= ~X265_CPU_SVE2;
+
+    return flags;
+}
+
+#else // if AARCH64_RUNTIME_CPU_DETECT
+
+static inline int aarch64_cpu_detect()
+{
+    int flags = 0;
+
+#if HAVE_NEON
+    flags |= X265_CPU_NEON;
+#endif
+#if HAVE_NEON_DOTPROD
+    flags |= X265_CPU_NEON_DOTPROD;
+#endif
+#if HAVE_NEON_I8MM
+    flags |= X265_CPU_NEON_I8MM;
+#endif
+#if HAVE_SVE
+    flags |= X265_CPU_SVE;
+#endif
+#if HAVE_SVE2
+    flags |= X265_CPU_SVE2;
+#endif
+    return flags;
+}
+
+#endif // if AARCH64_RUNTIME_CPU_DETECT
+
+#endif // ifndef X265_COMMON_AARCH64_CPU_H
diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp
index d18aeb8d2..11b1ab34a 100644
--- a/source/common/cpu.cpp
+++ b/source/common/cpu.cpp
@@ -390,27 +390,14 @@ uint32_t cpu_detect(bool benableavx512)
 }
 
 #elif X265_ARCH_ARM64
+#include "aarch64/cpu.h"
 
-uint32_t cpu_detect(bool benableavx512)
+uint32_t cpu_detect(bool /*benableavx512*/)
 {
     int flags = 0;
 
 #ifdef ENABLE_ASSEMBLY
-    #if HAVE_NEON
-         flags |= X265_CPU_NEON;
-    #endif
-    #if HAVE_NEON_DOTPROD
-         flags |= X265_CPU_NEON_DOTPROD;
-    #endif
-    #if HAVE_NEON_I8MM
-         flags |= X265_CPU_NEON_I8MM;
-    #endif
-    #if HAVE_SVE
-         flags |= X265_CPU_SVE;
-    #endif
-    #if HAVE_SVE2
-         flags |= X265_CPU_SVE2;
-    #endif
+    flags = aarch64_cpu_detect();
 #endif
 
     return flags;
diff --git a/source/common/param.cpp b/source/common/param.cpp
index 71437aa98..fc15a147f 100755
--- a/source/common/param.cpp
+++ b/source/common/param.cpp
@@ -1587,8 +1587,10 @@ int parseCpuName(const char* value, bool& bError, bool bEnableavx512)
         }
 
         free(buf);
+#if X265_ARCH_X86
         if ((cpu & X265_CPU_SSSE3) && !(cpu & X265_CPU_SSE2_IS_SLOW))
             cpu |= X265_CPU_SSE2_IS_FAST;
+#endif
     }
 
     return cpu;
diff --git a/source/test/testbench.cpp b/source/test/testbench.cpp
index ac93e37b3..b8ef760f2 100644
--- a/source/test/testbench.cpp
+++ b/source/test/testbench.cpp
@@ -120,6 +120,7 @@ int main(int argc, char *argv[])
         }
         else if (!strncmp(name, "cpuid", strlen(name)))
         {
+            int cpu_detect_cpuid = cpuid;
             bool bError = false;
             cpuid = parseCpuName(value, bError, enableavx512);
             if (bError)
@@ -127,6 +128,11 @@ int main(int argc, char *argv[])
                 printf("Invalid CPU name: %s\n", value);
                 return 1;
             }
+            else if ((cpuid & cpu_detect_cpuid) != cpuid)
+            {
+                printf("Feature detection conflicts with provided --cpuid: %s\n", value);
+                return 1;
+            }
             i += 2;
         }
         else if (!strncmp(name, "testbench", strlen(name)))
-- 
2.42.1

-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0002-AArch64-Add-run-time-CPU-feature-detection.patch
Type: text/x-patch
Size: 17533 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20241025/22c7dd81/attachment-0001.bin>


More information about the x265-devel mailing list