[x265] [PATCH] AArch64: Runtime CPU feature detection

Dash Santosh dash.sathyanarayanan at multicorewareinc.com
Thu Oct 3 14:03:47 UTC 2024


Fixed typos and updated TODO. Please find the updated patch below:

>From c116db02bd50faa59c3d2b1c63bd6816d6dec2a0 Mon Sep 17 00:00:00 2001
From: Logaprakash Ramajayam <logaprakash.ramajayam at multicorewareinc.com>
Date: Thu, 3 Oct 2024 05:24:16 -0700
Subject: [PATCH] AArch64: Runtime CPU feature detection

---
 .../make-aarch64-w64-mingw32-Makefiles.sh     |  8 ++
 .../msys/toolchain-aarch64-w64-mingw32.cmake  |  8 ++
 source/CMakeLists.txt                         | 21 +++--
 source/common/CMakeLists.txt                  |  5 ++
 source/common/cpu.cpp                         | 81 ++++++++++++++++++-
 5 files changed, 114 insertions(+), 9 deletions(-)
 create mode 100644 build/msys/make-aarch64-w64-mingw32-Makefiles.sh
 create mode 100644 build/msys/toolchain-aarch64-w64-mingw32.cmake

diff --git a/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
new file mode 100644
index 000000000..eceffa4a9
--- /dev/null
+++ b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
@@ -0,0 +1,8 @@
+#!/bin/sh
+
+# This will generate a cross-compile environment, compiling an aarch64
+# Win64 target from a 32bit MinGW32 host environment.  If your MinGW
+# install is 64bit, you can use the native compiler batch file:
+# make-Makefiles.sh
+
+cmake -G "MSYS Makefiles"
-DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64-w64-mingw32.cmake ../../source &&
cmake-gui ../../source
diff --git a/build/msys/toolchain-aarch64-w64-mingw32.cmake
b/build/msys/toolchain-aarch64-w64-mingw32.cmake
new file mode 100644
index 000000000..6607bdf64
--- /dev/null
+++ b/build/msys/toolchain-aarch64-w64-mingw32.cmake
@@ -0,0 +1,8 @@
+SET(CMAKE_SYSTEM_NAME Windows)
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
+SET(CMAKE_C_COMPILER aarch64-w64-mingw32-gcc)
+SET(CMAKE_CXX_COMPILER aarch64-w64-mingw32-g++)
+SET(CMAKE_RC_COMPILER aarch64-w64-mingw32-windres)
+SET(CMAKE_RANLIB aarch64-w64-mingw32-ranlib)
+SET(CMAKE_ASM_YASM_COMPILER yasm)
+SET(CROSS_COMPILE_ARM64 1)
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 13bc8ccfe..d1fe38559 100755
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -303,10 +303,12 @@ if(GCC)
             endif()
         endif()

+        set(ARM64_ARCH_ARGS "-O3")
         if(CPU_HAS_NEON_DOTPROD)
             # Neon DotProd is mandatory from Armv8.4.
             message(STATUS "Found Neon DotProd")
-            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod)
+            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod)
+            set(ARM_ARGS -O3)
             add_definitions(-DHAVE_NEON_DOTPROD=1)
         endif()
         if(CPU_HAS_NEON_I8MM)
@@ -316,7 +318,8 @@ if(GCC)
             if(NOT CPU_HAS_NEON_DOTPROD)
                 message(FATAL_ERROR "Unsupported AArch64 feature
combination (Neon I8MM without Neon DotProd)")
             endif()
-            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm)
+            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm)
+            set(ARM_ARGS -O3)
             add_definitions(-DHAVE_NEON_I8MM=1)
         endif()
         if(CPU_HAS_SVE)
@@ -325,13 +328,15 @@ if(GCC)
             if(NOT CPU_HAS_NEON_I8MM)
                 message(FATAL_ERROR "Unsupported AArch64 feature
combination (SVE without Neon I8MM)")
             endif()
-            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve)
+            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm+sve)
+            set(ARM_ARGS -O3)
             add_definitions(-DHAVE_SVE=1)
         endif()
         if(CPU_HAS_SVE2)
             message(STATUS "Found SVE2")
             # SVE2 is only available from Armv9.0, and armv9-a implies
+dotprod
-            set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2)
+            set(ARM64_ARCH_ARGS -march=armv9-a+i8mm+sve2)
+            set(ARM_ARGS -O3)
             add_definitions(-DHAVE_SVE2=1)
         endif()
         set(ARM_ARGS ${ARM_ARGS} -fPIC)
@@ -692,7 +697,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
             add_custom_command(
                 OUTPUT ${ASM}.${SUFFIX}
                 COMMAND ${CMAKE_CXX_COMPILER}
-                ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
${ASM}.${SUFFIX}
+                ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
${ASM_SRC} -o ${ASM}.${SUFFIX}
                 DEPENDS ${ASM_SRC})
         endforeach()
         if(CPU_HAS_SVE2)
@@ -703,7 +708,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
                 add_custom_command(
                     OUTPUT ${ASM}.${SUFFIX}
                     COMMAND ${CMAKE_CXX_COMPILER}
-                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
${ASM}.${SUFFIX}
+                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
${ASM_SRC} -o ${ASM}.${SUFFIX}
                     DEPENDS ${ASM_SRC})
             endforeach()
         endif()
@@ -715,7 +720,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
                 add_custom_command(
                     OUTPUT ${ASM}.${SUFFIX}
                     COMMAND ${CMAKE_CXX_COMPILER}
-                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
${ASM}.${SUFFIX}
+                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
${ASM_SRC} -o ${ASM}.${SUFFIX}
                     DEPENDS ${ASM_SRC})
             endforeach()
         endif()
@@ -727,7 +732,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
                 add_custom_command(
                     OUTPUT ${ASM}.${SUFFIX}
                     COMMAND ${CMAKE_CXX_COMPILER}
-                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
${ASM}.${SUFFIX}
+                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
${ASM_SRC} -o ${ASM}.${SUFFIX}
                     DEPENDS ${ASM_SRC})
             endforeach()
         endif()
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index dc4a74107..33025cada 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -123,29 +123,34 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
     set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL "Arm
Assembly Sources that use the Neon DotProd extension")
     foreach(SRC ${C_SRCS_NEON})
         set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+        set_source_files_properties( ${SRC} PROPERTIES COMPILE_FLAGS
${ARM64_ARCH_ARGS} )
     endforeach()

     if(CPU_HAS_NEON_I8MM)
         foreach(SRC ${C_SRCS_NEON_I8MM})
             set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+            set_source_files_properties( aarch64/${SRC} PROPERTIES
COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
         endforeach()
     endif()

     if(CPU_HAS_NEON_DOTPROD)
         foreach(SRC ${C_SRCS_NEON_DOTPROD})
             set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+            set_source_files_properties( aarch64/${SRC} PROPERTIES
COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
         endforeach()
     endif()

     if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE)
         foreach(SRC ${C_SRCS_SVE})
             set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+            set_source_files_properties( aarch64/${SRC} PROPERTIES
COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
         endforeach()
     endif()

     if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE)
         foreach(SRC ${C_SRCS_SVE2})
             set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+            set_source_files_properties( aarch64/${SRC} PROPERTIES
COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
         endforeach()
     endif()

diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp
index 61cdaadfb..2d4b15dc9 100644
--- a/source/common/cpu.cpp
+++ b/source/common/cpu.cpp
@@ -391,7 +391,8 @@ uint32_t cpu_detect(bool benableavx512)

 #elif X265_ARCH_ARM64

-uint32_t cpu_detect(bool benableavx512)
+#if defined(_MSC_VER) || defined(__APPLE__)
+uint32_t cpu_detect(bool /*benableavx512*/)
 {
     int flags = 0;

@@ -414,6 +415,84 @@ uint32_t cpu_detect(bool benableavx512)
     return flags;
 }

+// TODO: Remove isOryonCPU() once Windows defines PF_ flag for I8MM on
supported ARM64 devices
+#elif defined(__MINGW64__) // Windows+Aarch64
+
+#include <windows.h>
+#include <processthreadsapi.h>
+
+bool isOryonCPU()
+{
+
+    char processorName[128];
+    DWORD bufferSize = 128;
+
+    LONG result = RegGetValue(HKEY_LOCAL_MACHINE,
"HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
"ProcessorNameString", RRF_RT_ANY, NULL, (PVOID)&processorName,
&bufferSize);
+    if (strstr(processorName, "Oryon") != NULL)
+    {
+        return true;
+    }
+    else
+    {
+        return false;
+    }
+}
+uint32_t cpu_detect(bool /*benableavx512*/)
+{
+
+    int flags = 0;
+
+    #if HAVE_NEON
+         flags |= X265_CPU_NEON;    // All of ARM64 has NEON
+    #endif
+    #if HAVE_NEON_DOTPROD && defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
+         flags |=
IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) ?
X265_CPU_NEON_DOTPROD : 0;
+    #endif
+    #if HAVE_NEON_I8MM
+         flags |= isOryonCPU() ? X265_CPU_NEON_I8MM : 0;
+    #endif
+    #if HAVE_SVE && defined(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)
+         flags |=
IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE) ? X265_CPU_SVE
: 0;
+    #endif
+    #if HAVE_SVE2 && defined(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)
+         flags |=
IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE) ?
X265_CPU_SVE2 : 0;
+    #endif
+
+    return flags;
+} // end of Windows+Aarch64
+
+#else // Linux+Aarch64
+
+#include <asm/hwcap.h>
+#include <sys/auxv.h>
+
+uint32_t cpu_detect(bool /*benableavx512*/)
+{
+    unsigned long hwcaps = getauxval(AT_HWCAP);
+    unsigned long hwcaps2 = getauxval(AT_HWCAP2);
+
+    int flags = 0;
+
+    #if HAVE_NEON
+         flags |= X265_CPU_NEON;    // All of ARM64 has NEON
+    #endif
+    #if HAVE_NEON_DOTPROD
+         flags |= (hwcaps & HWCAP_ASIMDDP ? X265_CPU_NEON_DOTPROD : 0);
+    #endif
+    #if HAVE_NEON_I8MM
+         flags |= (hwcaps2 & HWCAP2_I8MM ? X265_CPU_NEON_I8MM : 0);
+    #endif
+    #if HAVE_SVE
+         flags |= (hwcaps & HWCAP_SVE ? X265_CPU_SVE : 0);
+    #endif
+    #if HAVE_SVE2
+         flags |= (hwcaps2 & HWCAP2_SVE2 ? X265_CPU_SVE2 : 0);
+    #endif
+
+    return flags;
+}
+#endif // end of Linux+AArch64
+
 #elif X265_ARCH_POWER8

 uint32_t cpu_detect(bool benableavx512)
-- 
2.45.2


On Thu, Oct 3, 2024 at 3:26 PM Dash Santosh <
dash.sathyanarayanan at multicorewareinc.com> wrote:

> From 7d2353aaf7509721461c141f2800962c15ff440c Mon Sep 17 00:00:00 2001
> From: Logaprakash Ramajayam <logaprakash.ramajayam at multicorewareinc.com>
> Date: Wed, 2 Oct 2024 21:59:59 -0700
> Subject: [PATCH] AArch64: Runtime CPU feature detection
>
> ---
>  .../make-aarch64-w64-mingw32-Makefiles.sh     |  8 ++
>  .../msys/toolchain-aarch64-w64-mingw32.cmake  |  8 ++
>  source/CMakeLists.txt                         | 21 +++--
>  source/common/CMakeLists.txt                  |  5 ++
>  source/common/cpu.cpp                         | 81 ++++++++++++++++++-
>  5 files changed, 114 insertions(+), 9 deletions(-)
>  create mode 100644 build/msys/make-aarch64-w64-mingw32-Makefiles.sh
>  create mode 100644 build/msys/toolchain-aarch64-w64-mingw32.cmake
>
> diff --git a/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
> b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
> new file mode 100644
> index 000000000..eceffa4a9
> --- /dev/null
> +++ b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
> @@ -0,0 +1,8 @@
> +#!/bin/sh
> +
> +# This will generate a cross-compile environment, compiling an aarch64
> +# Win64 target from a 32bit MinGW32 host environment.  If your MinGW
> +# install is 64bit, you can use the native compiler batch file:
> +# make-Makefiles.sh
> +
> +cmake -G "MSYS Makefiles"
> -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64-w64-mingw32.cmake ../../source &&
> cmake-gui ../../source
> diff --git a/build/msys/toolchain-aarch64-w64-mingw32.cmake
> b/build/msys/toolchain-aarch64-w64-mingw32.cmake
> new file mode 100644
> index 000000000..6607bdf64
> --- /dev/null
> +++ b/build/msys/toolchain-aarch64-w64-mingw32.cmake
> @@ -0,0 +1,8 @@
> +SET(CMAKE_SYSTEM_NAME Windows)
> +set(CMAKE_SYSTEM_PROCESSOR aarch64)
> +SET(CMAKE_C_COMPILER aarch64-w64-mingw32-gcc)
> +SET(CMAKE_CXX_COMPILER aarch64-w64-mingw32-g++)
> +SET(CMAKE_RC_COMPILER aarch64-w64-mingw32-windres)
> +SET(CMAKE_RANLIB aarch64-w64-mingw32-ranlib)
> +SET(CMAKE_ASM_YASM_COMPILER yasm)
> +SET(CROSS_COMPILE_ARM64 1)
> diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
> index 13bc8ccfe..d1fe38559 100755
> --- a/source/CMakeLists.txt
> +++ b/source/CMakeLists.txt
> @@ -303,10 +303,12 @@ if(GCC)
>              endif()
>          endif()
>
> +        set(ARM64_ARCH_ARGS "-O3")
>          if(CPU_HAS_NEON_DOTPROD)
>              # Neon DotProd is mandatory from Armv8.4.
>              message(STATUS "Found Neon DotProd")
> -            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod)
> +            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod)
> +            set(ARM_ARGS -O3)
>              add_definitions(-DHAVE_NEON_DOTPROD=1)
>          endif()
>          if(CPU_HAS_NEON_I8MM)
> @@ -316,7 +318,8 @@ if(GCC)
>              if(NOT CPU_HAS_NEON_DOTPROD)
>                  message(FATAL_ERROR "Unsupported AArch64 feature
> combination (Neon I8MM without Neon DotProd)")
>              endif()
> -            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm)
> +            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm)
> +            set(ARM_ARGS -O3)
>              add_definitions(-DHAVE_NEON_I8MM=1)
>          endif()
>          if(CPU_HAS_SVE)
> @@ -325,13 +328,15 @@ if(GCC)
>              if(NOT CPU_HAS_NEON_I8MM)
>                  message(FATAL_ERROR "Unsupported AArch64 feature
> combination (SVE without Neon I8MM)")
>              endif()
> -            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve)
> +            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm+sve)
> +            set(ARM_ARGS -O3)
>              add_definitions(-DHAVE_SVE=1)
>          endif()
>          if(CPU_HAS_SVE2)
>              message(STATUS "Found SVE2")
>              # SVE2 is only available from Armv9.0, and armv9-a implies
> +dotprod
> -            set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2)
> +            set(ARM64_ARCH_ARGS -march=armv9-a+i8mm+sve2)
> +            set(ARM_ARGS -O3)
>              add_definitions(-DHAVE_SVE2=1)
>          endif()
>          set(ARM_ARGS ${ARM_ARGS} -fPIC)
> @@ -692,7 +697,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>              add_custom_command(
>                  OUTPUT ${ASM}.${SUFFIX}
>                  COMMAND ${CMAKE_CXX_COMPILER}
> -                ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
> ${ASM}.${SUFFIX}
> +                ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>                  DEPENDS ${ASM_SRC})
>          endforeach()
>          if(CPU_HAS_SVE2)
> @@ -703,7 +708,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>                  add_custom_command(
>                      OUTPUT ${ASM}.${SUFFIX}
>                      COMMAND ${CMAKE_CXX_COMPILER}
> -                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
> ${ASM}.${SUFFIX}
> +                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>                      DEPENDS ${ASM_SRC})
>              endforeach()
>          endif()
> @@ -715,7 +720,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>                  add_custom_command(
>                      OUTPUT ${ASM}.${SUFFIX}
>                      COMMAND ${CMAKE_CXX_COMPILER}
> -                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
> ${ASM}.${SUFFIX}
> +                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>                      DEPENDS ${ASM_SRC})
>              endforeach()
>          endif()
> @@ -727,7 +732,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>                  add_custom_command(
>                      OUTPUT ${ASM}.${SUFFIX}
>                      COMMAND ${CMAKE_CXX_COMPILER}
> -                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
> ${ASM}.${SUFFIX}
> +                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>                      DEPENDS ${ASM_SRC})
>              endforeach()
>          endif()
> diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
> index dc4a74107..33025cada 100644
> --- a/source/common/CMakeLists.txt
> +++ b/source/common/CMakeLists.txt
> @@ -123,29 +123,34 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR
> CROSS_COMPILE_ARM64))
>      set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL
> "Arm Assembly Sources that use the Neon DotProd extension")
>      foreach(SRC ${C_SRCS_NEON})
>          set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
> +        set_source_files_properties( ${SRC} PROPERTIES COMPILE_FLAGS
> ${ARM64_ARCH_ARGS} )
>      endforeach()
>
>      if(CPU_HAS_NEON_I8MM)
>          foreach(SRC ${C_SRCS_NEON_I8MM})
>              set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
> +            set_source_files_properties( aarch64/${SRC} PROPERTIES
> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>          endforeach()
>      endif()
>
>      if(CPU_HAS_NEON_DOTPROD)
>          foreach(SRC ${C_SRCS_NEON_DOTPROD})
>              set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
> +            set_source_files_properties( aarch64/${SRC} PROPERTIES
> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>          endforeach()
>      endif()
>
>      if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE)
>          foreach(SRC ${C_SRCS_SVE})
>              set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
> +            set_source_files_properties( aarch64/${SRC} PROPERTIES
> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>          endforeach()
>      endif()
>
>      if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE)
>          foreach(SRC ${C_SRCS_SVE2})
>              set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
> +            set_source_files_properties( aarch64/${SRC} PROPERTIES
> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>          endforeach()
>      endif()
>
> diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp
> index 61cdaadfb..a2b0ac081 100644
> --- a/source/common/cpu.cpp
> +++ b/source/common/cpu.cpp
> @@ -391,7 +391,8 @@ uint32_t cpu_detect(bool benableavx512)
>
>  #elif X265_ARCH_ARM64
>
> -uint32_t cpu_detect(bool benableavx512)
> +#if defined(_MSC_VER) || defined(__APPLE__)
> +uint32_t cpu_detect(bool /*benableavx512*/)
>  {
>      int flags = 0;
>
> @@ -414,6 +415,84 @@ uint32_t cpu_detect(bool benableavx512)
>      return flags;
>  }
>
> +// TODO: Support ARM on Windows
> +#elif defined(__MINGW64__)
> +
> +#include <windows.h>
> +#include <processthreadsapi.h>
> +
> +bool isOryonCPU()
> +{
> +
> +    char processorName[128];
> +    DWORD bufferSize = 128;
> +
> +    LONG result = RegGetValue(HKEY_LOCAL_MACHINE,
> "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
> "ProcessorNameString", RRF_RT_ANY, NULL, (PVOID)&processorName,
> &bufferSize);
> +    if (strstr(processorName, "Oryon") != NULL)
> +    {
> +        return true;
> +    }
> +    else
> +    {
> +        return false;
> +    }
> +}
> +uint32_t cpu_detect(bool /*benableavx512*/)
> +{
> +
> +    int flags = 0;
> +
> +    #if HAVE_NEON
> +         flags |= X265_CPU_NEON;    // All of ARM64 has NEON
> +    #endif
> +    #if HAVE_NEON_DOTPROD && defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
> +         flags |=
> IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) ?
> X265_CPU_NEON_DOTPROD : 0;
> +    #endif
> +    #if HAVE_NEON_I8MM
> +         flags |= isOryonCPU() ? X265_CPU_NEON_I8MM : 0;
> +    #endif
> +    #if HAVE_SVE && defined(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)
> +         flags |=
> IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE) ? X265_CPU_SVE
> : 0;
> +    #endif
> +    #if HAVE_SVE2 && defined(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)
> +         flags |=
> IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE) ?
> X265_CPU_SVE2 : 0;
> +    #endif
> +
> +    return flags;
> +}
> +
> +#else // Linux+Aarch64
> +
> +#include <asm/hwcap.h>
> +#include <sys/auxv.h>
> +
> +uint32_t cpu_detect(bool /*benableavx5128*/)
> +{
> +    unsigned long hwcaps = getauxval(AT_HWCAP);
> +    unsigned long hwcaps2 = getauxval(AT_HWCAP2);
> +
> +    int flags = 0;
> +
> +    #if HAVE_NEON
> +         flags |= X265_CPU_NEON;    // All of ARM64 has NEON
> +    #endif
> +    #if HAVE_NEON_DOTPROD
> +         flags |= (hwcaps & HWCAP_ASIMDDP ? X265_CPU_NEON_DOTPROD : 0);
> +    #endif
> +    #if HAVE_NEON_I8MM
> +         flags |= (hwcaps2 & HWCAP2_I8MM ? X265_CPU_NEON_I8MM : 0);
> +    #endif
> +    #if HAVE_SVE
> +         flags |= (hwcaps & HWCAP_SVE ? X265_CPU_SVE : 0);
> +    #endif
> +    #if HAVE_SVE2
> +         flags |= (hwcaps2 & HWCAP2_SVE2 ? X265_CPU_SVE2 : 0);
> +    #endif
> +
> +    return flags;
> +}
> +#endif // end of Linux+AArch64
> +
>  #elif X265_ARCH_POWER8
>
>  uint32_t cpu_detect(bool benableavx512)
> --
> 2.45.2
>
>
> On Thu, Oct 3, 2024 at 3:26 PM Dash Santosh <
> dash.sathyanarayanan at multicorewareinc.com> wrote:
>
>> Hi Hari,
>> Thanks for spotting this. Also added support for Windows on ARM. Please
>> find below the updated patch:
>>
>>
>> On Thu, Sep 26, 2024 at 11:43 AM Dash Santosh <
>> dash.sathyanarayanan at multicorewareinc.com> wrote:
>>
>>> From e9614d170f93f3ad4f01e95abfed0a260f218bd5 Mon Sep 17 00:00:00 2001
>>> From: Min Chen <chenm003 at 163.com>
>>> Date: Sat, 14 Sep 2024 14:25:28 -0700
>>> Subject: [PATCH] AArch64: Runtime CPU feature detection
>>>
>>> ---
>>>  source/CMakeLists.txt        | 20 ++++++++++++--------
>>>  source/common/CMakeLists.txt |  5 +++++
>>>  source/common/cpu.cpp        | 33 +++++++++++++++++++++++++++++++++
>>>  3 files changed, 50 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
>>> index 37b83f959..32a99206f 100755
>>> --- a/source/CMakeLists.txt
>>> +++ b/source/CMakeLists.txt
>>> @@ -306,7 +306,8 @@ if(GCC)
>>>          if(CPU_HAS_NEON_DOTPROD)
>>>              # Neon DotProd is mandatory from Armv8.4.
>>>              message(STATUS "Found Neon DotProd")
>>> -            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod)
>>> +            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod)
>>> +            set(ARM_ARGS -O3)
>>>              add_definitions(-DHAVE_NEON_DOTPROD=1)
>>>          endif()
>>>          if(CPU_HAS_NEON_I8MM)
>>> @@ -316,7 +317,8 @@ if(GCC)
>>>              if(NOT CPU_HAS_NEON_DOTPROD)
>>>                  message(FATAL_ERROR "Unsupported AArch64 feature
>>> combination (Neon I8MM without Neon DotProd)")
>>>              endif()
>>> -            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm)
>>> +            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm)
>>> +            set(ARM_ARGS -O3)
>>>              add_definitions(-DHAVE_NEON_I8MM=1)
>>>          endif()
>>>          if(CPU_HAS_SVE)
>>> @@ -325,13 +327,15 @@ if(GCC)
>>>              if(NOT CPU_HAS_NEON_I8MM)
>>>                  message(FATAL_ERROR "Unsupported AArch64 feature
>>> combination (SVE without Neon I8MM)")
>>>              endif()
>>> -            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve)
>>> +            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm+sve)
>>> +            set(ARM_ARGS -O3)
>>>              add_definitions(-DHAVE_SVE=1)
>>>          endif()
>>>          if(CPU_HAS_SVE2)
>>>              message(STATUS "Found SVE2")
>>>              # SVE2 is only available from Armv9.0, and armv9-a implies
>>> +dotprod
>>> -            set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2)
>>> +            set(ARM64_ARCH_ARGS -march=armv9-a+i8mm+sve2)
>>> +            set(ARM_ARGS -O3)
>>>              add_definitions(-DHAVE_SVE2=1)
>>>          endif()
>>>          set(ARM_ARGS ${ARM_ARGS} -fPIC)
>>> @@ -692,7 +696,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>>>              add_custom_command(
>>>                  OUTPUT ${ASM}.${SUFFIX}
>>>                  COMMAND ${CMAKE_CXX_COMPILER}
>>> -                ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>>> ${ASM}.${SUFFIX}
>>> +                ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
>>> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>>>                  DEPENDS ${ASM_SRC})
>>>          endforeach()
>>>          if(CPU_HAS_SVE2)
>>> @@ -703,7 +707,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>>>                  add_custom_command(
>>>                      OUTPUT ${ASM}.${SUFFIX}
>>>                      COMMAND ${CMAKE_CXX_COMPILER}
>>> -                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>>> ${ASM}.${SUFFIX}
>>> +                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
>>> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>>>                      DEPENDS ${ASM_SRC})
>>>              endforeach()
>>>          endif()
>>> @@ -715,7 +719,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>>>                  add_custom_command(
>>>                      OUTPUT ${ASM}.${SUFFIX}
>>>                      COMMAND ${CMAKE_CXX_COMPILER}
>>> -                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>>> ${ASM}.${SUFFIX}
>>> +                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
>>> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>>>                      DEPENDS ${ASM_SRC})
>>>              endforeach()
>>>          endif()
>>> @@ -727,7 +731,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>>>                  add_custom_command(
>>>                      OUTPUT ${ASM}.${SUFFIX}
>>>                      COMMAND ${CMAKE_CXX_COMPILER}
>>> -                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>>> ${ASM}.${SUFFIX}
>>> +                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
>>> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>>>                      DEPENDS ${ASM_SRC})
>>>              endforeach()
>>>          endif()
>>> diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
>>> index dc4a74107..33025cada 100644
>>> --- a/source/common/CMakeLists.txt
>>> +++ b/source/common/CMakeLists.txt
>>> @@ -123,29 +123,34 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR
>>> CROSS_COMPILE_ARM64))
>>>      set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL
>>> "Arm Assembly Sources that use the Neon DotProd extension")
>>>      foreach(SRC ${C_SRCS_NEON})
>>>          set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>>> +        set_source_files_properties( ${SRC} PROPERTIES COMPILE_FLAGS
>>> ${ARM64_ARCH_ARGS} )
>>>      endforeach()
>>>
>>>      if(CPU_HAS_NEON_I8MM)
>>>          foreach(SRC ${C_SRCS_NEON_I8MM})
>>>              set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>>> +            set_source_files_properties( aarch64/${SRC} PROPERTIES
>>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>>>          endforeach()
>>>      endif()
>>>
>>>      if(CPU_HAS_NEON_DOTPROD)
>>>          foreach(SRC ${C_SRCS_NEON_DOTPROD})
>>>              set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>>> +            set_source_files_properties( aarch64/${SRC} PROPERTIES
>>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>>>          endforeach()
>>>      endif()
>>>
>>>      if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE)
>>>          foreach(SRC ${C_SRCS_SVE})
>>>              set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>>> +            set_source_files_properties( aarch64/${SRC} PROPERTIES
>>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>>>          endforeach()
>>>      endif()
>>>
>>>      if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE)
>>>          foreach(SRC ${C_SRCS_SVE2})
>>>              set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>>> +            set_source_files_properties( aarch64/${SRC} PROPERTIES
>>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>>>          endforeach()
>>>      endif()
>>>
>>> diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp
>>> index 61cdaadfb..24c60ff0e 100644
>>> --- a/source/common/cpu.cpp
>>> +++ b/source/common/cpu.cpp
>>> @@ -391,6 +391,8 @@ uint32_t cpu_detect(bool benableavx512)
>>>
>>>  #elif X265_ARCH_ARM64
>>>
>>> +// TODO: Support ARM on Windows
>>> +#if _MSC_VER
>>>  uint32_t cpu_detect(bool benableavx512)
>>>  {
>>>      int flags = 0;
>>> @@ -413,6 +415,37 @@ uint32_t cpu_detect(bool benableavx512)
>>>
>>>      return flags;
>>>  }
>>> +#else // Linux+Aarch64
>>> +
>>> +#include <asm/hwcap.h>
>>> +#include <sys/auxv.h>
>>> +
>>> +uint32_t cpu_detect(bool benableavx512)
>>> +{
>>> +    unsigned long hwcaps = getauxval(AT_HWCAP);
>>> +    unsigned long hwcaps2 = getauxval(AT_HWCAP2);
>>> +
>>> +    int flags = 0;
>>> +
>>> +    #if HAVE_NEON
>>> +         flags |= X265_CPU_NEON;    // All of ARM64 has NEON
>>> +    #endif
>>> +    #if HAVE_NEON_DOTPROD
>>> +         flags |= (hwcaps & HWCAP_ASIMDDP ? X265_CPU_NEON_DOTPROD : 0);
>>> +    #endif
>>> +    #if HAVE_NEON_I8MM
>>> +         flags |= (hwcaps2 & HWCAP2_SVEI8MM ? X265_CPU_NEON_I8MM : 0);
>>> +    #endif
>>> +    #if HAVE_SVE
>>> +         flags |= (hwcaps & HWCAP_SVE ? X265_CPU_SVE : 0);
>>> +    #endif
>>> +    #if HAVE_SVE2
>>> +         flags |= (hwcaps2 & HWCAP2_SVE2 ? X265_CPU_SVE2 : 0);
>>> +    #endif
>>> +
>>> +    return flags;
>>> +}
>>> +#endif // end of Linux+AArch64
>>>
>>>  #elif X265_ARCH_POWER8
>>>
>>> --
>>> 2.43.0.windows.1
>>>
>>>
>>> --
>>>
>>> * <https://multicorewareinc.com/>*
>>>   <https://www.linkedin.com/company/multicoreware-inc/>
>>> <https://twitter.com/MulticoreWare>
>>> <https://www.facebook.com/multicoreware>
>>> <https://www.youtube.com/channel/UCXZ1A1MzS5JwBqwBkNfsBBw?sub_confirmation=1>
>>>    <https://www.instagram.com/multicoreware.inc/>
>>>
>>> *Dash Santosh*
>>>
>>> *Research Engineer, Video Engineering*
>>>
>>> Mobile: +91 78679 43737
>>>
>>> IndiQube Echo Point, Avinashi Road
>>>
>>> Coimbatore - 641 014
>>>
>>>
>>>
>>>
>>>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20241003/21c2223d/attachment-0001.htm>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: v3-0001-AArch64-Runtime-CPU-feature-detection.patch
Type: application/octet-stream
Size: 10247 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20241003/21c2223d/attachment-0001.obj>


More information about the x265-devel mailing list