[x265] [PATCH] AArch64: Runtime CPU feature detection
Dash Santosh
dash.sathyanarayanan at multicorewareinc.com
Thu Oct 3 14:03:47 UTC 2024
Fixed typos and updated TODO. Please find the updated patch below:
>From c116db02bd50faa59c3d2b1c63bd6816d6dec2a0 Mon Sep 17 00:00:00 2001
From: Logaprakash Ramajayam <logaprakash.ramajayam at multicorewareinc.com>
Date: Thu, 3 Oct 2024 05:24:16 -0700
Subject: [PATCH] AArch64: Runtime CPU feature detection
---
.../make-aarch64-w64-mingw32-Makefiles.sh | 8 ++
.../msys/toolchain-aarch64-w64-mingw32.cmake | 8 ++
source/CMakeLists.txt | 21 +++--
source/common/CMakeLists.txt | 5 ++
source/common/cpu.cpp | 81 ++++++++++++++++++-
5 files changed, 114 insertions(+), 9 deletions(-)
create mode 100644 build/msys/make-aarch64-w64-mingw32-Makefiles.sh
create mode 100644 build/msys/toolchain-aarch64-w64-mingw32.cmake
diff --git a/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
new file mode 100644
index 000000000..eceffa4a9
--- /dev/null
+++ b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
@@ -0,0 +1,8 @@
+#!/bin/sh
+
+# This will generate a cross-compile environment, compiling an aarch64
+# Win64 target from a 32bit MinGW32 host environment. If your MinGW
+# install is 64bit, you can use the native compiler batch file:
+# make-Makefiles.sh
+
+cmake -G "MSYS Makefiles"
-DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64-w64-mingw32.cmake ../../source &&
cmake-gui ../../source
diff --git a/build/msys/toolchain-aarch64-w64-mingw32.cmake
b/build/msys/toolchain-aarch64-w64-mingw32.cmake
new file mode 100644
index 000000000..6607bdf64
--- /dev/null
+++ b/build/msys/toolchain-aarch64-w64-mingw32.cmake
@@ -0,0 +1,8 @@
+SET(CMAKE_SYSTEM_NAME Windows)
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
+SET(CMAKE_C_COMPILER aarch64-w64-mingw32-gcc)
+SET(CMAKE_CXX_COMPILER aarch64-w64-mingw32-g++)
+SET(CMAKE_RC_COMPILER aarch64-w64-mingw32-windres)
+SET(CMAKE_RANLIB aarch64-w64-mingw32-ranlib)
+SET(CMAKE_ASM_YASM_COMPILER yasm)
+SET(CROSS_COMPILE_ARM64 1)
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 13bc8ccfe..d1fe38559 100755
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -303,10 +303,12 @@ if(GCC)
endif()
endif()
+ set(ARM64_ARCH_ARGS "-O3")
if(CPU_HAS_NEON_DOTPROD)
# Neon DotProd is mandatory from Armv8.4.
message(STATUS "Found Neon DotProd")
- set(ARM_ARGS -O3 -march=armv8.2-a+dotprod)
+ set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod)
+ set(ARM_ARGS -O3)
add_definitions(-DHAVE_NEON_DOTPROD=1)
endif()
if(CPU_HAS_NEON_I8MM)
@@ -316,7 +318,8 @@ if(GCC)
if(NOT CPU_HAS_NEON_DOTPROD)
message(FATAL_ERROR "Unsupported AArch64 feature
combination (Neon I8MM without Neon DotProd)")
endif()
- set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm)
+ set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm)
+ set(ARM_ARGS -O3)
add_definitions(-DHAVE_NEON_I8MM=1)
endif()
if(CPU_HAS_SVE)
@@ -325,13 +328,15 @@ if(GCC)
if(NOT CPU_HAS_NEON_I8MM)
message(FATAL_ERROR "Unsupported AArch64 feature
combination (SVE without Neon I8MM)")
endif()
- set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve)
+ set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm+sve)
+ set(ARM_ARGS -O3)
add_definitions(-DHAVE_SVE=1)
endif()
if(CPU_HAS_SVE2)
message(STATUS "Found SVE2")
# SVE2 is only available from Armv9.0, and armv9-a implies
+dotprod
- set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2)
+ set(ARM64_ARCH_ARGS -march=armv9-a+i8mm+sve2)
+ set(ARM_ARGS -O3)
add_definitions(-DHAVE_SVE2=1)
endif()
set(ARM_ARGS ${ARM_ARGS} -fPIC)
@@ -692,7 +697,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
add_custom_command(
OUTPUT ${ASM}.${SUFFIX}
COMMAND ${CMAKE_CXX_COMPILER}
- ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
${ASM}.${SUFFIX}
+ ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
${ASM_SRC} -o ${ASM}.${SUFFIX}
DEPENDS ${ASM_SRC})
endforeach()
if(CPU_HAS_SVE2)
@@ -703,7 +708,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
add_custom_command(
OUTPUT ${ASM}.${SUFFIX}
COMMAND ${CMAKE_CXX_COMPILER}
- ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
${ASM}.${SUFFIX}
+ ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
${ASM_SRC} -o ${ASM}.${SUFFIX}
DEPENDS ${ASM_SRC})
endforeach()
endif()
@@ -715,7 +720,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
add_custom_command(
OUTPUT ${ASM}.${SUFFIX}
COMMAND ${CMAKE_CXX_COMPILER}
- ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
${ASM}.${SUFFIX}
+ ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
${ASM_SRC} -o ${ASM}.${SUFFIX}
DEPENDS ${ASM_SRC})
endforeach()
endif()
@@ -727,7 +732,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
add_custom_command(
OUTPUT ${ASM}.${SUFFIX}
COMMAND ${CMAKE_CXX_COMPILER}
- ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
${ASM}.${SUFFIX}
+ ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
${ASM_SRC} -o ${ASM}.${SUFFIX}
DEPENDS ${ASM_SRC})
endforeach()
endif()
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index dc4a74107..33025cada 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -123,29 +123,34 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL "Arm
Assembly Sources that use the Neon DotProd extension")
foreach(SRC ${C_SRCS_NEON})
set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+ set_source_files_properties( ${SRC} PROPERTIES COMPILE_FLAGS
${ARM64_ARCH_ARGS} )
endforeach()
if(CPU_HAS_NEON_I8MM)
foreach(SRC ${C_SRCS_NEON_I8MM})
set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+ set_source_files_properties( aarch64/${SRC} PROPERTIES
COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
endforeach()
endif()
if(CPU_HAS_NEON_DOTPROD)
foreach(SRC ${C_SRCS_NEON_DOTPROD})
set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+ set_source_files_properties( aarch64/${SRC} PROPERTIES
COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
endforeach()
endif()
if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE)
foreach(SRC ${C_SRCS_SVE})
set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+ set_source_files_properties( aarch64/${SRC} PROPERTIES
COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
endforeach()
endif()
if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE)
foreach(SRC ${C_SRCS_SVE2})
set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+ set_source_files_properties( aarch64/${SRC} PROPERTIES
COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
endforeach()
endif()
diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp
index 61cdaadfb..2d4b15dc9 100644
--- a/source/common/cpu.cpp
+++ b/source/common/cpu.cpp
@@ -391,7 +391,8 @@ uint32_t cpu_detect(bool benableavx512)
#elif X265_ARCH_ARM64
-uint32_t cpu_detect(bool benableavx512)
+#if defined(_MSC_VER) || defined(__APPLE__)
+uint32_t cpu_detect(bool /*benableavx512*/)
{
int flags = 0;
@@ -414,6 +415,84 @@ uint32_t cpu_detect(bool benableavx512)
return flags;
}
+// TODO: Remove isOryonCPU() once Windows defines PF_ flag for I8MM on
supported ARM64 devices
+#elif defined(__MINGW64__) // Windows+Aarch64
+
+#include <windows.h>
+#include <processthreadsapi.h>
+
+bool isOryonCPU()
+{
+
+ char processorName[128];
+ DWORD bufferSize = 128;
+
+ LONG result = RegGetValue(HKEY_LOCAL_MACHINE,
"HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
"ProcessorNameString", RRF_RT_ANY, NULL, (PVOID)&processorName,
&bufferSize);
+ if (strstr(processorName, "Oryon") != NULL)
+ {
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+}
+uint32_t cpu_detect(bool /*benableavx512*/)
+{
+
+ int flags = 0;
+
+ #if HAVE_NEON
+ flags |= X265_CPU_NEON; // All of ARM64 has NEON
+ #endif
+ #if HAVE_NEON_DOTPROD && defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
+ flags |=
IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) ?
X265_CPU_NEON_DOTPROD : 0;
+ #endif
+ #if HAVE_NEON_I8MM
+ flags |= isOryonCPU() ? X265_CPU_NEON_I8MM : 0;
+ #endif
+ #if HAVE_SVE && defined(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)
+ flags |=
IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE) ? X265_CPU_SVE
: 0;
+ #endif
+ #if HAVE_SVE2 && defined(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)
+ flags |=
IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE) ?
X265_CPU_SVE2 : 0;
+ #endif
+
+ return flags;
+} // end of Windows+Aarch64
+
+#else // Linux+Aarch64
+
+#include <asm/hwcap.h>
+#include <sys/auxv.h>
+
+uint32_t cpu_detect(bool /*benableavx512*/)
+{
+ unsigned long hwcaps = getauxval(AT_HWCAP);
+ unsigned long hwcaps2 = getauxval(AT_HWCAP2);
+
+ int flags = 0;
+
+ #if HAVE_NEON
+ flags |= X265_CPU_NEON; // All of ARM64 has NEON
+ #endif
+ #if HAVE_NEON_DOTPROD
+ flags |= (hwcaps & HWCAP_ASIMDDP ? X265_CPU_NEON_DOTPROD : 0);
+ #endif
+ #if HAVE_NEON_I8MM
+ flags |= (hwcaps2 & HWCAP2_I8MM ? X265_CPU_NEON_I8MM : 0);
+ #endif
+ #if HAVE_SVE
+ flags |= (hwcaps & HWCAP_SVE ? X265_CPU_SVE : 0);
+ #endif
+ #if HAVE_SVE2
+ flags |= (hwcaps2 & HWCAP2_SVE2 ? X265_CPU_SVE2 : 0);
+ #endif
+
+ return flags;
+}
+#endif // end of Linux+AArch64
+
#elif X265_ARCH_POWER8
uint32_t cpu_detect(bool benableavx512)
--
2.45.2
On Thu, Oct 3, 2024 at 3:26 PM Dash Santosh <
dash.sathyanarayanan at multicorewareinc.com> wrote:
> From 7d2353aaf7509721461c141f2800962c15ff440c Mon Sep 17 00:00:00 2001
> From: Logaprakash Ramajayam <logaprakash.ramajayam at multicorewareinc.com>
> Date: Wed, 2 Oct 2024 21:59:59 -0700
> Subject: [PATCH] AArch64: Runtime CPU feature detection
>
> ---
> .../make-aarch64-w64-mingw32-Makefiles.sh | 8 ++
> .../msys/toolchain-aarch64-w64-mingw32.cmake | 8 ++
> source/CMakeLists.txt | 21 +++--
> source/common/CMakeLists.txt | 5 ++
> source/common/cpu.cpp | 81 ++++++++++++++++++-
> 5 files changed, 114 insertions(+), 9 deletions(-)
> create mode 100644 build/msys/make-aarch64-w64-mingw32-Makefiles.sh
> create mode 100644 build/msys/toolchain-aarch64-w64-mingw32.cmake
>
> diff --git a/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
> b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
> new file mode 100644
> index 000000000..eceffa4a9
> --- /dev/null
> +++ b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
> @@ -0,0 +1,8 @@
> +#!/bin/sh
> +
> +# This will generate a cross-compile environment, compiling an aarch64
> +# Win64 target from a 32bit MinGW32 host environment. If your MinGW
> +# install is 64bit, you can use the native compiler batch file:
> +# make-Makefiles.sh
> +
> +cmake -G "MSYS Makefiles"
> -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64-w64-mingw32.cmake ../../source &&
> cmake-gui ../../source
> diff --git a/build/msys/toolchain-aarch64-w64-mingw32.cmake
> b/build/msys/toolchain-aarch64-w64-mingw32.cmake
> new file mode 100644
> index 000000000..6607bdf64
> --- /dev/null
> +++ b/build/msys/toolchain-aarch64-w64-mingw32.cmake
> @@ -0,0 +1,8 @@
> +SET(CMAKE_SYSTEM_NAME Windows)
> +set(CMAKE_SYSTEM_PROCESSOR aarch64)
> +SET(CMAKE_C_COMPILER aarch64-w64-mingw32-gcc)
> +SET(CMAKE_CXX_COMPILER aarch64-w64-mingw32-g++)
> +SET(CMAKE_RC_COMPILER aarch64-w64-mingw32-windres)
> +SET(CMAKE_RANLIB aarch64-w64-mingw32-ranlib)
> +SET(CMAKE_ASM_YASM_COMPILER yasm)
> +SET(CROSS_COMPILE_ARM64 1)
> diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
> index 13bc8ccfe..d1fe38559 100755
> --- a/source/CMakeLists.txt
> +++ b/source/CMakeLists.txt
> @@ -303,10 +303,12 @@ if(GCC)
> endif()
> endif()
>
> + set(ARM64_ARCH_ARGS "-O3")
> if(CPU_HAS_NEON_DOTPROD)
> # Neon DotProd is mandatory from Armv8.4.
> message(STATUS "Found Neon DotProd")
> - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod)
> + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod)
> + set(ARM_ARGS -O3)
> add_definitions(-DHAVE_NEON_DOTPROD=1)
> endif()
> if(CPU_HAS_NEON_I8MM)
> @@ -316,7 +318,8 @@ if(GCC)
> if(NOT CPU_HAS_NEON_DOTPROD)
> message(FATAL_ERROR "Unsupported AArch64 feature
> combination (Neon I8MM without Neon DotProd)")
> endif()
> - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm)
> + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm)
> + set(ARM_ARGS -O3)
> add_definitions(-DHAVE_NEON_I8MM=1)
> endif()
> if(CPU_HAS_SVE)
> @@ -325,13 +328,15 @@ if(GCC)
> if(NOT CPU_HAS_NEON_I8MM)
> message(FATAL_ERROR "Unsupported AArch64 feature
> combination (SVE without Neon I8MM)")
> endif()
> - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve)
> + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm+sve)
> + set(ARM_ARGS -O3)
> add_definitions(-DHAVE_SVE=1)
> endif()
> if(CPU_HAS_SVE2)
> message(STATUS "Found SVE2")
> # SVE2 is only available from Armv9.0, and armv9-a implies
> +dotprod
> - set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2)
> + set(ARM64_ARCH_ARGS -march=armv9-a+i8mm+sve2)
> + set(ARM_ARGS -O3)
> add_definitions(-DHAVE_SVE2=1)
> endif()
> set(ARM_ARGS ${ARM_ARGS} -fPIC)
> @@ -692,7 +697,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
> add_custom_command(
> OUTPUT ${ASM}.${SUFFIX}
> COMMAND ${CMAKE_CXX_COMPILER}
> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
> ${ASM}.${SUFFIX}
> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
> ${ASM_SRC} -o ${ASM}.${SUFFIX}
> DEPENDS ${ASM_SRC})
> endforeach()
> if(CPU_HAS_SVE2)
> @@ -703,7 +708,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
> add_custom_command(
> OUTPUT ${ASM}.${SUFFIX}
> COMMAND ${CMAKE_CXX_COMPILER}
> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
> ${ASM}.${SUFFIX}
> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
> ${ASM_SRC} -o ${ASM}.${SUFFIX}
> DEPENDS ${ASM_SRC})
> endforeach()
> endif()
> @@ -715,7 +720,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
> add_custom_command(
> OUTPUT ${ASM}.${SUFFIX}
> COMMAND ${CMAKE_CXX_COMPILER}
> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
> ${ASM}.${SUFFIX}
> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
> ${ASM_SRC} -o ${ASM}.${SUFFIX}
> DEPENDS ${ASM_SRC})
> endforeach()
> endif()
> @@ -727,7 +732,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
> add_custom_command(
> OUTPUT ${ASM}.${SUFFIX}
> COMMAND ${CMAKE_CXX_COMPILER}
> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
> ${ASM}.${SUFFIX}
> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
> ${ASM_SRC} -o ${ASM}.${SUFFIX}
> DEPENDS ${ASM_SRC})
> endforeach()
> endif()
> diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
> index dc4a74107..33025cada 100644
> --- a/source/common/CMakeLists.txt
> +++ b/source/common/CMakeLists.txt
> @@ -123,29 +123,34 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR
> CROSS_COMPILE_ARM64))
> set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL
> "Arm Assembly Sources that use the Neon DotProd extension")
> foreach(SRC ${C_SRCS_NEON})
> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
> + set_source_files_properties( ${SRC} PROPERTIES COMPILE_FLAGS
> ${ARM64_ARCH_ARGS} )
> endforeach()
>
> if(CPU_HAS_NEON_I8MM)
> foreach(SRC ${C_SRCS_NEON_I8MM})
> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
> + set_source_files_properties( aarch64/${SRC} PROPERTIES
> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
> endforeach()
> endif()
>
> if(CPU_HAS_NEON_DOTPROD)
> foreach(SRC ${C_SRCS_NEON_DOTPROD})
> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
> + set_source_files_properties( aarch64/${SRC} PROPERTIES
> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
> endforeach()
> endif()
>
> if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE)
> foreach(SRC ${C_SRCS_SVE})
> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
> + set_source_files_properties( aarch64/${SRC} PROPERTIES
> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
> endforeach()
> endif()
>
> if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE)
> foreach(SRC ${C_SRCS_SVE2})
> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
> + set_source_files_properties( aarch64/${SRC} PROPERTIES
> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
> endforeach()
> endif()
>
> diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp
> index 61cdaadfb..a2b0ac081 100644
> --- a/source/common/cpu.cpp
> +++ b/source/common/cpu.cpp
> @@ -391,7 +391,8 @@ uint32_t cpu_detect(bool benableavx512)
>
> #elif X265_ARCH_ARM64
>
> -uint32_t cpu_detect(bool benableavx512)
> +#if defined(_MSC_VER) || defined(__APPLE__)
> +uint32_t cpu_detect(bool /*benableavx512*/)
> {
> int flags = 0;
>
> @@ -414,6 +415,84 @@ uint32_t cpu_detect(bool benableavx512)
> return flags;
> }
>
> +// TODO: Support ARM on Windows
> +#elif defined(__MINGW64__)
> +
> +#include <windows.h>
> +#include <processthreadsapi.h>
> +
> +bool isOryonCPU()
> +{
> +
> + char processorName[128];
> + DWORD bufferSize = 128;
> +
> + LONG result = RegGetValue(HKEY_LOCAL_MACHINE,
> "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
> "ProcessorNameString", RRF_RT_ANY, NULL, (PVOID)&processorName,
> &bufferSize);
> + if (strstr(processorName, "Oryon") != NULL)
> + {
> + return true;
> + }
> + else
> + {
> + return false;
> + }
> +}
> +uint32_t cpu_detect(bool /*benableavx512*/)
> +{
> +
> + int flags = 0;
> +
> + #if HAVE_NEON
> + flags |= X265_CPU_NEON; // All of ARM64 has NEON
> + #endif
> + #if HAVE_NEON_DOTPROD && defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
> + flags |=
> IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) ?
> X265_CPU_NEON_DOTPROD : 0;
> + #endif
> + #if HAVE_NEON_I8MM
> + flags |= isOryonCPU() ? X265_CPU_NEON_I8MM : 0;
> + #endif
> + #if HAVE_SVE && defined(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)
> + flags |=
> IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE) ? X265_CPU_SVE
> : 0;
> + #endif
> + #if HAVE_SVE2 && defined(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)
> + flags |=
> IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE) ?
> X265_CPU_SVE2 : 0;
> + #endif
> +
> + return flags;
> +}
> +
> +#else // Linux+Aarch64
> +
> +#include <asm/hwcap.h>
> +#include <sys/auxv.h>
> +
> +uint32_t cpu_detect(bool /*benableavx5128*/)
> +{
> + unsigned long hwcaps = getauxval(AT_HWCAP);
> + unsigned long hwcaps2 = getauxval(AT_HWCAP2);
> +
> + int flags = 0;
> +
> + #if HAVE_NEON
> + flags |= X265_CPU_NEON; // All of ARM64 has NEON
> + #endif
> + #if HAVE_NEON_DOTPROD
> + flags |= (hwcaps & HWCAP_ASIMDDP ? X265_CPU_NEON_DOTPROD : 0);
> + #endif
> + #if HAVE_NEON_I8MM
> + flags |= (hwcaps2 & HWCAP2_I8MM ? X265_CPU_NEON_I8MM : 0);
> + #endif
> + #if HAVE_SVE
> + flags |= (hwcaps & HWCAP_SVE ? X265_CPU_SVE : 0);
> + #endif
> + #if HAVE_SVE2
> + flags |= (hwcaps2 & HWCAP2_SVE2 ? X265_CPU_SVE2 : 0);
> + #endif
> +
> + return flags;
> +}
> +#endif // end of Linux+AArch64
> +
> #elif X265_ARCH_POWER8
>
> uint32_t cpu_detect(bool benableavx512)
> --
> 2.45.2
>
>
> On Thu, Oct 3, 2024 at 3:26 PM Dash Santosh <
> dash.sathyanarayanan at multicorewareinc.com> wrote:
>
>> Hi Hari,
>> Thanks for spotting this. Also added support for Windows on ARM. Please
>> find below the updated patch:
>>
>>
>> On Thu, Sep 26, 2024 at 11:43 AM Dash Santosh <
>> dash.sathyanarayanan at multicorewareinc.com> wrote:
>>
>>> From e9614d170f93f3ad4f01e95abfed0a260f218bd5 Mon Sep 17 00:00:00 2001
>>> From: Min Chen <chenm003 at 163.com>
>>> Date: Sat, 14 Sep 2024 14:25:28 -0700
>>> Subject: [PATCH] AArch64: Runtime CPU feature detection
>>>
>>> ---
>>> source/CMakeLists.txt | 20 ++++++++++++--------
>>> source/common/CMakeLists.txt | 5 +++++
>>> source/common/cpu.cpp | 33 +++++++++++++++++++++++++++++++++
>>> 3 files changed, 50 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
>>> index 37b83f959..32a99206f 100755
>>> --- a/source/CMakeLists.txt
>>> +++ b/source/CMakeLists.txt
>>> @@ -306,7 +306,8 @@ if(GCC)
>>> if(CPU_HAS_NEON_DOTPROD)
>>> # Neon DotProd is mandatory from Armv8.4.
>>> message(STATUS "Found Neon DotProd")
>>> - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod)
>>> + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod)
>>> + set(ARM_ARGS -O3)
>>> add_definitions(-DHAVE_NEON_DOTPROD=1)
>>> endif()
>>> if(CPU_HAS_NEON_I8MM)
>>> @@ -316,7 +317,8 @@ if(GCC)
>>> if(NOT CPU_HAS_NEON_DOTPROD)
>>> message(FATAL_ERROR "Unsupported AArch64 feature
>>> combination (Neon I8MM without Neon DotProd)")
>>> endif()
>>> - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm)
>>> + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm)
>>> + set(ARM_ARGS -O3)
>>> add_definitions(-DHAVE_NEON_I8MM=1)
>>> endif()
>>> if(CPU_HAS_SVE)
>>> @@ -325,13 +327,15 @@ if(GCC)
>>> if(NOT CPU_HAS_NEON_I8MM)
>>> message(FATAL_ERROR "Unsupported AArch64 feature
>>> combination (SVE without Neon I8MM)")
>>> endif()
>>> - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve)
>>> + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm+sve)
>>> + set(ARM_ARGS -O3)
>>> add_definitions(-DHAVE_SVE=1)
>>> endif()
>>> if(CPU_HAS_SVE2)
>>> message(STATUS "Found SVE2")
>>> # SVE2 is only available from Armv9.0, and armv9-a implies
>>> +dotprod
>>> - set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2)
>>> + set(ARM64_ARCH_ARGS -march=armv9-a+i8mm+sve2)
>>> + set(ARM_ARGS -O3)
>>> add_definitions(-DHAVE_SVE2=1)
>>> endif()
>>> set(ARM_ARGS ${ARM_ARGS} -fPIC)
>>> @@ -692,7 +696,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>>> add_custom_command(
>>> OUTPUT ${ASM}.${SUFFIX}
>>> COMMAND ${CMAKE_CXX_COMPILER}
>>> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>>> ${ASM}.${SUFFIX}
>>> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
>>> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>>> DEPENDS ${ASM_SRC})
>>> endforeach()
>>> if(CPU_HAS_SVE2)
>>> @@ -703,7 +707,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>>> add_custom_command(
>>> OUTPUT ${ASM}.${SUFFIX}
>>> COMMAND ${CMAKE_CXX_COMPILER}
>>> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>>> ${ASM}.${SUFFIX}
>>> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
>>> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>>> DEPENDS ${ASM_SRC})
>>> endforeach()
>>> endif()
>>> @@ -715,7 +719,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>>> add_custom_command(
>>> OUTPUT ${ASM}.${SUFFIX}
>>> COMMAND ${CMAKE_CXX_COMPILER}
>>> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>>> ${ASM}.${SUFFIX}
>>> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
>>> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>>> DEPENDS ${ASM_SRC})
>>> endforeach()
>>> endif()
>>> @@ -727,7 +731,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>>> add_custom_command(
>>> OUTPUT ${ASM}.${SUFFIX}
>>> COMMAND ${CMAKE_CXX_COMPILER}
>>> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>>> ${ASM}.${SUFFIX}
>>> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
>>> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>>> DEPENDS ${ASM_SRC})
>>> endforeach()
>>> endif()
>>> diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
>>> index dc4a74107..33025cada 100644
>>> --- a/source/common/CMakeLists.txt
>>> +++ b/source/common/CMakeLists.txt
>>> @@ -123,29 +123,34 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR
>>> CROSS_COMPILE_ARM64))
>>> set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL
>>> "Arm Assembly Sources that use the Neon DotProd extension")
>>> foreach(SRC ${C_SRCS_NEON})
>>> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>>> + set_source_files_properties( ${SRC} PROPERTIES COMPILE_FLAGS
>>> ${ARM64_ARCH_ARGS} )
>>> endforeach()
>>>
>>> if(CPU_HAS_NEON_I8MM)
>>> foreach(SRC ${C_SRCS_NEON_I8MM})
>>> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>>> + set_source_files_properties( aarch64/${SRC} PROPERTIES
>>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>>> endforeach()
>>> endif()
>>>
>>> if(CPU_HAS_NEON_DOTPROD)
>>> foreach(SRC ${C_SRCS_NEON_DOTPROD})
>>> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>>> + set_source_files_properties( aarch64/${SRC} PROPERTIES
>>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>>> endforeach()
>>> endif()
>>>
>>> if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE)
>>> foreach(SRC ${C_SRCS_SVE})
>>> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>>> + set_source_files_properties( aarch64/${SRC} PROPERTIES
>>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>>> endforeach()
>>> endif()
>>>
>>> if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE)
>>> foreach(SRC ${C_SRCS_SVE2})
>>> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>>> + set_source_files_properties( aarch64/${SRC} PROPERTIES
>>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>>> endforeach()
>>> endif()
>>>
>>> diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp
>>> index 61cdaadfb..24c60ff0e 100644
>>> --- a/source/common/cpu.cpp
>>> +++ b/source/common/cpu.cpp
>>> @@ -391,6 +391,8 @@ uint32_t cpu_detect(bool benableavx512)
>>>
>>> #elif X265_ARCH_ARM64
>>>
>>> +// TODO: Support ARM on Windows
>>> +#if _MSC_VER
>>> uint32_t cpu_detect(bool benableavx512)
>>> {
>>> int flags = 0;
>>> @@ -413,6 +415,37 @@ uint32_t cpu_detect(bool benableavx512)
>>>
>>> return flags;
>>> }
>>> +#else // Linux+Aarch64
>>> +
>>> +#include <asm/hwcap.h>
>>> +#include <sys/auxv.h>
>>> +
>>> +uint32_t cpu_detect(bool benableavx512)
>>> +{
>>> + unsigned long hwcaps = getauxval(AT_HWCAP);
>>> + unsigned long hwcaps2 = getauxval(AT_HWCAP2);
>>> +
>>> + int flags = 0;
>>> +
>>> + #if HAVE_NEON
>>> + flags |= X265_CPU_NEON; // All of ARM64 has NEON
>>> + #endif
>>> + #if HAVE_NEON_DOTPROD
>>> + flags |= (hwcaps & HWCAP_ASIMDDP ? X265_CPU_NEON_DOTPROD : 0);
>>> + #endif
>>> + #if HAVE_NEON_I8MM
>>> + flags |= (hwcaps2 & HWCAP2_SVEI8MM ? X265_CPU_NEON_I8MM : 0);
>>> + #endif
>>> + #if HAVE_SVE
>>> + flags |= (hwcaps & HWCAP_SVE ? X265_CPU_SVE : 0);
>>> + #endif
>>> + #if HAVE_SVE2
>>> + flags |= (hwcaps2 & HWCAP2_SVE2 ? X265_CPU_SVE2 : 0);
>>> + #endif
>>> +
>>> + return flags;
>>> +}
>>> +#endif // end of Linux+AArch64
>>>
>>> #elif X265_ARCH_POWER8
>>>
>>> --
>>> 2.43.0.windows.1
>>>
>>>
>>> --
>>>
>>> * <https://multicorewareinc.com/>*
>>> <https://www.linkedin.com/company/multicoreware-inc/>
>>> <https://twitter.com/MulticoreWare>
>>> <https://www.facebook.com/multicoreware>
>>> <https://www.youtube.com/channel/UCXZ1A1MzS5JwBqwBkNfsBBw?sub_confirmation=1>
>>> <https://www.instagram.com/multicoreware.inc/>
>>>
>>> *Dash Santosh*
>>>
>>> *Research Engineer, Video Engineering*
>>>
>>> Mobile: +91 78679 43737
>>>
>>> IndiQube Echo Point, Avinashi Road
>>>
>>> Coimbatore - 641 014
>>>
>>>
>>>
>>>
>>>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20241003/21c2223d/attachment-0001.htm>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: v3-0001-AArch64-Runtime-CPU-feature-detection.patch
Type: application/octet-stream
Size: 10247 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20241003/21c2223d/attachment-0001.obj>
More information about the x265-devel
mailing list