[x265] [PATCH] AArch64: Runtime CPU feature detection
Karam Singh
karam.singh at multicorewareinc.com
Fri Oct 4 08:27:09 UTC 2024
This patch has been pushed to the master branch.
*__________________________*
*Karam Singh*
*Ph.D. IIT Guwahati*
Senior Software (Video Coding) Engineer
Mobile: +91 8011279030
Block 9A, 6th floor, DLF Cyber City
Manapakkam, Chennai 600 089
On Thu, Oct 3, 2024 at 7:34 PM Dash Santosh <
dash.sathyanarayanan at multicorewareinc.com> wrote:
> Fixed typos and updated TODO. Please find the updated patch below:
>
> From c116db02bd50faa59c3d2b1c63bd6816d6dec2a0 Mon Sep 17 00:00:00 2001
> From: Logaprakash Ramajayam <logaprakash.ramajayam at multicorewareinc.com>
> Date: Thu, 3 Oct 2024 05:24:16 -0700
> Subject: [PATCH] AArch64: Runtime CPU feature detection
>
> ---
> .../make-aarch64-w64-mingw32-Makefiles.sh | 8 ++
> .../msys/toolchain-aarch64-w64-mingw32.cmake | 8 ++
> source/CMakeLists.txt | 21 +++--
> source/common/CMakeLists.txt | 5 ++
> source/common/cpu.cpp | 81 ++++++++++++++++++-
> 5 files changed, 114 insertions(+), 9 deletions(-)
> create mode 100644 build/msys/make-aarch64-w64-mingw32-Makefiles.sh
> create mode 100644 build/msys/toolchain-aarch64-w64-mingw32.cmake
>
> diff --git a/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
> b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
> new file mode 100644
> index 000000000..eceffa4a9
> --- /dev/null
> +++ b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
> @@ -0,0 +1,8 @@
> +#!/bin/sh
> +
> +# This will generate a cross-compile environment, compiling an aarch64
> +# Win64 target from a 32bit MinGW32 host environment. If your MinGW
> +# install is 64bit, you can use the native compiler batch file:
> +# make-Makefiles.sh
> +
> +cmake -G "MSYS Makefiles"
> -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64-w64-mingw32.cmake ../../source &&
> cmake-gui ../../source
> diff --git a/build/msys/toolchain-aarch64-w64-mingw32.cmake
> b/build/msys/toolchain-aarch64-w64-mingw32.cmake
> new file mode 100644
> index 000000000..6607bdf64
> --- /dev/null
> +++ b/build/msys/toolchain-aarch64-w64-mingw32.cmake
> @@ -0,0 +1,8 @@
> +SET(CMAKE_SYSTEM_NAME Windows)
> +set(CMAKE_SYSTEM_PROCESSOR aarch64)
> +SET(CMAKE_C_COMPILER aarch64-w64-mingw32-gcc)
> +SET(CMAKE_CXX_COMPILER aarch64-w64-mingw32-g++)
> +SET(CMAKE_RC_COMPILER aarch64-w64-mingw32-windres)
> +SET(CMAKE_RANLIB aarch64-w64-mingw32-ranlib)
> +SET(CMAKE_ASM_YASM_COMPILER yasm)
> +SET(CROSS_COMPILE_ARM64 1)
> diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
> index 13bc8ccfe..d1fe38559 100755
> --- a/source/CMakeLists.txt
> +++ b/source/CMakeLists.txt
> @@ -303,10 +303,12 @@ if(GCC)
> endif()
> endif()
>
> + set(ARM64_ARCH_ARGS "-O3")
> if(CPU_HAS_NEON_DOTPROD)
> # Neon DotProd is mandatory from Armv8.4.
> message(STATUS "Found Neon DotProd")
> - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod)
> + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod)
> + set(ARM_ARGS -O3)
> add_definitions(-DHAVE_NEON_DOTPROD=1)
> endif()
> if(CPU_HAS_NEON_I8MM)
> @@ -316,7 +318,8 @@ if(GCC)
> if(NOT CPU_HAS_NEON_DOTPROD)
> message(FATAL_ERROR "Unsupported AArch64 feature
> combination (Neon I8MM without Neon DotProd)")
> endif()
> - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm)
> + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm)
> + set(ARM_ARGS -O3)
> add_definitions(-DHAVE_NEON_I8MM=1)
> endif()
> if(CPU_HAS_SVE)
> @@ -325,13 +328,15 @@ if(GCC)
> if(NOT CPU_HAS_NEON_I8MM)
> message(FATAL_ERROR "Unsupported AArch64 feature
> combination (SVE without Neon I8MM)")
> endif()
> - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve)
> + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm+sve)
> + set(ARM_ARGS -O3)
> add_definitions(-DHAVE_SVE=1)
> endif()
> if(CPU_HAS_SVE2)
> message(STATUS "Found SVE2")
> # SVE2 is only available from Armv9.0, and armv9-a implies
> +dotprod
> - set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2)
> + set(ARM64_ARCH_ARGS -march=armv9-a+i8mm+sve2)
> + set(ARM_ARGS -O3)
> add_definitions(-DHAVE_SVE2=1)
> endif()
> set(ARM_ARGS ${ARM_ARGS} -fPIC)
> @@ -692,7 +697,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
> add_custom_command(
> OUTPUT ${ASM}.${SUFFIX}
> COMMAND ${CMAKE_CXX_COMPILER}
> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
> ${ASM}.${SUFFIX}
> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
> ${ASM_SRC} -o ${ASM}.${SUFFIX}
> DEPENDS ${ASM_SRC})
> endforeach()
> if(CPU_HAS_SVE2)
> @@ -703,7 +708,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
> add_custom_command(
> OUTPUT ${ASM}.${SUFFIX}
> COMMAND ${CMAKE_CXX_COMPILER}
> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
> ${ASM}.${SUFFIX}
> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
> ${ASM_SRC} -o ${ASM}.${SUFFIX}
> DEPENDS ${ASM_SRC})
> endforeach()
> endif()
> @@ -715,7 +720,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
> add_custom_command(
> OUTPUT ${ASM}.${SUFFIX}
> COMMAND ${CMAKE_CXX_COMPILER}
> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
> ${ASM}.${SUFFIX}
> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
> ${ASM_SRC} -o ${ASM}.${SUFFIX}
> DEPENDS ${ASM_SRC})
> endforeach()
> endif()
> @@ -727,7 +732,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
> add_custom_command(
> OUTPUT ${ASM}.${SUFFIX}
> COMMAND ${CMAKE_CXX_COMPILER}
> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
> ${ASM}.${SUFFIX}
> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
> ${ASM_SRC} -o ${ASM}.${SUFFIX}
> DEPENDS ${ASM_SRC})
> endforeach()
> endif()
> diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
> index dc4a74107..33025cada 100644
> --- a/source/common/CMakeLists.txt
> +++ b/source/common/CMakeLists.txt
> @@ -123,29 +123,34 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR
> CROSS_COMPILE_ARM64))
> set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL
> "Arm Assembly Sources that use the Neon DotProd extension")
> foreach(SRC ${C_SRCS_NEON})
> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
> + set_source_files_properties( ${SRC} PROPERTIES COMPILE_FLAGS
> ${ARM64_ARCH_ARGS} )
> endforeach()
>
> if(CPU_HAS_NEON_I8MM)
> foreach(SRC ${C_SRCS_NEON_I8MM})
> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
> + set_source_files_properties( aarch64/${SRC} PROPERTIES
> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
> endforeach()
> endif()
>
> if(CPU_HAS_NEON_DOTPROD)
> foreach(SRC ${C_SRCS_NEON_DOTPROD})
> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
> + set_source_files_properties( aarch64/${SRC} PROPERTIES
> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
> endforeach()
> endif()
>
> if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE)
> foreach(SRC ${C_SRCS_SVE})
> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
> + set_source_files_properties( aarch64/${SRC} PROPERTIES
> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
> endforeach()
> endif()
>
> if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE)
> foreach(SRC ${C_SRCS_SVE2})
> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
> + set_source_files_properties( aarch64/${SRC} PROPERTIES
> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
> endforeach()
> endif()
>
> diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp
> index 61cdaadfb..2d4b15dc9 100644
> --- a/source/common/cpu.cpp
> +++ b/source/common/cpu.cpp
> @@ -391,7 +391,8 @@ uint32_t cpu_detect(bool benableavx512)
>
> #elif X265_ARCH_ARM64
>
> -uint32_t cpu_detect(bool benableavx512)
> +#if defined(_MSC_VER) || defined(__APPLE__)
> +uint32_t cpu_detect(bool /*benableavx512*/)
> {
> int flags = 0;
>
> @@ -414,6 +415,84 @@ uint32_t cpu_detect(bool benableavx512)
> return flags;
> }
>
> +// TODO: Remove isOryonCPU() once Windows defines PF_ flag for I8MM on
> supported ARM64 devices
> +#elif defined(__MINGW64__) // Windows+Aarch64
> +
> +#include <windows.h>
> +#include <processthreadsapi.h>
> +
> +bool isOryonCPU()
> +{
> +
> + char processorName[128];
> + DWORD bufferSize = 128;
> +
> + LONG result = RegGetValue(HKEY_LOCAL_MACHINE,
> "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
> "ProcessorNameString", RRF_RT_ANY, NULL, (PVOID)&processorName,
> &bufferSize);
> + if (strstr(processorName, "Oryon") != NULL)
> + {
> + return true;
> + }
> + else
> + {
> + return false;
> + }
> +}
> +uint32_t cpu_detect(bool /*benableavx512*/)
> +{
> +
> + int flags = 0;
> +
> + #if HAVE_NEON
> + flags |= X265_CPU_NEON; // All of ARM64 has NEON
> + #endif
> + #if HAVE_NEON_DOTPROD && defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
> + flags |=
> IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) ?
> X265_CPU_NEON_DOTPROD : 0;
> + #endif
> + #if HAVE_NEON_I8MM
> + flags |= isOryonCPU() ? X265_CPU_NEON_I8MM : 0;
> + #endif
> + #if HAVE_SVE && defined(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)
> + flags |=
> IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE) ? X265_CPU_SVE
> : 0;
> + #endif
> + #if HAVE_SVE2 && defined(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)
> + flags |=
> IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE) ?
> X265_CPU_SVE2 : 0;
> + #endif
> +
> + return flags;
> +} // end of Windows+Aarch64
> +
> +#else // Linux+Aarch64
> +
> +#include <asm/hwcap.h>
> +#include <sys/auxv.h>
> +
> +uint32_t cpu_detect(bool /*benableavx512*/)
> +{
> + unsigned long hwcaps = getauxval(AT_HWCAP);
> + unsigned long hwcaps2 = getauxval(AT_HWCAP2);
> +
> + int flags = 0;
> +
> + #if HAVE_NEON
> + flags |= X265_CPU_NEON; // All of ARM64 has NEON
> + #endif
> + #if HAVE_NEON_DOTPROD
> + flags |= (hwcaps & HWCAP_ASIMDDP ? X265_CPU_NEON_DOTPROD : 0);
> + #endif
> + #if HAVE_NEON_I8MM
> + flags |= (hwcaps2 & HWCAP2_I8MM ? X265_CPU_NEON_I8MM : 0);
> + #endif
> + #if HAVE_SVE
> + flags |= (hwcaps & HWCAP_SVE ? X265_CPU_SVE : 0);
> + #endif
> + #if HAVE_SVE2
> + flags |= (hwcaps2 & HWCAP2_SVE2 ? X265_CPU_SVE2 : 0);
> + #endif
> +
> + return flags;
> +}
> +#endif // end of Linux+AArch64
> +
> #elif X265_ARCH_POWER8
>
> uint32_t cpu_detect(bool benableavx512)
> --
> 2.45.2
>
>
> On Thu, Oct 3, 2024 at 3:26 PM Dash Santosh <
> dash.sathyanarayanan at multicorewareinc.com> wrote:
>
>> From 7d2353aaf7509721461c141f2800962c15ff440c Mon Sep 17 00:00:00 2001
>> From: Logaprakash Ramajayam <logaprakash.ramajayam at multicorewareinc.com>
>> Date: Wed, 2 Oct 2024 21:59:59 -0700
>> Subject: [PATCH] AArch64: Runtime CPU feature detection
>>
>> ---
>> .../make-aarch64-w64-mingw32-Makefiles.sh | 8 ++
>> .../msys/toolchain-aarch64-w64-mingw32.cmake | 8 ++
>> source/CMakeLists.txt | 21 +++--
>> source/common/CMakeLists.txt | 5 ++
>> source/common/cpu.cpp | 81 ++++++++++++++++++-
>> 5 files changed, 114 insertions(+), 9 deletions(-)
>> create mode 100644 build/msys/make-aarch64-w64-mingw32-Makefiles.sh
>> create mode 100644 build/msys/toolchain-aarch64-w64-mingw32.cmake
>>
>> diff --git a/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
>> b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
>> new file mode 100644
>> index 000000000..eceffa4a9
>> --- /dev/null
>> +++ b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
>> @@ -0,0 +1,8 @@
>> +#!/bin/sh
>> +
>> +# This will generate a cross-compile environment, compiling an aarch64
>> +# Win64 target from a 32bit MinGW32 host environment. If your MinGW
>> +# install is 64bit, you can use the native compiler batch file:
>> +# make-Makefiles.sh
>> +
>> +cmake -G "MSYS Makefiles"
>> -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64-w64-mingw32.cmake ../../source &&
>> cmake-gui ../../source
>> diff --git a/build/msys/toolchain-aarch64-w64-mingw32.cmake
>> b/build/msys/toolchain-aarch64-w64-mingw32.cmake
>> new file mode 100644
>> index 000000000..6607bdf64
>> --- /dev/null
>> +++ b/build/msys/toolchain-aarch64-w64-mingw32.cmake
>> @@ -0,0 +1,8 @@
>> +SET(CMAKE_SYSTEM_NAME Windows)
>> +set(CMAKE_SYSTEM_PROCESSOR aarch64)
>> +SET(CMAKE_C_COMPILER aarch64-w64-mingw32-gcc)
>> +SET(CMAKE_CXX_COMPILER aarch64-w64-mingw32-g++)
>> +SET(CMAKE_RC_COMPILER aarch64-w64-mingw32-windres)
>> +SET(CMAKE_RANLIB aarch64-w64-mingw32-ranlib)
>> +SET(CMAKE_ASM_YASM_COMPILER yasm)
>> +SET(CROSS_COMPILE_ARM64 1)
>> diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
>> index 13bc8ccfe..d1fe38559 100755
>> --- a/source/CMakeLists.txt
>> +++ b/source/CMakeLists.txt
>> @@ -303,10 +303,12 @@ if(GCC)
>> endif()
>> endif()
>>
>> + set(ARM64_ARCH_ARGS "-O3")
>> if(CPU_HAS_NEON_DOTPROD)
>> # Neon DotProd is mandatory from Armv8.4.
>> message(STATUS "Found Neon DotProd")
>> - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod)
>> + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod)
>> + set(ARM_ARGS -O3)
>> add_definitions(-DHAVE_NEON_DOTPROD=1)
>> endif()
>> if(CPU_HAS_NEON_I8MM)
>> @@ -316,7 +318,8 @@ if(GCC)
>> if(NOT CPU_HAS_NEON_DOTPROD)
>> message(FATAL_ERROR "Unsupported AArch64 feature
>> combination (Neon I8MM without Neon DotProd)")
>> endif()
>> - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm)
>> + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm)
>> + set(ARM_ARGS -O3)
>> add_definitions(-DHAVE_NEON_I8MM=1)
>> endif()
>> if(CPU_HAS_SVE)
>> @@ -325,13 +328,15 @@ if(GCC)
>> if(NOT CPU_HAS_NEON_I8MM)
>> message(FATAL_ERROR "Unsupported AArch64 feature
>> combination (SVE without Neon I8MM)")
>> endif()
>> - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve)
>> + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm+sve)
>> + set(ARM_ARGS -O3)
>> add_definitions(-DHAVE_SVE=1)
>> endif()
>> if(CPU_HAS_SVE2)
>> message(STATUS "Found SVE2")
>> # SVE2 is only available from Armv9.0, and armv9-a implies
>> +dotprod
>> - set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2)
>> + set(ARM64_ARCH_ARGS -march=armv9-a+i8mm+sve2)
>> + set(ARM_ARGS -O3)
>> add_definitions(-DHAVE_SVE2=1)
>> endif()
>> set(ARM_ARGS ${ARM_ARGS} -fPIC)
>> @@ -692,7 +697,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>> add_custom_command(
>> OUTPUT ${ASM}.${SUFFIX}
>> COMMAND ${CMAKE_CXX_COMPILER}
>> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>> ${ASM}.${SUFFIX}
>> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
>> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>> DEPENDS ${ASM_SRC})
>> endforeach()
>> if(CPU_HAS_SVE2)
>> @@ -703,7 +708,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>> add_custom_command(
>> OUTPUT ${ASM}.${SUFFIX}
>> COMMAND ${CMAKE_CXX_COMPILER}
>> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>> ${ASM}.${SUFFIX}
>> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
>> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>> DEPENDS ${ASM_SRC})
>> endforeach()
>> endif()
>> @@ -715,7 +720,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>> add_custom_command(
>> OUTPUT ${ASM}.${SUFFIX}
>> COMMAND ${CMAKE_CXX_COMPILER}
>> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>> ${ASM}.${SUFFIX}
>> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
>> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>> DEPENDS ${ASM_SRC})
>> endforeach()
>> endif()
>> @@ -727,7 +732,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>> add_custom_command(
>> OUTPUT ${ASM}.${SUFFIX}
>> COMMAND ${CMAKE_CXX_COMPILER}
>> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>> ${ASM}.${SUFFIX}
>> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
>> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>> DEPENDS ${ASM_SRC})
>> endforeach()
>> endif()
>> diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
>> index dc4a74107..33025cada 100644
>> --- a/source/common/CMakeLists.txt
>> +++ b/source/common/CMakeLists.txt
>> @@ -123,29 +123,34 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR
>> CROSS_COMPILE_ARM64))
>> set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL
>> "Arm Assembly Sources that use the Neon DotProd extension")
>> foreach(SRC ${C_SRCS_NEON})
>> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>> + set_source_files_properties( ${SRC} PROPERTIES COMPILE_FLAGS
>> ${ARM64_ARCH_ARGS} )
>> endforeach()
>>
>> if(CPU_HAS_NEON_I8MM)
>> foreach(SRC ${C_SRCS_NEON_I8MM})
>> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>> + set_source_files_properties( aarch64/${SRC} PROPERTIES
>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>> endforeach()
>> endif()
>>
>> if(CPU_HAS_NEON_DOTPROD)
>> foreach(SRC ${C_SRCS_NEON_DOTPROD})
>> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>> + set_source_files_properties( aarch64/${SRC} PROPERTIES
>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>> endforeach()
>> endif()
>>
>> if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE)
>> foreach(SRC ${C_SRCS_SVE})
>> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>> + set_source_files_properties( aarch64/${SRC} PROPERTIES
>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>> endforeach()
>> endif()
>>
>> if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE)
>> foreach(SRC ${C_SRCS_SVE2})
>> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>> + set_source_files_properties( aarch64/${SRC} PROPERTIES
>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>> endforeach()
>> endif()
>>
>> diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp
>> index 61cdaadfb..a2b0ac081 100644
>> --- a/source/common/cpu.cpp
>> +++ b/source/common/cpu.cpp
>> @@ -391,7 +391,8 @@ uint32_t cpu_detect(bool benableavx512)
>>
>> #elif X265_ARCH_ARM64
>>
>> -uint32_t cpu_detect(bool benableavx512)
>> +#if defined(_MSC_VER) || defined(__APPLE__)
>> +uint32_t cpu_detect(bool /*benableavx512*/)
>> {
>> int flags = 0;
>>
>> @@ -414,6 +415,84 @@ uint32_t cpu_detect(bool benableavx512)
>> return flags;
>> }
>>
>> +// TODO: Support ARM on Windows
>> +#elif defined(__MINGW64__)
>> +
>> +#include <windows.h>
>> +#include <processthreadsapi.h>
>> +
>> +bool isOryonCPU()
>> +{
>> +
>> + char processorName[128];
>> + DWORD bufferSize = 128;
>> +
>> + LONG result = RegGetValue(HKEY_LOCAL_MACHINE,
>> "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
>> "ProcessorNameString", RRF_RT_ANY, NULL, (PVOID)&processorName,
>> &bufferSize);
>> + if (strstr(processorName, "Oryon") != NULL)
>> + {
>> + return true;
>> + }
>> + else
>> + {
>> + return false;
>> + }
>> +}
>> +uint32_t cpu_detect(bool /*benableavx512*/)
>> +{
>> +
>> + int flags = 0;
>> +
>> + #if HAVE_NEON
>> + flags |= X265_CPU_NEON; // All of ARM64 has NEON
>> + #endif
>> + #if HAVE_NEON_DOTPROD &&
>> defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
>> + flags |=
>> IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) ?
>> X265_CPU_NEON_DOTPROD : 0;
>> + #endif
>> + #if HAVE_NEON_I8MM
>> + flags |= isOryonCPU() ? X265_CPU_NEON_I8MM : 0;
>> + #endif
>> + #if HAVE_SVE && defined(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)
>> + flags |=
>> IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE) ? X265_CPU_SVE
>> : 0;
>> + #endif
>> + #if HAVE_SVE2 && defined(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)
>> + flags |=
>> IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE) ?
>> X265_CPU_SVE2 : 0;
>> + #endif
>> +
>> + return flags;
>> +}
>> +
>> +#else // Linux+Aarch64
>> +
>> +#include <asm/hwcap.h>
>> +#include <sys/auxv.h>
>> +
>> +uint32_t cpu_detect(bool /*benableavx5128*/)
>> +{
>> + unsigned long hwcaps = getauxval(AT_HWCAP);
>> + unsigned long hwcaps2 = getauxval(AT_HWCAP2);
>> +
>> + int flags = 0;
>> +
>> + #if HAVE_NEON
>> + flags |= X265_CPU_NEON; // All of ARM64 has NEON
>> + #endif
>> + #if HAVE_NEON_DOTPROD
>> + flags |= (hwcaps & HWCAP_ASIMDDP ? X265_CPU_NEON_DOTPROD : 0);
>> + #endif
>> + #if HAVE_NEON_I8MM
>> + flags |= (hwcaps2 & HWCAP2_I8MM ? X265_CPU_NEON_I8MM : 0);
>> + #endif
>> + #if HAVE_SVE
>> + flags |= (hwcaps & HWCAP_SVE ? X265_CPU_SVE : 0);
>> + #endif
>> + #if HAVE_SVE2
>> + flags |= (hwcaps2 & HWCAP2_SVE2 ? X265_CPU_SVE2 : 0);
>> + #endif
>> +
>> + return flags;
>> +}
>> +#endif // end of Linux+AArch64
>> +
>> #elif X265_ARCH_POWER8
>>
>> uint32_t cpu_detect(bool benableavx512)
>> --
>> 2.45.2
>>
>>
>> On Thu, Oct 3, 2024 at 3:26 PM Dash Santosh <
>> dash.sathyanarayanan at multicorewareinc.com> wrote:
>>
>>> Hi Hari,
>>> Thanks for spotting this. Also added support for Windows on ARM. Please
>>> find below the updated patch:
>>>
>>>
>>> On Thu, Sep 26, 2024 at 11:43 AM Dash Santosh <
>>> dash.sathyanarayanan at multicorewareinc.com> wrote:
>>>
>>>> From e9614d170f93f3ad4f01e95abfed0a260f218bd5 Mon Sep 17 00:00:00 2001
>>>> From: Min Chen <chenm003 at 163.com>
>>>> Date: Sat, 14 Sep 2024 14:25:28 -0700
>>>> Subject: [PATCH] AArch64: Runtime CPU feature detection
>>>>
>>>> ---
>>>> source/CMakeLists.txt | 20 ++++++++++++--------
>>>> source/common/CMakeLists.txt | 5 +++++
>>>> source/common/cpu.cpp | 33 +++++++++++++++++++++++++++++++++
>>>> 3 files changed, 50 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
>>>> index 37b83f959..32a99206f 100755
>>>> --- a/source/CMakeLists.txt
>>>> +++ b/source/CMakeLists.txt
>>>> @@ -306,7 +306,8 @@ if(GCC)
>>>> if(CPU_HAS_NEON_DOTPROD)
>>>> # Neon DotProd is mandatory from Armv8.4.
>>>> message(STATUS "Found Neon DotProd")
>>>> - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod)
>>>> + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod)
>>>> + set(ARM_ARGS -O3)
>>>> add_definitions(-DHAVE_NEON_DOTPROD=1)
>>>> endif()
>>>> if(CPU_HAS_NEON_I8MM)
>>>> @@ -316,7 +317,8 @@ if(GCC)
>>>> if(NOT CPU_HAS_NEON_DOTPROD)
>>>> message(FATAL_ERROR "Unsupported AArch64 feature
>>>> combination (Neon I8MM without Neon DotProd)")
>>>> endif()
>>>> - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm)
>>>> + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm)
>>>> + set(ARM_ARGS -O3)
>>>> add_definitions(-DHAVE_NEON_I8MM=1)
>>>> endif()
>>>> if(CPU_HAS_SVE)
>>>> @@ -325,13 +327,15 @@ if(GCC)
>>>> if(NOT CPU_HAS_NEON_I8MM)
>>>> message(FATAL_ERROR "Unsupported AArch64 feature
>>>> combination (SVE without Neon I8MM)")
>>>> endif()
>>>> - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve)
>>>> + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm+sve)
>>>> + set(ARM_ARGS -O3)
>>>> add_definitions(-DHAVE_SVE=1)
>>>> endif()
>>>> if(CPU_HAS_SVE2)
>>>> message(STATUS "Found SVE2")
>>>> # SVE2 is only available from Armv9.0, and armv9-a implies
>>>> +dotprod
>>>> - set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2)
>>>> + set(ARM64_ARCH_ARGS -march=armv9-a+i8mm+sve2)
>>>> + set(ARM_ARGS -O3)
>>>> add_definitions(-DHAVE_SVE2=1)
>>>> endif()
>>>> set(ARM_ARGS ${ARM_ARGS} -fPIC)
>>>> @@ -692,7 +696,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>>>> add_custom_command(
>>>> OUTPUT ${ASM}.${SUFFIX}
>>>> COMMAND ${CMAKE_CXX_COMPILER}
>>>> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>>>> ${ASM}.${SUFFIX}
>>>> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
>>>> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>>>> DEPENDS ${ASM_SRC})
>>>> endforeach()
>>>> if(CPU_HAS_SVE2)
>>>> @@ -703,7 +707,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>>>> add_custom_command(
>>>> OUTPUT ${ASM}.${SUFFIX}
>>>> COMMAND ${CMAKE_CXX_COMPILER}
>>>> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>>>> ${ASM}.${SUFFIX}
>>>> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS}
>>>> -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
>>>> DEPENDS ${ASM_SRC})
>>>> endforeach()
>>>> endif()
>>>> @@ -715,7 +719,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>>>> add_custom_command(
>>>> OUTPUT ${ASM}.${SUFFIX}
>>>> COMMAND ${CMAKE_CXX_COMPILER}
>>>> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>>>> ${ASM}.${SUFFIX}
>>>> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS}
>>>> -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
>>>> DEPENDS ${ASM_SRC})
>>>> endforeach()
>>>> endif()
>>>> @@ -727,7 +731,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>>>> add_custom_command(
>>>> OUTPUT ${ASM}.${SUFFIX}
>>>> COMMAND ${CMAKE_CXX_COMPILER}
>>>> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>>>> ${ASM}.${SUFFIX}
>>>> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS}
>>>> -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
>>>> DEPENDS ${ASM_SRC})
>>>> endforeach()
>>>> endif()
>>>> diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
>>>> index dc4a74107..33025cada 100644
>>>> --- a/source/common/CMakeLists.txt
>>>> +++ b/source/common/CMakeLists.txt
>>>> @@ -123,29 +123,34 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR
>>>> CROSS_COMPILE_ARM64))
>>>> set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL
>>>> "Arm Assembly Sources that use the Neon DotProd extension")
>>>> foreach(SRC ${C_SRCS_NEON})
>>>> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>>>> + set_source_files_properties( ${SRC} PROPERTIES COMPILE_FLAGS
>>>> ${ARM64_ARCH_ARGS} )
>>>> endforeach()
>>>>
>>>> if(CPU_HAS_NEON_I8MM)
>>>> foreach(SRC ${C_SRCS_NEON_I8MM})
>>>> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>>>> + set_source_files_properties( aarch64/${SRC} PROPERTIES
>>>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>>>> endforeach()
>>>> endif()
>>>>
>>>> if(CPU_HAS_NEON_DOTPROD)
>>>> foreach(SRC ${C_SRCS_NEON_DOTPROD})
>>>> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>>>> + set_source_files_properties( aarch64/${SRC} PROPERTIES
>>>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>>>> endforeach()
>>>> endif()
>>>>
>>>> if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE)
>>>> foreach(SRC ${C_SRCS_SVE})
>>>> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>>>> + set_source_files_properties( aarch64/${SRC} PROPERTIES
>>>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>>>> endforeach()
>>>> endif()
>>>>
>>>> if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE)
>>>> foreach(SRC ${C_SRCS_SVE2})
>>>> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>>>> + set_source_files_properties( aarch64/${SRC} PROPERTIES
>>>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>>>> endforeach()
>>>> endif()
>>>>
>>>> diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp
>>>> index 61cdaadfb..24c60ff0e 100644
>>>> --- a/source/common/cpu.cpp
>>>> +++ b/source/common/cpu.cpp
>>>> @@ -391,6 +391,8 @@ uint32_t cpu_detect(bool benableavx512)
>>>>
>>>> #elif X265_ARCH_ARM64
>>>>
>>>> +// TODO: Support ARM on Windows
>>>> +#if _MSC_VER
>>>> uint32_t cpu_detect(bool benableavx512)
>>>> {
>>>> int flags = 0;
>>>> @@ -413,6 +415,37 @@ uint32_t cpu_detect(bool benableavx512)
>>>>
>>>> return flags;
>>>> }
>>>> +#else // Linux+Aarch64
>>>> +
>>>> +#include <asm/hwcap.h>
>>>> +#include <sys/auxv.h>
>>>> +
>>>> +uint32_t cpu_detect(bool benableavx512)
>>>> +{
>>>> + unsigned long hwcaps = getauxval(AT_HWCAP);
>>>> + unsigned long hwcaps2 = getauxval(AT_HWCAP2);
>>>> +
>>>> + int flags = 0;
>>>> +
>>>> + #if HAVE_NEON
>>>> + flags |= X265_CPU_NEON; // All of ARM64 has NEON
>>>> + #endif
>>>> + #if HAVE_NEON_DOTPROD
>>>> + flags |= (hwcaps & HWCAP_ASIMDDP ? X265_CPU_NEON_DOTPROD : 0);
>>>> + #endif
>>>> + #if HAVE_NEON_I8MM
>>>> + flags |= (hwcaps2 & HWCAP2_SVEI8MM ? X265_CPU_NEON_I8MM : 0);
>>>> + #endif
>>>> + #if HAVE_SVE
>>>> + flags |= (hwcaps & HWCAP_SVE ? X265_CPU_SVE : 0);
>>>> + #endif
>>>> + #if HAVE_SVE2
>>>> + flags |= (hwcaps2 & HWCAP2_SVE2 ? X265_CPU_SVE2 : 0);
>>>> + #endif
>>>> +
>>>> + return flags;
>>>> +}
>>>> +#endif // end of Linux+AArch64
>>>>
>>>> #elif X265_ARCH_POWER8
>>>>
>>>> --
>>>> 2.43.0.windows.1
>>>>
>>>>
>>>> --
>>>>
>>>> * <https://multicorewareinc.com/>*
>>>> <https://www.linkedin.com/company/multicoreware-inc/>
>>>> <https://twitter.com/MulticoreWare>
>>>> <https://www.facebook.com/multicoreware>
>>>> <https://www.youtube.com/channel/UCXZ1A1MzS5JwBqwBkNfsBBw?sub_confirmation=1>
>>>> <https://www.instagram.com/multicoreware.inc/>
>>>>
>>>> *Dash Santosh*
>>>>
>>>> *Research Engineer, Video Engineering*
>>>>
>>>> Mobile: +91 78679 43737
>>>>
>>>> IndiQube Echo Point, Avinashi Road
>>>>
>>>> Coimbatore - 641 014
>>>>
>>>>
>>>>
>>>>
>>>> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20241004/13c6c031/attachment-0001.htm>
More information about the x265-devel
mailing list