[x265] [PATCH] AArch64: Runtime CPU feature detection

Karam Singh karam.singh at multicorewareinc.com
Fri Oct 4 08:27:09 UTC 2024


This patch has been pushed to the master branch.
*__________________________*
*Karam Singh*
*Ph.D. IIT Guwahati*
Senior Software (Video Coding) Engineer
Mobile: +91 8011279030
Block 9A, 6th floor, DLF Cyber City
Manapakkam, Chennai 600 089


On Thu, Oct 3, 2024 at 7:34 PM Dash Santosh <
dash.sathyanarayanan at multicorewareinc.com> wrote:

> Fixed typos and updated TODO. Please find the updated patch below:
>
> From c116db02bd50faa59c3d2b1c63bd6816d6dec2a0 Mon Sep 17 00:00:00 2001
> From: Logaprakash Ramajayam <logaprakash.ramajayam at multicorewareinc.com>
> Date: Thu, 3 Oct 2024 05:24:16 -0700
> Subject: [PATCH] AArch64: Runtime CPU feature detection
>
> ---
>  .../make-aarch64-w64-mingw32-Makefiles.sh     |  8 ++
>  .../msys/toolchain-aarch64-w64-mingw32.cmake  |  8 ++
>  source/CMakeLists.txt                         | 21 +++--
>  source/common/CMakeLists.txt                  |  5 ++
>  source/common/cpu.cpp                         | 81 ++++++++++++++++++-
>  5 files changed, 114 insertions(+), 9 deletions(-)
>  create mode 100644 build/msys/make-aarch64-w64-mingw32-Makefiles.sh
>  create mode 100644 build/msys/toolchain-aarch64-w64-mingw32.cmake
>
> diff --git a/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
> b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
> new file mode 100644
> index 000000000..eceffa4a9
> --- /dev/null
> +++ b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
> @@ -0,0 +1,8 @@
> +#!/bin/sh
> +
> +# This will generate a cross-compile environment, compiling an aarch64
> +# Win64 target from a 32bit MinGW32 host environment.  If your MinGW
> +# install is 64bit, you can use the native compiler batch file:
> +# make-Makefiles.sh
> +
> +cmake -G "MSYS Makefiles"
> -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64-w64-mingw32.cmake ../../source &&
> cmake-gui ../../source
> diff --git a/build/msys/toolchain-aarch64-w64-mingw32.cmake
> b/build/msys/toolchain-aarch64-w64-mingw32.cmake
> new file mode 100644
> index 000000000..6607bdf64
> --- /dev/null
> +++ b/build/msys/toolchain-aarch64-w64-mingw32.cmake
> @@ -0,0 +1,8 @@
> +SET(CMAKE_SYSTEM_NAME Windows)
> +set(CMAKE_SYSTEM_PROCESSOR aarch64)
> +SET(CMAKE_C_COMPILER aarch64-w64-mingw32-gcc)
> +SET(CMAKE_CXX_COMPILER aarch64-w64-mingw32-g++)
> +SET(CMAKE_RC_COMPILER aarch64-w64-mingw32-windres)
> +SET(CMAKE_RANLIB aarch64-w64-mingw32-ranlib)
> +SET(CMAKE_ASM_YASM_COMPILER yasm)
> +SET(CROSS_COMPILE_ARM64 1)
> diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
> index 13bc8ccfe..d1fe38559 100755
> --- a/source/CMakeLists.txt
> +++ b/source/CMakeLists.txt
> @@ -303,10 +303,12 @@ if(GCC)
>              endif()
>          endif()
>
> +        set(ARM64_ARCH_ARGS "-O3")
>          if(CPU_HAS_NEON_DOTPROD)
>              # Neon DotProd is mandatory from Armv8.4.
>              message(STATUS "Found Neon DotProd")
> -            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod)
> +            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod)
> +            set(ARM_ARGS -O3)
>              add_definitions(-DHAVE_NEON_DOTPROD=1)
>          endif()
>          if(CPU_HAS_NEON_I8MM)
> @@ -316,7 +318,8 @@ if(GCC)
>              if(NOT CPU_HAS_NEON_DOTPROD)
>                  message(FATAL_ERROR "Unsupported AArch64 feature
> combination (Neon I8MM without Neon DotProd)")
>              endif()
> -            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm)
> +            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm)
> +            set(ARM_ARGS -O3)
>              add_definitions(-DHAVE_NEON_I8MM=1)
>          endif()
>          if(CPU_HAS_SVE)
> @@ -325,13 +328,15 @@ if(GCC)
>              if(NOT CPU_HAS_NEON_I8MM)
>                  message(FATAL_ERROR "Unsupported AArch64 feature
> combination (SVE without Neon I8MM)")
>              endif()
> -            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve)
> +            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm+sve)
> +            set(ARM_ARGS -O3)
>              add_definitions(-DHAVE_SVE=1)
>          endif()
>          if(CPU_HAS_SVE2)
>              message(STATUS "Found SVE2")
>              # SVE2 is only available from Armv9.0, and armv9-a implies
> +dotprod
> -            set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2)
> +            set(ARM64_ARCH_ARGS -march=armv9-a+i8mm+sve2)
> +            set(ARM_ARGS -O3)
>              add_definitions(-DHAVE_SVE2=1)
>          endif()
>          set(ARM_ARGS ${ARM_ARGS} -fPIC)
> @@ -692,7 +697,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>              add_custom_command(
>                  OUTPUT ${ASM}.${SUFFIX}
>                  COMMAND ${CMAKE_CXX_COMPILER}
> -                ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
> ${ASM}.${SUFFIX}
> +                ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>                  DEPENDS ${ASM_SRC})
>          endforeach()
>          if(CPU_HAS_SVE2)
> @@ -703,7 +708,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>                  add_custom_command(
>                      OUTPUT ${ASM}.${SUFFIX}
>                      COMMAND ${CMAKE_CXX_COMPILER}
> -                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
> ${ASM}.${SUFFIX}
> +                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>                      DEPENDS ${ASM_SRC})
>              endforeach()
>          endif()
> @@ -715,7 +720,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>                  add_custom_command(
>                      OUTPUT ${ASM}.${SUFFIX}
>                      COMMAND ${CMAKE_CXX_COMPILER}
> -                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
> ${ASM}.${SUFFIX}
> +                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>                      DEPENDS ${ASM_SRC})
>              endforeach()
>          endif()
> @@ -727,7 +732,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>                  add_custom_command(
>                      OUTPUT ${ASM}.${SUFFIX}
>                      COMMAND ${CMAKE_CXX_COMPILER}
> -                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
> ${ASM}.${SUFFIX}
> +                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>                      DEPENDS ${ASM_SRC})
>              endforeach()
>          endif()
> diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
> index dc4a74107..33025cada 100644
> --- a/source/common/CMakeLists.txt
> +++ b/source/common/CMakeLists.txt
> @@ -123,29 +123,34 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR
> CROSS_COMPILE_ARM64))
>      set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL
> "Arm Assembly Sources that use the Neon DotProd extension")
>      foreach(SRC ${C_SRCS_NEON})
>          set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
> +        set_source_files_properties( ${SRC} PROPERTIES COMPILE_FLAGS
> ${ARM64_ARCH_ARGS} )
>      endforeach()
>
>      if(CPU_HAS_NEON_I8MM)
>          foreach(SRC ${C_SRCS_NEON_I8MM})
>              set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
> +            set_source_files_properties( aarch64/${SRC} PROPERTIES
> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>          endforeach()
>      endif()
>
>      if(CPU_HAS_NEON_DOTPROD)
>          foreach(SRC ${C_SRCS_NEON_DOTPROD})
>              set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
> +            set_source_files_properties( aarch64/${SRC} PROPERTIES
> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>          endforeach()
>      endif()
>
>      if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE)
>          foreach(SRC ${C_SRCS_SVE})
>              set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
> +            set_source_files_properties( aarch64/${SRC} PROPERTIES
> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>          endforeach()
>      endif()
>
>      if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE)
>          foreach(SRC ${C_SRCS_SVE2})
>              set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
> +            set_source_files_properties( aarch64/${SRC} PROPERTIES
> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>          endforeach()
>      endif()
>
> diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp
> index 61cdaadfb..2d4b15dc9 100644
> --- a/source/common/cpu.cpp
> +++ b/source/common/cpu.cpp
> @@ -391,7 +391,8 @@ uint32_t cpu_detect(bool benableavx512)
>
>  #elif X265_ARCH_ARM64
>
> -uint32_t cpu_detect(bool benableavx512)
> +#if defined(_MSC_VER) || defined(__APPLE__)
> +uint32_t cpu_detect(bool /*benableavx512*/)
>  {
>      int flags = 0;
>
> @@ -414,6 +415,84 @@ uint32_t cpu_detect(bool benableavx512)
>      return flags;
>  }
>
> +// TODO: Remove isOryonCPU() once Windows defines PF_ flag for I8MM on
> supported ARM64 devices
> +#elif defined(__MINGW64__) // Windows+Aarch64
> +
> +#include <windows.h>
> +#include <processthreadsapi.h>
> +
> +bool isOryonCPU()
> +{
> +
> +    char processorName[128];
> +    DWORD bufferSize = 128;
> +
> +    LONG result = RegGetValue(HKEY_LOCAL_MACHINE,
> "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
> "ProcessorNameString", RRF_RT_ANY, NULL, (PVOID)&processorName,
> &bufferSize);
> +    if (strstr(processorName, "Oryon") != NULL)
> +    {
> +        return true;
> +    }
> +    else
> +    {
> +        return false;
> +    }
> +}
> +uint32_t cpu_detect(bool /*benableavx512*/)
> +{
> +
> +    int flags = 0;
> +
> +    #if HAVE_NEON
> +         flags |= X265_CPU_NEON;    // All of ARM64 has NEON
> +    #endif
> +    #if HAVE_NEON_DOTPROD && defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
> +         flags |=
> IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) ?
> X265_CPU_NEON_DOTPROD : 0;
> +    #endif
> +    #if HAVE_NEON_I8MM
> +         flags |= isOryonCPU() ? X265_CPU_NEON_I8MM : 0;
> +    #endif
> +    #if HAVE_SVE && defined(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)
> +         flags |=
> IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE) ? X265_CPU_SVE
> : 0;
> +    #endif
> +    #if HAVE_SVE2 && defined(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)
> +         flags |=
> IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE) ?
> X265_CPU_SVE2 : 0;
> +    #endif
> +
> +    return flags;
> +} // end of Windows+Aarch64
> +
> +#else // Linux+Aarch64
> +
> +#include <asm/hwcap.h>
> +#include <sys/auxv.h>
> +
> +uint32_t cpu_detect(bool /*benableavx512*/)
> +{
> +    unsigned long hwcaps = getauxval(AT_HWCAP);
> +    unsigned long hwcaps2 = getauxval(AT_HWCAP2);
> +
> +    int flags = 0;
> +
> +    #if HAVE_NEON
> +         flags |= X265_CPU_NEON;    // All of ARM64 has NEON
> +    #endif
> +    #if HAVE_NEON_DOTPROD
> +         flags |= (hwcaps & HWCAP_ASIMDDP ? X265_CPU_NEON_DOTPROD : 0);
> +    #endif
> +    #if HAVE_NEON_I8MM
> +         flags |= (hwcaps2 & HWCAP2_I8MM ? X265_CPU_NEON_I8MM : 0);
> +    #endif
> +    #if HAVE_SVE
> +         flags |= (hwcaps & HWCAP_SVE ? X265_CPU_SVE : 0);
> +    #endif
> +    #if HAVE_SVE2
> +         flags |= (hwcaps2 & HWCAP2_SVE2 ? X265_CPU_SVE2 : 0);
> +    #endif
> +
> +    return flags;
> +}
> +#endif // end of Linux+AArch64
> +
>  #elif X265_ARCH_POWER8
>
>  uint32_t cpu_detect(bool benableavx512)
> --
> 2.45.2
>
>
> On Thu, Oct 3, 2024 at 3:26 PM Dash Santosh <
> dash.sathyanarayanan at multicorewareinc.com> wrote:
>
>> From 7d2353aaf7509721461c141f2800962c15ff440c Mon Sep 17 00:00:00 2001
>> From: Logaprakash Ramajayam <logaprakash.ramajayam at multicorewareinc.com>
>> Date: Wed, 2 Oct 2024 21:59:59 -0700
>> Subject: [PATCH] AArch64: Runtime CPU feature detection
>>
>> ---
>>  .../make-aarch64-w64-mingw32-Makefiles.sh     |  8 ++
>>  .../msys/toolchain-aarch64-w64-mingw32.cmake  |  8 ++
>>  source/CMakeLists.txt                         | 21 +++--
>>  source/common/CMakeLists.txt                  |  5 ++
>>  source/common/cpu.cpp                         | 81 ++++++++++++++++++-
>>  5 files changed, 114 insertions(+), 9 deletions(-)
>>  create mode 100644 build/msys/make-aarch64-w64-mingw32-Makefiles.sh
>>  create mode 100644 build/msys/toolchain-aarch64-w64-mingw32.cmake
>>
>> diff --git a/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
>> b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
>> new file mode 100644
>> index 000000000..eceffa4a9
>> --- /dev/null
>> +++ b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh
>> @@ -0,0 +1,8 @@
>> +#!/bin/sh
>> +
>> +# This will generate a cross-compile environment, compiling an aarch64
>> +# Win64 target from a 32bit MinGW32 host environment.  If your MinGW
>> +# install is 64bit, you can use the native compiler batch file:
>> +# make-Makefiles.sh
>> +
>> +cmake -G "MSYS Makefiles"
>> -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64-w64-mingw32.cmake ../../source &&
>> cmake-gui ../../source
>> diff --git a/build/msys/toolchain-aarch64-w64-mingw32.cmake
>> b/build/msys/toolchain-aarch64-w64-mingw32.cmake
>> new file mode 100644
>> index 000000000..6607bdf64
>> --- /dev/null
>> +++ b/build/msys/toolchain-aarch64-w64-mingw32.cmake
>> @@ -0,0 +1,8 @@
>> +SET(CMAKE_SYSTEM_NAME Windows)
>> +set(CMAKE_SYSTEM_PROCESSOR aarch64)
>> +SET(CMAKE_C_COMPILER aarch64-w64-mingw32-gcc)
>> +SET(CMAKE_CXX_COMPILER aarch64-w64-mingw32-g++)
>> +SET(CMAKE_RC_COMPILER aarch64-w64-mingw32-windres)
>> +SET(CMAKE_RANLIB aarch64-w64-mingw32-ranlib)
>> +SET(CMAKE_ASM_YASM_COMPILER yasm)
>> +SET(CROSS_COMPILE_ARM64 1)
>> diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
>> index 13bc8ccfe..d1fe38559 100755
>> --- a/source/CMakeLists.txt
>> +++ b/source/CMakeLists.txt
>> @@ -303,10 +303,12 @@ if(GCC)
>>              endif()
>>          endif()
>>
>> +        set(ARM64_ARCH_ARGS "-O3")
>>          if(CPU_HAS_NEON_DOTPROD)
>>              # Neon DotProd is mandatory from Armv8.4.
>>              message(STATUS "Found Neon DotProd")
>> -            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod)
>> +            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod)
>> +            set(ARM_ARGS -O3)
>>              add_definitions(-DHAVE_NEON_DOTPROD=1)
>>          endif()
>>          if(CPU_HAS_NEON_I8MM)
>> @@ -316,7 +318,8 @@ if(GCC)
>>              if(NOT CPU_HAS_NEON_DOTPROD)
>>                  message(FATAL_ERROR "Unsupported AArch64 feature
>> combination (Neon I8MM without Neon DotProd)")
>>              endif()
>> -            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm)
>> +            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm)
>> +            set(ARM_ARGS -O3)
>>              add_definitions(-DHAVE_NEON_I8MM=1)
>>          endif()
>>          if(CPU_HAS_SVE)
>> @@ -325,13 +328,15 @@ if(GCC)
>>              if(NOT CPU_HAS_NEON_I8MM)
>>                  message(FATAL_ERROR "Unsupported AArch64 feature
>> combination (SVE without Neon I8MM)")
>>              endif()
>> -            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve)
>> +            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm+sve)
>> +            set(ARM_ARGS -O3)
>>              add_definitions(-DHAVE_SVE=1)
>>          endif()
>>          if(CPU_HAS_SVE2)
>>              message(STATUS "Found SVE2")
>>              # SVE2 is only available from Armv9.0, and armv9-a implies
>> +dotprod
>> -            set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2)
>> +            set(ARM64_ARCH_ARGS -march=armv9-a+i8mm+sve2)
>> +            set(ARM_ARGS -O3)
>>              add_definitions(-DHAVE_SVE2=1)
>>          endif()
>>          set(ARM_ARGS ${ARM_ARGS} -fPIC)
>> @@ -692,7 +697,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>>              add_custom_command(
>>                  OUTPUT ${ASM}.${SUFFIX}
>>                  COMMAND ${CMAKE_CXX_COMPILER}
>> -                ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>> ${ASM}.${SUFFIX}
>> +                ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
>> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>>                  DEPENDS ${ASM_SRC})
>>          endforeach()
>>          if(CPU_HAS_SVE2)
>> @@ -703,7 +708,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>>                  add_custom_command(
>>                      OUTPUT ${ASM}.${SUFFIX}
>>                      COMMAND ${CMAKE_CXX_COMPILER}
>> -                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>> ${ASM}.${SUFFIX}
>> +                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
>> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>>                      DEPENDS ${ASM_SRC})
>>              endforeach()
>>          endif()
>> @@ -715,7 +720,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>>                  add_custom_command(
>>                      OUTPUT ${ASM}.${SUFFIX}
>>                      COMMAND ${CMAKE_CXX_COMPILER}
>> -                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>> ${ASM}.${SUFFIX}
>> +                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
>> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>>                      DEPENDS ${ASM_SRC})
>>              endforeach()
>>          endif()
>> @@ -727,7 +732,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>>                  add_custom_command(
>>                      OUTPUT ${ASM}.${SUFFIX}
>>                      COMMAND ${CMAKE_CXX_COMPILER}
>> -                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>> ${ASM}.${SUFFIX}
>> +                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
>> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>>                      DEPENDS ${ASM_SRC})
>>              endforeach()
>>          endif()
>> diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
>> index dc4a74107..33025cada 100644
>> --- a/source/common/CMakeLists.txt
>> +++ b/source/common/CMakeLists.txt
>> @@ -123,29 +123,34 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR
>> CROSS_COMPILE_ARM64))
>>      set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL
>> "Arm Assembly Sources that use the Neon DotProd extension")
>>      foreach(SRC ${C_SRCS_NEON})
>>          set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>> +        set_source_files_properties( ${SRC} PROPERTIES COMPILE_FLAGS
>> ${ARM64_ARCH_ARGS} )
>>      endforeach()
>>
>>      if(CPU_HAS_NEON_I8MM)
>>          foreach(SRC ${C_SRCS_NEON_I8MM})
>>              set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>> +            set_source_files_properties( aarch64/${SRC} PROPERTIES
>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>>          endforeach()
>>      endif()
>>
>>      if(CPU_HAS_NEON_DOTPROD)
>>          foreach(SRC ${C_SRCS_NEON_DOTPROD})
>>              set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>> +            set_source_files_properties( aarch64/${SRC} PROPERTIES
>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>>          endforeach()
>>      endif()
>>
>>      if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE)
>>          foreach(SRC ${C_SRCS_SVE})
>>              set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>> +            set_source_files_properties( aarch64/${SRC} PROPERTIES
>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>>          endforeach()
>>      endif()
>>
>>      if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE)
>>          foreach(SRC ${C_SRCS_SVE2})
>>              set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>> +            set_source_files_properties( aarch64/${SRC} PROPERTIES
>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>>          endforeach()
>>      endif()
>>
>> diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp
>> index 61cdaadfb..a2b0ac081 100644
>> --- a/source/common/cpu.cpp
>> +++ b/source/common/cpu.cpp
>> @@ -391,7 +391,8 @@ uint32_t cpu_detect(bool benableavx512)
>>
>>  #elif X265_ARCH_ARM64
>>
>> -uint32_t cpu_detect(bool benableavx512)
>> +#if defined(_MSC_VER) || defined(__APPLE__)
>> +uint32_t cpu_detect(bool /*benableavx512*/)
>>  {
>>      int flags = 0;
>>
>> @@ -414,6 +415,84 @@ uint32_t cpu_detect(bool benableavx512)
>>      return flags;
>>  }
>>
>> +// TODO: Support ARM on Windows
>> +#elif defined(__MINGW64__)
>> +
>> +#include <windows.h>
>> +#include <processthreadsapi.h>
>> +
>> +bool isOryonCPU()
>> +{
>> +
>> +    char processorName[128];
>> +    DWORD bufferSize = 128;
>> +
>> +    LONG result = RegGetValue(HKEY_LOCAL_MACHINE,
>> "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
>> "ProcessorNameString", RRF_RT_ANY, NULL, (PVOID)&processorName,
>> &bufferSize);
>> +    if (strstr(processorName, "Oryon") != NULL)
>> +    {
>> +        return true;
>> +    }
>> +    else
>> +    {
>> +        return false;
>> +    }
>> +}
>> +uint32_t cpu_detect(bool /*benableavx512*/)
>> +{
>> +
>> +    int flags = 0;
>> +
>> +    #if HAVE_NEON
>> +         flags |= X265_CPU_NEON;    // All of ARM64 has NEON
>> +    #endif
>> +    #if HAVE_NEON_DOTPROD &&
>> defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
>> +         flags |=
>> IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) ?
>> X265_CPU_NEON_DOTPROD : 0;
>> +    #endif
>> +    #if HAVE_NEON_I8MM
>> +         flags |= isOryonCPU() ? X265_CPU_NEON_I8MM : 0;
>> +    #endif
>> +    #if HAVE_SVE && defined(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)
>> +         flags |=
>> IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE) ? X265_CPU_SVE
>> : 0;
>> +    #endif
>> +    #if HAVE_SVE2 && defined(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)
>> +         flags |=
>> IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE) ?
>> X265_CPU_SVE2 : 0;
>> +    #endif
>> +
>> +    return flags;
>> +}
>> +
>> +#else // Linux+Aarch64
>> +
>> +#include <asm/hwcap.h>
>> +#include <sys/auxv.h>
>> +
>> +uint32_t cpu_detect(bool /*benableavx5128*/)
>> +{
>> +    unsigned long hwcaps = getauxval(AT_HWCAP);
>> +    unsigned long hwcaps2 = getauxval(AT_HWCAP2);
>> +
>> +    int flags = 0;
>> +
>> +    #if HAVE_NEON
>> +         flags |= X265_CPU_NEON;    // All of ARM64 has NEON
>> +    #endif
>> +    #if HAVE_NEON_DOTPROD
>> +         flags |= (hwcaps & HWCAP_ASIMDDP ? X265_CPU_NEON_DOTPROD : 0);
>> +    #endif
>> +    #if HAVE_NEON_I8MM
>> +         flags |= (hwcaps2 & HWCAP2_I8MM ? X265_CPU_NEON_I8MM : 0);
>> +    #endif
>> +    #if HAVE_SVE
>> +         flags |= (hwcaps & HWCAP_SVE ? X265_CPU_SVE : 0);
>> +    #endif
>> +    #if HAVE_SVE2
>> +         flags |= (hwcaps2 & HWCAP2_SVE2 ? X265_CPU_SVE2 : 0);
>> +    #endif
>> +
>> +    return flags;
>> +}
>> +#endif // end of Linux+AArch64
>> +
>>  #elif X265_ARCH_POWER8
>>
>>  uint32_t cpu_detect(bool benableavx512)
>> --
>> 2.45.2
>>
>>
>> On Thu, Oct 3, 2024 at 3:26 PM Dash Santosh <
>> dash.sathyanarayanan at multicorewareinc.com> wrote:
>>
>>> Hi Hari,
>>> Thanks for spotting this. Also added support for Windows on ARM. Please
>>> find below the updated patch:
>>>
>>>
>>> On Thu, Sep 26, 2024 at 11:43 AM Dash Santosh <
>>> dash.sathyanarayanan at multicorewareinc.com> wrote:
>>>
>>>> From e9614d170f93f3ad4f01e95abfed0a260f218bd5 Mon Sep 17 00:00:00 2001
>>>> From: Min Chen <chenm003 at 163.com>
>>>> Date: Sat, 14 Sep 2024 14:25:28 -0700
>>>> Subject: [PATCH] AArch64: Runtime CPU feature detection
>>>>
>>>> ---
>>>>  source/CMakeLists.txt        | 20 ++++++++++++--------
>>>>  source/common/CMakeLists.txt |  5 +++++
>>>>  source/common/cpu.cpp        | 33 +++++++++++++++++++++++++++++++++
>>>>  3 files changed, 50 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
>>>> index 37b83f959..32a99206f 100755
>>>> --- a/source/CMakeLists.txt
>>>> +++ b/source/CMakeLists.txt
>>>> @@ -306,7 +306,8 @@ if(GCC)
>>>>          if(CPU_HAS_NEON_DOTPROD)
>>>>              # Neon DotProd is mandatory from Armv8.4.
>>>>              message(STATUS "Found Neon DotProd")
>>>> -            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod)
>>>> +            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod)
>>>> +            set(ARM_ARGS -O3)
>>>>              add_definitions(-DHAVE_NEON_DOTPROD=1)
>>>>          endif()
>>>>          if(CPU_HAS_NEON_I8MM)
>>>> @@ -316,7 +317,8 @@ if(GCC)
>>>>              if(NOT CPU_HAS_NEON_DOTPROD)
>>>>                  message(FATAL_ERROR "Unsupported AArch64 feature
>>>> combination (Neon I8MM without Neon DotProd)")
>>>>              endif()
>>>> -            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm)
>>>> +            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm)
>>>> +            set(ARM_ARGS -O3)
>>>>              add_definitions(-DHAVE_NEON_I8MM=1)
>>>>          endif()
>>>>          if(CPU_HAS_SVE)
>>>> @@ -325,13 +327,15 @@ if(GCC)
>>>>              if(NOT CPU_HAS_NEON_I8MM)
>>>>                  message(FATAL_ERROR "Unsupported AArch64 feature
>>>> combination (SVE without Neon I8MM)")
>>>>              endif()
>>>> -            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve)
>>>> +            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm+sve)
>>>> +            set(ARM_ARGS -O3)
>>>>              add_definitions(-DHAVE_SVE=1)
>>>>          endif()
>>>>          if(CPU_HAS_SVE2)
>>>>              message(STATUS "Found SVE2")
>>>>              # SVE2 is only available from Armv9.0, and armv9-a implies
>>>> +dotprod
>>>> -            set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2)
>>>> +            set(ARM64_ARCH_ARGS -march=armv9-a+i8mm+sve2)
>>>> +            set(ARM_ARGS -O3)
>>>>              add_definitions(-DHAVE_SVE2=1)
>>>>          endif()
>>>>          set(ARM_ARGS ${ARM_ARGS} -fPIC)
>>>> @@ -692,7 +696,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>>>>              add_custom_command(
>>>>                  OUTPUT ${ASM}.${SUFFIX}
>>>>                  COMMAND ${CMAKE_CXX_COMPILER}
>>>> -                ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>>>> ${ASM}.${SUFFIX}
>>>> +                ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c
>>>> ${ASM_SRC} -o ${ASM}.${SUFFIX}
>>>>                  DEPENDS ${ASM_SRC})
>>>>          endforeach()
>>>>          if(CPU_HAS_SVE2)
>>>> @@ -703,7 +707,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>>>>                  add_custom_command(
>>>>                      OUTPUT ${ASM}.${SUFFIX}
>>>>                      COMMAND ${CMAKE_CXX_COMPILER}
>>>> -                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>>>> ${ASM}.${SUFFIX}
>>>> +                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS}
>>>> -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
>>>>                      DEPENDS ${ASM_SRC})
>>>>              endforeach()
>>>>          endif()
>>>> @@ -715,7 +719,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>>>>                  add_custom_command(
>>>>                      OUTPUT ${ASM}.${SUFFIX}
>>>>                      COMMAND ${CMAKE_CXX_COMPILER}
>>>> -                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>>>> ${ASM}.${SUFFIX}
>>>> +                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS}
>>>> -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
>>>>                      DEPENDS ${ASM_SRC})
>>>>              endforeach()
>>>>          endif()
>>>> @@ -727,7 +731,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>>>>                  add_custom_command(
>>>>                      OUTPUT ${ASM}.${SUFFIX}
>>>>                      COMMAND ${CMAKE_CXX_COMPILER}
>>>> -                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o
>>>> ${ASM}.${SUFFIX}
>>>> +                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS}
>>>> -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
>>>>                      DEPENDS ${ASM_SRC})
>>>>              endforeach()
>>>>          endif()
>>>> diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
>>>> index dc4a74107..33025cada 100644
>>>> --- a/source/common/CMakeLists.txt
>>>> +++ b/source/common/CMakeLists.txt
>>>> @@ -123,29 +123,34 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR
>>>> CROSS_COMPILE_ARM64))
>>>>      set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL
>>>> "Arm Assembly Sources that use the Neon DotProd extension")
>>>>      foreach(SRC ${C_SRCS_NEON})
>>>>          set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>>>> +        set_source_files_properties( ${SRC} PROPERTIES COMPILE_FLAGS
>>>> ${ARM64_ARCH_ARGS} )
>>>>      endforeach()
>>>>
>>>>      if(CPU_HAS_NEON_I8MM)
>>>>          foreach(SRC ${C_SRCS_NEON_I8MM})
>>>>              set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>>>> +            set_source_files_properties( aarch64/${SRC} PROPERTIES
>>>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>>>>          endforeach()
>>>>      endif()
>>>>
>>>>      if(CPU_HAS_NEON_DOTPROD)
>>>>          foreach(SRC ${C_SRCS_NEON_DOTPROD})
>>>>              set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>>>> +            set_source_files_properties( aarch64/${SRC} PROPERTIES
>>>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>>>>          endforeach()
>>>>      endif()
>>>>
>>>>      if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE)
>>>>          foreach(SRC ${C_SRCS_SVE})
>>>>              set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>>>> +            set_source_files_properties( aarch64/${SRC} PROPERTIES
>>>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>>>>          endforeach()
>>>>      endif()
>>>>
>>>>      if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE)
>>>>          foreach(SRC ${C_SRCS_SVE2})
>>>>              set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
>>>> +            set_source_files_properties( aarch64/${SRC} PROPERTIES
>>>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} )
>>>>          endforeach()
>>>>      endif()
>>>>
>>>> diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp
>>>> index 61cdaadfb..24c60ff0e 100644
>>>> --- a/source/common/cpu.cpp
>>>> +++ b/source/common/cpu.cpp
>>>> @@ -391,6 +391,8 @@ uint32_t cpu_detect(bool benableavx512)
>>>>
>>>>  #elif X265_ARCH_ARM64
>>>>
>>>> +// TODO: Support ARM on Windows
>>>> +#if _MSC_VER
>>>>  uint32_t cpu_detect(bool benableavx512)
>>>>  {
>>>>      int flags = 0;
>>>> @@ -413,6 +415,37 @@ uint32_t cpu_detect(bool benableavx512)
>>>>
>>>>      return flags;
>>>>  }
>>>> +#else // Linux+Aarch64
>>>> +
>>>> +#include <asm/hwcap.h>
>>>> +#include <sys/auxv.h>
>>>> +
>>>> +uint32_t cpu_detect(bool benableavx512)
>>>> +{
>>>> +    unsigned long hwcaps = getauxval(AT_HWCAP);
>>>> +    unsigned long hwcaps2 = getauxval(AT_HWCAP2);
>>>> +
>>>> +    int flags = 0;
>>>> +
>>>> +    #if HAVE_NEON
>>>> +         flags |= X265_CPU_NEON;    // All of ARM64 has NEON
>>>> +    #endif
>>>> +    #if HAVE_NEON_DOTPROD
>>>> +         flags |= (hwcaps & HWCAP_ASIMDDP ? X265_CPU_NEON_DOTPROD : 0);
>>>> +    #endif
>>>> +    #if HAVE_NEON_I8MM
>>>> +         flags |= (hwcaps2 & HWCAP2_SVEI8MM ? X265_CPU_NEON_I8MM : 0);
>>>> +    #endif
>>>> +    #if HAVE_SVE
>>>> +         flags |= (hwcaps & HWCAP_SVE ? X265_CPU_SVE : 0);
>>>> +    #endif
>>>> +    #if HAVE_SVE2
>>>> +         flags |= (hwcaps2 & HWCAP2_SVE2 ? X265_CPU_SVE2 : 0);
>>>> +    #endif
>>>> +
>>>> +    return flags;
>>>> +}
>>>> +#endif // end of Linux+AArch64
>>>>
>>>>  #elif X265_ARCH_POWER8
>>>>
>>>> --
>>>> 2.43.0.windows.1
>>>>
>>>>
>>>> --
>>>>
>>>> * <https://multicorewareinc.com/>*
>>>>   <https://www.linkedin.com/company/multicoreware-inc/>
>>>> <https://twitter.com/MulticoreWare>
>>>> <https://www.facebook.com/multicoreware>
>>>> <https://www.youtube.com/channel/UCXZ1A1MzS5JwBqwBkNfsBBw?sub_confirmation=1>
>>>>    <https://www.instagram.com/multicoreware.inc/>
>>>>
>>>> *Dash Santosh*
>>>>
>>>> *Research Engineer, Video Engineering*
>>>>
>>>> Mobile: +91 78679 43737
>>>>
>>>> IndiQube Echo Point, Avinashi Road
>>>>
>>>> Coimbatore - 641 014
>>>>
>>>>
>>>>
>>>>
>>>> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20241004/13c6c031/attachment-0001.htm>


More information about the x265-devel mailing list