<div dir="ltr">Fixed typos and updated TODO. Please find the updated patch below:<div><br></div><div>From c116db02bd50faa59c3d2b1c63bd6816d6dec2a0 Mon Sep 17 00:00:00 2001<br>From: Logaprakash Ramajayam <<a href="mailto:logaprakash.ramajayam@multicorewareinc.com">logaprakash.ramajayam@multicorewareinc.com</a>><br>Date: Thu, 3 Oct 2024 05:24:16 -0700<br>Subject: [PATCH] AArch64: Runtime CPU feature detection<br><br>---<br> .../make-aarch64-w64-mingw32-Makefiles.sh     |  8 ++<br> .../msys/toolchain-aarch64-w64-mingw32.cmake  |  8 ++<br> source/CMakeLists.txt                         | 21 +++--<br> source/common/CMakeLists.txt                  |  5 ++<br> source/common/cpu.cpp                         | 81 ++++++++++++++++++-<br> 5 files changed, 114 insertions(+), 9 deletions(-)<br> create mode 100644 build/msys/make-aarch64-w64-mingw32-Makefiles.sh<br> create mode 100644 build/msys/toolchain-aarch64-w64-mingw32.cmake<br><br>diff --git a/build/msys/make-aarch64-w64-mingw32-Makefiles.sh b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh<br>new file mode 100644<br>index 000000000..eceffa4a9<br>--- /dev/null<br>+++ b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh<br>@@ -0,0 +1,8 @@<br>+#!/bin/sh<br>+<br>+# This will generate a cross-compile environment, compiling an aarch64<br>+# Win64 target from a 32bit MinGW32 host environment.  If your MinGW<br>+# install is 64bit, you can use the native compiler batch file:<br>+# make-Makefiles.sh<br>+<br>+cmake -G "MSYS Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64-w64-mingw32.cmake ../../source && cmake-gui ../../source<br>diff --git a/build/msys/toolchain-aarch64-w64-mingw32.cmake b/build/msys/toolchain-aarch64-w64-mingw32.cmake<br>new file mode 100644<br>index 000000000..6607bdf64<br>--- /dev/null<br>+++ b/build/msys/toolchain-aarch64-w64-mingw32.cmake<br>@@ -0,0 +1,8 @@<br>+SET(CMAKE_SYSTEM_NAME Windows)<br>+set(CMAKE_SYSTEM_PROCESSOR aarch64)<br>+SET(CMAKE_C_COMPILER aarch64-w64-mingw32-gcc)<br>+SET(CMAKE_CXX_COMPILER aarch64-w64-mingw32-g++)<br>+SET(CMAKE_RC_COMPILER aarch64-w64-mingw32-windres)<br>+SET(CMAKE_RANLIB aarch64-w64-mingw32-ranlib)<br>+SET(CMAKE_ASM_YASM_COMPILER yasm)<br>+SET(CROSS_COMPILE_ARM64 1)<br>diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt<br>index 13bc8ccfe..d1fe38559 100755<br>--- a/source/CMakeLists.txt<br>+++ b/source/CMakeLists.txt<br>@@ -303,10 +303,12 @@ if(GCC)<br>             endif()<br>         endif()<br> <br>+        set(ARM64_ARCH_ARGS "-O3")<br>         if(CPU_HAS_NEON_DOTPROD)<br>             # Neon DotProd is mandatory from Armv8.4.<br>             message(STATUS "Found Neon DotProd")<br>-            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod)<br>+            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod)<br>+            set(ARM_ARGS -O3)<br>             add_definitions(-DHAVE_NEON_DOTPROD=1)<br>         endif()<br>         if(CPU_HAS_NEON_I8MM)<br>@@ -316,7 +318,8 @@ if(GCC)<br>             if(NOT CPU_HAS_NEON_DOTPROD)<br>                 message(FATAL_ERROR "Unsupported AArch64 feature combination (Neon I8MM without Neon DotProd)")<br>             endif()<br>-            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm)<br>+            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm)<br>+            set(ARM_ARGS -O3)<br>             add_definitions(-DHAVE_NEON_I8MM=1)<br>         endif()<br>         if(CPU_HAS_SVE)<br>@@ -325,13 +328,15 @@ if(GCC)<br>             if(NOT CPU_HAS_NEON_I8MM)<br>                 message(FATAL_ERROR "Unsupported AArch64 feature combination (SVE without Neon I8MM)")<br>             endif()<br>-            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve)<br>+            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm+sve)<br>+            set(ARM_ARGS -O3)<br>             add_definitions(-DHAVE_SVE=1)<br>         endif()<br>         if(CPU_HAS_SVE2)<br>             message(STATUS "Found SVE2")<br>             # SVE2 is only available from Armv9.0, and armv9-a implies +dotprod<br>-            set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2)<br>+            set(ARM64_ARCH_ARGS -march=armv9-a+i8mm+sve2)<br>+            set(ARM_ARGS -O3)<br>             add_definitions(-DHAVE_SVE2=1)<br>         endif()<br>         set(ARM_ARGS ${ARM_ARGS} -fPIC)<br>@@ -692,7 +697,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)<br>             add_custom_command(<br>                 OUTPUT ${ASM}.${SUFFIX}<br>                 COMMAND ${CMAKE_CXX_COMPILER}<br>-                ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>+                ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>                 DEPENDS ${ASM_SRC})<br>         endforeach()<br>         if(CPU_HAS_SVE2)<br>@@ -703,7 +708,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)<br>                 add_custom_command(<br>                     OUTPUT ${ASM}.${SUFFIX}<br>                     COMMAND ${CMAKE_CXX_COMPILER}<br>-                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>+                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>                     DEPENDS ${ASM_SRC})<br>             endforeach()<br>         endif()<br>@@ -715,7 +720,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)<br>                 add_custom_command(<br>                     OUTPUT ${ASM}.${SUFFIX}<br>                     COMMAND ${CMAKE_CXX_COMPILER}<br>-                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>+                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>                     DEPENDS ${ASM_SRC})<br>             endforeach()<br>         endif()<br>@@ -727,7 +732,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)<br>                 add_custom_command(<br>                     OUTPUT ${ASM}.${SUFFIX}<br>                     COMMAND ${CMAKE_CXX_COMPILER}<br>-                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>+                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>                     DEPENDS ${ASM_SRC})<br>             endforeach()<br>         endif()<br>diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt<br>index dc4a74107..33025cada 100644<br>--- a/source/common/CMakeLists.txt<br>+++ b/source/common/CMakeLists.txt<br>@@ -123,29 +123,34 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))<br>     set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL "Arm Assembly Sources that use the Neon DotProd extension")<br>     foreach(SRC ${C_SRCS_NEON})<br>         set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})<br>+        set_source_files_properties( ${SRC} PROPERTIES COMPILE_FLAGS ${ARM64_ARCH_ARGS} )<br>     endforeach()<br> <br>     if(CPU_HAS_NEON_I8MM)<br>         foreach(SRC ${C_SRCS_NEON_I8MM})<br>             set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})<br>+            set_source_files_properties( aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${ARM64_ARCH_ARGS} )<br>         endforeach()<br>     endif()<br> <br>     if(CPU_HAS_NEON_DOTPROD)<br>         foreach(SRC ${C_SRCS_NEON_DOTPROD})<br>             set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})<br>+            set_source_files_properties( aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${ARM64_ARCH_ARGS} )<br>         endforeach()<br>     endif()<br> <br>     if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE)<br>         foreach(SRC ${C_SRCS_SVE})<br>             set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})<br>+            set_source_files_properties( aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${ARM64_ARCH_ARGS} )<br>         endforeach()<br>     endif()<br> <br>     if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE)<br>         foreach(SRC ${C_SRCS_SVE2})<br>             set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})<br>+            set_source_files_properties( aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${ARM64_ARCH_ARGS} )<br>         endforeach()<br>     endif()<br> <br>diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp<br>index 61cdaadfb..2d4b15dc9 100644<br>--- a/source/common/cpu.cpp<br>+++ b/source/common/cpu.cpp<br>@@ -391,7 +391,8 @@ uint32_t cpu_detect(bool benableavx512)<br> <br> #elif X265_ARCH_ARM64<br> <br>-uint32_t cpu_detect(bool benableavx512)<br>+#if defined(_MSC_VER) || defined(__APPLE__)<br>+uint32_t cpu_detect(bool /*benableavx512*/)<br> {<br>     int flags = 0;<br> <br>@@ -414,6 +415,84 @@ uint32_t cpu_detect(bool benableavx512)<br>     return flags;<br> }<br> <br>+// TODO: Remove isOryonCPU() once Windows defines PF_ flag for I8MM on supported ARM64 devices<br>+#elif defined(__MINGW64__) // Windows+Aarch64<br>+<br>+#include <windows.h><br>+#include <processthreadsapi.h><br>+<br>+bool isOryonCPU()<br>+{<br>+<br>+    char processorName[128];<br>+    DWORD bufferSize = 128;<br>+<br>+    LONG result = RegGetValue(HKEY_LOCAL_MACHINE, "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", "ProcessorNameString", RRF_RT_ANY, NULL, (PVOID)&processorName, &bufferSize);<br>+    if (strstr(processorName, "Oryon") != NULL)<br>+    {<br>+        return true;<br>+    }<br>+    else<br>+    {<br>+        return false;<br>+    }<br>+}<br>+uint32_t cpu_detect(bool /*benableavx512*/)<br>+{<br>+    <br>+    int flags = 0;<br>+<br>+    #if HAVE_NEON<br>+         flags |= X265_CPU_NEON;    // All of ARM64 has NEON<br>+    #endif<br>+    #if HAVE_NEON_DOTPROD && defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)<br>+         flags |= IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) ? X265_CPU_NEON_DOTPROD : 0;<br>+    #endif<br>+    #if HAVE_NEON_I8MM<br>+         flags |= isOryonCPU() ? X265_CPU_NEON_I8MM : 0;<br>+    #endif<br>+    #if HAVE_SVE && defined(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)<br>+         flags |= IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE) ? X265_CPU_SVE : 0;<br>+    #endif<br>+    #if HAVE_SVE2 && defined(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)<br>+         flags |= IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE) ? X265_CPU_SVE2 : 0;<br>+    #endif<br>+<br>+    return flags;<br>+} // end of Windows+Aarch64<br>+<br>+#else // Linux+Aarch64<br>+<br>+#include <asm/hwcap.h><br>+#include <sys/auxv.h><br>+<br>+uint32_t cpu_detect(bool /*benableavx512*/)<br>+{<br>+    unsigned long hwcaps = getauxval(AT_HWCAP);<br>+    unsigned long hwcaps2 = getauxval(AT_HWCAP2);<br>+<br>+    int flags = 0;<br>+<br>+    #if HAVE_NEON<br>+         flags |= X265_CPU_NEON;    // All of ARM64 has NEON<br>+    #endif<br>+    #if HAVE_NEON_DOTPROD<br>+         flags |= (hwcaps & HWCAP_ASIMDDP ? X265_CPU_NEON_DOTPROD : 0);<br>+    #endif<br>+    #if HAVE_NEON_I8MM<br>+         flags |= (hwcaps2 & HWCAP2_I8MM ? X265_CPU_NEON_I8MM : 0);<br>+    #endif<br>+    #if HAVE_SVE<br>+         flags |= (hwcaps & HWCAP_SVE ? X265_CPU_SVE : 0);<br>+    #endif<br>+    #if HAVE_SVE2<br>+         flags |= (hwcaps2 & HWCAP2_SVE2 ? X265_CPU_SVE2 : 0);<br>+    #endif<br>+<br>+    return flags;<br>+}<br>+#endif // end of Linux+AArch64<br>+<br> #elif X265_ARCH_POWER8<br> <br> uint32_t cpu_detect(bool benableavx512)<br>-- <br>2.45.2<br><br></div></div><br><div class="gmail_quote"><div dir="ltr" class="gmail_attr">On Thu, Oct 3, 2024 at 3:26 PM Dash Santosh <<a href="mailto:dash.sathyanarayanan@multicorewareinc.com">dash.sathyanarayanan@multicorewareinc.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"><div dir="ltr">From 7d2353aaf7509721461c141f2800962c15ff440c Mon Sep 17 00:00:00 2001<br>From: Logaprakash Ramajayam <<a href="mailto:logaprakash.ramajayam@multicorewareinc.com" target="_blank">logaprakash.ramajayam@multicorewareinc.com</a>><br>Date: Wed, 2 Oct 2024 21:59:59 -0700<br>Subject: [PATCH] AArch64: Runtime CPU feature detection <br><br>---<br> .../make-aarch64-w64-mingw32-Makefiles.sh     |  8 ++<br> .../msys/toolchain-aarch64-w64-mingw32.cmake  |  8 ++<br> source/CMakeLists.txt                         | 21 +++--<br> source/common/CMakeLists.txt                  |  5 ++<br> source/common/cpu.cpp                         | 81 ++++++++++++++++++-<br> 5 files changed, 114 insertions(+), 9 deletions(-)<br> create mode 100644 build/msys/make-aarch64-w64-mingw32-Makefiles.sh<br> create mode 100644 build/msys/toolchain-aarch64-w64-mingw32.cmake<br><br>diff --git a/build/msys/make-aarch64-w64-mingw32-Makefiles.sh b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh<br>new file mode 100644<br>index 000000000..eceffa4a9<br>--- /dev/null<br>+++ b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh<br>@@ -0,0 +1,8 @@<br>+#!/bin/sh<br>+<br>+# This will generate a cross-compile environment, compiling an aarch64<br>+# Win64 target from a 32bit MinGW32 host environment.  If your MinGW<br>+# install is 64bit, you can use the native compiler batch file:<br>+# make-Makefiles.sh<br>+<br>+cmake -G "MSYS Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64-w64-mingw32.cmake ../../source && cmake-gui ../../source<br>diff --git a/build/msys/toolchain-aarch64-w64-mingw32.cmake b/build/msys/toolchain-aarch64-w64-mingw32.cmake<br>new file mode 100644<br>index 000000000..6607bdf64<br>--- /dev/null<br>+++ b/build/msys/toolchain-aarch64-w64-mingw32.cmake<br>@@ -0,0 +1,8 @@<br>+SET(CMAKE_SYSTEM_NAME Windows)<br>+set(CMAKE_SYSTEM_PROCESSOR aarch64)<br>+SET(CMAKE_C_COMPILER aarch64-w64-mingw32-gcc)<br>+SET(CMAKE_CXX_COMPILER aarch64-w64-mingw32-g++)<br>+SET(CMAKE_RC_COMPILER aarch64-w64-mingw32-windres)<br>+SET(CMAKE_RANLIB aarch64-w64-mingw32-ranlib)<br>+SET(CMAKE_ASM_YASM_COMPILER yasm)<br>+SET(CROSS_COMPILE_ARM64 1)<br>diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt<br>index 13bc8ccfe..d1fe38559 100755<br>--- a/source/CMakeLists.txt<br>+++ b/source/CMakeLists.txt<br>@@ -303,10 +303,12 @@ if(GCC)<br>             endif()<br>         endif()<br> <br>+        set(ARM64_ARCH_ARGS "-O3")<br>         if(CPU_HAS_NEON_DOTPROD)<br>             # Neon DotProd is mandatory from Armv8.4.<br>             message(STATUS "Found Neon DotProd")<br>-            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod)<br>+            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod)<br>+            set(ARM_ARGS -O3)<br>             add_definitions(-DHAVE_NEON_DOTPROD=1)<br>         endif()<br>         if(CPU_HAS_NEON_I8MM)<br>@@ -316,7 +318,8 @@ if(GCC)<br>             if(NOT CPU_HAS_NEON_DOTPROD)<br>                 message(FATAL_ERROR "Unsupported AArch64 feature combination (Neon I8MM without Neon DotProd)")<br>             endif()<br>-            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm)<br>+            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm)<br>+            set(ARM_ARGS -O3)<br>             add_definitions(-DHAVE_NEON_I8MM=1)<br>         endif()<br>         if(CPU_HAS_SVE)<br>@@ -325,13 +328,15 @@ if(GCC)<br>             if(NOT CPU_HAS_NEON_I8MM)<br>                 message(FATAL_ERROR "Unsupported AArch64 feature combination (SVE without Neon I8MM)")<br>             endif()<br>-            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve)<br>+            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm+sve)<br>+            set(ARM_ARGS -O3)<br>             add_definitions(-DHAVE_SVE=1)<br>         endif()<br>         if(CPU_HAS_SVE2)<br>             message(STATUS "Found SVE2")<br>             # SVE2 is only available from Armv9.0, and armv9-a implies +dotprod<br>-            set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2)<br>+            set(ARM64_ARCH_ARGS -march=armv9-a+i8mm+sve2)<br>+            set(ARM_ARGS -O3)<br>             add_definitions(-DHAVE_SVE2=1)<br>         endif()<br>         set(ARM_ARGS ${ARM_ARGS} -fPIC)<br>@@ -692,7 +697,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)<br>             add_custom_command(<br>                 OUTPUT ${ASM}.${SUFFIX}<br>                 COMMAND ${CMAKE_CXX_COMPILER}<br>-                ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>+                ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>                 DEPENDS ${ASM_SRC})<br>         endforeach()<br>         if(CPU_HAS_SVE2)<br>@@ -703,7 +708,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)<br>                 add_custom_command(<br>                     OUTPUT ${ASM}.${SUFFIX}<br>                     COMMAND ${CMAKE_CXX_COMPILER}<br>-                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>+                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>                     DEPENDS ${ASM_SRC})<br>             endforeach()<br>         endif()<br>@@ -715,7 +720,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)<br>                 add_custom_command(<br>                     OUTPUT ${ASM}.${SUFFIX}<br>                     COMMAND ${CMAKE_CXX_COMPILER}<br>-                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>+                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>                     DEPENDS ${ASM_SRC})<br>             endforeach()<br>         endif()<br>@@ -727,7 +732,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)<br>                 add_custom_command(<br>                     OUTPUT ${ASM}.${SUFFIX}<br>                     COMMAND ${CMAKE_CXX_COMPILER}<br>-                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>+                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>                     DEPENDS ${ASM_SRC})<br>             endforeach()<br>         endif()<br>diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt<br>index dc4a74107..33025cada 100644<br>--- a/source/common/CMakeLists.txt<br>+++ b/source/common/CMakeLists.txt<br>@@ -123,29 +123,34 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))<br>     set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL "Arm Assembly Sources that use the Neon DotProd extension")<br>     foreach(SRC ${C_SRCS_NEON})<br>         set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})<br>+        set_source_files_properties( ${SRC} PROPERTIES COMPILE_FLAGS ${ARM64_ARCH_ARGS} )<br>     endforeach()<br> <br>     if(CPU_HAS_NEON_I8MM)<br>         foreach(SRC ${C_SRCS_NEON_I8MM})<br>             set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})<br>+            set_source_files_properties( aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${ARM64_ARCH_ARGS} )<br>         endforeach()<br>     endif()<br> <br>     if(CPU_HAS_NEON_DOTPROD)<br>         foreach(SRC ${C_SRCS_NEON_DOTPROD})<br>             set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})<br>+            set_source_files_properties( aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${ARM64_ARCH_ARGS} )<br>         endforeach()<br>     endif()<br> <br>     if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE)<br>         foreach(SRC ${C_SRCS_SVE})<br>             set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})<br>+            set_source_files_properties( aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${ARM64_ARCH_ARGS} )<br>         endforeach()<br>     endif()<br> <br>     if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE)<br>         foreach(SRC ${C_SRCS_SVE2})<br>             set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})<br>+            set_source_files_properties( aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${ARM64_ARCH_ARGS} )<br>         endforeach()<br>     endif()<br> <br>diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp<br>index 61cdaadfb..a2b0ac081 100644<br>--- a/source/common/cpu.cpp<br>+++ b/source/common/cpu.cpp<br>@@ -391,7 +391,8 @@ uint32_t cpu_detect(bool benableavx512)<br> <br> #elif X265_ARCH_ARM64<br> <br>-uint32_t cpu_detect(bool benableavx512)<br>+#if defined(_MSC_VER) || defined(__APPLE__)<br>+uint32_t cpu_detect(bool /*benableavx512*/)<br> {<br>     int flags = 0;<br> <br>@@ -414,6 +415,84 @@ uint32_t cpu_detect(bool benableavx512)<br>     return flags;<br> }<br> <br>+// TODO: Support ARM on Windows<br>+#elif defined(__MINGW64__)<br>+<br>+#include <windows.h><br>+#include <processthreadsapi.h><br>+<br>+bool isOryonCPU()<br>+{<br>+<br>+    char processorName[128];<br>+    DWORD bufferSize = 128;<br>+<br>+    LONG result = RegGetValue(HKEY_LOCAL_MACHINE, "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", "ProcessorNameString", RRF_RT_ANY, NULL, (PVOID)&processorName, &bufferSize);<br>+    if (strstr(processorName, "Oryon") != NULL)<br>+    {<br>+        return true;<br>+    }<br>+    else<br>+    {<br>+        return false;<br>+    }<br>+}<br>+uint32_t cpu_detect(bool /*benableavx512*/)<br>+{<br>+<br>+    int flags = 0;<br>+<br>+    #if HAVE_NEON<br>+         flags |= X265_CPU_NEON;    // All of ARM64 has NEON<br>+    #endif<br>+    #if HAVE_NEON_DOTPROD && defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)<br>+         flags |= IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) ? X265_CPU_NEON_DOTPROD : 0;<br>+    #endif<br>+    #if HAVE_NEON_I8MM<br>+         flags |= isOryonCPU() ? X265_CPU_NEON_I8MM : 0;<br>+    #endif<br>+    #if HAVE_SVE && defined(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)<br>+         flags |= IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE) ? X265_CPU_SVE : 0;<br>+    #endif<br>+    #if HAVE_SVE2 && defined(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)<br>+         flags |= IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE) ? X265_CPU_SVE2 : 0;<br>+    #endif<br>+<br>+    return flags;<br>+}<br>+<br>+#else // Linux+Aarch64<br>+<br>+#include <asm/hwcap.h><br>+#include <sys/auxv.h><br>+<br>+uint32_t cpu_detect(bool /*benableavx5128*/)<br>+{<br>+    unsigned long hwcaps = getauxval(AT_HWCAP);<br>+    unsigned long hwcaps2 = getauxval(AT_HWCAP2);<br>+<br>+    int flags = 0;<br>+<br>+    #if HAVE_NEON<br>+         flags |= X265_CPU_NEON;    // All of ARM64 has NEON<br>+    #endif<br>+    #if HAVE_NEON_DOTPROD<br>+         flags |= (hwcaps & HWCAP_ASIMDDP ? X265_CPU_NEON_DOTPROD : 0);<br>+    #endif<br>+    #if HAVE_NEON_I8MM<br>+         flags |= (hwcaps2 & HWCAP2_I8MM ? X265_CPU_NEON_I8MM : 0);<br>+    #endif<br>+    #if HAVE_SVE<br>+         flags |= (hwcaps & HWCAP_SVE ? X265_CPU_SVE : 0);<br>+    #endif<br>+    #if HAVE_SVE2<br>+         flags |= (hwcaps2 & HWCAP2_SVE2 ? X265_CPU_SVE2 : 0);<br>+    #endif<br>+<br>+    return flags;<br>+}<br>+#endif // end of Linux+AArch64<br>+<br> #elif X265_ARCH_POWER8<br> <br> uint32_t cpu_detect(bool benableavx512)<br>-- <br>2.45.2<br><br></div><br><div class="gmail_quote"><div dir="ltr" class="gmail_attr">On Thu, Oct 3, 2024 at 3:26 PM Dash Santosh <<a href="mailto:dash.sathyanarayanan@multicorewareinc.com" target="_blank">dash.sathyanarayanan@multicorewareinc.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"><div dir="ltr">Hi Hari, <div>Thanks for spotting this. Also added support for Windows on ARM. Please find below the updated patch:</div><div><br></div></div><br><div class="gmail_quote"><div dir="ltr" class="gmail_attr">On Thu, Sep 26, 2024 at 11:43 AM Dash Santosh <<a href="mailto:dash.sathyanarayanan@multicorewareinc.com" target="_blank">dash.sathyanarayanan@multicorewareinc.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"><div dir="ltr">From e9614d170f93f3ad4f01e95abfed0a260f218bd5 Mon Sep 17 00:00:00 2001<br>From: Min Chen <<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>><br>Date: Sat, 14 Sep 2024 14:25:28 -0700<br>Subject: [PATCH] AArch64: Runtime CPU feature detection<br><br>---<br> source/CMakeLists.txt        | 20 ++++++++++++--------<br> source/common/CMakeLists.txt |  5 +++++<br> source/common/cpu.cpp        | 33 +++++++++++++++++++++++++++++++++<br> 3 files changed, 50 insertions(+), 8 deletions(-)<br><br>diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt<br>index 37b83f959..32a99206f 100755<br>--- a/source/CMakeLists.txt<br>+++ b/source/CMakeLists.txt<br>@@ -306,7 +306,8 @@ if(GCC)<br>         if(CPU_HAS_NEON_DOTPROD)<br>             # Neon DotProd is mandatory from Armv8.4.<br>             message(STATUS "Found Neon DotProd")<br>-            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod)<br>+            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod)<br>+            set(ARM_ARGS -O3)<br>             add_definitions(-DHAVE_NEON_DOTPROD=1)<br>         endif()<br>         if(CPU_HAS_NEON_I8MM)<br>@@ -316,7 +317,8 @@ if(GCC)<br>             if(NOT CPU_HAS_NEON_DOTPROD)<br>                 message(FATAL_ERROR "Unsupported AArch64 feature combination (Neon I8MM without Neon DotProd)")<br>             endif()<br>-            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm)<br>+            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm)<br>+            set(ARM_ARGS -O3)<br>             add_definitions(-DHAVE_NEON_I8MM=1)<br>         endif()<br>         if(CPU_HAS_SVE)<br>@@ -325,13 +327,15 @@ if(GCC)<br>             if(NOT CPU_HAS_NEON_I8MM)<br>                 message(FATAL_ERROR "Unsupported AArch64 feature combination (SVE without Neon I8MM)")<br>             endif()<br>-            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve)<br>+            set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm+sve)<br>+            set(ARM_ARGS -O3)<br>             add_definitions(-DHAVE_SVE=1)<br>         endif()<br>         if(CPU_HAS_SVE2)<br>             message(STATUS "Found SVE2")<br>             # SVE2 is only available from Armv9.0, and armv9-a implies +dotprod<br>-            set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2)<br>+            set(ARM64_ARCH_ARGS -march=armv9-a+i8mm+sve2)<br>+            set(ARM_ARGS -O3)<br>             add_definitions(-DHAVE_SVE2=1)<br>         endif()<br>         set(ARM_ARGS ${ARM_ARGS} -fPIC)<br>@@ -692,7 +696,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)<br>             add_custom_command(<br>                 OUTPUT ${ASM}.${SUFFIX}<br>                 COMMAND ${CMAKE_CXX_COMPILER}<br>-                ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>+                ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>                 DEPENDS ${ASM_SRC})<br>         endforeach()<br>         if(CPU_HAS_SVE2)<br>@@ -703,7 +707,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)<br>                 add_custom_command(<br>                     OUTPUT ${ASM}.${SUFFIX}<br>                     COMMAND ${CMAKE_CXX_COMPILER}<br>-                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>+                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>                     DEPENDS ${ASM_SRC})<br>             endforeach()<br>         endif()<br>@@ -715,7 +719,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)<br>                 add_custom_command(<br>                     OUTPUT ${ASM}.${SUFFIX}<br>                     COMMAND ${CMAKE_CXX_COMPILER}<br>-                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>+                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>                     DEPENDS ${ASM_SRC})<br>             endforeach()<br>         endif()<br>@@ -727,7 +731,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)<br>                 add_custom_command(<br>                     OUTPUT ${ASM}.${SUFFIX}<br>                     COMMAND ${CMAKE_CXX_COMPILER}<br>-                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>+                    ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}<br>                     DEPENDS ${ASM_SRC})<br>             endforeach()<br>         endif()<br>diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt<br>index dc4a74107..33025cada 100644<br>--- a/source/common/CMakeLists.txt<br>+++ b/source/common/CMakeLists.txt<br>@@ -123,29 +123,34 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))<br>     set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL "Arm Assembly Sources that use the Neon DotProd extension")<br>     foreach(SRC ${C_SRCS_NEON})<br>         set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})<br>+        set_source_files_properties( ${SRC} PROPERTIES COMPILE_FLAGS ${ARM64_ARCH_ARGS} )<br>     endforeach()<br> <br>     if(CPU_HAS_NEON_I8MM)<br>         foreach(SRC ${C_SRCS_NEON_I8MM})<br>             set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})<br>+            set_source_files_properties( aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${ARM64_ARCH_ARGS} )<br>         endforeach()<br>     endif()<br> <br>     if(CPU_HAS_NEON_DOTPROD)<br>         foreach(SRC ${C_SRCS_NEON_DOTPROD})<br>             set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})<br>+            set_source_files_properties( aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${ARM64_ARCH_ARGS} )<br>         endforeach()<br>     endif()<br> <br>     if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE)<br>         foreach(SRC ${C_SRCS_SVE})<br>             set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})<br>+            set_source_files_properties( aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${ARM64_ARCH_ARGS} )<br>         endforeach()<br>     endif()<br> <br>     if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE)<br>         foreach(SRC ${C_SRCS_SVE2})<br>             set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})<br>+            set_source_files_properties( aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${ARM64_ARCH_ARGS} )<br>         endforeach()<br>     endif()<br> <br>diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp<br>index 61cdaadfb..24c60ff0e 100644<br>--- a/source/common/cpu.cpp<br>+++ b/source/common/cpu.cpp<br>@@ -391,6 +391,8 @@ uint32_t cpu_detect(bool benableavx512)<br> <br> #elif X265_ARCH_ARM64<br> <br>+// TODO: Support ARM on Windows<br>+#if _MSC_VER<br> uint32_t cpu_detect(bool benableavx512)<br> {<br>     int flags = 0;<br>@@ -413,6 +415,37 @@ uint32_t cpu_detect(bool benableavx512)<br> <br>     return flags;<br> }<br>+#else // Linux+Aarch64<br>+<br>+#include <asm/hwcap.h><br>+#include <sys/auxv.h><br>+<br>+uint32_t cpu_detect(bool benableavx512)<br>+{<br>+    unsigned long hwcaps = getauxval(AT_HWCAP);<br>+    unsigned long hwcaps2 = getauxval(AT_HWCAP2);<br>+<br>+    int flags = 0;<br>+<br>+    #if HAVE_NEON<br>+         flags |= X265_CPU_NEON;    // All of ARM64 has NEON<br>+    #endif<br>+    #if HAVE_NEON_DOTPROD<br>+         flags |= (hwcaps & HWCAP_ASIMDDP ? X265_CPU_NEON_DOTPROD : 0);<br>+    #endif<br>+    #if HAVE_NEON_I8MM<br>+         flags |= (hwcaps2 & HWCAP2_SVEI8MM ? X265_CPU_NEON_I8MM : 0);<br>+    #endif<br>+    #if HAVE_SVE<br>+         flags |= (hwcaps & HWCAP_SVE ? X265_CPU_SVE : 0);<br>+    #endif<br>+    #if HAVE_SVE2<br>+         flags |= (hwcaps2 & HWCAP2_SVE2 ? X265_CPU_SVE2 : 0);<br>+    #endif<br>+<br>+    return flags;<br>+}<br>+#endif // end of Linux+AArch64<br> <br> #elif X265_ARCH_POWER8<br> <br>-- <br>2.43.0.windows.1<br><br><div><br></div><span class="gmail_signature_prefix">-- </span><br><div dir="ltr" class="gmail_signature"><div dir="ltr"><table border="0" cellpadding="0" cellspacing="0" style="color:rgb(0,0,0);font-family:"Times New Roman";font-size:medium"><tbody><tr><td valign="top" align="center" style="padding-right:10px"><div><u><a rel="noopener noreferrer" href="https://multicorewareinc.com/" style="color:rgb(17,85,204)" target="_blank"><img src="https://ci3.googleusercontent.com/mail-sig/AIorK4xJVw2ouL-7kSIsC53GaQNdgsrl9bgcOifC08GwaQJPu1JnKnqfZTU30s7zxLJ1jTg-UY85QJ4KXpg5"></a><br></u></div><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td valign="top" align="center" style="padding-right:10px"><span style="padding:0px;margin:0px"> <a rel="noopener noreferrer" href="https://www.linkedin.com/company/multicoreware-inc/" style="color:rgb(17,85,204)" target="_blank"><img src="https://ci3.googleusercontent.com/mail-sig/AIorK4ypKpdFAdtD5ghJ8ZhhSvThTPpbqhR7s_V5Wtz3SVRR9MQSPtYqiB4r6Q8gFg0JBX8_pEAxYqx8IcDP"></a>  <a rel="noopener noreferrer" href="https://twitter.com/MulticoreWare" style="color:rgb(17,85,204)" target="_blank"><img src="https://ci3.googleusercontent.com/mail-sig/AIorK4ywduRorDK_UGiphRmWW7yAHbFQFYtBNfKTy3tMcr7MVSt1kfuKUkrOUHYuQU7m3aFao_fcyhp3tUxR"></a>  <a rel="noopener noreferrer" href="https://www.facebook.com/multicoreware" style="color:rgb(17,85,204)" target="_blank"><img src="https://ci3.googleusercontent.com/mail-sig/AIorK4wzBbm2U0HqcrA18mfNe5ZYnu_7R21iHbKJK9fpG2-E0XGI-2tYbO8d32UMcqxJWCQTCt8_6e83HoMT"></a>  <a rel="noopener noreferrer" href="https://www.youtube.com/channel/UCXZ1A1MzS5JwBqwBkNfsBBw?sub_confirmation=1" style="color:rgb(17,85,204)" target="_blank"><img src="https://ci3.googleusercontent.com/mail-sig/AIorK4zux15kq6I5m66gDXz7UhzTaY8c72DXs-Vr3A6tehpEWsj_U7hcRzDAgzHHNql5Rj8jR62YBJLuOFYu"></a>  <a rel="noopener noreferrer" href="https://www.instagram.com/multicoreware.inc/" style="color:rgb(17,85,204)" target="_blank"><img src="https://ci3.googleusercontent.com/mail-sig/AIorK4xCleID9p5NJ9j5vsPEQFpunbcFVzF_hieeJvA1HjLt5HLx6z9CT___VxTzWCrPzMXiojTcwionPe8F"></a></span><br></td></tr></tbody></table></td><td bgcolor="#999999" rowspan="2" width="1"><br></td><td style="vertical-align:top;padding-left:10px"><p style="margin:0px;line-height:20px"><span style="color:rgb(32,85,254)"><b><span style="font-family:Arial,Helvetica,sans-serif"><span style="font-size:16px"><span style="color:rgb(68,68,68)"><span style="font-family:Arial,sans-serif">Dash Santosh</span></span></span></span></b></span><br></p><p style="margin:0px 0px 5px;line-height:20px"><span style="color:rgb(36,36,36)"><span style="font-family:Arial,Helvetica,sans-serif"><span style="font-size:16px"><b style="color:rgb(102,102,102);font-family:Arial,sans-serif;font-size:x-small">Research Engineer, Video Engineering</b></span></span></span><br></p><p style="margin:0px;line-height:16px"><span style="color:rgb(36,36,36)"><span style="font-family:Arial,Helvetica,sans-serif"><span style="font-size:11px"><span style="color:rgb(88,88,88)"><span style="font-family:Arial,sans-serif"><span style="font-size:x-small">Mobile: +91 78679 43737</span></span></span></span></span></span><br></p><p style="margin:0px;line-height:16px"><font color="#585858" face="Arial, sans-serif"><span style="font-size:10px">IndiQube Echo Point, Avinashi Road</span></font></p><p style="margin:0px;line-height:16px"><span style="color:rgb(36,36,36)"><span style="font-family:Arial,Helvetica,sans-serif"><span style="font-size:11px"><span style="color:rgb(88,88,88)"><span style="font-family:Arial,sans-serif"><span style="font-size:x-small">Coimbatore - 641 014</span></span></span></span></span></span><br></p><p style="margin:0px;line-height:16px"><span style="color:rgb(36,36,36)"><span style="font-family:Arial,Helvetica,sans-serif"><span style="font-size:11px"><span style="color:rgb(88,88,88)"><span style="font-family:Arial,sans-serif"><span style="font-size:x-small"><br></span></span></span></span></span></span></p></td></tr><tr><td valign="top" align="center" style="padding-right:10px"><br></td><td valign="top" style="padding:0px 0px 0px 10px"><img src="https://ci3.googleusercontent.com/mail-sig/AIorK4yP_T0tZnWYAbKHbRi0Pt8l8koPJBmOijhmBRMYVotNrmftxEnTmN93Ac-sC2XTKbHwXR-SV97SisS9"><br></td></tr></tbody></table></div></div></div>
</blockquote></div>
</blockquote></div>
</blockquote></div>