[x265] [PATCH v2 5/8] AArch64: Add Armv8.4 Neon DotProd feature detection

Hari Limaye hari.limaye at arm.com
Tue Jul 30 15:45:42 UTC 2024


Add compile-time feature detection for AArch64 Neon DotProd
instructions, which are mandatory from Armv8.4, and clean up the
Arm/AArch64 CPU flag definitions.

We impose the constraint that SVE implies Neon DotProd, as this is true
for all systems except the Fujitsu A64FX.
---
 build/README.txt                    |  8 ++++++++
 source/CMakeLists.txt               | 24 +++++++++++++++++++++---
 source/cmake/FindNEON_DOTPROD.cmake | 21 +++++++++++++++++++++
 source/common/cpu.cpp               | 19 ++++++++++++-------
 source/test/testbench.cpp           |  3 ++-
 source/x265.h                       | 11 ++++++-----
 6 files changed, 70 insertions(+), 16 deletions(-)
 create mode 100644 source/cmake/FindNEON_DOTPROD.cmake

diff --git a/build/README.txt b/build/README.txt
index af4abd21c..8e229c3bd 100644
--- a/build/README.txt
+++ b/build/README.txt
@@ -113,4 +113,12 @@ For example, when running CMake to configure the project:
 1. cmake -DCROSS_COMPILE_SVE=ON  <other configuration options...>
 2. cmake -DCROSS_COMPILE_SVE2=ON <other configuration options...>
 
+Note: when the CROSS_COMPILE_SVE option is set to ON the build configuration will
+compile for Neon DotProd, as we impose the constraint that SVE implies Neon DotProd.
+
+If target platform supports Armv8.4 Neon DotProd instructions, the
+CROSS_COMPILE_NEON_DOTPROD CMake option should be set to ON:
+
+* cmake -DCROSS_COMPILE_NEON_DOTPROD=ON  <other configuration options...>
+
 Then, the normal build process can be followed.
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 0a877f209..c1179d276 100755
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -88,6 +88,7 @@ elseif(ARM64MATCH GREATER "-1")
     # Options for cross compiling AArch64 optional extensions
     option(CROSS_COMPILE_SVE "Cross Compile for SVE Target" OFF)
     option(CROSS_COMPILE_SVE2 "Cross Compile for SVE2 Target" OFF)
+    option(CROSS_COMPILE_NEON_DOTPROD "Cross Compile for Neon DotProd Target" OFF)
 else()
     message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
     message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
@@ -265,25 +266,42 @@ if(GCC)
         add_definitions(-DX265_ARCH_ARM64=1 -DHAVE_NEON=1)
 
         # Handle cross-compilation options.
+        if(CROSS_COMPILE_NEON_DOTPROD)
+            set(CPU_HAS_NEON_DOTPROD 1)
+        endif()
         if(CROSS_COMPILE_SVE)
             set(CPU_HAS_SVE 1)
+            # We impose the constraint that SVE implies Neon DotProd.
+            set(CPU_HAS_NEON_DOTPROD 1)
         endif()
         if(CROSS_COMPILE_SVE2)
             set(CPU_HAS_SVE2 1)
-            # SVE2 implies SVE.
+            # SVE2 implies SVE and Neon DotProd.
             set(CPU_HAS_SVE 1)
+            set(CPU_HAS_NEON_DOTPROD 1)
         endif()
 
+        find_package(NEON_DOTPROD)
         find_package(SVE)
         find_package(SVE2)
+        if(CPU_HAS_NEON_DOTPROD)
+            # Neon DotProd is mandatory from Armv8.4.
+            message(STATUS "Found Neon DotProd")
+            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod)
+            add_definitions(-DHAVE_NEON_DOTPROD=1)
+        endif()
         if(CPU_HAS_SVE)
             message(STATUS "Found SVE")
-            set(ARM_ARGS -O3 -march=armv8.2-a+sve)
+            # We impose the constraint that SVE implies Neon DotProd.
+            if(NOT CPU_HAS_NEON_DOTPROD)
+                message(FATAL_ERROR "Unsupported AArch64 feature combination (SVE without Neon DotProd)")
+            endif()
+            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+sve)
             add_definitions(-DHAVE_SVE=1)
         endif()
         if(CPU_HAS_SVE2)
             message(STATUS "Found SVE2")
-            # SVE2 is only available from Armv9.0-A.
+            # SVE2 is only available from Armv9.0, and armv9-a implies +dotprod
             set(ARM_ARGS -O3 -march=armv9-a+sve2)
             add_definitions(-DHAVE_SVE2=1)
         endif()
diff --git a/source/cmake/FindNEON_DOTPROD.cmake b/source/cmake/FindNEON_DOTPROD.cmake
new file mode 100644
index 000000000..49b9ff605
--- /dev/null
+++ b/source/cmake/FindNEON_DOTPROD.cmake
@@ -0,0 +1,21 @@
+include(FindPackageHandleStandardArgs)
+
+# Check if Armv8.4 Neon DotProd is supported by the Arm CPU
+if(APPLE)
+    execute_process(COMMAND sysctl -a
+                    COMMAND grep "hw.optional.arm.FEAT_DotProd: 1"
+                    OUTPUT_VARIABLE has_dot_product
+                    ERROR_QUIET
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
+else()
+    execute_process(COMMAND cat /proc/cpuinfo
+                    COMMAND grep Features
+                    COMMAND grep asimddp
+                    OUTPUT_VARIABLE has_dot_product
+                    ERROR_QUIET
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
+endif()
+
+if(has_dot_product)
+    set(CPU_HAS_NEON_DOTPROD 1)
+endif()
diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp
index 6cf0a5312..84a241876 100644
--- a/source/common/cpu.cpp
+++ b/source/common/cpu.cpp
@@ -115,6 +115,9 @@ const cpu_name_t cpu_names[] =
 #if defined(HAVE_SVE2)
     { "SVE2",            X265_CPU_SVE2 },
 #endif
+#if defined(HAVE_NEON_DOTPROD)
+    { "Neon_DotProd",    X265_CPU_NEON_DOTPROD },
+#endif
 #elif X265_ARCH_POWER8
     { "Altivec",         X265_CPU_ALTIVEC },
 
@@ -389,17 +392,19 @@ uint32_t cpu_detect(bool benableavx512)
 {
     int flags = 0;
 
-    #if defined(HAVE_SVE2)
+    #if HAVE_SVE2
          flags |= X265_CPU_SVE2;
+    #endif
+    #if HAVE_SVE
          flags |= X265_CPU_SVE;
-         flags |= X265_CPU_NEON;
-    #elif defined(HAVE_SVE)
-         flags |= X265_CPU_SVE;
-         flags |= X265_CPU_NEON;
-    #elif HAVE_NEON
+    #endif
+    #if HAVE_NEON
          flags |= X265_CPU_NEON;
     #endif
-        
+    #if HAVE_NEON_DOTPROD
+         flags |= X265_CPU_NEON_DOTPROD;
+    #endif
+
     return flags;
 }
 
diff --git a/source/test/testbench.cpp b/source/test/testbench.cpp
index 45da893a7..ec4f5d395 100644
--- a/source/test/testbench.cpp
+++ b/source/test/testbench.cpp
@@ -159,7 +159,7 @@ int main(int argc, char *argv[])
 
     struct test_arch_t
     {
-        char name[12];
+        char name[13];
         int flag;
     } test_arch[] =
     {
@@ -176,6 +176,7 @@ int main(int argc, char *argv[])
         { "NEON", X265_CPU_NEON },
         { "SVE2", X265_CPU_SVE2 },
         { "SVE", X265_CPU_SVE },
+        { "Neon_DotProd", X265_CPU_NEON_DOTPROD },
         { "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
         { "", 0 },
     };
diff --git a/source/x265.h b/source/x265.h
index c48b8648a..eb7e01c1c 100644
--- a/source/x265.h
+++ b/source/x265.h
@@ -536,11 +536,12 @@ typedef enum
 #define X265_CPU_SLOW_PALIGNR    (1 << 25)  /* such as on the AMD Bobcat */
 
 /* ARM */
-#define X265_CPU_ARMV6           0x0000001
-#define X265_CPU_NEON            0x0000002  /* ARM NEON */
-#define X265_CPU_SVE2            0x0000008  /* ARM SVE2 */
-#define X265_CPU_SVE             0x0000010  /* ARM SVE2 */
-#define X265_CPU_FAST_NEON_MRC   0x0000004  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
+#define X265_CPU_ARMV6           (1 << 0)
+#define X265_CPU_NEON            (1 << 1)   /* ARM NEON */
+#define X265_CPU_FAST_NEON_MRC   (1 << 2)   /* Transfer from NEON to ARM register is fast (Cortex-A9) */
+#define X265_CPU_SVE2            (1 << 3)   /* AArch64 SVE2 */
+#define X265_CPU_SVE             (1 << 4)   /* AArch64 SVE2 */
+#define X265_CPU_NEON_DOTPROD    (1 << 5)   /* AArch64 Neon DotProd */
 
 /* IBM Power8 */
 #define X265_CPU_ALTIVEC         0x0000001
-- 
2.42.1

-------------- next part --------------
A non-text attachment was scrubbed...
Name: v2-0005-AArch64-Add-Armv8.4-Neon-DotProd-feature-detectio.patch
Type: text/x-patch
Size: 8063 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240730/0ca89e76/attachment-0001.bin>


More information about the x265-devel mailing list