[x265] [PATCH 5/8] AArch64: Add Armv8.4 Neon DotProd feature detection
Hari Limaye
hari.limaye at arm.com
Thu May 23 17:19:27 UTC 2024
Add compile-time feature detection for AArch64 Neon DotProd
instructions, which are mandatory from Armv8.4, and clean up the
Arm/AArch64 CPU flag definitions.
We impose the constraint that SVE implies Neon DotProd, as this is true
for all systems except the Fujitsu A64FX.
---
build/README.txt | 8 ++++++++
source/CMakeLists.txt | 24 +++++++++++++++++++++---
source/cmake/FindNEON_DOTPROD.cmake | 21 +++++++++++++++++++++
source/common/cpu.cpp | 19 ++++++++++++-------
source/test/testbench.cpp | 3 ++-
source/x265.h | 11 ++++++-----
6 files changed, 70 insertions(+), 16 deletions(-)
create mode 100644 source/cmake/FindNEON_DOTPROD.cmake
diff --git a/build/README.txt b/build/README.txt
index af4abd21c..8e229c3bd 100644
--- a/build/README.txt
+++ b/build/README.txt
@@ -113,4 +113,12 @@ For example, when running CMake to configure the project:
1. cmake -DCROSS_COMPILE_SVE=ON <other configuration options...>
2. cmake -DCROSS_COMPILE_SVE2=ON <other configuration options...>
+Note: when the CROSS_COMPILE_SVE option is set to ON the build configuration will
+compile for Neon DotProd, as we impose the constraint that SVE implies Neon DotProd.
+
+If target platform supports Armv8.4 Neon DotProd instructions, the
+CROSS_COMPILE_NEON_DOTPROD CMake option should be set to ON:
+
+* cmake -DCROSS_COMPILE_NEON_DOTPROD=ON <other configuration options...>
+
Then, the normal build process can be followed.
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 0a877f209..c1179d276 100755
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -88,6 +88,7 @@ elseif(ARM64MATCH GREATER "-1")
# Options for cross compiling AArch64 optional extensions
option(CROSS_COMPILE_SVE "Cross Compile for SVE Target" OFF)
option(CROSS_COMPILE_SVE2 "Cross Compile for SVE2 Target" OFF)
+ option(CROSS_COMPILE_NEON_DOTPROD "Cross Compile for Neon DotProd Target" OFF)
else()
message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
@@ -265,25 +266,42 @@ if(GCC)
add_definitions(-DX265_ARCH_ARM64=1 -DHAVE_NEON=1)
# Handle cross-compilation options.
+ if(CROSS_COMPILE_NEON_DOTPROD)
+ set(CPU_HAS_NEON_DOTPROD 1)
+ endif()
if(CROSS_COMPILE_SVE)
set(CPU_HAS_SVE 1)
+ # We impose the constraint that SVE implies Neon DotProd.
+ set(CPU_HAS_NEON_DOTPROD 1)
endif()
if(CROSS_COMPILE_SVE2)
set(CPU_HAS_SVE2 1)
- # SVE2 implies SVE.
+ # SVE2 implies SVE and Neon DotProd.
set(CPU_HAS_SVE 1)
+ set(CPU_HAS_NEON_DOTPROD 1)
endif()
+ find_package(NEON_DOTPROD)
find_package(SVE)
find_package(SVE2)
+ if(CPU_HAS_NEON_DOTPROD)
+ # Neon DotProd is mandatory from Armv8.4.
+ message(STATUS "Found Neon DotProd")
+ set(ARM_ARGS -O3 -march=armv8.2-a+dotprod)
+ add_definitions(-DHAVE_NEON_DOTPROD=1)
+ endif()
if(CPU_HAS_SVE)
message(STATUS "Found SVE")
- set(ARM_ARGS -O3 -march=armv8.2-a+sve)
+ # We impose the constraint that SVE implies Neon DotProd.
+ if(NOT CPU_HAS_NEON_DOTPROD)
+ message(FATAL_ERROR "Unsupported AArch64 feature combination (SVE without Neon DotProd)")
+ endif()
+ set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+sve)
add_definitions(-DHAVE_SVE=1)
endif()
if(CPU_HAS_SVE2)
message(STATUS "Found SVE2")
- # SVE2 is only available from Armv9.0-A.
+ # SVE2 is only available from Armv9.0, and armv9-a implies +dotprod
set(ARM_ARGS -O3 -march=armv9-a+sve2)
add_definitions(-DHAVE_SVE2=1)
endif()
diff --git a/source/cmake/FindNEON_DOTPROD.cmake b/source/cmake/FindNEON_DOTPROD.cmake
new file mode 100644
index 000000000..49b9ff605
--- /dev/null
+++ b/source/cmake/FindNEON_DOTPROD.cmake
@@ -0,0 +1,21 @@
+include(FindPackageHandleStandardArgs)
+
+# Check if Armv8.4 Neon DotProd is supported by the Arm CPU
+if(APPLE)
+ execute_process(COMMAND sysctl -a
+ COMMAND grep "hw.optional.arm.FEAT_DotProd: 1"
+ OUTPUT_VARIABLE has_dot_product
+ ERROR_QUIET
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+else()
+ execute_process(COMMAND cat /proc/cpuinfo
+ COMMAND grep Features
+ COMMAND grep asimddp
+ OUTPUT_VARIABLE has_dot_product
+ ERROR_QUIET
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+endif()
+
+if(has_dot_product)
+ set(CPU_HAS_NEON_DOTPROD 1)
+endif()
diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp
index 6cf0a5312..84a241876 100644
--- a/source/common/cpu.cpp
+++ b/source/common/cpu.cpp
@@ -115,6 +115,9 @@ const cpu_name_t cpu_names[] =
#if defined(HAVE_SVE2)
{ "SVE2", X265_CPU_SVE2 },
#endif
+#if defined(HAVE_NEON_DOTPROD)
+ { "Neon_DotProd", X265_CPU_NEON_DOTPROD },
+#endif
#elif X265_ARCH_POWER8
{ "Altivec", X265_CPU_ALTIVEC },
@@ -389,17 +392,19 @@ uint32_t cpu_detect(bool benableavx512)
{
int flags = 0;
- #if defined(HAVE_SVE2)
+ #if HAVE_SVE2
flags |= X265_CPU_SVE2;
+ #endif
+ #if HAVE_SVE
flags |= X265_CPU_SVE;
- flags |= X265_CPU_NEON;
- #elif defined(HAVE_SVE)
- flags |= X265_CPU_SVE;
- flags |= X265_CPU_NEON;
- #elif HAVE_NEON
+ #endif
+ #if HAVE_NEON
flags |= X265_CPU_NEON;
#endif
-
+ #if HAVE_NEON_DOTPROD
+ flags |= X265_CPU_NEON_DOTPROD;
+ #endif
+
return flags;
}
diff --git a/source/test/testbench.cpp b/source/test/testbench.cpp
index 45da893a7..ec4f5d395 100644
--- a/source/test/testbench.cpp
+++ b/source/test/testbench.cpp
@@ -159,7 +159,7 @@ int main(int argc, char *argv[])
struct test_arch_t
{
- char name[12];
+ char name[13];
int flag;
} test_arch[] =
{
@@ -176,6 +176,7 @@ int main(int argc, char *argv[])
{ "NEON", X265_CPU_NEON },
{ "SVE2", X265_CPU_SVE2 },
{ "SVE", X265_CPU_SVE },
+ { "Neon_DotProd", X265_CPU_NEON_DOTPROD },
{ "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
{ "", 0 },
};
diff --git a/source/x265.h b/source/x265.h
index 4452526ae..c4cce319b 100644
--- a/source/x265.h
+++ b/source/x265.h
@@ -536,11 +536,12 @@ typedef enum
#define X265_CPU_SLOW_PALIGNR (1 << 25) /* such as on the AMD Bobcat */
/* ARM */
-#define X265_CPU_ARMV6 0x0000001
-#define X265_CPU_NEON 0x0000002 /* ARM NEON */
-#define X265_CPU_SVE2 0x0000008 /* ARM SVE2 */
-#define X265_CPU_SVE 0x0000010 /* ARM SVE2 */
-#define X265_CPU_FAST_NEON_MRC 0x0000004 /* Transfer from NEON to ARM register is fast (Cortex-A9) */
+#define X265_CPU_ARMV6 (1 << 0)
+#define X265_CPU_NEON (1 << 1) /* ARM NEON */
+#define X265_CPU_FAST_NEON_MRC (1 << 2) /* Transfer from NEON to ARM register is fast (Cortex-A9) */
+#define X265_CPU_SVE2 (1 << 3) /* AArch64 SVE2 */
+#define X265_CPU_SVE (1 << 4) /* AArch64 SVE2 */
+#define X265_CPU_NEON_DOTPROD (1 << 5) /* AArch64 Neon DotProd */
/* IBM Power8 */
#define X265_CPU_ALTIVEC 0x0000001
--
2.42.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0005-AArch64-Add-Armv8.4-Neon-DotProd-feature-detection.patch
Type: text/x-patch
Size: 8060 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240523/3cf94265/attachment-0001.bin>
More information about the x265-devel
mailing list