[x264-devel] [Git][videolan/x264][master] 7 commits: configure: Use as_check for checking for aarch64 features
Martin Storsjö (@mstorsjo)
gitlab at videolan.org
Wed Mar 12 12:17:52 UTC 2025
Martin Storsjö pushed to branch master at VideoLAN / x264
Commits:
a0191bd8 by Martin Storsjö at 2025-03-12T13:23:40+02:00
configure: Use as_check for checking for aarch64 features
This is more correct than using cc_check; we're going to assemble
standalone external assembly - thus check for whether we can
build it in that form, not using inline assembly.
This allows sharing checks with the MSVC codepath (where inline
assembly isn't supported, and where assembly is built using
a tool different from the regular compiler).
- - - - -
72ce1cde by Martin Storsjö at 2025-03-12T13:23:40+02:00
configure: Use as_check for the main check for whether NEON is supported
This requires adding the "-c" flag to ASFLAGS before doing the
check.
This also makes sure to validate the gas-preprocessor is functional
for MSVC configurations, by testing whether the "cmeq" instruction
can be assembled at this point.
- - - - -
f87ca183 by Martin Storsjö at 2025-03-12T13:23:40+02:00
configure: Check for .arch and .arch_extension for enabling aarch64 extensions
This hasn't been needed for SVE/SVE2, as all toolchains have
supported just enabling it via ".arch armv8.2-a+sve". For other
arch extensions, like dotprod/i8mm, there's more combinations of
toolchain bugs in slightly older toolchains; try to detect what is
supported.
Additionally, when involving more than one architecture extension,
we may want to enable/disable individual extensions one at a time,
without needing to specify the full list in one single .arch
statement.
This is a preparatory commit for adding support for the dotprod/i8mm
extensions.
We intentionally don't add AS_ARCH_LEVEL to the CONFIG_HAVE list,
as this define isn't prefixed with "HAVE_", and we don't use the
define except in the case where we actually do set it. (It's not
a regular 0/1 define like the others.)
- - - - -
87044b21 by Martin Storsjö at 2025-03-12T13:23:40+02:00
aarch64: Use configure detected directives for enabling SVE/SVE2
By using .arch_extension (if supported) to enable the relevant
extensions, we can also disable them afterwards, so we can e.g.
cleanly enable one extension only for one subsection of a file.
This also makes it easier to enable various combinations of
supported architecture extensions.
- - - - -
fc4012fb by Martin Storsjö at 2025-03-12T13:23:40+02:00
configure: Check for the dotprod and i8mm aarch64 extensions
- - - - -
0e48d072 by Martin Storsjö at 2025-03-12T13:23:40+02:00
aarch64: Add flags for runtime detection of dotprod and i8mm
Also add code for detecting them on Linux.
- - - - -
570f6c70 by Martin Storsjö at 2025-03-12T13:23:40+02:00
aarch64: Add runtime detection of extensions on Windows and macOS
- - - - -
11 changed files:
- common/aarch64/asm.S
- common/aarch64/dct-a-sve.S
- common/aarch64/dct-a-sve2.S
- common/aarch64/deblock-a-sve.S
- common/aarch64/mc-a-sve.S
- common/aarch64/pixel-a-sve.S
- common/cpu.c
- configure
- tools/checkasm-aarch64.S
- tools/checkasm.c
- x264.h
Changes:
=====================================
common/aarch64/asm.S
=====================================
@@ -66,6 +66,46 @@
# define FUNC #
#endif
+ .arch AS_ARCH_LEVEL
+#if HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE
+#define ENABLE_DOTPROD .arch_extension dotprod
+#define DISABLE_DOTPROD .arch_extension nodotprod
+#else
+#define ENABLE_DOTPROD
+#define DISABLE_DOTPROD
+#endif
+#if HAVE_AS_ARCHEXT_I8MM_DIRECTIVE
+#define ENABLE_I8MM .arch_extension i8mm
+#define DISABLE_I8MM .arch_extension noi8mm
+#else
+#define ENABLE_I8MM
+#define DISABLE_I8MM
+#endif
+#if HAVE_AS_ARCHEXT_SVE_DIRECTIVE
+#define ENABLE_SVE .arch_extension sve
+#define DISABLE_SVE .arch_extension nosve
+#else
+#define ENABLE_SVE
+#define DISABLE_SVE
+#endif
+#if HAVE_AS_ARCHEXT_SVE2_DIRECTIVE
+#define ENABLE_SVE2 .arch_extension sve2
+#define DISABLE_SVE2 .arch_extension nosve2
+#else
+#define ENABLE_SVE2
+#define DISABLE_SVE2
+#endif
+
+/* If we do support the .arch_extension directives, disable support for all
+ * the extensions that we may use, in case they were implicitly enabled by
+ * the .arch level. This makes it clear if we try to assemble an instruction
+ * from an unintended extension set; we only allow assmbling such instructions
+ * within regions where we explicitly enable those extensions. */
+DISABLE_DOTPROD
+DISABLE_I8MM
+DISABLE_SVE
+DISABLE_SVE2
+
.macro function name, export=0, align=2
.macro endfunc
.if \export
=====================================
common/aarch64/dct-a-sve.S
=====================================
@@ -26,7 +26,7 @@
#include "asm.S"
#include "dct-a-common.S"
-.arch armv8-a+sve
+ENABLE_SVE
function sub4x4_dct_sve, export=1
mov x3, #FENC_STRIDE
=====================================
common/aarch64/dct-a-sve2.S
=====================================
@@ -26,7 +26,8 @@
#include "asm.S"
#include "dct-a-common.S"
-.arch armv8-a+sve+sve2
+ENABLE_SVE
+ENABLE_SVE2
function add4x4_idct_sve2, export=1
mov x2, #FDEC_STRIDE
=====================================
common/aarch64/deblock-a-sve.S
=====================================
@@ -26,7 +26,7 @@
#include "asm.S"
#include "deblock-a-common.S"
-.arch armv8-a+sve
+ENABLE_SVE
.macro h264_loop_filter_chroma_sve
ptrue p0.b, vl16
=====================================
common/aarch64/mc-a-sve.S
=====================================
@@ -26,7 +26,7 @@
#include "asm.S"
#include "mc-a-common.S"
-.arch armv8-a+sve
+ENABLE_SVE
#if BIT_DEPTH == 8
=====================================
common/aarch64/pixel-a-sve.S
=====================================
@@ -26,7 +26,7 @@
#include "asm.S"
#include "pixel-a-common.S"
-.arch armv8-a+sve
+ENABLE_SVE
#if BIT_DEPTH == 8
=====================================
common/cpu.c
=====================================
@@ -95,6 +95,8 @@ const x264_cpu_name_t x264_cpu_names[] =
#elif ARCH_AARCH64
{"ARMv8", X264_CPU_ARMV8},
{"NEON", X264_CPU_NEON},
+ {"DotProd", X264_CPU_DOTPROD},
+ {"I8MM", X264_CPU_I8MM},
{"SVE", X264_CPU_SVE},
{"SVE2", X264_CPU_SVE2},
#elif ARCH_MIPS
@@ -459,8 +461,10 @@ uint32_t x264_cpu_detect( void )
#if defined(__linux__) || HAVE_ELF_AUX_INFO
-#define HWCAP_AARCH64_SVE (1U << 22)
-#define HWCAP2_AARCH64_SVE2 (1U << 1)
+#define HWCAP_AARCH64_ASIMDDP (1U << 20)
+#define HWCAP_AARCH64_SVE (1U << 22)
+#define HWCAP2_AARCH64_SVE2 (1U << 1)
+#define HWCAP2_AARCH64_I8MM (1U << 13)
static uint32_t detect_flags( void )
{
@@ -469,6 +473,10 @@ static uint32_t detect_flags( void )
unsigned long hwcap = x264_getauxval( AT_HWCAP );
unsigned long hwcap2 = x264_getauxval( AT_HWCAP2 );
+ if ( hwcap & HWCAP_AARCH64_ASIMDDP )
+ flags |= X264_CPU_DOTPROD;
+ if ( hwcap2 & HWCAP2_AARCH64_I8MM )
+ flags |= X264_CPU_I8MM;
if ( hwcap & HWCAP_AARCH64_SVE )
flags |= X264_CPU_SVE;
if ( hwcap2 & HWCAP2_AARCH64_SVE2 )
@@ -476,6 +484,60 @@ static uint32_t detect_flags( void )
return flags;
}
+
+#elif defined(__APPLE__)
+#include <sys/sysctl.h>
+
+static int have_feature( const char *feature )
+{
+ int supported = 0;
+ size_t size = sizeof(supported);
+ if ( sysctlbyname( feature, &supported, &size, NULL, 0 ) )
+ return 0;
+ return supported;
+}
+
+static uint32_t detect_flags( void )
+{
+ uint32_t flags = 0;
+
+ if ( have_feature( "hw.optional.arm.FEAT_DotProd" ) )
+ flags |= X264_CPU_DOTPROD;
+ if ( have_feature( "hw.optional.arm.FEAT_I8MM" ) )
+ flags |= X264_CPU_I8MM;
+ /* No SVE and SVE2 feature detection available on Apple platforms. */
+ return flags;
+}
+
+#elif defined(_WIN32)
+#include <windows.h>
+
+static uint32_t detect_flags( void )
+{
+ uint32_t flags = 0;
+
+#ifdef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
+ if ( IsProcessorFeaturePresent( PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE ) )
+ flags |= X264_CPU_DOTPROD;
+#endif
+#ifdef PF_ARM_SVE_INSTRUCTIONS_AVAILABLE
+ if ( IsProcessorFeaturePresent( PF_ARM_SVE_INSTRUCTIONS_AVAILABLE ) )
+ flags |= X264_CPU_SVE;
+#endif
+#ifdef PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE
+ if ( IsProcessorFeaturePresent( PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE ) )
+ flags |= X264_CPU_SVE2;
+#endif
+#ifdef PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE
+ /* There's no PF_* flag that indicates whether plain I8MM is available
+ * or not. But if SVE_I8MM is available, that also implies that
+ * regular I8MM is available. */
+ if ( IsProcessorFeaturePresent( PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE ) )
+ flags |= X264_CPU_I8MM;
+#endif
+ return flags;
+}
+
#endif
uint32_t x264_cpu_detect( void )
@@ -487,6 +549,12 @@ uint32_t x264_cpu_detect( void )
// If these features are enabled unconditionally in the compiler, we can
// assume that they are available.
+#ifdef __ARM_FEATURE_DOTPROD
+ flags |= X264_CPU_DOTPROD;
+#endif
+#ifdef __ARM_FEATURE_MATMUL_INT8
+ flags |= X264_CPU_I8MM;
+#endif
#ifdef __ARM_FEATURE_SVE
flags |= X264_CPU_SVE;
#endif
@@ -495,7 +563,8 @@ uint32_t x264_cpu_detect( void )
#endif
// Where possible, try to do runtime detection as well.
-#if defined(__linux__) || HAVE_ELF_AUX_INFO
+#if defined(__linux__) || HAVE_ELF_AUX_INFO || \
+ defined(__APPLE__) || defined(_WIN32)
flags |= detect_flags();
#endif
=====================================
configure
=====================================
@@ -237,6 +237,18 @@ as_check() {
return $res
}
+as_archext_check() {
+ feature="$1"
+ instr="$2"
+ feature_upper="$(echo $feature | tr a-z A-Z)"
+ header=".arch $as_arch_level ${NL}"
+ if as_check "$header .arch_extension $feature" ; then
+ define HAVE_AS_ARCHEXT_${feature_upper}_DIRECTIVE
+ header="$header .arch_extension $feature ${NL}"
+ fi
+ as_check "$header $instr" && define HAVE_${feature_upper}
+}
+
rc_check() {
log_check "whether $RC works"
echo "$1" > conftest.rc
@@ -411,8 +423,10 @@ NL="
# list of all preprocessor HAVE values we can define
CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON AARCH64 BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F SWSCALE \
LAVF FFMS GPAC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP LSMASH X86_INLINE_ASM AS_FUNC INTEL_DISPATCHER \
- MSA LSX MMAP WINRT VSX ARM_INLINE_ASM STRTOK_R CLOCK_GETTIME BITDEPTH8 BITDEPTH10 SVE SVE2 ELF_AUX_INFO GETAUXVAL \
- SYSCONF SYNC_FETCH_AND_ADD"
+ MSA LSX MMAP WINRT VSX ARM_INLINE_ASM STRTOK_R CLOCK_GETTIME BITDEPTH8 BITDEPTH10 ELF_AUX_INFO GETAUXVAL \
+ SYSCONF SYNC_FETCH_AND_ADD \
+ DOTPROD I8MM SVE SVE2 \
+ AS_ARCHEXT_DOTPROD_DIRECTIVE AS_ARCHEXT_I8MM_DIRECTIVE AS_ARCHEXT_SVE_DIRECTIVE AS_ARCHEXT_SVE2_DIRECTIVE"
# parse options
@@ -1005,22 +1019,42 @@ if [ $asm = auto -a $ARCH = ARM ] ; then
fi
if [ $asm = auto -a $ARCH = AARCH64 ] ; then
- if [ $compiler = CL ] && cpp_check '' '' 'defined(_M_ARM64)' ; then
- define HAVE_AARCH64
- define HAVE_NEON
- as_check "ptrue p0.b, vl16" && define HAVE_SVE
- as_check "smlalb z10.s, z2.h, z1.h" && define HAVE_SVE2
- elif cc_check '' '' '__asm__("cmeq v0.8h, v0.8h, #0");' ; then
+ if [ $compiler != CL ] ; then
+ ASFLAGS="$ASFLAGS -c"
+ fi
+ if as_check "cmeq v0.8h, v0.8h, #0" ; then
define HAVE_AARCH64
define HAVE_NEON
- cc_check '' '' '__asm__(".arch armv8.2-a+sve \n ptrue p0.b, vl16");' && define HAVE_SVE
- cc_check '' '' '__asm__(".arch armv8.2-a+sve2 \n smlalb z10.s, z2.h, z1.h");' && define HAVE_SVE2
- ASFLAGS="$ASFLAGS -c"
else
echo "no NEON support, try adding -mfpu=neon to CFLAGS"
echo "If you really want to run on such a CPU, configure with --disable-asm."
exit 1
fi
+ # Check for higher .arch levels. We only need armv8.2-a in order to
+ # enable the extensions we want below - we primarily want to control
+ # them via .arch_extension. However:
+ #
+ # Clang before version 17 (Xcode versions before 16) didn't support
+ # controlling the dotprod/i8mm extensions via .arch_extension; thus
+ # try to enable them via the .arch level as well.
+ as_arch_level="armv8-a"
+ for level in armv8.2-a armv8.4-a armv8.6-a; do
+ as_check ".arch ${level}" && as_arch_level="$level"
+ done
+ # Clang before version 17 (Xcode versions before 16) also had a bug
+ # (https://github.com/llvm/llvm-project/issues/32220) causing a plain
+ # ".arch <level>" to not have any effect unless it had an extra
+ # "+<feature>" included - but it was activated on the next
+ # ".arch_extension" directive. Check if we can include "+crc" as dummy
+ # feature to make the .arch directive behave as expected and take
+ # effect right away.
+ as_check ".arch ${as_arch_level}+crc" && as_arch_level="${as_arch_level}+crc"
+ define AS_ARCH_LEVEL "$as_arch_level"
+
+ as_archext_check dotprod "udot v0.4s, v0.16b, v0.16b"
+ as_archext_check i8mm "usdot v0.4s, v0.16b, v0.16b"
+ as_archext_check sve "ptrue p0.b, vl16"
+ as_archext_check sve2 "smlalb z10.s, z2.h, z1.h"
fi
if [ $asm = auto -a \( $ARCH = ARM -o $ARCH = AARCH64 \) ] ; then
=====================================
tools/checkasm-aarch64.S
=====================================
@@ -166,11 +166,13 @@ function checkasm_call, export=1
endfunc
#if HAVE_SVE
-.arch armv8-a+sve
+ENABLE_SVE
function checkasm_sve_length, export=1
cntb x0
lsl x0, x0, #3
ret
endfunc
+
+DISABLE_SVE
#endif
=====================================
tools/checkasm.c
=====================================
@@ -216,6 +216,8 @@ static void print_bench(void)
#elif ARCH_AARCH64
b->cpu&X264_CPU_SVE2 ? "sve2" :
b->cpu&X264_CPU_SVE ? "sve" :
+ b->cpu&X264_CPU_I8MM ? "i8mm" :
+ b->cpu&X264_CPU_DOTPROD ? "dotprod" :
b->cpu&X264_CPU_NEON ? "neon" :
b->cpu&X264_CPU_ARMV8 ? "armv8" :
#elif ARCH_MIPS
@@ -2998,6 +3000,10 @@ static int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV8, "ARMv8" );
if( cpu_detect & X264_CPU_NEON )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" );
+ if( cpu_detect & X264_CPU_DOTPROD )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_DOTPROD, "DOTPROD" );
+ if( cpu_detect & X264_CPU_I8MM )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_I8MM, "I8MM" );
#if HAVE_SVE
if( cpu_detect & X264_CPU_SVE ) {
snprintf( buf, sizeof( buf ), "SVE (%d bits)", x264_checkasm_sve_length() );
=====================================
x264.h
=====================================
@@ -45,7 +45,7 @@ extern "C" {
#include "x264_config.h"
-#define X264_BUILD 164
+#define X264_BUILD 165
#ifdef _WIN32
# define X264_DLL_IMPORT __declspec(dllimport)
@@ -179,6 +179,8 @@ typedef struct x264_nal_t
#define X264_CPU_ARMV8 0x0000008U
#define X264_CPU_SVE 0x0000010U /* AArch64 SVE */
#define X264_CPU_SVE2 0x0000020U /* AArch64 SVE2 */
+#define X264_CPU_DOTPROD 0x0000040U /* AArch64 DotProd */
+#define X264_CPU_I8MM 0x0000080U /* AArch64 I8MM */
/* MIPS */
#define X264_CPU_MSA 0x0000001U /* MIPS MSA */
View it on GitLab: https://code.videolan.org/videolan/x264/-/compare/27d8370847d79665de06bbf8f043ce9a3a4a3da1...570f6c70808287fc78e3f8f5372a095ec6ef7878
--
View it on GitLab: https://code.videolan.org/videolan/x264/-/compare/27d8370847d79665de06bbf8f043ce9a3a4a3da1...570f6c70808287fc78e3f8f5372a095ec6ef7878
You're receiving this email because of your account on code.videolan.org.
VideoLAN code repository instance
More information about the x264-devel
mailing list