[x264-devel] commit: GSOC merge part 1: Framework for ARM assembly optimizations ( Steven Walters )
git version control
git at videolan.org
Thu Aug 20 22:10:03 CEST 2009
x264 | branch: master | Steven Walters <kemuri9 at gmail.com> | Wed Aug 19 17:03:02 2009 -0700| [0a79eb354eded79ea2de89127c5fb645e919408f] | committer: Jason Garrett-Glaser
GSOC merge part 1: Framework for ARM assembly optimizations
x264 will detect which ARM core it's building for and only build NEON asm if the target is ARMv6 or above, then enable NEON at runtime.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=0a79eb354eded79ea2de89127c5fb645e919408f
---
Makefile | 12 ++++++++
common/cpu.c | 79 +++++++++++++++++++++++++++++++++++++++++++-----------
common/osdep.h | 7 +++++
configure | 15 +++++++++-
tools/checkasm.c | 32 +++++++++++++++------
x264.h | 3 ++
6 files changed, 121 insertions(+), 27 deletions(-)
diff --git a/Makefile b/Makefile
index 563f185..725c919 100644
--- a/Makefile
+++ b/Makefile
@@ -55,6 +55,14 @@ SRCS += $(ALTIVECSRC)
$(ALTIVECSRC:%.c=%.o): CFLAGS += $(ALTIVECFLAGS)
endif
+# NEON optims
+ifeq ($(ARCH),ARM)
+ifneq ($(AS),)
+ASMSRC += common/arm/cpu-a.S
+OBJASM = $(ASMSRC:%.S=%.o)
+endif
+endif
+
# VIS optims
ifeq ($(ARCH),UltraSparc)
ASMSRC += common/sparc/pixel.asm
@@ -88,6 +96,10 @@ checkasm: tools/checkasm.o libx264.a
%.o: %.asm
$(AS) $(ASFLAGS) -o $@ $<
+
+%.o: %.S
+ $(AS) $(ASFLAGS) -o $@ $<
+
# delete local/anonymous symbols, so they don't show up in oprofile
-@ $(STRIP) -x $@
diff --git a/common/cpu.c b/common/cpu.c
index 1cb7080..8bfd21f 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -61,9 +61,30 @@ const x264_cpu_name_t x264_cpu_names[] = {
{"SSEMisalign", X264_CPU_SSE_MISALIGN},
{"LZCNT", X264_CPU_LZCNT},
{"Slow_mod4_stack", X264_CPU_STACK_MOD4},
+ {"ARMv6", X264_CPU_ARMV6},
+ {"NEON", X264_CPU_NEON},
+ {"Fast_NEON_MRC", X264_CPU_FAST_NEON_MRC},
{"", 0},
};
+#if (defined(ARCH_PPC) && defined(SYS_LINUX)) || (defined(ARCH_ARM) && !defined(HAVE_NEON))
+#include <signal.h>
+#include <setjmp.h>
+static sigjmp_buf jmpbuf;
+static volatile sig_atomic_t canjump = 0;
+
+static void sigill_handler( int sig )
+{
+ if( !canjump )
+ {
+ signal( sig, SIG_DFL );
+ raise( sig );
+ }
+
+ canjump = 0;
+ siglongjmp( jmpbuf, 1 );
+}
+#endif
#ifdef HAVE_MMX
extern int x264_cpu_cpuid_test( void );
@@ -224,22 +245,6 @@ uint32_t x264_cpu_detect( void )
}
#elif defined( SYS_LINUX )
-#include <signal.h>
-#include <setjmp.h>
-static sigjmp_buf jmpbuf;
-static volatile sig_atomic_t canjump = 0;
-
-static void sigill_handler( int sig )
-{
- if( !canjump )
- {
- signal( sig, SIG_DFL );
- raise( sig );
- }
-
- canjump = 0;
- siglongjmp( jmpbuf, 1 );
-}
uint32_t x264_cpu_detect( void )
{
@@ -265,6 +270,48 @@ uint32_t x264_cpu_detect( void )
}
#endif
+#elif defined( ARCH_ARM )
+
+void x264_cpu_neon_test();
+int x264_cpu_fast_neon_mrc_test();
+
+uint32_t x264_cpu_detect( void )
+{
+ int flags = 0;
+#ifdef HAVE_ARMV6
+ flags |= X264_CPU_ARMV6;
+
+ // don't do this hack if compiled with -mfpu=neon
+#ifndef HAVE_NEON
+ static void (* oldsig)( int );
+ oldsig = signal( SIGILL, sigill_handler );
+ if( sigsetjmp( jmpbuf, 1 ) )
+ {
+ signal( SIGILL, oldsig );
+ return flags;
+ }
+
+ canjump = 1;
+ x264_cpu_neon_test();
+ canjump = 0;
+ signal( SIGILL, oldsig );
+#endif
+
+ flags |= X264_CPU_NEON;
+
+ // fast neon -> arm (Cortex-A9) detection relies on user access to the
+ // cycle counter; this assumes ARMv7 performance counters.
+ // NEON requires at least ARMv7, ARMv8 may require changes here, but
+ // hopefully this hacky detection method will have been replaced by then.
+ // Note that there is potential for a race condition if another program or
+ // x264 instance disables or reinits the counters while x264 is using them,
+ // which may result in incorrect detection and the counters stuck enabled.
+ flags |= x264_cpu_fast_neon_mrc_test() ? X264_CPU_FAST_NEON_MRC : 0;
+ // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
+#endif
+ return flags;
+}
+
#else
uint32_t x264_cpu_detect( void )
diff --git a/common/osdep.h b/common/osdep.h
index 57642dc..a691d06 100644
--- a/common/osdep.h
+++ b/common/osdep.h
@@ -163,6 +163,13 @@ static ALWAYS_INLINE intptr_t endian_fix( intptr_t x )
asm("bswap %0":"+r"(x));
return x;
}
+#elif defined(__GNUC__) && defined(HAVE_ARMV6)
+static ALWAYS_INLINE intptr_t endian_fix( intptr_t x )
+{
+ asm("rev %0, %0":"+r"(x));
+ return x;
+}
+#define endian_fix32 endian_fix
#else
static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
{
diff --git a/configure b/configure
index 2e7360a..a4af82a 100755
--- a/configure
+++ b/configure
@@ -10,7 +10,7 @@ echo " --help print this message"
echo " --disable-avis-input disables avisynth input (win32 only)"
echo " --disable-mp4-output disables mp4 output (using gpac)"
echo " --disable-pthread disables multithreaded encoding"
-echo " --disable-asm disables assembly optimizations on x86"
+echo " --disable-asm disables assembly optimizations on x86 and arm"
echo " --enable-debug adds -g, doesn't strip"
echo " --enable-gprof adds -pg, doesn't strip"
echo " --enable-visualize enables visualization (X11 only)"
@@ -157,7 +157,6 @@ CC="${CC-${cross_prefix}gcc}"
AR="${AR-${cross_prefix}ar}"
RANLIB="${RANLIB-${cross_prefix}ranlib}"
STRIP="${STRIP-${cross_prefix}strip}"
-AS=""
if [ "x$host" = x ]; then
host=`./config.guess`
@@ -286,6 +285,7 @@ case $host_cpu in
;;
arm*)
ARCH="ARM"
+ AS="${AS-${cross_prefix}gcc}"
;;
s390|s390x)
ARCH="S390"
@@ -324,6 +324,17 @@ if [ $asm = yes -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
fi
CFLAGS="$CFLAGS -DHAVE_MMX"
fi
+
+if [ $asm = yes -a $ARCH = ARM ] ; then
+ if cc_check '' '' 'asm("rev r0, r0");' ; then CFLAGS="$CFLAGS -DHAVE_ARMV6"
+ cc_check '' '' 'asm("movt r0, #0");' && CFLAGS="$CFLAGS -DHAVE_ARMV6T2"
+ cc_check '' '' 'asm("vadd.i16 q0, q0, q0");' && CFLAGS="$CFLAGS -DHAVE_NEON"
+ ASFLAGS="$ASFLAGS $CFLAGS -c"
+ else
+ asm="no"
+ fi
+fi
+
[ $asm = no ] && AS=""
[ "x$AS" = x ] && asm="no"
diff --git a/tools/checkasm.c b/tools/checkasm.c
index b574a42..58fc608 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -30,6 +30,12 @@
#include "common/common.h"
#include "common/cpu.h"
+// GCC doesn't align stack variables on ARM, so use .bss
+#ifdef ARCH_ARM
+#undef DECLARE_ALIGNED_16
+#define DECLARE_ALIGNED_16( var ) DECLARE_ALIGNED( static var, 16 )
+#endif
+
/* buf1, buf2: initialised to random data and shouldn't write into them */
uint8_t * buf1, * buf2;
/* buf3, buf4: used to store output */
@@ -76,17 +82,15 @@ static const char **intra_predict_8x8_names = intra_predict_4x4_names;
static inline uint32_t read_time(void)
{
+ uint32_t a = 0;
#if defined(__GNUC__) && (defined(ARCH_X86) || defined(ARCH_X86_64))
- uint32_t a;
asm volatile( "rdtsc" :"=a"(a) ::"edx" );
- return a;
#elif defined(ARCH_PPC)
- uint32_t a;
asm volatile( "mftb %0" : "=r" (a) );
- return a;
-#else
- return 0;
+#elif defined(ARCH_ARM) // ARMv7 only
+ asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) );
#endif
+ return a;
}
static bench_t* get_bench( const char *name, int cpu )
@@ -158,11 +162,14 @@ static void print_bench(void)
b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
b->cpu&X264_CPU_SSE2 ? "sse2" :
b->cpu&X264_CPU_MMX ? "mmx" :
- b->cpu&X264_CPU_ALTIVEC ? "altivec" : "c",
+ b->cpu&X264_CPU_ALTIVEC ? "altivec" :
+ b->cpu&X264_CPU_NEON ? "neon" :
+ b->cpu&X264_CPU_ARMV6 ? "armv6" : "c",
b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
- b->cpu&X264_CPU_LZCNT ? "_lzcnt" : "",
+ b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
+ b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : "",
((int64_t)10*b->cycles/b->den - nop_time)/4 );
}
}
@@ -1580,6 +1587,13 @@ static int check_all_flags( void )
fprintf( stderr, "x264: ALTIVEC against C\n" );
ret = check_all_funcs( 0, X264_CPU_ALTIVEC );
}
+#elif ARCH_ARM
+ if( x264_cpu_detect() & X264_CPU_ARMV6 )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV6, "ARMv6" );
+ if( x264_cpu_detect() & X264_CPU_NEON )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" );
+ if( x264_cpu_detect() & X264_CPU_FAST_NEON_MRC )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_FAST_NEON_MRC, "Fast NEON MRC" );
#endif
return ret;
}
@@ -1591,7 +1605,7 @@ int main(int argc, char *argv[])
if( argc > 1 && !strncmp( argv[1], "--bench", 7 ) )
{
-#if !defined(ARCH_X86) && !defined(ARCH_X86_64) && !defined(ARCH_PPC)
+#if !defined(ARCH_X86) && !defined(ARCH_X86_64) && !defined(ARCH_PPC) && !defined(ARCH_ARM)
fprintf( stderr, "no --bench for your cpu until you port rdtsc\n" );
return 1;
#endif
diff --git a/x264.h b/x264.h
index 37a643c..7fa508d 100644
--- a/x264.h
+++ b/x264.h
@@ -63,6 +63,9 @@ typedef struct x264_t x264_t;
#define X264_CPU_SSE42 0x004000 /* SSE4.2 */
#define X264_CPU_SSE_MISALIGN 0x008000 /* Phenom support for misaligned SSE instruction arguments */
#define X264_CPU_LZCNT 0x010000 /* Phenom support for "leading zero count" instruction. */
+#define X264_CPU_ARMV6 0x020000
+#define X264_CPU_NEON 0x040000 /* ARM NEON */
+#define X264_CPU_FAST_NEON_MRC 0x080000 /* Transfer from NEON to ARM register is fast (Cortex-A9) */
/* Analyse flags
*/
More information about the x264-devel
mailing list