[x264-devel] commit: GSOC merge part 1: Framework for ARM assembly optimizations ( Steven Walters )

Thu Aug 20 22:10:03 CEST 2009

x264 | branch: master | Steven Walters <kemuri9 at gmail.com> | Wed Aug 19 17:03:02 2009 -0700| [0a79eb354eded79ea2de89127c5fb645e919408f] | committer: Jason Garrett-Glaser 

GSOC merge part 1: Framework for ARM assembly optimizations
x264 will detect which ARM core it's building for and only build NEON asm if the target is ARMv6 or above, then enable NEON at runtime.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=0a79eb354eded79ea2de89127c5fb645e919408f
---

 Makefile         |   12 ++++++++
 common/cpu.c     |   79 +++++++++++++++++++++++++++++++++++++++++++-----------
 common/osdep.h   |    7 +++++
 configure        |   15 +++++++++-
 tools/checkasm.c |   32 +++++++++++++++------
 x264.h           |    3 ++
 6 files changed, 121 insertions(+), 27 deletions(-)

diff --git a/Makefile b/Makefile
index 563f185..725c919 100644
--- a/Makefile
+++ b/Makefile
@@ -55,6 +55,14 @@ SRCS += $(ALTIVECSRC)
 $(ALTIVECSRC:%.c=%.o): CFLAGS += $(ALTIVECFLAGS)
 endif
 
+# NEON optims
+ifeq ($(ARCH),ARM)
+ifneq ($(AS),)
+ASMSRC += common/arm/cpu-a.S
+OBJASM  = $(ASMSRC:%.S=%.o)
+endif
+endif
+
 # VIS optims
 ifeq ($(ARCH),UltraSparc)
 ASMSRC += common/sparc/pixel.asm
@@ -88,6 +96,10 @@ checkasm: tools/checkasm.o libx264.a
 
 %.o: %.asm
 	$(AS) $(ASFLAGS) -o $@ $<
+
+%.o: %.S
+	$(AS) $(ASFLAGS) -o $@ $<
+
 # delete local/anonymous symbols, so they don't show up in oprofile
 	-@ $(STRIP) -x $@
 
diff --git a/common/cpu.c b/common/cpu.c
index 1cb7080..8bfd21f 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -61,9 +61,30 @@ const x264_cpu_name_t x264_cpu_names[] = {
     {"SSEMisalign", X264_CPU_SSE_MISALIGN},
     {"LZCNT", X264_CPU_LZCNT},
     {"Slow_mod4_stack", X264_CPU_STACK_MOD4},
+    {"ARMv6", X264_CPU_ARMV6},
+    {"NEON",  X264_CPU_NEON},
+    {"Fast_NEON_MRC",  X264_CPU_FAST_NEON_MRC},
     {"", 0},
 };
 
+#if (defined(ARCH_PPC) && defined(SYS_LINUX)) || (defined(ARCH_ARM) && !defined(HAVE_NEON))
+#include <signal.h>
+#include <setjmp.h>
+static sigjmp_buf jmpbuf;
+static volatile sig_atomic_t canjump = 0;
+
+static void sigill_handler( int sig )
+{
+    if( !canjump )
+    {
+        signal( sig, SIG_DFL );
+        raise( sig );
+    }
+
+    canjump = 0;
+    siglongjmp( jmpbuf, 1 );
+}
+#endif
 
 #ifdef HAVE_MMX
 extern int  x264_cpu_cpuid_test( void );
@@ -224,22 +245,6 @@ uint32_t x264_cpu_detect( void )
 }
 
 #elif defined( SYS_LINUX )
-#include <signal.h>
-#include <setjmp.h>
-static sigjmp_buf jmpbuf;
-static volatile sig_atomic_t canjump = 0;
-
-static void sigill_handler( int sig )
-{
-    if( !canjump )
-    {
-        signal( sig, SIG_DFL );
-        raise( sig );
-    }
-
-    canjump = 0;
-    siglongjmp( jmpbuf, 1 );
-}
 
 uint32_t x264_cpu_detect( void )
 {
@@ -265,6 +270,48 @@ uint32_t x264_cpu_detect( void )
 }
 #endif
 
+#elif defined( ARCH_ARM )
+
+void x264_cpu_neon_test();
+int x264_cpu_fast_neon_mrc_test();
+
+uint32_t x264_cpu_detect( void )
+{
+    int flags = 0;
+#ifdef HAVE_ARMV6
+    flags |= X264_CPU_ARMV6;
+
+    // don't do this hack if compiled with -mfpu=neon
+#ifndef HAVE_NEON
+    static void (* oldsig)( int );
+    oldsig = signal( SIGILL, sigill_handler );
+    if( sigsetjmp( jmpbuf, 1 ) )
+    {
+        signal( SIGILL, oldsig );
+        return flags;
+    }
+
+    canjump = 1;
+    x264_cpu_neon_test();
+    canjump = 0;
+    signal( SIGILL, oldsig );
+#endif
+
+    flags |= X264_CPU_NEON;
+
+    // fast neon -> arm (Cortex-A9) detection relies on user access to the
+    // cycle counter; this assumes ARMv7 performance counters.
+    // NEON requires at least ARMv7, ARMv8 may require changes here, but
+    // hopefully this hacky detection method will have been replaced by then.
+    // Note that there is potential for a race condition if another program or
+    // x264 instance disables or reinits the counters while x264 is using them,
+    // which may result in incorrect detection and the counters stuck enabled.
+    flags |= x264_cpu_fast_neon_mrc_test() ? X264_CPU_FAST_NEON_MRC : 0;
+    // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
+#endif
+    return flags;
+}
+
 #else
 
 uint32_t x264_cpu_detect( void )
diff --git a/common/osdep.h b/common/osdep.h
index 57642dc..a691d06 100644
--- a/common/osdep.h
+++ b/common/osdep.h
@@ -163,6 +163,13 @@ static ALWAYS_INLINE intptr_t endian_fix( intptr_t x )
     asm("bswap %0":"+r"(x));
     return x;
 }
+#elif defined(__GNUC__) && defined(HAVE_ARMV6)
+static ALWAYS_INLINE intptr_t endian_fix( intptr_t x )
+{
+    asm("rev %0, %0":"+r"(x));
+    return x;
+}
+#define endian_fix32 endian_fix
 #else
 static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
 {
diff --git a/configure b/configure
index 2e7360a..a4af82a 100755
--- a/configure
+++ b/configure
@@ -10,7 +10,7 @@ echo "  --help                   print this message"
 echo "  --disable-avis-input     disables avisynth input (win32 only)"
 echo "  --disable-mp4-output     disables mp4 output (using gpac)"
 echo "  --disable-pthread        disables multithreaded encoding"
-echo "  --disable-asm            disables assembly optimizations on x86"
+echo "  --disable-asm            disables assembly optimizations on x86 and arm"
 echo "  --enable-debug           adds -g, doesn't strip"
 echo "  --enable-gprof           adds -pg, doesn't strip"
 echo "  --enable-visualize       enables visualization (X11 only)"
@@ -157,7 +157,6 @@ CC="${CC-${cross_prefix}gcc}"
 AR="${AR-${cross_prefix}ar}"
 RANLIB="${RANLIB-${cross_prefix}ranlib}"
 STRIP="${STRIP-${cross_prefix}strip}"
-AS=""
 
 if [ "x$host" = x ]; then
     host=`./config.guess`
@@ -286,6 +285,7 @@ case $host_cpu in
     ;;
   arm*)
     ARCH="ARM"
+    AS="${AS-${cross_prefix}gcc}"
     ;;
   s390|s390x)
     ARCH="S390"
@@ -324,6 +324,17 @@ if [ $asm = yes -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
     fi
     CFLAGS="$CFLAGS -DHAVE_MMX"
 fi
+
+if [ $asm = yes -a $ARCH = ARM ] ; then
+    if  cc_check '' '' 'asm("rev r0, r0");' ; then      CFLAGS="$CFLAGS -DHAVE_ARMV6"
+        cc_check '' '' 'asm("movt r0, #0");'         && CFLAGS="$CFLAGS -DHAVE_ARMV6T2"
+        cc_check '' '' 'asm("vadd.i16 q0, q0, q0");' && CFLAGS="$CFLAGS -DHAVE_NEON"
+        ASFLAGS="$ASFLAGS $CFLAGS -c"
+    else
+        asm="no"
+    fi
+fi
+
 [ $asm = no ] && AS=""
 [ "x$AS" = x ] && asm="no"
 
diff --git a/tools/checkasm.c b/tools/checkasm.c
index b574a42..58fc608 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -30,6 +30,12 @@
 #include "common/common.h"
 #include "common/cpu.h"
 
+// GCC doesn't align stack variables on ARM, so use .bss
+#ifdef ARCH_ARM
+#undef DECLARE_ALIGNED_16
+#define DECLARE_ALIGNED_16( var ) DECLARE_ALIGNED( static var, 16 )
+#endif
+
 /* buf1, buf2: initialised to random data and shouldn't write into them */
 uint8_t * buf1, * buf2;
 /* buf3, buf4: used to store output */
@@ -76,17 +82,15 @@ static const char **intra_predict_8x8_names = intra_predict_4x4_names;
 
 static inline uint32_t read_time(void)
 {
+    uint32_t a = 0;
 #if defined(__GNUC__) && (defined(ARCH_X86) || defined(ARCH_X86_64))
-    uint32_t a;
     asm volatile( "rdtsc" :"=a"(a) ::"edx" );
-    return a;
 #elif defined(ARCH_PPC)
-    uint32_t a;
     asm volatile( "mftb %0" : "=r" (a) );
-    return a;
-#else
-    return 0;
+#elif defined(ARCH_ARM)     // ARMv7 only
+    asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) );
 #endif
+    return a;
 }
 
 static bench_t* get_bench( const char *name, int cpu )
@@ -158,11 +162,14 @@ static void print_bench(void)
                     b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
                     b->cpu&X264_CPU_SSE2 ? "sse2" :
                     b->cpu&X264_CPU_MMX ? "mmx" :
-                    b->cpu&X264_CPU_ALTIVEC ? "altivec" : "c",
+                    b->cpu&X264_CPU_ALTIVEC ? "altivec" :
+                    b->cpu&X264_CPU_NEON ? "neon" :
+                    b->cpu&X264_CPU_ARMV6 ? "armv6" : "c",
                     b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
                     b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
                     b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
-                    b->cpu&X264_CPU_LZCNT ? "_lzcnt" : "",
+                    b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
+                    b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : "",
                     ((int64_t)10*b->cycles/b->den - nop_time)/4 );
         }
 }
@@ -1580,6 +1587,13 @@ static int check_all_flags( void )
         fprintf( stderr, "x264: ALTIVEC against C\n" );
         ret = check_all_funcs( 0, X264_CPU_ALTIVEC );
     }
+#elif ARCH_ARM
+    if( x264_cpu_detect() & X264_CPU_ARMV6 )
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV6, "ARMv6" );
+    if( x264_cpu_detect() & X264_CPU_NEON )
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" );
+    if( x264_cpu_detect() & X264_CPU_FAST_NEON_MRC )
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_FAST_NEON_MRC, "Fast NEON MRC" );
 #endif
     return ret;
 }
@@ -1591,7 +1605,7 @@ int main(int argc, char *argv[])
 
     if( argc > 1 && !strncmp( argv[1], "--bench", 7 ) )
     {
-#if !defined(ARCH_X86) && !defined(ARCH_X86_64) && !defined(ARCH_PPC)
+#if !defined(ARCH_X86) && !defined(ARCH_X86_64) && !defined(ARCH_PPC) && !defined(ARCH_ARM)
         fprintf( stderr, "no --bench for your cpu until you port rdtsc\n" );
         return 1;
 #endif
diff --git a/x264.h b/x264.h
index 37a643c..7fa508d 100644
--- a/x264.h
+++ b/x264.h
@@ -63,6 +63,9 @@ typedef struct x264_t x264_t;
 #define X264_CPU_SSE42          0x004000  /* SSE4.2 */
 #define X264_CPU_SSE_MISALIGN   0x008000  /* Phenom support for misaligned SSE instruction arguments */
 #define X264_CPU_LZCNT          0x010000  /* Phenom support for "leading zero count" instruction. */
+#define X264_CPU_ARMV6          0x020000
+#define X264_CPU_NEON           0x040000  /* ARM NEON */
+#define X264_CPU_FAST_NEON_MRC  0x080000  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
 
 /* Analyse flags
  */