[x264-devel] [PATCH] Add all remaining 16x16 predict Altivec routines

Tue Jan 13 22:06:39 CET 2009

Hello,

I got the following numbers from checkasm by calling the  
mach_absolute_time() function (counts nanoseconds) on MacOSX instead  
of rdtsc.
I don't know how accurate they are : it seems you can't access the PPC  
performance counters on Darwin without a driver.

intra_predict_16x16_dc_c: 25
intra_predict_16x16_dc_altivec: 16
intra_predict_16x16_dc8_c: 17
intra_predict_16x16_dc8_altivec: 9
intra_predict_16x16_dcl_c: 23
intra_predict_16x16_dcl_altivec: 13
intra_predict_16x16_dct_c: 23
intra_predict_16x16_dct_altivec: 13
intra_predict_16x16_h_c: 17
intra_predict_16x16_h_altivec: 54
intra_predict_16x16_p_c: 290
intra_predict_16x16_p_altivec: 26
intra_predict_16x16_v_c: 17
intra_predict_16x16_v_altivec: 11

With the exception of intra_predict_16x16_h, all new functions seem to  
be faster than their C equivalents.

This was on a PPC970 (quad G5). For reference, here is the checkasm  
patch I used :

diff --git a/tools/checkasm.c b/tools/checkasm.c
index aeaf5fb..7825b97 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -30,6 +30,10 @@
  #include "common/common.h"
  #include "common/cpu.h"

+#ifdef SYS_MACOSX
+#include <mach/mach_time.h>
+#endif
+
  /* buf1, buf2: initialised to random data and shouldn't write into  
them */
  uint8_t * buf1, * buf2;
  /* buf3, buf4: used to store output */
@@ -80,6 +84,8 @@ static inline uint32_t read_time(void)
      uint32_t a;
      asm volatile( "rdtsc" :"=a"(a) ::"edx" );
      return a;
+#elif defined(SYS_MACOSX)
+   return mach_absolute_time() & 0xFFFFFFFF;
  #else
      return 0;
  #endif
@@ -153,7 +159,8 @@ static void print_bench(void)
                      /* print sse2slow only if there's also a  
sse2fast version of the same func */
                      b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS &&  
b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ?  
"sse2slow" :
                      b->cpu&X264_CPU_SSE2 ? "sse2" :
-                    b->cpu&X264_CPU_MMX ? "mmx" : "c",
+                    b->cpu&X264_CPU_MMX ? "mmx" :
+                    b->cpu&X264_CPU_ALTIVEC ? "altivec" : "c",
                      b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
                      b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
                      b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
@@ -1448,7 +1455,7 @@ int main(int argc, char *argv[])

      if( argc > 1 && !strncmp( argv[1], "--bench", 7 ) )
      {
-#if !defined(ARCH_X86) && !defined(ARCH_X86_64)
+#if !defined(ARCH_X86) && !defined(ARCH_X86_64) && !defined(SYS_MACOSX)
          fprintf( stderr, "no --bench for your cpu until you port  
rdtsc\n" );
          return 1;
  #endif

>>> Thoughts? Benchmark figures?