[x264-devel] checkasm: x86: More accurate ymm/zmm measurements

Henrik Gramner git at videolan.org
Mon May 22 00:02:55 CEST 2017


x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Fri May 12 00:03:10 2017 +0200| [1878c7f2af0a9c73e291488209109782c428cfcf] | committer: Henrik Gramner

checkasm: x86: More accurate ymm/zmm measurements

YMM and ZMM registers on x86 are turned off to save power when they haven't
been used for some period of time. When they are used there will be a
"warmup" period during which performance will be reduced and inconsistent
which is problematic when trying to benchmark individual functions.

Periodically issue "dummy" instructions that uses those registers to
prevent them from being powered down. The end result is more consitent
benchmark results.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=1878c7f2af0a9c73e291488209109782c428cfcf
---

 tools/checkasm-a.asm | 11 +++++++++++
 tools/checkasm.c     | 20 ++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/tools/checkasm-a.asm b/tools/checkasm-a.asm
index d1da9b79..a9f74931 100644
--- a/tools/checkasm-a.asm
+++ b/tools/checkasm-a.asm
@@ -225,3 +225,14 @@ cglobal stack_pagealign, 2,2
     leave
     RET
 
+; Trigger a warmup of vector units
+%macro WARMUP 0
+cglobal checkasm_warmup, 0,0
+    xorps m0, m0
+    RET
+%endmacro
+
+INIT_YMM avx
+WARMUP
+INIT_ZMM avx512
+WARMUP
diff --git a/tools/checkasm.c b/tools/checkasm.c
index a2c2e492..75899dfe 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -222,8 +222,18 @@ static void print_bench(void)
         }
 }
 
+/* YMM and ZMM registers on x86 are turned off to save power when they haven't been
+ * used for some period of time. When they are used there will be a "warmup" period
+ * during which performance will be reduced and inconsistent which is problematic when
+ * trying to benchmark individual functions. We can work around this by periodically
+ * issuing "dummy" instructions that uses those registers to keep them powered on. */
+static void (*simd_warmup_func)( void ) = NULL;
+#define simd_warmup() do { if( simd_warmup_func ) simd_warmup_func(); } while( 0 )
+
 #if ARCH_X86 || ARCH_X86_64
 int x264_stack_pagealign( int (*func)(), int align );
+void x264_checkasm_warmup_avx( void );
+void x264_checkasm_warmup_avx512( void );
 
 /* detect when callee-saved regs aren't saved
  * needs an explicit asm check because it only sometimes crashes in normal use. */
@@ -258,6 +268,7 @@ void x264_checkasm_stack_clobber( uint64_t clobber, ... );
 #define call_a1(func,...) ({ \
     uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \
     x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+6 */ \
+    simd_warmup(); \
     x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, __VA_ARGS__ ); })
 #elif ARCH_AARCH64 && !defined(__APPLE__)
 void x264_checkasm_stack_clobber( uint64_t clobber, ... );
@@ -285,6 +296,7 @@ void x264_checkasm_stack_clobber( uint64_t clobber, ... );
         call_a1(func, __VA_ARGS__);\
         for( int ti = 0; ti < (cpu?BENCH_RUNS:BENCH_RUNS/4); ti++ )\
         {\
+            simd_warmup();\
             uint32_t t = read_time();\
             func(__VA_ARGS__);\
             func(__VA_ARGS__);\
@@ -2785,6 +2797,14 @@ static int check_all_flags( void )
     int ret = 0;
     int cpu0 = 0, cpu1 = 0;
     uint32_t cpu_detect = x264_cpu_detect();
+#if ARCH_X86 || ARCH_X86_64
+    if( cpu_detect & X264_CPU_AVX512 )
+        simd_warmup_func = x264_checkasm_warmup_avx512;
+    else if( cpu_detect & X264_CPU_AVX )
+        simd_warmup_func = x264_checkasm_warmup_avx;
+#endif
+    simd_warmup();
+
 #if HAVE_MMX
     if( cpu_detect & X264_CPU_MMX2 )
     {



More information about the x264-devel mailing list