[x264-devel] AVX2/FMA3 version of mbtree_propagate

Wed Jan 9 19:32:23 CET 2013

x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Mon Nov 12 10:28:53 2012 -0800| [b924133cabd125286488e16cfa71488ad4105d63] | committer: Jason Garrett-Glaser

AVX2/FMA3 version of mbtree_propagate
First AVX2 function for testing.
Bump yasm version to 1.2.0 for AVX2 support.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=b924133cabd125286488e16cfa71488ad4105d63
---

 common/x86/mc-a2.asm  |   30 +++++++++++++++++++++++++++---
 common/x86/mc-c.c     |   11 +++++++++--
 common/x86/x86inc.asm |   42 ++++++++++++++++++++++++++++++++++++++++++
 configure             |    4 ++--
 tools/checkasm.c      |   11 ++++++-----
 5 files changed, 86 insertions(+), 12 deletions(-)

diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 5b936c7..19f1fb7 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -1702,7 +1702,7 @@ cglobal mbtree_propagate_cost, 7,7,7
 %if cpuflag(fma4)
     cvtdq2ps  xmm0, xmm0
     cvtdq2ps  xmm1, xmm1
-    vfmaddps  xmm0, xmm0, xmm6, xmm1
+    fmaddps   xmm0, xmm0, xmm6, xmm1
     cvtdq2ps  xmm1, xmm2
     psubd     xmm2, xmm3
     cvtdq2ps  xmm2, xmm2
@@ -1710,7 +1710,7 @@ cglobal mbtree_propagate_cost, 7,7,7
     mulps     xmm1, xmm3
     mulps     xmm0, xmm2
     addps     xmm2, xmm3, xmm3
-    vfnmaddps xmm3, xmm1, xmm3, xmm2
+    fnmaddps  xmm3, xmm1, xmm3, xmm2
     mulps     xmm0, xmm3
 %else
     cvtdq2ps  xmm0, xmm0
@@ -1742,14 +1742,18 @@ INIT_XMM fma4
 MBTREE
 
 %macro INT16_TO_FLOAT 1
+%if cpuflag(avx2)
+    vpmovzxwd   ymm%1, xmm%1
+%else
     vpunpckhwd   xmm4, xmm%1, xmm7
     vpunpcklwd  xmm%1, xmm7
     vinsertf128 ymm%1, ymm%1, xmm4, 1
+%endif
     vcvtdq2ps   ymm%1, ymm%1
 %endmacro
 
 ; FIXME: align loads/stores to 16 bytes
-INIT_YMM avx
+%macro MBTREE_AVX 0
 cglobal mbtree_propagate_cost, 7,7,8
     add           r6d, r6d
     lea            r0, [r0+r6*2]
@@ -1761,7 +1765,9 @@ cglobal mbtree_propagate_cost, 7,7,8
     vmovdqa      xmm5, [pw_3fff]
     vbroadcastss ymm6, [r5]
     vmulps       ymm6, ymm6, [pf_inv256]
+%if notcpuflag(avx2)
     vpxor        xmm7, xmm7
+%endif
 .loop:
     vmovdqu      xmm0, [r2+r6]       ; intra
     vmovdqu      xmm1, [r4+r6]       ; invq
@@ -1771,6 +1777,17 @@ cglobal mbtree_propagate_cost, 7,7,8
     INT16_TO_FLOAT 1
     INT16_TO_FLOAT 2
     INT16_TO_FLOAT 3
+%if cpuflag(fma3)
+    vmulps       ymm1, ymm1, ymm0
+    vsubps       ymm4, ymm0, ymm3
+    fmaddps      ymm1, ymm1, ymm6, ymm2
+    vrcpps       ymm3, ymm0
+    vmulps       ymm2, ymm0, ymm3
+    vmulps       ymm1, ymm1, ymm4
+    vaddps       ymm4, ymm3, ymm3
+    fnmaddps     ymm4, ymm2, ymm3, ymm4
+    vmulps       ymm1, ymm1, ymm4
+%else
     vmulps       ymm1, ymm1, ymm0
     vsubps       ymm4, ymm0, ymm3
     vmulps       ymm1, ymm1, ymm6    ; intra*invq*fps_factor>>8
@@ -1782,8 +1799,15 @@ cglobal mbtree_propagate_cost, 7,7,8
     vaddps       ymm3, ymm3, ymm3    ; 2 * (1/intra 1st approx)
     vsubps       ymm3, ymm3, ymm2    ; 2nd approximation for 1/intra
     vmulps       ymm1, ymm1, ymm3    ; / intra
+%endif
     vcvtps2dq    ymm1, ymm1
     vmovdqu [r0+r6*2], ymm1
     add            r6, 16
     jl .loop
     RET
+%endmacro
+
+INIT_YMM avx
+MBTREE_AVX
+INIT_YMM avx2,fma3
+MBTREE_AVX
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index dbb118b..f6d2db0 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -139,6 +139,8 @@ void x264_mbtree_propagate_cost_avx ( int *dst, uint16_t *propagate_in, uint16_t
                                       uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
 void x264_mbtree_propagate_cost_fma4( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                       uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_avx2_fma3( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+                                           uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
 
 #define MC_CHROMA(cpu)\
 void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,\
@@ -754,7 +756,12 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
         return;
     pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
 
-    if( !(cpu&X264_CPU_FMA4) )
+    if( cpu&X264_CPU_FMA4 )
+        pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4;
+
+    if( !(cpu&X264_CPU_AVX2) )
         return;
-    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4;
+
+    if( cpu&X264_CPU_FMA3 )
+        pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2_fma3;
 }
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index 05d8130..2d99333 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -1361,3 +1361,45 @@ FMA_INSTR pmadcswd, pmaddwd, paddd
 ; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
 ; This lets us use tzcnt without bumping the yasm version requirement yet.
 %define tzcnt rep bsf
+
+; convert FMA4 to FMA3 if possible
+%macro FMA4_INSTR 4
+    %macro %1 4-8 %1, %2, %3, %4
+        %if cpuflag(fma4)
+            v%5 %1, %2, %3, %4
+        %elifidn %1, %2
+            v%6 %1, %4, %3 ; %1 = %1 * %3 + %4
+        %elifidn %1, %3
+            v%7 %1, %2, %4 ; %1 = %2 * %1 + %4
+        %elifidn %1, %4
+            v%8 %1, %2, %3 ; %1 = %2 * %3 + %1
+        %else
+            %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported
+        %endif
+    %endmacro
+%endmacro
+
+FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd
+FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps
+FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd
+FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss
+
+FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd
+FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps
+FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd
+FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps
+
+FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd
+FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps
+FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd
+FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss
+
+FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd
+FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps
+FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd
+FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss
+
+FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd
+FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps
+FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd
+FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss
diff --git a/configure b/configure
index 94285b2..cb8f669 100755
--- a/configure
+++ b/configure
@@ -687,10 +687,10 @@ if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o
 fi
 
 if [ $asm = auto -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
-    if ! as_check "vpperm xmm0, xmm0, xmm0, xmm0" ; then
+    if ! as_check "vpmovzxwd ymm0, xmm0" ; then
         VER=`($AS --version || echo no assembler) 2>/dev/null | head -n 1`
         echo "Found $VER"
-        echo "Minimum version is yasm-1.0.0"
+        echo "Minimum version is yasm-1.2.0"
         echo "If you really want to compile without asm, configure with --disable-asm."
         exit 1
     fi
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 01e0dd3..2782c2a 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -164,6 +164,7 @@ static void print_bench(void)
             if( k < j )
                 continue;
             printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
+                    b->cpu&X264_CPU_AVX2 && b->cpu&X264_CPU_FMA3 ? "avx2_fma3" :
                     b->cpu&X264_CPU_AVX2 ? "avx2" :
                     b->cpu&X264_CPU_FMA3 ? "fma3" :
                     b->cpu&X264_CPU_FMA4 ? "fma4" :
@@ -2444,11 +2445,6 @@ static int check_all_flags( void )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" );
         cpu1 &= ~X264_CPU_FMA4;
     }
-    if( x264_cpu_detect() & X264_CPU_FMA3 )
-    {
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
-        cpu1 &= ~X264_CPU_FMA3;
-    }
     if( x264_cpu_detect() & X264_CPU_BMI1 )
     {
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
@@ -2466,6 +2462,11 @@ static int check_all_flags( void )
     }
     if( x264_cpu_detect() & X264_CPU_AVX2 )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
+    if( x264_cpu_detect() & X264_CPU_FMA3 )
+    {
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
+        cpu1 &= ~X264_CPU_FMA3;
+    }
 #elif ARCH_PPC
     if( x264_cpu_detect() & X264_CPU_ALTIVEC )
     {