[x264-devel] commit: Faster mbtree_propagate asm (Holger Lubitz )

Tue Jun 15 11:26:54 CEST 2010

x264 | branch: master | Holger Lubitz <holger at lubitz.org> | Wed Jun  9 13:59:06 2010 +0200| [20cbe1046c7828bb213b10268aa711abfdafe257] | committer: Jason Garrett-Glaser 

Faster mbtree_propagate asm
Replace fp division by multiply with the reciprocal.
Only ~12% faster on penryn, but over 80% faster on amd k8.
Also make checkasm slightly more tolerant to rounding error.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=20cbe1046c7828bb213b10268aa711abfdafe257
---

 common/x86/mc-a2.asm |   17 +++++++++++------
 tools/checkasm.c     |    2 +-
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index aee3f0a..368497b 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -1120,28 +1120,33 @@ cglobal mbtree_propagate_cost_sse2, 6,6,7
     add r4, r5
     neg r5
     pxor      xmm5, xmm5
-    movdqa    xmm4, [pd_128]
     movdqa    xmm6, [pw_3fff]
+    movdqa    xmm4, [pd_128]
 .loop:
     movq      xmm2, [r2+r5] ; intra
     movq      xmm0, [r4+r5] ; invq
+    movq      xmm3, [r3+r5] ; inter
+    movq      xmm1, [r1+r5] ; prop
     punpcklwd xmm2, xmm5
     punpcklwd xmm0, xmm5
     pmaddwd   xmm0, xmm2
-    paddd     xmm0, xmm4
-    psrld     xmm0, 8       ; intra*invq>>8
-    movq      xmm3, [r3+r5] ; inter
-    movq      xmm1, [r1+r5] ; prop
     pand      xmm3, xmm6
     punpcklwd xmm1, xmm5
     punpcklwd xmm3, xmm5
+    paddd     xmm0, xmm4
+    psrld     xmm0, 8       ; intra*invq>>8
     paddd     xmm0, xmm1    ; prop + (intra*invq>>8)
     cvtdq2ps  xmm1, xmm2    ; intra
     psubd     xmm2, xmm3    ; intra - inter
+    rcpps     xmm3, xmm1    ; 1 / intra 1st approximation
     cvtdq2ps  xmm0, xmm0
+    mulps     xmm1, xmm3    ; intra * (1/intra 1st approx)
     cvtdq2ps  xmm2, xmm2
+    mulps     xmm1, xmm3    ; intra * (1/intra 1st approx)^2
     mulps     xmm0, xmm2    ; (prop + (intra*invq>>8)) * (intra - inter)
-    divps     xmm0, xmm1    ; / intra
+    addps     xmm3, xmm3    ; 2 * (1/intra 1st approx)
+    subps     xmm3, xmm1    ; 2nd approximation for 1/intra
+    mulps     xmm0, xmm3    ; / intra
     cvttps2dq xmm0, xmm0    ; truncation isn't really desired, but matches the integer implementation
     movdqa [r0+r5*2], xmm0
     add r5, 8
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 3f5dbcc..7fa2c0c 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1073,7 +1073,7 @@ static int check_mc( int cpu_ref, int cpu_new )
         // I don't care about exact rounding, this is just how close the floating-point implementation happens to be
         x264_emms();
         for( int i = 0; i < 400; i++ )
-            ok &= abs( dstc[i]-dsta[i] ) <= (abs( dstc[i])>512 ) || fabs( (double)dstc[i]/dsta[i]-1 ) < 1e-6;
+            ok &= abs( dstc[i]-dsta[i] ) <= 1 || fabs( (double)dstc[i]/dsta[i]-1 ) < 1e-6;
         report( "mbtree propagate :" );
     }