[x264-devel] commit: Faster mbtree_propagate asm (Holger Lubitz )
git at videolan.org
git at videolan.org
Fri Jun 25 09:58:09 CEST 2010
x264 | branch: stable | Holger Lubitz <holger at lubitz.org> | Wed Jun 9 13:59:06 2010 +0200| [20cbe1046c7828bb213b10268aa711abfdafe257] | committer: Jason Garrett-Glaser
Faster mbtree_propagate asm
Replace fp division by multiply with the reciprocal.
Only ~12% faster on penryn, but over 80% faster on amd k8.
Also make checkasm slightly more tolerant to rounding error.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=20cbe1046c7828bb213b10268aa711abfdafe257
---
common/x86/mc-a2.asm | 17 +++++++++++------
tools/checkasm.c | 2 +-
2 files changed, 12 insertions(+), 7 deletions(-)
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index aee3f0a..368497b 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -1120,28 +1120,33 @@ cglobal mbtree_propagate_cost_sse2, 6,6,7
add r4, r5
neg r5
pxor xmm5, xmm5
- movdqa xmm4, [pd_128]
movdqa xmm6, [pw_3fff]
+ movdqa xmm4, [pd_128]
.loop:
movq xmm2, [r2+r5] ; intra
movq xmm0, [r4+r5] ; invq
+ movq xmm3, [r3+r5] ; inter
+ movq xmm1, [r1+r5] ; prop
punpcklwd xmm2, xmm5
punpcklwd xmm0, xmm5
pmaddwd xmm0, xmm2
- paddd xmm0, xmm4
- psrld xmm0, 8 ; intra*invq>>8
- movq xmm3, [r3+r5] ; inter
- movq xmm1, [r1+r5] ; prop
pand xmm3, xmm6
punpcklwd xmm1, xmm5
punpcklwd xmm3, xmm5
+ paddd xmm0, xmm4
+ psrld xmm0, 8 ; intra*invq>>8
paddd xmm0, xmm1 ; prop + (intra*invq>>8)
cvtdq2ps xmm1, xmm2 ; intra
psubd xmm2, xmm3 ; intra - inter
+ rcpps xmm3, xmm1 ; 1 / intra 1st approximation
cvtdq2ps xmm0, xmm0
+ mulps xmm1, xmm3 ; intra * (1/intra 1st approx)
cvtdq2ps xmm2, xmm2
+ mulps xmm1, xmm3 ; intra * (1/intra 1st approx)^2
mulps xmm0, xmm2 ; (prop + (intra*invq>>8)) * (intra - inter)
- divps xmm0, xmm1 ; / intra
+ addps xmm3, xmm3 ; 2 * (1/intra 1st approx)
+ subps xmm3, xmm1 ; 2nd approximation for 1/intra
+ mulps xmm0, xmm3 ; / intra
cvttps2dq xmm0, xmm0 ; truncation isn't really desired, but matches the integer implementation
movdqa [r0+r5*2], xmm0
add r5, 8
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 3f5dbcc..7fa2c0c 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1073,7 +1073,7 @@ static int check_mc( int cpu_ref, int cpu_new )
// I don't care about exact rounding, this is just how close the floating-point implementation happens to be
x264_emms();
for( int i = 0; i < 400; i++ )
- ok &= abs( dstc[i]-dsta[i] ) <= (abs( dstc[i])>512 ) || fabs( (double)dstc[i]/dsta[i]-1 ) < 1e-6;
+ ok &= abs( dstc[i]-dsta[i] ) <= 1 || fabs( (double)dstc[i]/dsta[i]-1 ) < 1e-6;
report( "mbtree propagate :" );
}
More information about the x264-devel
mailing list