[x264-devel] x86: Use one less register in mbtree_propagate_cost_avx2
Henrik Gramner
git at videolan.org
Tue Apr 12 20:36:18 CEST 2016
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Tue Mar 15 20:16:45 2016 +0100| [1507cfe80ecf5f8e240a35e9e9dc5a92bd25e792] | committer: Henrik Gramner
x86: Use one less register in mbtree_propagate_cost_avx2
Avoids the need to save and restore xmm6 on 64-bit Windows.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=1507cfe80ecf5f8e240a35e9e9dc5a92bd25e792
---
common/x86/mc-a2.asm | 50 +++++++++++++++++++++++++-------------------------
1 file changed, 25 insertions(+), 25 deletions(-)
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index c58aba4..f5c3418 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -2094,15 +2094,15 @@ INIT_XMM fma4
MBTREE
%macro INT16_UNPACK 1
- punpckhwd xm4, xm%1, xm7
+ punpckhwd xm6, xm%1, xm7
punpcklwd xm%1, xm7
- vinsertf128 m%1, m%1, xm4, 1
+ vinsertf128 m%1, m%1, xm6, 1
%endmacro
; FIXME: align loads to 16 bytes
%macro MBTREE_AVX 0
-cglobal mbtree_propagate_cost, 6,6,8-cpuflag(avx2)
- vbroadcastss m6, [r5]
+cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
+ vbroadcastss m5, [r5]
mov r5d, r6m
lea r0, [r0+r5*2]
add r5d, r5d
@@ -2111,7 +2111,7 @@ cglobal mbtree_propagate_cost, 6,6,8-cpuflag(avx2)
add r3, r5
add r4, r5
neg r5
- mova xm5, [pw_3fff]
+ mova xm4, [pw_3fff]
%if notcpuflag(avx2)
pxor xm7, xm7
%endif
@@ -2120,27 +2120,27 @@ cglobal mbtree_propagate_cost, 6,6,8-cpuflag(avx2)
pmovzxwd m0, [r2+r5] ; intra
pmovzxwd m1, [r4+r5] ; invq
pmovzxwd m2, [r1+r5] ; prop
- pand xm3, xm5, [r3+r5] ; inter
+ pand xm3, xm4, [r3+r5] ; inter
pmovzxwd m3, xm3
pminsd m3, m0
pmaddwd m1, m0
- psubd m4, m0, m3
+ psubd m3, m0, m3
cvtdq2ps m0, m0
cvtdq2ps m1, m1
cvtdq2ps m2, m2
- cvtdq2ps m4, m4
- fmaddps m1, m1, m6, m2
- rcpps m3, m0
- mulps m2, m0, m3
- mulps m1, m4
- addps m4, m3, m3
- fnmaddps m4, m2, m3, m4
- mulps m1, m4
+ cvtdq2ps m3, m3
+ fmaddps m1, m1, m5, m2
+ rcpps m2, m0
+ mulps m0, m2
+ mulps m1, m3
+ addps m3, m2, m2
+ fnmaddps m2, m2, m0, m3
+ mulps m1, m2
%else
movu xm0, [r2+r5]
movu xm1, [r4+r5]
movu xm2, [r1+r5]
- pand xm3, xm5, [r3+r5]
+ pand xm3, xm4, [r3+r5]
pminsw xm3, xm0
INT16_UNPACK 0
INT16_UNPACK 1
@@ -2151,16 +2151,16 @@ cglobal mbtree_propagate_cost, 6,6,8-cpuflag(avx2)
cvtdq2ps m2, m2
cvtdq2ps m3, m3
mulps m1, m0
- subps m4, m0, m3
- mulps m1, m6 ; intra*invq*fps_factor>>8
+ subps m3, m0, m3
+ mulps m1, m5 ; intra*invq*fps_factor>>8
addps m1, m2 ; prop + (intra*invq*fps_factor>>8)
- rcpps m3, m0 ; 1 / intra 1st approximation
- mulps m2, m0, m3 ; intra * (1/intra 1st approx)
- mulps m2, m3 ; intra * (1/intra 1st approx)^2
- mulps m1, m4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
- addps m3, m3 ; 2 * (1/intra 1st approx)
- subps m3, m2 ; 2nd approximation for 1/intra
- mulps m1, m3 ; / intra
+ rcpps m2, m0 ; 1 / intra 1st approximation
+ mulps m0, m2 ; intra * (1/intra 1st approx)
+ mulps m0, m2 ; intra * (1/intra 1st approx)^2
+ mulps m1, m3 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
+ addps m2, m2 ; 2 * (1/intra 1st approx)
+ subps m2, m0 ; 2nd approximation for 1/intra
+ mulps m1, m2 ; / intra
%endif
vcvtps2dq m1, m1
vextractf128 xm2, m1, 1
More information about the x264-devel
mailing list