[x264-devel] x86: Minor pixel_ssim_end4 improvements
Henrik Gramner
git at videolan.org
Tue Aug 26 18:23:14 CEST 2014
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Tue Aug 5 01:42:51 2014 +0200| [d4317786b8428b00978459f6de3db219f0f6f8e6] | committer: Fiona Glaser
x86: Minor pixel_ssim_end4 improvements
Reduce the number of vector registers used from 7 to 5.
Eliminate some moves in the AVX implementation.
Avoid bypass delays for transitioning between int and float domains.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=d4317786b8428b00978459f6de3db219f0f6f8e6
---
common/x86/pixel-a.asm | 60 +++++++++++++++++++++++++++++-------------------
1 file changed, 36 insertions(+), 24 deletions(-)
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index edadad3..262c537 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -4689,13 +4689,13 @@ cglobal pixel_ssim_4x4x2_core, 4,4,8
;-----------------------------------------------------------------------------
; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
;-----------------------------------------------------------------------------
-cglobal pixel_ssim_end4, 2,3,7
+cglobal pixel_ssim_end4, 2,3
mov r2d, r2m
- movdqa m0, [r0+ 0]
- movdqa m1, [r0+16]
- movdqa m2, [r0+32]
- movdqa m3, [r0+48]
- movdqa m4, [r0+64]
+ mova m0, [r0+ 0]
+ mova m1, [r0+16]
+ mova m2, [r0+32]
+ mova m3, [r0+48]
+ mova m4, [r0+64]
paddd m0, [r1+ 0]
paddd m1, [r1+16]
paddd m2, [r1+32]
@@ -4705,8 +4705,6 @@ cglobal pixel_ssim_end4, 2,3,7
paddd m1, m2
paddd m2, m3
paddd m3, m4
- movdqa m5, [ssim_c1]
- movdqa m6, [ssim_c2]
TRANSPOSE4x4D 0, 1, 2, 3, 4
; s1=m0, s2=m1, ss=m2, s12=m3
@@ -4715,20 +4713,21 @@ cglobal pixel_ssim_end4, 2,3,7
cvtdq2ps m1, m1
cvtdq2ps m2, m2
cvtdq2ps m3, m3
+ mulps m4, m0, m1 ; s1*s2
+ mulps m0, m0 ; s1*s1
+ mulps m1, m1 ; s2*s2
mulps m2, [pf_64] ; ss*64
mulps m3, [pf_128] ; s12*128
- movdqa m4, m1
- mulps m4, m0 ; s1*s2
- mulps m1, m1 ; s2*s2
- mulps m0, m0 ; s1*s1
addps m4, m4 ; s1*s2*2
addps m0, m1 ; s1*s1 + s2*s2
subps m2, m0 ; vars
subps m3, m4 ; covar*2
- addps m4, m5 ; s1*s2*2 + ssim_c1
- addps m0, m5 ; s1*s1 + s2*s2 + ssim_c1
- addps m2, m6 ; vars + ssim_c2
- addps m3, m6 ; covar*2 + ssim_c2
+ movaps m1, [ssim_c1]
+ addps m4, m1 ; s1*s2*2 + ssim_c1
+ addps m0, m1 ; s1*s1 + s2*s2 + ssim_c1
+ movaps m1, [ssim_c2]
+ addps m2, m1 ; vars + ssim_c2
+ addps m3, m1 ; covar*2 + ssim_c2
%else
pmaddwd m4, m1, m0 ; s1*s2
pslld m1, 16
@@ -4739,10 +4738,12 @@ cglobal pixel_ssim_end4, 2,3,7
pslld m2, 6
psubd m3, m4 ; covar*2
psubd m2, m0 ; vars
- paddd m0, m5
- paddd m4, m5
- paddd m3, m6
- paddd m2, m6
+ mova m1, [ssim_c1]
+ paddd m0, m1
+ paddd m4, m1
+ mova m1, [ssim_c2]
+ paddd m3, m1
+ paddd m2, m1
cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1)
cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2)
@@ -4755,20 +4756,31 @@ cglobal pixel_ssim_end4, 2,3,7
cmp r2d, 4
je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
neg r2
+
%ifdef PIC
lea r3, [mask_ff + 16]
- movdqu m1, [r3 + r2*4]
+ %xdefine %%mask r3
+%else
+ %xdefine %%mask mask_ff + 16
+%endif
+%if cpuflag(avx)
+ andps m4, [%%mask + r2*4]
%else
- movdqu m1, [mask_ff + r2*4 + 16]
+ movups m0, [%%mask + r2*4]
+ andps m4, m0
%endif
- pand m4, m1
+
.skip:
movhlps m0, m4
addps m0, m4
+%if cpuflag(ssse3)
+ movshdup m4, m0
+%else
pshuflw m4, m0, q0032
+%endif
addss m0, m4
%if ARCH_X86_64 == 0
- movd r0m, m0
+ movss r0m, m0
fld dword r0m
%endif
RET
More information about the x264-devel
mailing list