[x264-devel] x86: Faster pixel_ssim_4x4x2_core

Henrik Gramner git at videolan.org
Tue Sep 20 20:57:52 CEST 2016


x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sat Sep 17 21:41:52 2016 +0200| [72d53ab2ac7af24597a824e868f2ef363a22f5d4] | committer: Anton Mitrofanov

x86: Faster pixel_ssim_4x4x2_core

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=72d53ab2ac7af24597a824e868f2ef363a22f5d4
---

 common/x86/pixel-a.asm | 95 +++++++++++++++++++++++++++++---------------------
 1 file changed, 55 insertions(+), 40 deletions(-)

diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index cff22ee..d55a41b 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -4595,67 +4595,82 @@ cglobal intra_sad_x9_8x8, 5,7,8
 ;-----------------------------------------------------------------------------
 %macro SSIM_ITER 1
 %if HIGH_BIT_DEPTH
-    movdqu    m5, [r0+(%1&1)*r1]
-    movdqu    m6, [r2+(%1&1)*r3]
+    movu      m4, [r0+(%1&1)*r1]
+    movu      m5, [r2+(%1&1)*r3]
+%elif cpuflag(avx)
+    pmovzxbw  m4, [r0+(%1&1)*r1]
+    pmovzxbw  m5, [r2+(%1&1)*r3]
 %else
-    movq      m5, [r0+(%1&1)*r1]
-    movq      m6, [r2+(%1&1)*r3]
-    punpcklbw m5, m0
-    punpcklbw m6, m0
+    movq      m4, [r0+(%1&1)*r1]
+    movq      m5, [r2+(%1&1)*r3]
+    punpcklbw m4, m7
+    punpcklbw m5, m7
 %endif
 %if %1==1
     lea       r0, [r0+r1*2]
     lea       r2, [r2+r3*2]
 %endif
-%if %1==0
-    movdqa    m1, m5
-    movdqa    m2, m6
+%if %1 == 0 && cpuflag(avx)
+    SWAP       0, 4
+    SWAP       1, 5
+    pmaddwd   m4, m0, m0
+    pmaddwd   m5, m1, m1
+    pmaddwd   m6, m0, m1
 %else
+%if %1 == 0
+    mova      m0, m4
+    mova      m1, m5
+%else
+    paddw     m0, m4
     paddw     m1, m5
-    paddw     m2, m6
 %endif
-    pmaddwd   m7, m5, m6
+    pmaddwd   m6, m4, m5
+    pmaddwd   m4, m4
     pmaddwd   m5, m5
-    pmaddwd   m6, m6
-    ACCUM  paddd, 3, 5, %1
-    ACCUM  paddd, 4, 7, %1
-    paddd     m3, m6
+%endif
+    ACCUM  paddd, 2, 4, %1
+    ACCUM  paddd, 3, 6, %1
+    paddd     m2, m5
 %endmacro
 
 %macro SSIM 0
-cglobal pixel_ssim_4x4x2_core, 4,4,8
+%if HIGH_BIT_DEPTH
+cglobal pixel_ssim_4x4x2_core, 4,4,7
     FIX_STRIDES r1, r3
-    pxor      m0, m0
+%else
+cglobal pixel_ssim_4x4x2_core, 4,4,7+notcpuflag(avx)
+%if notcpuflag(avx)
+    pxor      m7, m7
+%endif
+%endif
     SSIM_ITER 0
     SSIM_ITER 1
     SSIM_ITER 2
     SSIM_ITER 3
-    ; PHADDW m1, m2
-    ; PHADDD m3, m4
-    movdqa    m7, [pw_1]
-    pshufd    m5, m3, q2301
-    pmaddwd   m1, m7
-    pmaddwd   m2, m7
-    pshufd    m6, m4, q2301
-    packssdw  m1, m2
-    paddd     m3, m5
-    pshufd    m1, m1, q3120
-    paddd     m4, m6
-    pmaddwd   m1, m7
-    punpckhdq m5, m3, m4
-    punpckldq m3, m4
-
 %if UNIX64
-    %define t0 r4
+    DECLARE_REG_TMP 4
 %else
-    %define t0 rax
-    mov t0, r4mp
+    DECLARE_REG_TMP 0
+    mov       t0, r4mp
 %endif
-
-    movq      [t0+ 0], m1
-    movq      [t0+ 8], m3
-    movhps    [t0+16], m1
-    movq      [t0+24], m5
+%if cpuflag(ssse3)
+    phaddw    m0, m1
+    pmaddwd   m0, [pw_1]
+    phaddd    m2, m3
+%else
+    mova      m4, [pw_1]
+    pmaddwd   m0, m4
+    pmaddwd   m1, m4
+    packssdw  m0, m1
+    shufps    m1, m2, m3, q2020
+    shufps    m2, m3, q3131
+    pmaddwd   m0, m4
+    paddd     m2, m1
+%endif
+    shufps    m1, m0, m2, q2020
+    shufps    m0, m2, q3131
+    mova    [t0], m1
+    mova [t0+16], m0
     RET
 
 ;-----------------------------------------------------------------------------



More information about the x264-devel mailing list