[x265] [PATCH 2 of 2 Update] fix Main12 satd overflow bug up to SSE4, (fixes #180)

Min Chen chenm003 at 163.com
Thu Sep 17 00:58:24 CEST 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1442442321 18000
# Node ID 7ad1738840cb0153b59f952da160f8e9e08dc40a
# Parent  b585c3b6834dc3e98b78be5c750047a5e988926a
fix Main12 satd overflow bug up to SSE4, (fixes #180)
---
 source/common/x86/asm-primitives.cpp |    2 +-
 source/common/x86/pixel-a.asm        |  645 +++++++++-------------------------
 2 files changed, 170 insertions(+), 477 deletions(-)

diff -r b585c3b6834d -r 7ad1738840cb source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Sep 16 17:25:19 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Wed Sep 16 17:25:21 2015 -0500
@@ -1179,7 +1179,7 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = PFX(pixel_satd_4x32_avx);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_avx);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = PFX(pixel_satd_8x16_avx);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_avx);
+        // p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_avx);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = PFX(pixel_satd_8x8_avx);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_avx);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = PFX(pixel_satd_8x32_avx);
diff -r b585c3b6834d -r 7ad1738840cb source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Wed Sep 16 17:25:19 2015 -0500
+++ b/source/common/x86/pixel-a.asm	Wed Sep 16 17:25:21 2015 -0500
@@ -242,6 +242,12 @@
 %endif
     HADAMARD4_2D 4, 5, 6, 7, 3, %%n
     paddw m4, m6
+;%if HIGH_BIT_DEPTH && (BIT_DEPTH == 12)
+;    pxor m5, m5
+;    punpcklwd m6, m4, m5
+;    punpckhwd m4, m5
+;    paddd m4, m6
+;%endif
     SWAP %%n, 4
 %endmacro
 
@@ -257,15 +263,45 @@
     HADAMARD 1, max, %2, %4, %6, %7
 %endif
 %ifnidn %9, swap
+  %if (BIT_DEPTH == 12)
+    pxor m%6, m%6
+    punpcklwd m%7, m%2, m%6
+    punpckhwd m%2, m%6
+    paddd m%8, m%7
+    paddd m%8, m%2
+  %else
     paddw m%8, m%2
+  %endif
 %else
     SWAP %8, %2
+  %if (BIT_DEPTH == 12)
+    pxor m%6, m%6
+    punpcklwd m%7, m%8, m%6
+    punpckhwd m%8, m%6
+    paddd m%8, m%7
+  %endif
 %endif
 %if %1
+  %if (BIT_DEPTH == 12)
+    pxor m%6, m%6
+    punpcklwd m%7, m%4, m%6
+    punpckhwd m%4, m%6
+    paddd m%8, m%7
+    paddd m%8, m%4
+  %else
     paddw m%8, m%4
+  %endif
 %else
     HADAMARD 1, max, %3, %5, %6, %7
+  %if (BIT_DEPTH == 12)
+    pxor m%6, m%6
+    punpcklwd m%7, m%3, m%6
+    punpckhwd m%3, m%6
+    paddd m%8, m%7
+    paddd m%8, m%3
+  %else
     paddw m%8, m%3
+  %endif
 %endif
 %endmacro
 
@@ -281,29 +317,23 @@
 %endif
 
     pxor m%10, m%10
-    mova m%9, m%2
-    punpcklwd m%9, m%10
+    punpcklwd m%9, m%2, m%10
     paddd m%8, m%9
-    mova m%9, m%2
-    punpckhwd m%9, m%10
+    punpckhwd m%9, m%2, m%10
     paddd m%8, m%9
 
 %if %1
     pxor m%10, m%10
-    mova m%9, m%4
-    punpcklwd m%9, m%10
+    punpcklwd m%9, m%4, m%10
     paddd m%8, m%9
-    mova m%9, m%4
-    punpckhwd m%9, m%10
+    punpckhwd m%9, m%4, m%10
     paddd m%8, m%9
 %else
     HADAMARD 1, max, %3, %5, %6, %7
     pxor m%10, m%10
-    mova m%9, m%3
-    punpcklwd m%9, m%10
+    punpcklwd m%9, m%3, m%10
     paddd m%8, m%9
-    mova m%9, m%3
-    punpckhwd m%9, m%10
+    punpckhwd m%9, m%3, m%10
     paddd m%8, m%9
 %endif
 %endmacro
@@ -326,146 +356,21 @@
     movd       eax, m0
     and        eax, 0xffff
 %endif ; HIGH_BIT_DEPTH
-    RET
-%endmacro
-
-; FIXME avoid the spilling of regs to hold 3*stride.
-; for small blocks on x86_32, modify pixel pointer instead.
-
-;-----------------------------------------------------------------------------
-; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
-;-----------------------------------------------------------------------------
-INIT_MMX mmx2
-cglobal pixel_satd_16x4_internal
-    SATD_4x4_MMX m2,  0, 0
-    SATD_4x4_MMX m1,  4, 0
-    paddw        m0, m2
-    SATD_4x4_MMX m2,  8, 0
-    paddw        m0, m1
-    SATD_4x4_MMX m1, 12, 0
-    paddw        m0, m2
-    paddw        m0, m1
-    ret
-
-cglobal pixel_satd_8x8_internal
-    SATD_4x4_MMX m2,  0, 0
-    SATD_4x4_MMX m1,  4, 1
-    paddw        m0, m2
-    paddw        m0, m1
-pixel_satd_8x4_internal_mmx2:
-    SATD_4x4_MMX m2,  0, 0
-    SATD_4x4_MMX m1,  4, 0
-    paddw        m0, m2
-    paddw        m0, m1
-    ret
-
-%if HIGH_BIT_DEPTH
-%macro SATD_MxN_MMX 3
-cglobal pixel_satd_%1x%2, 4,7
-    SATD_START_MMX
-    pxor   m0, m0
-    call pixel_satd_%1x%3_internal_mmx2
-    HADDUW m0, m1
-    movd  r6d, m0
-%rep %2/%3-1
-    pxor   m0, m0
-    lea    r0, [r0+4*r1]
-    lea    r2, [r2+4*r3]
-    call pixel_satd_%1x%3_internal_mmx2
-    movd   m2, r4
-    HADDUW m0, m1
-    movd   r4, m0
-    add    r6, r4
-    movd   r4, m2
-%endrep
-    movifnidn eax, r6d
-    RET
-%endmacro
-
-SATD_MxN_MMX 16, 16, 4
-SATD_MxN_MMX 16,  8, 4
-SATD_MxN_MMX  8, 16, 8
-%endif ; HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH == 0
-cglobal pixel_satd_16x16, 4,6
-    SATD_START_MMX
-    pxor   m0, m0
-%rep 3
-    call pixel_satd_16x4_internal_mmx2
-    lea  r0, [r0+4*r1]
-    lea  r2, [r2+4*r3]
-%endrep
-    call pixel_satd_16x4_internal_mmx2
-    HADDUW m0, m1
-    movd  eax, m0
-    RET
-
-cglobal pixel_satd_16x8, 4,6
-    SATD_START_MMX
-    pxor   m0, m0
-    call pixel_satd_16x4_internal_mmx2
-    lea  r0, [r0+4*r1]
-    lea  r2, [r2+4*r3]
-    call pixel_satd_16x4_internal_mmx2
-    SATD_END_MMX
-
-cglobal pixel_satd_8x16, 4,6
-    SATD_START_MMX
-    pxor   m0, m0
-    call pixel_satd_8x8_internal_mmx2
-    lea  r0, [r0+4*r1]
-    lea  r2, [r2+4*r3]
-    call pixel_satd_8x8_internal_mmx2
-    SATD_END_MMX
-%endif ; !HIGH_BIT_DEPTH
-
-cglobal pixel_satd_8x8, 4,6
-    SATD_START_MMX
-    pxor   m0, m0
-    call pixel_satd_8x8_internal_mmx2
-    SATD_END_MMX
-
-cglobal pixel_satd_8x4, 4,6
-    SATD_START_MMX
-    pxor   m0, m0
-    call pixel_satd_8x4_internal_mmx2
-    SATD_END_MMX
-
-cglobal pixel_satd_4x16, 4,6
-    SATD_START_MMX
-    SATD_4x4_MMX m0, 0, 1
-    SATD_4x4_MMX m1, 0, 1
-    paddw  m0, m1
-    SATD_4x4_MMX m1, 0, 1
-    paddw  m0, m1
-    SATD_4x4_MMX m1, 0, 0
-    paddw  m0, m1
-    SATD_END_MMX
-
-cglobal pixel_satd_4x8, 4,6
-    SATD_START_MMX
-    SATD_4x4_MMX m0, 0, 1
-    SATD_4x4_MMX m1, 0, 0
-    paddw  m0, m1
-    SATD_END_MMX
-
-cglobal pixel_satd_4x4, 4,6
-    SATD_START_MMX
-    SATD_4x4_MMX m0, 0, 0
-%if HIGH_BIT_DEPTH
-    HADDUW      m0, m1
-    movd       eax, m0
-%else ; !HIGH_BIT_DEPTH
-    pshufw      m1, m0, q1032
-    paddw       m0, m1
-    pshufw      m1, m0, q2301
-    paddw       m0, m1
-    movd       eax, m0
-    and        eax, 0xffff
-%endif ; HIGH_BIT_DEPTH
     EMMS
     RET
+%endmacro
+
+; FIXME avoid the spilling of regs to hold 3*stride.
+; for small blocks on x86_32, modify pixel pointer instead.
+
+;-----------------------------------------------------------------------------
+; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+INIT_MMX mmx2
+cglobal pixel_satd_4x4, 4,6
+    SATD_START_MMX
+    SATD_4x4_MMX m0, 0, 0
+    SATD_END_MMX
 
 %macro SATD_START_SSE2 2-3 0
     FIX_STRIDES r1, r3
@@ -485,10 +390,14 @@
 
 %macro SATD_END_SSE2 1-2
 %if HIGH_BIT_DEPTH
+  %if BIT_DEPTH == 12
+    HADDD   %1, xm0
+  %else ; BIT_DEPTH == 12
     HADDUW  %1, xm0
-%if %0 == 2
+  %endif ; BIT_DEPTH == 12
+  %if %0 == 2
     paddd   %1, %2
-%endif
+  %endif
 %else
     HADDW   %1, xm7
 %endif
@@ -631,7 +540,11 @@
     mova m7, [hmul_4p]
 %endif
     SATD_4x8_SSE vertical, 0, swap
-    HADDW m7, m1
+%if BIT_DEPTH == 12
+    HADDD m7, m1
+%else
+    HADDUW m7, m1
+%endif
     movd eax, m7
     RET
 
@@ -644,7 +557,11 @@
     lea r0, [r0+r1*2*SIZEOF_PIXEL]
     lea r2, [r2+r3*2*SIZEOF_PIXEL]
     SATD_4x8_SSE vertical, 1, add
+%if BIT_DEPTH == 12
+    HADDD m7, m1
+%else
     HADDUW m7, m1
+%endif
     movd eax, m7
     RET
 
@@ -690,12 +607,8 @@
     mova m7, [pw_00ff]
 %endif
     call pixel_satd_16x4_internal2
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
-    movd    eax, m10
+    HADDD m10, m0
+    movd eax, m10
     RET
 
 cglobal pixel_satd_16x8, 4,6,14
@@ -757,12 +670,8 @@
 %%pixel_satd_16x8_internal:
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
-    movd    eax, m10
+    HADDD m10, m0
+    movd eax, m10
     RET
 
 cglobal pixel_satd_32x8, 4,8,14    ;if WIN64 && notcpuflag(avx)
@@ -778,12 +687,8 @@
     lea r2, [r7 + 16]
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
-    movd    eax, m10
+    HADDD m10, m0
+    movd eax, m10
     RET
 
 cglobal pixel_satd_32x16, 4,8,14    ;if WIN64 && notcpuflag(avx)
@@ -803,11 +708,7 @@
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
+    HADDD m10, m0
     movd    eax, m10
     RET
 
@@ -832,12 +733,8 @@
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
-    movd    eax, m10
+    HADDD m10, m0
+    movd eax, m10
     RET
 
 cglobal pixel_satd_32x32, 4,8,14    ;if WIN64 && notcpuflag(avx)
@@ -865,12 +762,8 @@
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
-    movd    eax, m10
+    HADDD m10, m0
+    movd eax, m10
     RET
 
 cglobal pixel_satd_32x64, 4,8,14    ;if WIN64 && notcpuflag(avx)
@@ -914,12 +807,8 @@
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
-    movd    eax, m10
+    HADDD m10, m0
+    movd eax, m10
     RET
 
 cglobal pixel_satd_48x64, 4,8,14    ;if WIN64 && notcpuflag(avx)
@@ -981,12 +870,8 @@
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
-    movd    eax, m10
+    HADDD m10, m0
+    movd eax, m10
     RET
 
 cglobal pixel_satd_64x16, 4,8,14    ;if WIN64 && notcpuflag(avx)
@@ -1018,12 +903,8 @@
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
-    movd    eax, m10
+    HADDD m10, m0
+    movd eax, m10
     RET
 
 cglobal pixel_satd_64x32, 4,8,14    ;if WIN64 && notcpuflag(avx)
@@ -1072,12 +953,8 @@
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
 
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
-    movd    eax, m10
+    HADDD m10, m0
+    movd eax, m10
     RET
 
 cglobal pixel_satd_64x48, 4,8,14    ;if WIN64 && notcpuflag(avx)
@@ -1142,12 +1019,8 @@
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
 
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
-    movd    eax, m10
+    HADDD m10, m0
+    movd eax, m10
     RET
 
 cglobal pixel_satd_64x64, 4,8,14    ;if WIN64 && notcpuflag(avx)
@@ -1228,12 +1101,8 @@
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
 
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
-    movd    eax, m10
+    HADDD m10, m0
+    movd eax, m10
     RET
 
 %else
@@ -1250,11 +1119,7 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
+    HADDD m6, m0
     movd   eax, m6
     RET
 %else
@@ -1271,12 +1136,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 %if WIN64
@@ -1314,12 +1175,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_32x48, 4,7,8,0-gprsize    ;if !WIN64
@@ -1359,12 +1216,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -1401,12 +1254,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_24x64, 4,7,8,0-gprsize    ;if !WIN64
@@ -1443,12 +1292,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -1465,12 +1310,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_8x64, 4,7,8,0-gprsize    ;if !WIN64
@@ -1485,12 +1326,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -1515,12 +1352,8 @@
     mov [rsp], r2
     call pixel_satd_8x8_internal2
     call %%pixel_satd_8x4_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -1565,12 +1398,8 @@
     lea r0, [r0 + r1*2*SIZEOF_PIXEL]
     lea r2, [r2 + r3*2*SIZEOF_PIXEL]
     SATD_4x8_SSE vertical, 1, 4, 5
-    pxor    m1, m1
-    movhlps m1, m7
-    paddd   m7, m1
-    pshufd  m1, m7, 1
-    paddd   m7, m1
-    movd   eax, m7
+    HADDD m7, m0
+    movd eax, m7
     RET
 %else
 cglobal pixel_satd_12x32, 4,7,8,0-gprsize
@@ -1614,12 +1443,8 @@
     lea r0, [r0 + r1*2*SIZEOF_PIXEL]
     lea r2, [r2 + r3*2*SIZEOF_PIXEL]
     SATD_4x8_SSE vertical, 1, 4, 5
-    pxor    m1, m1
-    movhlps m1, m7
-    paddd   m7, m1
-    pshufd  m1, m7, 1
-    paddd   m7, m1
-    movd   eax, m7
+    HADDD m7, m0
+    movd eax, m7
     RET
 %endif
 %else ;HIGH_BIT_DEPTH
@@ -1735,12 +1560,8 @@
     lea r0, [r0 + r1*2*SIZEOF_PIXEL]
     lea r2, [r2 + r3*2*SIZEOF_PIXEL]
     SATD_4x8_SSE vertical, 1, 4, 5
-    pxor    m1, m1
-    movhlps m1, m7
-    paddd   m7, m1
-    pshufd  m1, m7, 1
-    paddd   m7, m1
-    movd   eax, m7
+    HADDD m7, m0
+    movd eax, m7
     RET
 %else
 cglobal pixel_satd_4x32, 4,7,8,0-gprsize
@@ -1827,12 +1648,8 @@
     lea r0, [r6 + 24*SIZEOF_PIXEL]
     lea r2, [r7 + 24*SIZEOF_PIXEL]
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_32x8, 4,7,8,0-gprsize    ;if !WIN64
@@ -1852,12 +1669,8 @@
     mov r2, [rsp]
     add r2, 24*SIZEOF_PIXEL
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -1880,12 +1693,8 @@
     lea r2, [r7 + 24*SIZEOF_PIXEL]
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_32x16, 4,7,8,0-gprsize   ;if !WIN64
@@ -1909,12 +1718,8 @@
     add r2, 24*SIZEOF_PIXEL
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -1941,12 +1746,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_32x24, 4,7,8,0-gprsize   ;if !WIN64
@@ -1974,12 +1775,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -2010,12 +1807,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_32x32, 4,7,8,0-gprsize   ;if !WIN64
@@ -2047,12 +1840,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -2099,12 +1888,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_32x64, 4,7,8,0-gprsize   ;if !WIN64
@@ -2152,12 +1937,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -2224,12 +2005,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_48x64, 4,7,8,0-gprsize   ;if !WIN64
@@ -2299,12 +2076,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -2344,12 +2117,8 @@
     lea r2, [r7 + 56*SIZEOF_PIXEL]
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_64x16, 4,7,8,0-gprsize   ;if !WIN64
@@ -2393,12 +2162,8 @@
     add r2,56*SIZEOF_PIXEL
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -2453,12 +2218,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_64x32, 4,7,8,0-gprsize   ;if !WIN64
@@ -2518,12 +2279,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -2594,12 +2351,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m8, m8
-    movhlps m8, m6
-    paddd   m6, m8
-    pshufd  m8, m6, 1
-    paddd   m6, m8
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_64x48, 4,7,8,0-gprsize   ;if !WIN64
@@ -2675,12 +2428,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -2767,12 +2516,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m8, m8
-    movhlps m8, m6
-    paddd   m6, m8
-    pshufd  m8, m6, 1
-    paddd   m6, m8
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_64x64, 4,7,8,0-gprsize   ;if !WIN64
@@ -2864,12 +2609,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -2883,12 +2624,8 @@
     call %%pixel_satd_8x4_internal2
     RESTORE_AND_INC_POINTERS
     call %%pixel_satd_8x4_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 
 %if WIN64
@@ -2901,12 +2638,8 @@
     call pixel_satd_8x8_internal2
     RESTORE_AND_INC_POINTERS
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 
 %if WIN64
@@ -2921,12 +2654,8 @@
     RESTORE_AND_INC_POINTERS
     call pixel_satd_8x8_internal2
     call %%pixel_satd_8x4_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 
 %if WIN64
@@ -2941,12 +2670,8 @@
     RESTORE_AND_INC_POINTERS
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 
 %if WIN64
@@ -2965,12 +2690,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 
 %if WIN64
@@ -2997,12 +2718,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -3029,12 +2746,8 @@
     lea r0, [r0 + r1*2*SIZEOF_PIXEL]
     lea r2, [r2 + r3*2*SIZEOF_PIXEL]
     SATD_4x8_SSE vertical, 1, 4, 5
-    pxor    m1, m1
-    movhlps m1, m7
-    paddd   m7, m1
-    pshufd  m1, m7, 1
-    paddd   m7, m1
-    movd   eax, m7
+    HADDD m7, m0
+    movd eax, m7
     RET
 %else
 cglobal pixel_satd_12x16, 4,7,8,0-gprsize
@@ -3060,12 +2773,8 @@
     lea r0, [r0 + r1*2*SIZEOF_PIXEL]
     lea r2, [r2 + r3*2*SIZEOF_PIXEL]
     SATD_4x8_SSE vertical, 1, 4, 5
-    pxor    m1, m1
-    movhlps m1, m7
-    paddd   m7, m1
-    pshufd  m1, m7, 1
-    paddd   m7, m1
-    movd   eax, m7
+    HADDD m7, m0
+    movd eax, m7
     RET
 %endif
 %else    ;HIGH_BIT_DEPTH
@@ -3149,12 +2858,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_24x32, 4,7,8,0-gprsize
@@ -3179,12 +2884,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif    ;WIN64
 
@@ -3201,12 +2902,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 
 %if WIN64
@@ -3217,12 +2914,8 @@
     SATD_START_SSE2 m6, m7
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 
 cglobal pixel_satd_8x8, 4,6,8



More information about the x265-devel mailing list