[x265] [PATCH 2 of 2] fix Main12 satd overflow bug up to SSE4, (fixes #180)

Min Chen chenm003 at 163.com
Wed Sep 16 20:14:30 CEST 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1442423261 18000
# Node ID 61769ca2358186b36c2498b3eaa8aeded95af6fd
# Parent  e35d3260b5138abf09bf79808bf55104606c4626
fix Main12 satd overflow bug up to SSE4, (fixes #180)
---
 source/common/x86/asm-primitives.cpp |  106 ++++++-
 source/common/x86/pixel-a.asm        |  635 +++++++++-------------------------
 2 files changed, 264 insertions(+), 477 deletions(-)

diff -r e35d3260b513 -r 61769ca23581 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Sep 16 12:07:40 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Wed Sep 16 12:07:41 2015 -0500
@@ -871,6 +871,84 @@
 
 #if HIGH_BIT_DEPTH
 
+#define BITS_PER_SUM (8 * sizeof(sum_t))
+
+#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) { \
+        sum2_t t0 = s0 + s1; \
+        sum2_t t1 = s0 - s1; \
+        sum2_t t2 = s2 + s3; \
+        sum2_t t3 = s2 - s3; \
+        d0 = t0 + t2; \
+        d2 = t0 - t2; \
+        d1 = t1 + t3; \
+        d3 = t1 - t3; \
+}
+
+// in: a pseudo-simd number of the form x+(y<<16)
+// return: abs(x)+(abs(y)<<16)
+inline sum2_t abs2(sum2_t a)
+{
+    sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1);
+
+    return (a + s) ^ s;
+}
+
+static int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+{
+    sum2_t tmp[4][2];
+    sum2_t a0, a1, a2, a3, b0, b1;
+    sum2_t sum = 0;
+
+    for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2)
+    {
+        a0 = pix1[0] - pix2[0];
+        a1 = pix1[1] - pix2[1];
+        b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
+        a2 = pix1[2] - pix2[2];
+        a3 = pix1[3] - pix2[3];
+        b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
+        tmp[i][0] = b0 + b1;
+        tmp[i][1] = b0 - b1;
+    }
+
+    for (int i = 0; i < 2; i++)
+    {
+        HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
+        a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
+        sum += ((sum_t)a0) + (a0 >> BITS_PER_SUM);
+    }
+
+    return (int)(sum >> 1);
+}
+
+template<int w, int h>
+// calculate satd in blocks of 4x4
+int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+{
+    int satd = 0;
+
+    for (int row = 0; row < h; row += 4)
+        for (int col = 0; col < w; col += 4)
+            satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1,
+                             pix2 + row * stride_pix2 + col, stride_pix2);
+
+    return satd;
+}
+
+int my_satd(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+{
+    int v0 = satd4<8, 8>(pix1, stride_pix1, pix2, stride_pix2);
+    int v1 = PFX(pixel_satd_8x8_sse2)(pix1, stride_pix1, pix2, stride_pix2);
+
+    if (v0 != v1)
+    {
+        v1 = PFX(pixel_satd_8x8_sse2)(pix1, stride_pix1, pix2, stride_pix2);
+        printf("X\n");
+    }
+
+    return v0;
+}
+
 void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // Main10
 {
 #if !defined(X86_64)
@@ -915,6 +993,32 @@
         HEVC_SAD_X4(sse2);
 
         p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_mmx2);
+
+//     p.pu[LUMA_8x8].satd   = PFX(pixel_satd_8x8_sse2);
+//     p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_sse2);
+//     p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_sse2);
+//     p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_sse2);
+//     p.pu[LUMA_8x4].satd   = PFX(pixel_satd_8x4_sse2);
+//     p.pu[LUMA_4x8].satd   = PFX(pixel_satd_4x8_sse2);
+//     p.pu[LUMA_16x8].satd  = PFX(pixel_satd_16x8_sse2);
+//     p.pu[LUMA_8x16].satd  = PFX(pixel_satd_8x16_sse2);
+//     p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_sse2);
+//     p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_sse2);
+//     p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_sse2);
+//     p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_sse2);
+//     p.pu[LUMA_16x12].satd = PFX(pixel_satd_16x12_sse2);
+//     p.pu[LUMA_12x16].satd = PFX(pixel_satd_12x16_sse2);
+//     p.pu[LUMA_16x4].satd  = PFX(pixel_satd_16x4_sse2);
+//     p.pu[LUMA_4x16].satd  = PFX(pixel_satd_4x16_sse2);
+//     p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_sse2);
+//     p.pu[LUMA_24x32].satd = PFX(pixel_satd_24x32_sse2);
+//     p.pu[LUMA_32x8].satd  = PFX(pixel_satd_32x8_sse2);
+//     p.pu[LUMA_8x32].satd  = PFX(pixel_satd_8x32_sse2);
+//     p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_sse2);
+//     p.pu[LUMA_48x64].satd = PFX(pixel_satd_48x64_sse2);
+//     p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_sse2);
+//     p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_sse2);
+
         ALL_LUMA_PU(satd, pixel_satd, sse2);
 
 #if X265_DEPTH <= 10
@@ -1179,7 +1283,7 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = PFX(pixel_satd_4x32_avx);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_avx);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = PFX(pixel_satd_8x16_avx);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_avx);
+        // p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_avx);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = PFX(pixel_satd_8x8_avx);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_avx);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = PFX(pixel_satd_8x32_avx);
diff -r e35d3260b513 -r 61769ca23581 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Wed Sep 16 12:07:40 2015 -0500
+++ b/source/common/x86/pixel-a.asm	Wed Sep 16 12:07:41 2015 -0500
@@ -242,6 +242,12 @@
 %endif
     HADAMARD4_2D 4, 5, 6, 7, 3, %%n
     paddw m4, m6
+;%if HIGH_BIT_DEPTH && (BIT_DEPTH == 12)
+;    pxor m5, m5
+;    punpcklwd m6, m4, m5
+;    punpckhwd m4, m5
+;    paddd m4, m6
+;%endif
     SWAP %%n, 4
 %endmacro
 
@@ -257,15 +263,39 @@
     HADAMARD 1, max, %2, %4, %6, %7
 %endif
 %ifnidn %9, swap
+  %if (BIT_DEPTH == 12)
+    pxor m%6, m%6
+    punpcklwd m%7, m%2, m%6
+    punpckhwd m%2, m%6
+    paddd m%8, m%7
+    paddd m%8, m%2
+  %else
     paddw m%8, m%2
+  %endif
 %else
     SWAP %8, %2
 %endif
 %if %1
+  %if (BIT_DEPTH == 12)
+    pxor m%6, m%6
+    punpcklwd m%7, m%4, m%6
+    punpckhwd m%4, m%6
+    paddd m%8, m%7
+    paddd m%8, m%4
+  %else
     paddw m%8, m%4
+  %endif
 %else
     HADAMARD 1, max, %3, %5, %6, %7
+  %if (BIT_DEPTH == 12)
+    pxor m%6, m%6
+    punpcklwd m%7, m%3, m%6
+    punpckhwd m%3, m%6
+    paddd m%8, m%7
+    paddd m%8, m%3
+  %else
     paddw m%8, m%3
+  %endif
 %endif
 %endmacro
 
@@ -281,29 +311,23 @@
 %endif
 
     pxor m%10, m%10
-    mova m%9, m%2
-    punpcklwd m%9, m%10
+    punpcklwd m%9, m%2, m%10
     paddd m%8, m%9
-    mova m%9, m%2
-    punpckhwd m%9, m%10
+    punpckhwd m%9, m%2, m%10
     paddd m%8, m%9
 
 %if %1
     pxor m%10, m%10
-    mova m%9, m%4
-    punpcklwd m%9, m%10
+    punpcklwd m%9, m%4, m%10
     paddd m%8, m%9
-    mova m%9, m%4
-    punpckhwd m%9, m%10
+    punpckhwd m%9, m%4, m%10
     paddd m%8, m%9
 %else
     HADAMARD 1, max, %3, %5, %6, %7
     pxor m%10, m%10
-    mova m%9, m%3
-    punpcklwd m%9, m%10
+    punpcklwd m%9, m%3, m%10
     paddd m%8, m%9
-    mova m%9, m%3
-    punpckhwd m%9, m%10
+    punpckhwd m%9, m%3, m%10
     paddd m%8, m%9
 %endif
 %endmacro
@@ -326,146 +350,21 @@
     movd       eax, m0
     and        eax, 0xffff
 %endif ; HIGH_BIT_DEPTH
-    RET
-%endmacro
-
-; FIXME avoid the spilling of regs to hold 3*stride.
-; for small blocks on x86_32, modify pixel pointer instead.
-
-;-----------------------------------------------------------------------------
-; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
-;-----------------------------------------------------------------------------
-INIT_MMX mmx2
-cglobal pixel_satd_16x4_internal
-    SATD_4x4_MMX m2,  0, 0
-    SATD_4x4_MMX m1,  4, 0
-    paddw        m0, m2
-    SATD_4x4_MMX m2,  8, 0
-    paddw        m0, m1
-    SATD_4x4_MMX m1, 12, 0
-    paddw        m0, m2
-    paddw        m0, m1
-    ret
-
-cglobal pixel_satd_8x8_internal
-    SATD_4x4_MMX m2,  0, 0
-    SATD_4x4_MMX m1,  4, 1
-    paddw        m0, m2
-    paddw        m0, m1
-pixel_satd_8x4_internal_mmx2:
-    SATD_4x4_MMX m2,  0, 0
-    SATD_4x4_MMX m1,  4, 0
-    paddw        m0, m2
-    paddw        m0, m1
-    ret
-
-%if HIGH_BIT_DEPTH
-%macro SATD_MxN_MMX 3
-cglobal pixel_satd_%1x%2, 4,7
-    SATD_START_MMX
-    pxor   m0, m0
-    call pixel_satd_%1x%3_internal_mmx2
-    HADDUW m0, m1
-    movd  r6d, m0
-%rep %2/%3-1
-    pxor   m0, m0
-    lea    r0, [r0+4*r1]
-    lea    r2, [r2+4*r3]
-    call pixel_satd_%1x%3_internal_mmx2
-    movd   m2, r4
-    HADDUW m0, m1
-    movd   r4, m0
-    add    r6, r4
-    movd   r4, m2
-%endrep
-    movifnidn eax, r6d
-    RET
-%endmacro
-
-SATD_MxN_MMX 16, 16, 4
-SATD_MxN_MMX 16,  8, 4
-SATD_MxN_MMX  8, 16, 8
-%endif ; HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH == 0
-cglobal pixel_satd_16x16, 4,6
-    SATD_START_MMX
-    pxor   m0, m0
-%rep 3
-    call pixel_satd_16x4_internal_mmx2
-    lea  r0, [r0+4*r1]
-    lea  r2, [r2+4*r3]
-%endrep
-    call pixel_satd_16x4_internal_mmx2
-    HADDUW m0, m1
-    movd  eax, m0
-    RET
-
-cglobal pixel_satd_16x8, 4,6
-    SATD_START_MMX
-    pxor   m0, m0
-    call pixel_satd_16x4_internal_mmx2
-    lea  r0, [r0+4*r1]
-    lea  r2, [r2+4*r3]
-    call pixel_satd_16x4_internal_mmx2
-    SATD_END_MMX
-
-cglobal pixel_satd_8x16, 4,6
-    SATD_START_MMX
-    pxor   m0, m0
-    call pixel_satd_8x8_internal_mmx2
-    lea  r0, [r0+4*r1]
-    lea  r2, [r2+4*r3]
-    call pixel_satd_8x8_internal_mmx2
-    SATD_END_MMX
-%endif ; !HIGH_BIT_DEPTH
-
-cglobal pixel_satd_8x8, 4,6
-    SATD_START_MMX
-    pxor   m0, m0
-    call pixel_satd_8x8_internal_mmx2
-    SATD_END_MMX
-
-cglobal pixel_satd_8x4, 4,6
-    SATD_START_MMX
-    pxor   m0, m0
-    call pixel_satd_8x4_internal_mmx2
-    SATD_END_MMX
-
-cglobal pixel_satd_4x16, 4,6
-    SATD_START_MMX
-    SATD_4x4_MMX m0, 0, 1
-    SATD_4x4_MMX m1, 0, 1
-    paddw  m0, m1
-    SATD_4x4_MMX m1, 0, 1
-    paddw  m0, m1
-    SATD_4x4_MMX m1, 0, 0
-    paddw  m0, m1
-    SATD_END_MMX
-
-cglobal pixel_satd_4x8, 4,6
-    SATD_START_MMX
-    SATD_4x4_MMX m0, 0, 1
-    SATD_4x4_MMX m1, 0, 0
-    paddw  m0, m1
-    SATD_END_MMX
-
-cglobal pixel_satd_4x4, 4,6
-    SATD_START_MMX
-    SATD_4x4_MMX m0, 0, 0
-%if HIGH_BIT_DEPTH
-    HADDUW      m0, m1
-    movd       eax, m0
-%else ; !HIGH_BIT_DEPTH
-    pshufw      m1, m0, q1032
-    paddw       m0, m1
-    pshufw      m1, m0, q2301
-    paddw       m0, m1
-    movd       eax, m0
-    and        eax, 0xffff
-%endif ; HIGH_BIT_DEPTH
     EMMS
     RET
+%endmacro
+
+; FIXME avoid the spilling of regs to hold 3*stride.
+; for small blocks on x86_32, modify pixel pointer instead.
+
+;-----------------------------------------------------------------------------
+; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+INIT_MMX mmx2
+cglobal pixel_satd_4x4, 4,6
+    SATD_START_MMX
+    SATD_4x4_MMX m0, 0, 0
+    SATD_END_MMX
 
 %macro SATD_START_SSE2 2-3 0
     FIX_STRIDES r1, r3
@@ -485,10 +384,14 @@
 
 %macro SATD_END_SSE2 1-2
 %if HIGH_BIT_DEPTH
+  %if BIT_DEPTH == 12
+    HADDD   %1, xm0
+  %else ; BIT_DEPTH == 12
     HADDUW  %1, xm0
-%if %0 == 2
+  %endif ; BIT_DEPTH == 12
+  %if %0 == 2
     paddd   %1, %2
-%endif
+  %endif
 %else
     HADDW   %1, xm7
 %endif
@@ -631,7 +534,7 @@
     mova m7, [hmul_4p]
 %endif
     SATD_4x8_SSE vertical, 0, swap
-    HADDW m7, m1
+    HADDUW m7, m1
     movd eax, m7
     RET
 
@@ -644,7 +547,11 @@
     lea r0, [r0+r1*2*SIZEOF_PIXEL]
     lea r2, [r2+r3*2*SIZEOF_PIXEL]
     SATD_4x8_SSE vertical, 1, add
+  %if BIT_DEPTH == 12
+    HADDD m7, m1
+  %else
     HADDUW m7, m1
+  %endif
     movd eax, m7
     RET
 
@@ -690,12 +597,8 @@
     mova m7, [pw_00ff]
 %endif
     call pixel_satd_16x4_internal2
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
-    movd    eax, m10
+    HADDD m10, m0
+    movd eax, m10
     RET
 
 cglobal pixel_satd_16x8, 4,6,14
@@ -757,12 +660,8 @@
 %%pixel_satd_16x8_internal:
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
-    movd    eax, m10
+    HADDD m10, m0
+    movd eax, m10
     RET
 
 cglobal pixel_satd_32x8, 4,8,14    ;if WIN64 && notcpuflag(avx)
@@ -778,12 +677,8 @@
     lea r2, [r7 + 16]
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
-    movd    eax, m10
+    HADDD m10, m0
+    movd eax, m10
     RET
 
 cglobal pixel_satd_32x16, 4,8,14    ;if WIN64 && notcpuflag(avx)
@@ -803,11 +698,7 @@
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
+    HADDD m10, m0
     movd    eax, m10
     RET
 
@@ -832,12 +723,8 @@
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
-    movd    eax, m10
+    HADDD m10, m0
+    movd eax, m10
     RET
 
 cglobal pixel_satd_32x32, 4,8,14    ;if WIN64 && notcpuflag(avx)
@@ -865,12 +752,8 @@
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
-    movd    eax, m10
+    HADDD m10, m0
+    movd eax, m10
     RET
 
 cglobal pixel_satd_32x64, 4,8,14    ;if WIN64 && notcpuflag(avx)
@@ -914,12 +797,8 @@
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
-    movd    eax, m10
+    HADDD m10, m0
+    movd eax, m10
     RET
 
 cglobal pixel_satd_48x64, 4,8,14    ;if WIN64 && notcpuflag(avx)
@@ -981,12 +860,8 @@
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
-    movd    eax, m10
+    HADDD m10, m0
+    movd eax, m10
     RET
 
 cglobal pixel_satd_64x16, 4,8,14    ;if WIN64 && notcpuflag(avx)
@@ -1018,12 +893,8 @@
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
-    movd    eax, m10
+    HADDD m10, m0
+    movd eax, m10
     RET
 
 cglobal pixel_satd_64x32, 4,8,14    ;if WIN64 && notcpuflag(avx)
@@ -1072,12 +943,8 @@
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
 
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
-    movd    eax, m10
+    HADDD m10, m0
+    movd eax, m10
     RET
 
 cglobal pixel_satd_64x48, 4,8,14    ;if WIN64 && notcpuflag(avx)
@@ -1142,12 +1009,8 @@
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
 
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
-    movd    eax, m10
+    HADDD m10, m0
+    movd eax, m10
     RET
 
 cglobal pixel_satd_64x64, 4,8,14    ;if WIN64 && notcpuflag(avx)
@@ -1228,12 +1091,8 @@
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
 
-    pxor     m9, m9
-    movhlps  m9, m10
-    paddd   m10, m9
-    pshufd   m9, m10, 1
-    paddd   m10, m9
-    movd    eax, m10
+    HADDD m10, m0
+    movd eax, m10
     RET
 
 %else
@@ -1250,11 +1109,7 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
+    HADDD m6, m0
     movd   eax, m6
     RET
 %else
@@ -1271,12 +1126,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 %if WIN64
@@ -1314,12 +1165,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_32x48, 4,7,8,0-gprsize    ;if !WIN64
@@ -1359,12 +1206,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -1401,12 +1244,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_24x64, 4,7,8,0-gprsize    ;if !WIN64
@@ -1443,12 +1282,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -1465,12 +1300,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_8x64, 4,7,8,0-gprsize    ;if !WIN64
@@ -1485,12 +1316,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -1515,12 +1342,8 @@
     mov [rsp], r2
     call pixel_satd_8x8_internal2
     call %%pixel_satd_8x4_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -1565,12 +1388,8 @@
     lea r0, [r0 + r1*2*SIZEOF_PIXEL]
     lea r2, [r2 + r3*2*SIZEOF_PIXEL]
     SATD_4x8_SSE vertical, 1, 4, 5
-    pxor    m1, m1
-    movhlps m1, m7
-    paddd   m7, m1
-    pshufd  m1, m7, 1
-    paddd   m7, m1
-    movd   eax, m7
+    HADDD m7, m0
+    movd eax, m7
     RET
 %else
 cglobal pixel_satd_12x32, 4,7,8,0-gprsize
@@ -1614,12 +1433,8 @@
     lea r0, [r0 + r1*2*SIZEOF_PIXEL]
     lea r2, [r2 + r3*2*SIZEOF_PIXEL]
     SATD_4x8_SSE vertical, 1, 4, 5
-    pxor    m1, m1
-    movhlps m1, m7
-    paddd   m7, m1
-    pshufd  m1, m7, 1
-    paddd   m7, m1
-    movd   eax, m7
+    HADDD m7, m0
+    movd eax, m7
     RET
 %endif
 %else ;HIGH_BIT_DEPTH
@@ -1735,12 +1550,8 @@
     lea r0, [r0 + r1*2*SIZEOF_PIXEL]
     lea r2, [r2 + r3*2*SIZEOF_PIXEL]
     SATD_4x8_SSE vertical, 1, 4, 5
-    pxor    m1, m1
-    movhlps m1, m7
-    paddd   m7, m1
-    pshufd  m1, m7, 1
-    paddd   m7, m1
-    movd   eax, m7
+    HADDD m7, m0
+    movd eax, m7
     RET
 %else
 cglobal pixel_satd_4x32, 4,7,8,0-gprsize
@@ -1827,12 +1638,8 @@
     lea r0, [r6 + 24*SIZEOF_PIXEL]
     lea r2, [r7 + 24*SIZEOF_PIXEL]
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_32x8, 4,7,8,0-gprsize    ;if !WIN64
@@ -1852,12 +1659,8 @@
     mov r2, [rsp]
     add r2, 24*SIZEOF_PIXEL
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -1880,12 +1683,8 @@
     lea r2, [r7 + 24*SIZEOF_PIXEL]
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_32x16, 4,7,8,0-gprsize   ;if !WIN64
@@ -1909,12 +1708,8 @@
     add r2, 24*SIZEOF_PIXEL
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -1941,12 +1736,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_32x24, 4,7,8,0-gprsize   ;if !WIN64
@@ -1974,12 +1765,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -2010,12 +1797,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_32x32, 4,7,8,0-gprsize   ;if !WIN64
@@ -2047,12 +1830,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -2099,12 +1878,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_32x64, 4,7,8,0-gprsize   ;if !WIN64
@@ -2152,12 +1927,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -2224,12 +1995,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_48x64, 4,7,8,0-gprsize   ;if !WIN64
@@ -2299,12 +2066,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -2344,12 +2107,8 @@
     lea r2, [r7 + 56*SIZEOF_PIXEL]
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_64x16, 4,7,8,0-gprsize   ;if !WIN64
@@ -2393,12 +2152,8 @@
     add r2,56*SIZEOF_PIXEL
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -2453,12 +2208,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_64x32, 4,7,8,0-gprsize   ;if !WIN64
@@ -2518,12 +2269,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -2594,12 +2341,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m8, m8
-    movhlps m8, m6
-    paddd   m6, m8
-    pshufd  m8, m6, 1
-    paddd   m6, m8
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_64x48, 4,7,8,0-gprsize   ;if !WIN64
@@ -2675,12 +2418,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -2767,12 +2506,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m8, m8
-    movhlps m8, m6
-    paddd   m6, m8
-    pshufd  m8, m6, 1
-    paddd   m6, m8
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_64x64, 4,7,8,0-gprsize   ;if !WIN64
@@ -2864,12 +2599,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -2883,12 +2614,8 @@
     call %%pixel_satd_8x4_internal2
     RESTORE_AND_INC_POINTERS
     call %%pixel_satd_8x4_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 
 %if WIN64
@@ -2901,12 +2628,8 @@
     call pixel_satd_8x8_internal2
     RESTORE_AND_INC_POINTERS
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 
 %if WIN64
@@ -2921,12 +2644,8 @@
     RESTORE_AND_INC_POINTERS
     call pixel_satd_8x8_internal2
     call %%pixel_satd_8x4_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 
 %if WIN64
@@ -2941,12 +2660,8 @@
     RESTORE_AND_INC_POINTERS
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 
 %if WIN64
@@ -2965,12 +2680,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 
 %if WIN64
@@ -2997,12 +2708,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif
 
@@ -3029,12 +2736,8 @@
     lea r0, [r0 + r1*2*SIZEOF_PIXEL]
     lea r2, [r2 + r3*2*SIZEOF_PIXEL]
     SATD_4x8_SSE vertical, 1, 4, 5
-    pxor    m1, m1
-    movhlps m1, m7
-    paddd   m7, m1
-    pshufd  m1, m7, 1
-    paddd   m7, m1
-    movd   eax, m7
+    HADDD m7, m0
+    movd eax, m7
     RET
 %else
 cglobal pixel_satd_12x16, 4,7,8,0-gprsize
@@ -3060,12 +2763,8 @@
     lea r0, [r0 + r1*2*SIZEOF_PIXEL]
     lea r2, [r2 + r3*2*SIZEOF_PIXEL]
     SATD_4x8_SSE vertical, 1, 4, 5
-    pxor    m1, m1
-    movhlps m1, m7
-    paddd   m7, m1
-    pshufd  m1, m7, 1
-    paddd   m7, m1
-    movd   eax, m7
+    HADDD m7, m0
+    movd eax, m7
     RET
 %endif
 %else    ;HIGH_BIT_DEPTH
@@ -3149,12 +2848,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %else
 cglobal pixel_satd_24x32, 4,7,8,0-gprsize
@@ -3179,12 +2874,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 %endif    ;WIN64
 
@@ -3201,12 +2892,8 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 
 %if WIN64
@@ -3217,12 +2904,8 @@
     SATD_START_SSE2 m6, m7
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    pxor    m7, m7
-    movhlps m7, m6
-    paddd   m6, m7
-    pshufd  m7, m6, 1
-    paddd   m6, m7
-    movd   eax, m6
+    HADDD m6, m0
+    movd eax, m6
     RET
 
 cglobal pixel_satd_8x8, 4,6,8



More information about the x265-devel mailing list