[x265] [PATCH 2 of 2] fix Main12 satd overflow bug up to SSE4, (fixes #180)
Min Chen
chenm003 at 163.com
Wed Sep 16 20:14:30 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1442423261 18000
# Node ID 61769ca2358186b36c2498b3eaa8aeded95af6fd
# Parent e35d3260b5138abf09bf79808bf55104606c4626
fix Main12 satd overflow bug up to SSE4, (fixes #180)
---
source/common/x86/asm-primitives.cpp | 106 ++++++-
source/common/x86/pixel-a.asm | 635 +++++++++-------------------------
2 files changed, 264 insertions(+), 477 deletions(-)
diff -r e35d3260b513 -r 61769ca23581 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Sep 16 12:07:40 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Wed Sep 16 12:07:41 2015 -0500
@@ -871,6 +871,84 @@
#if HIGH_BIT_DEPTH
+#define BITS_PER_SUM (8 * sizeof(sum_t))
+
+#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) { \
+ sum2_t t0 = s0 + s1; \
+ sum2_t t1 = s0 - s1; \
+ sum2_t t2 = s2 + s3; \
+ sum2_t t3 = s2 - s3; \
+ d0 = t0 + t2; \
+ d2 = t0 - t2; \
+ d1 = t1 + t3; \
+ d3 = t1 - t3; \
+}
+
+// in: a pseudo-simd number of the form x+(y<<16)
+// return: abs(x)+(abs(y)<<16)
+inline sum2_t abs2(sum2_t a)
+{
+ sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1);
+
+ return (a + s) ^ s;
+}
+
+static int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+{
+ sum2_t tmp[4][2];
+ sum2_t a0, a1, a2, a3, b0, b1;
+ sum2_t sum = 0;
+
+ for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2)
+ {
+ a0 = pix1[0] - pix2[0];
+ a1 = pix1[1] - pix2[1];
+ b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
+ a2 = pix1[2] - pix2[2];
+ a3 = pix1[3] - pix2[3];
+ b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
+ tmp[i][0] = b0 + b1;
+ tmp[i][1] = b0 - b1;
+ }
+
+ for (int i = 0; i < 2; i++)
+ {
+ HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
+ a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
+ sum += ((sum_t)a0) + (a0 >> BITS_PER_SUM);
+ }
+
+ return (int)(sum >> 1);
+}
+
+template<int w, int h>
+// calculate satd in blocks of 4x4
+int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+{
+ int satd = 0;
+
+ for (int row = 0; row < h; row += 4)
+ for (int col = 0; col < w; col += 4)
+ satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1,
+ pix2 + row * stride_pix2 + col, stride_pix2);
+
+ return satd;
+}
+
+int my_satd(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+{
+ int v0 = satd4<8, 8>(pix1, stride_pix1, pix2, stride_pix2);
+ int v1 = PFX(pixel_satd_8x8_sse2)(pix1, stride_pix1, pix2, stride_pix2);
+
+ if (v0 != v1)
+ {
+ v1 = PFX(pixel_satd_8x8_sse2)(pix1, stride_pix1, pix2, stride_pix2);
+ printf("X\n");
+ }
+
+ return v0;
+}
+
void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // Main10
{
#if !defined(X86_64)
@@ -915,6 +993,32 @@
HEVC_SAD_X4(sse2);
p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_mmx2);
+
+// p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_sse2);
+// p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_sse2);
+// p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_sse2);
+// p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_sse2);
+// p.pu[LUMA_8x4].satd = PFX(pixel_satd_8x4_sse2);
+// p.pu[LUMA_4x8].satd = PFX(pixel_satd_4x8_sse2);
+// p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_sse2);
+// p.pu[LUMA_8x16].satd = PFX(pixel_satd_8x16_sse2);
+// p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_sse2);
+// p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_sse2);
+// p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_sse2);
+// p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_sse2);
+// p.pu[LUMA_16x12].satd = PFX(pixel_satd_16x12_sse2);
+// p.pu[LUMA_12x16].satd = PFX(pixel_satd_12x16_sse2);
+// p.pu[LUMA_16x4].satd = PFX(pixel_satd_16x4_sse2);
+// p.pu[LUMA_4x16].satd = PFX(pixel_satd_4x16_sse2);
+// p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_sse2);
+// p.pu[LUMA_24x32].satd = PFX(pixel_satd_24x32_sse2);
+// p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_sse2);
+// p.pu[LUMA_8x32].satd = PFX(pixel_satd_8x32_sse2);
+// p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_sse2);
+// p.pu[LUMA_48x64].satd = PFX(pixel_satd_48x64_sse2);
+// p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_sse2);
+// p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_sse2);
+
ALL_LUMA_PU(satd, pixel_satd, sse2);
#if X265_DEPTH <= 10
@@ -1179,7 +1283,7 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = PFX(pixel_satd_4x32_avx);
p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_avx);
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = PFX(pixel_satd_8x16_avx);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_avx);
+ // p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_avx);
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = PFX(pixel_satd_8x8_avx);
p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_avx);
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = PFX(pixel_satd_8x32_avx);
diff -r e35d3260b513 -r 61769ca23581 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Wed Sep 16 12:07:40 2015 -0500
+++ b/source/common/x86/pixel-a.asm Wed Sep 16 12:07:41 2015 -0500
@@ -242,6 +242,12 @@
%endif
HADAMARD4_2D 4, 5, 6, 7, 3, %%n
paddw m4, m6
+;%if HIGH_BIT_DEPTH && (BIT_DEPTH == 12)
+; pxor m5, m5
+; punpcklwd m6, m4, m5
+; punpckhwd m4, m5
+; paddd m4, m6
+;%endif
SWAP %%n, 4
%endmacro
@@ -257,15 +263,39 @@
HADAMARD 1, max, %2, %4, %6, %7
%endif
%ifnidn %9, swap
+ %if (BIT_DEPTH == 12)
+ pxor m%6, m%6
+ punpcklwd m%7, m%2, m%6
+ punpckhwd m%2, m%6
+ paddd m%8, m%7
+ paddd m%8, m%2
+ %else
paddw m%8, m%2
+ %endif
%else
SWAP %8, %2
%endif
%if %1
+ %if (BIT_DEPTH == 12)
+ pxor m%6, m%6
+ punpcklwd m%7, m%4, m%6
+ punpckhwd m%4, m%6
+ paddd m%8, m%7
+ paddd m%8, m%4
+ %else
paddw m%8, m%4
+ %endif
%else
HADAMARD 1, max, %3, %5, %6, %7
+ %if (BIT_DEPTH == 12)
+ pxor m%6, m%6
+ punpcklwd m%7, m%3, m%6
+ punpckhwd m%3, m%6
+ paddd m%8, m%7
+ paddd m%8, m%3
+ %else
paddw m%8, m%3
+ %endif
%endif
%endmacro
@@ -281,29 +311,23 @@
%endif
pxor m%10, m%10
- mova m%9, m%2
- punpcklwd m%9, m%10
+ punpcklwd m%9, m%2, m%10
paddd m%8, m%9
- mova m%9, m%2
- punpckhwd m%9, m%10
+ punpckhwd m%9, m%2, m%10
paddd m%8, m%9
%if %1
pxor m%10, m%10
- mova m%9, m%4
- punpcklwd m%9, m%10
+ punpcklwd m%9, m%4, m%10
paddd m%8, m%9
- mova m%9, m%4
- punpckhwd m%9, m%10
+ punpckhwd m%9, m%4, m%10
paddd m%8, m%9
%else
HADAMARD 1, max, %3, %5, %6, %7
pxor m%10, m%10
- mova m%9, m%3
- punpcklwd m%9, m%10
+ punpcklwd m%9, m%3, m%10
paddd m%8, m%9
- mova m%9, m%3
- punpckhwd m%9, m%10
+ punpckhwd m%9, m%3, m%10
paddd m%8, m%9
%endif
%endmacro
@@ -326,146 +350,21 @@
movd eax, m0
and eax, 0xffff
%endif ; HIGH_BIT_DEPTH
- RET
-%endmacro
-
-; FIXME avoid the spilling of regs to hold 3*stride.
-; for small blocks on x86_32, modify pixel pointer instead.
-
-;-----------------------------------------------------------------------------
-; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
-;-----------------------------------------------------------------------------
-INIT_MMX mmx2
-cglobal pixel_satd_16x4_internal
- SATD_4x4_MMX m2, 0, 0
- SATD_4x4_MMX m1, 4, 0
- paddw m0, m2
- SATD_4x4_MMX m2, 8, 0
- paddw m0, m1
- SATD_4x4_MMX m1, 12, 0
- paddw m0, m2
- paddw m0, m1
- ret
-
-cglobal pixel_satd_8x8_internal
- SATD_4x4_MMX m2, 0, 0
- SATD_4x4_MMX m1, 4, 1
- paddw m0, m2
- paddw m0, m1
-pixel_satd_8x4_internal_mmx2:
- SATD_4x4_MMX m2, 0, 0
- SATD_4x4_MMX m1, 4, 0
- paddw m0, m2
- paddw m0, m1
- ret
-
-%if HIGH_BIT_DEPTH
-%macro SATD_MxN_MMX 3
-cglobal pixel_satd_%1x%2, 4,7
- SATD_START_MMX
- pxor m0, m0
- call pixel_satd_%1x%3_internal_mmx2
- HADDUW m0, m1
- movd r6d, m0
-%rep %2/%3-1
- pxor m0, m0
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- call pixel_satd_%1x%3_internal_mmx2
- movd m2, r4
- HADDUW m0, m1
- movd r4, m0
- add r6, r4
- movd r4, m2
-%endrep
- movifnidn eax, r6d
- RET
-%endmacro
-
-SATD_MxN_MMX 16, 16, 4
-SATD_MxN_MMX 16, 8, 4
-SATD_MxN_MMX 8, 16, 8
-%endif ; HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH == 0
-cglobal pixel_satd_16x16, 4,6
- SATD_START_MMX
- pxor m0, m0
-%rep 3
- call pixel_satd_16x4_internal_mmx2
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
-%endrep
- call pixel_satd_16x4_internal_mmx2
- HADDUW m0, m1
- movd eax, m0
- RET
-
-cglobal pixel_satd_16x8, 4,6
- SATD_START_MMX
- pxor m0, m0
- call pixel_satd_16x4_internal_mmx2
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- call pixel_satd_16x4_internal_mmx2
- SATD_END_MMX
-
-cglobal pixel_satd_8x16, 4,6
- SATD_START_MMX
- pxor m0, m0
- call pixel_satd_8x8_internal_mmx2
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- call pixel_satd_8x8_internal_mmx2
- SATD_END_MMX
-%endif ; !HIGH_BIT_DEPTH
-
-cglobal pixel_satd_8x8, 4,6
- SATD_START_MMX
- pxor m0, m0
- call pixel_satd_8x8_internal_mmx2
- SATD_END_MMX
-
-cglobal pixel_satd_8x4, 4,6
- SATD_START_MMX
- pxor m0, m0
- call pixel_satd_8x4_internal_mmx2
- SATD_END_MMX
-
-cglobal pixel_satd_4x16, 4,6
- SATD_START_MMX
- SATD_4x4_MMX m0, 0, 1
- SATD_4x4_MMX m1, 0, 1
- paddw m0, m1
- SATD_4x4_MMX m1, 0, 1
- paddw m0, m1
- SATD_4x4_MMX m1, 0, 0
- paddw m0, m1
- SATD_END_MMX
-
-cglobal pixel_satd_4x8, 4,6
- SATD_START_MMX
- SATD_4x4_MMX m0, 0, 1
- SATD_4x4_MMX m1, 0, 0
- paddw m0, m1
- SATD_END_MMX
-
-cglobal pixel_satd_4x4, 4,6
- SATD_START_MMX
- SATD_4x4_MMX m0, 0, 0
-%if HIGH_BIT_DEPTH
- HADDUW m0, m1
- movd eax, m0
-%else ; !HIGH_BIT_DEPTH
- pshufw m1, m0, q1032
- paddw m0, m1
- pshufw m1, m0, q2301
- paddw m0, m1
- movd eax, m0
- and eax, 0xffff
-%endif ; HIGH_BIT_DEPTH
EMMS
RET
+%endmacro
+
+; FIXME avoid the spilling of regs to hold 3*stride.
+; for small blocks on x86_32, modify pixel pointer instead.
+
+;-----------------------------------------------------------------------------
+; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+INIT_MMX mmx2
+cglobal pixel_satd_4x4, 4,6
+ SATD_START_MMX
+ SATD_4x4_MMX m0, 0, 0
+ SATD_END_MMX
%macro SATD_START_SSE2 2-3 0
FIX_STRIDES r1, r3
@@ -485,10 +384,14 @@
%macro SATD_END_SSE2 1-2
%if HIGH_BIT_DEPTH
+ %if BIT_DEPTH == 12
+ HADDD %1, xm0
+ %else ; BIT_DEPTH == 12
HADDUW %1, xm0
-%if %0 == 2
+ %endif ; BIT_DEPTH == 12
+ %if %0 == 2
paddd %1, %2
-%endif
+ %endif
%else
HADDW %1, xm7
%endif
@@ -631,7 +534,7 @@
mova m7, [hmul_4p]
%endif
SATD_4x8_SSE vertical, 0, swap
- HADDW m7, m1
+ HADDUW m7, m1
movd eax, m7
RET
@@ -644,7 +547,11 @@
lea r0, [r0+r1*2*SIZEOF_PIXEL]
lea r2, [r2+r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
+ %if BIT_DEPTH == 12
+ HADDD m7, m1
+ %else
HADDUW m7, m1
+ %endif
movd eax, m7
RET
@@ -690,12 +597,8 @@
mova m7, [pw_00ff]
%endif
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_16x8, 4,6,14
@@ -757,12 +660,8 @@
%%pixel_satd_16x8_internal:
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_32x8, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -778,12 +677,8 @@
lea r2, [r7 + 16]
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_32x16, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -803,11 +698,7 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
+ HADDD m10, m0
movd eax, m10
RET
@@ -832,12 +723,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_32x32, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -865,12 +752,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_32x64, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -914,12 +797,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_48x64, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -981,12 +860,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_64x16, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -1018,12 +893,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_64x32, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -1072,12 +943,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_64x48, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -1142,12 +1009,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_64x64, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -1228,12 +1091,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
%else
@@ -1250,11 +1109,7 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
+ HADDD m6, m0
movd eax, m6
RET
%else
@@ -1271,12 +1126,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
%if WIN64
@@ -1314,12 +1165,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_32x48, 4,7,8,0-gprsize ;if !WIN64
@@ -1359,12 +1206,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -1401,12 +1244,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_24x64, 4,7,8,0-gprsize ;if !WIN64
@@ -1443,12 +1282,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -1465,12 +1300,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_8x64, 4,7,8,0-gprsize ;if !WIN64
@@ -1485,12 +1316,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -1515,12 +1342,8 @@
mov [rsp], r2
call pixel_satd_8x8_internal2
call %%pixel_satd_8x4_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -1565,12 +1388,8 @@
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
- pxor m1, m1
- movhlps m1, m7
- paddd m7, m1
- pshufd m1, m7, 1
- paddd m7, m1
- movd eax, m7
+ HADDD m7, m0
+ movd eax, m7
RET
%else
cglobal pixel_satd_12x32, 4,7,8,0-gprsize
@@ -1614,12 +1433,8 @@
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
- pxor m1, m1
- movhlps m1, m7
- paddd m7, m1
- pshufd m1, m7, 1
- paddd m7, m1
- movd eax, m7
+ HADDD m7, m0
+ movd eax, m7
RET
%endif
%else ;HIGH_BIT_DEPTH
@@ -1735,12 +1550,8 @@
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
- pxor m1, m1
- movhlps m1, m7
- paddd m7, m1
- pshufd m1, m7, 1
- paddd m7, m1
- movd eax, m7
+ HADDD m7, m0
+ movd eax, m7
RET
%else
cglobal pixel_satd_4x32, 4,7,8,0-gprsize
@@ -1827,12 +1638,8 @@
lea r0, [r6 + 24*SIZEOF_PIXEL]
lea r2, [r7 + 24*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_32x8, 4,7,8,0-gprsize ;if !WIN64
@@ -1852,12 +1659,8 @@
mov r2, [rsp]
add r2, 24*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -1880,12 +1683,8 @@
lea r2, [r7 + 24*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_32x16, 4,7,8,0-gprsize ;if !WIN64
@@ -1909,12 +1708,8 @@
add r2, 24*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -1941,12 +1736,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_32x24, 4,7,8,0-gprsize ;if !WIN64
@@ -1974,12 +1765,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2010,12 +1797,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_32x32, 4,7,8,0-gprsize ;if !WIN64
@@ -2047,12 +1830,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2099,12 +1878,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_32x64, 4,7,8,0-gprsize ;if !WIN64
@@ -2152,12 +1927,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2224,12 +1995,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_48x64, 4,7,8,0-gprsize ;if !WIN64
@@ -2299,12 +2066,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2344,12 +2107,8 @@
lea r2, [r7 + 56*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_64x16, 4,7,8,0-gprsize ;if !WIN64
@@ -2393,12 +2152,8 @@
add r2,56*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2453,12 +2208,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_64x32, 4,7,8,0-gprsize ;if !WIN64
@@ -2518,12 +2269,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2594,12 +2341,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m8, m8
- movhlps m8, m6
- paddd m6, m8
- pshufd m8, m6, 1
- paddd m6, m8
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_64x48, 4,7,8,0-gprsize ;if !WIN64
@@ -2675,12 +2418,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2767,12 +2506,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m8, m8
- movhlps m8, m6
- paddd m6, m8
- pshufd m8, m6, 1
- paddd m6, m8
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_64x64, 4,7,8,0-gprsize ;if !WIN64
@@ -2864,12 +2599,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2883,12 +2614,8 @@
call %%pixel_satd_8x4_internal2
RESTORE_AND_INC_POINTERS
call %%pixel_satd_8x4_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%if WIN64
@@ -2901,12 +2628,8 @@
call pixel_satd_8x8_internal2
RESTORE_AND_INC_POINTERS
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%if WIN64
@@ -2921,12 +2644,8 @@
RESTORE_AND_INC_POINTERS
call pixel_satd_8x8_internal2
call %%pixel_satd_8x4_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%if WIN64
@@ -2941,12 +2660,8 @@
RESTORE_AND_INC_POINTERS
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%if WIN64
@@ -2965,12 +2680,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%if WIN64
@@ -2997,12 +2708,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -3029,12 +2736,8 @@
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
- pxor m1, m1
- movhlps m1, m7
- paddd m7, m1
- pshufd m1, m7, 1
- paddd m7, m1
- movd eax, m7
+ HADDD m7, m0
+ movd eax, m7
RET
%else
cglobal pixel_satd_12x16, 4,7,8,0-gprsize
@@ -3060,12 +2763,8 @@
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
- pxor m1, m1
- movhlps m1, m7
- paddd m7, m1
- pshufd m1, m7, 1
- paddd m7, m1
- movd eax, m7
+ HADDD m7, m0
+ movd eax, m7
RET
%endif
%else ;HIGH_BIT_DEPTH
@@ -3149,12 +2848,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_24x32, 4,7,8,0-gprsize
@@ -3179,12 +2874,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif ;WIN64
@@ -3201,12 +2892,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%if WIN64
@@ -3217,12 +2904,8 @@
SATD_START_SSE2 m6, m7
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
cglobal pixel_satd_8x8, 4,6,8
More information about the x265-devel
mailing list