[x265] [PATCH 2 of 2 Update] fix Main12 satd overflow bug up to SSE4, (fixes #180)
Min Chen
chenm003 at 163.com
Thu Sep 17 00:58:24 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1442442321 18000
# Node ID 7ad1738840cb0153b59f952da160f8e9e08dc40a
# Parent b585c3b6834dc3e98b78be5c750047a5e988926a
fix Main12 satd overflow bug up to SSE4, (fixes #180)
---
source/common/x86/asm-primitives.cpp | 2 +-
source/common/x86/pixel-a.asm | 645 +++++++++-------------------------
2 files changed, 170 insertions(+), 477 deletions(-)
diff -r b585c3b6834d -r 7ad1738840cb source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Sep 16 17:25:19 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Wed Sep 16 17:25:21 2015 -0500
@@ -1179,7 +1179,7 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = PFX(pixel_satd_4x32_avx);
p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_avx);
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = PFX(pixel_satd_8x16_avx);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_avx);
+ // p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_avx);
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = PFX(pixel_satd_8x8_avx);
p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_avx);
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = PFX(pixel_satd_8x32_avx);
diff -r b585c3b6834d -r 7ad1738840cb source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Wed Sep 16 17:25:19 2015 -0500
+++ b/source/common/x86/pixel-a.asm Wed Sep 16 17:25:21 2015 -0500
@@ -242,6 +242,12 @@
%endif
HADAMARD4_2D 4, 5, 6, 7, 3, %%n
paddw m4, m6
+;%if HIGH_BIT_DEPTH && (BIT_DEPTH == 12)
+; pxor m5, m5
+; punpcklwd m6, m4, m5
+; punpckhwd m4, m5
+; paddd m4, m6
+;%endif
SWAP %%n, 4
%endmacro
@@ -257,15 +263,45 @@
HADAMARD 1, max, %2, %4, %6, %7
%endif
%ifnidn %9, swap
+ %if (BIT_DEPTH == 12)
+ pxor m%6, m%6
+ punpcklwd m%7, m%2, m%6
+ punpckhwd m%2, m%6
+ paddd m%8, m%7
+ paddd m%8, m%2
+ %else
paddw m%8, m%2
+ %endif
%else
SWAP %8, %2
+ %if (BIT_DEPTH == 12)
+ pxor m%6, m%6
+ punpcklwd m%7, m%8, m%6
+ punpckhwd m%8, m%6
+ paddd m%8, m%7
+ %endif
%endif
%if %1
+ %if (BIT_DEPTH == 12)
+ pxor m%6, m%6
+ punpcklwd m%7, m%4, m%6
+ punpckhwd m%4, m%6
+ paddd m%8, m%7
+ paddd m%8, m%4
+ %else
paddw m%8, m%4
+ %endif
%else
HADAMARD 1, max, %3, %5, %6, %7
+ %if (BIT_DEPTH == 12)
+ pxor m%6, m%6
+ punpcklwd m%7, m%3, m%6
+ punpckhwd m%3, m%6
+ paddd m%8, m%7
+ paddd m%8, m%3
+ %else
paddw m%8, m%3
+ %endif
%endif
%endmacro
@@ -281,29 +317,23 @@
%endif
pxor m%10, m%10
- mova m%9, m%2
- punpcklwd m%9, m%10
+ punpcklwd m%9, m%2, m%10
paddd m%8, m%9
- mova m%9, m%2
- punpckhwd m%9, m%10
+ punpckhwd m%9, m%2, m%10
paddd m%8, m%9
%if %1
pxor m%10, m%10
- mova m%9, m%4
- punpcklwd m%9, m%10
+ punpcklwd m%9, m%4, m%10
paddd m%8, m%9
- mova m%9, m%4
- punpckhwd m%9, m%10
+ punpckhwd m%9, m%4, m%10
paddd m%8, m%9
%else
HADAMARD 1, max, %3, %5, %6, %7
pxor m%10, m%10
- mova m%9, m%3
- punpcklwd m%9, m%10
+ punpcklwd m%9, m%3, m%10
paddd m%8, m%9
- mova m%9, m%3
- punpckhwd m%9, m%10
+ punpckhwd m%9, m%3, m%10
paddd m%8, m%9
%endif
%endmacro
@@ -326,146 +356,21 @@
movd eax, m0
and eax, 0xffff
%endif ; HIGH_BIT_DEPTH
- RET
-%endmacro
-
-; FIXME avoid the spilling of regs to hold 3*stride.
-; for small blocks on x86_32, modify pixel pointer instead.
-
-;-----------------------------------------------------------------------------
-; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
-;-----------------------------------------------------------------------------
-INIT_MMX mmx2
-cglobal pixel_satd_16x4_internal
- SATD_4x4_MMX m2, 0, 0
- SATD_4x4_MMX m1, 4, 0
- paddw m0, m2
- SATD_4x4_MMX m2, 8, 0
- paddw m0, m1
- SATD_4x4_MMX m1, 12, 0
- paddw m0, m2
- paddw m0, m1
- ret
-
-cglobal pixel_satd_8x8_internal
- SATD_4x4_MMX m2, 0, 0
- SATD_4x4_MMX m1, 4, 1
- paddw m0, m2
- paddw m0, m1
-pixel_satd_8x4_internal_mmx2:
- SATD_4x4_MMX m2, 0, 0
- SATD_4x4_MMX m1, 4, 0
- paddw m0, m2
- paddw m0, m1
- ret
-
-%if HIGH_BIT_DEPTH
-%macro SATD_MxN_MMX 3
-cglobal pixel_satd_%1x%2, 4,7
- SATD_START_MMX
- pxor m0, m0
- call pixel_satd_%1x%3_internal_mmx2
- HADDUW m0, m1
- movd r6d, m0
-%rep %2/%3-1
- pxor m0, m0
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- call pixel_satd_%1x%3_internal_mmx2
- movd m2, r4
- HADDUW m0, m1
- movd r4, m0
- add r6, r4
- movd r4, m2
-%endrep
- movifnidn eax, r6d
- RET
-%endmacro
-
-SATD_MxN_MMX 16, 16, 4
-SATD_MxN_MMX 16, 8, 4
-SATD_MxN_MMX 8, 16, 8
-%endif ; HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH == 0
-cglobal pixel_satd_16x16, 4,6
- SATD_START_MMX
- pxor m0, m0
-%rep 3
- call pixel_satd_16x4_internal_mmx2
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
-%endrep
- call pixel_satd_16x4_internal_mmx2
- HADDUW m0, m1
- movd eax, m0
- RET
-
-cglobal pixel_satd_16x8, 4,6
- SATD_START_MMX
- pxor m0, m0
- call pixel_satd_16x4_internal_mmx2
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- call pixel_satd_16x4_internal_mmx2
- SATD_END_MMX
-
-cglobal pixel_satd_8x16, 4,6
- SATD_START_MMX
- pxor m0, m0
- call pixel_satd_8x8_internal_mmx2
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- call pixel_satd_8x8_internal_mmx2
- SATD_END_MMX
-%endif ; !HIGH_BIT_DEPTH
-
-cglobal pixel_satd_8x8, 4,6
- SATD_START_MMX
- pxor m0, m0
- call pixel_satd_8x8_internal_mmx2
- SATD_END_MMX
-
-cglobal pixel_satd_8x4, 4,6
- SATD_START_MMX
- pxor m0, m0
- call pixel_satd_8x4_internal_mmx2
- SATD_END_MMX
-
-cglobal pixel_satd_4x16, 4,6
- SATD_START_MMX
- SATD_4x4_MMX m0, 0, 1
- SATD_4x4_MMX m1, 0, 1
- paddw m0, m1
- SATD_4x4_MMX m1, 0, 1
- paddw m0, m1
- SATD_4x4_MMX m1, 0, 0
- paddw m0, m1
- SATD_END_MMX
-
-cglobal pixel_satd_4x8, 4,6
- SATD_START_MMX
- SATD_4x4_MMX m0, 0, 1
- SATD_4x4_MMX m1, 0, 0
- paddw m0, m1
- SATD_END_MMX
-
-cglobal pixel_satd_4x4, 4,6
- SATD_START_MMX
- SATD_4x4_MMX m0, 0, 0
-%if HIGH_BIT_DEPTH
- HADDUW m0, m1
- movd eax, m0
-%else ; !HIGH_BIT_DEPTH
- pshufw m1, m0, q1032
- paddw m0, m1
- pshufw m1, m0, q2301
- paddw m0, m1
- movd eax, m0
- and eax, 0xffff
-%endif ; HIGH_BIT_DEPTH
EMMS
RET
+%endmacro
+
+; FIXME avoid the spilling of regs to hold 3*stride.
+; for small blocks on x86_32, modify pixel pointer instead.
+
+;-----------------------------------------------------------------------------
+; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+INIT_MMX mmx2
+cglobal pixel_satd_4x4, 4,6
+ SATD_START_MMX
+ SATD_4x4_MMX m0, 0, 0
+ SATD_END_MMX
%macro SATD_START_SSE2 2-3 0
FIX_STRIDES r1, r3
@@ -485,10 +390,14 @@
%macro SATD_END_SSE2 1-2
%if HIGH_BIT_DEPTH
+ %if BIT_DEPTH == 12
+ HADDD %1, xm0
+ %else ; BIT_DEPTH == 12
HADDUW %1, xm0
-%if %0 == 2
+ %endif ; BIT_DEPTH == 12
+ %if %0 == 2
paddd %1, %2
-%endif
+ %endif
%else
HADDW %1, xm7
%endif
@@ -631,7 +540,11 @@
mova m7, [hmul_4p]
%endif
SATD_4x8_SSE vertical, 0, swap
- HADDW m7, m1
+%if BIT_DEPTH == 12
+ HADDD m7, m1
+%else
+ HADDUW m7, m1
+%endif
movd eax, m7
RET
@@ -644,7 +557,11 @@
lea r0, [r0+r1*2*SIZEOF_PIXEL]
lea r2, [r2+r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
+%if BIT_DEPTH == 12
+ HADDD m7, m1
+%else
HADDUW m7, m1
+%endif
movd eax, m7
RET
@@ -690,12 +607,8 @@
mova m7, [pw_00ff]
%endif
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_16x8, 4,6,14
@@ -757,12 +670,8 @@
%%pixel_satd_16x8_internal:
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_32x8, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -778,12 +687,8 @@
lea r2, [r7 + 16]
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_32x16, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -803,11 +708,7 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
+ HADDD m10, m0
movd eax, m10
RET
@@ -832,12 +733,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_32x32, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -865,12 +762,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_32x64, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -914,12 +807,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_48x64, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -981,12 +870,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_64x16, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -1018,12 +903,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_64x32, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -1072,12 +953,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_64x48, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -1142,12 +1019,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_64x64, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -1228,12 +1101,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
%else
@@ -1250,11 +1119,7 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
+ HADDD m6, m0
movd eax, m6
RET
%else
@@ -1271,12 +1136,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
%if WIN64
@@ -1314,12 +1175,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_32x48, 4,7,8,0-gprsize ;if !WIN64
@@ -1359,12 +1216,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -1401,12 +1254,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_24x64, 4,7,8,0-gprsize ;if !WIN64
@@ -1443,12 +1292,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -1465,12 +1310,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_8x64, 4,7,8,0-gprsize ;if !WIN64
@@ -1485,12 +1326,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -1515,12 +1352,8 @@
mov [rsp], r2
call pixel_satd_8x8_internal2
call %%pixel_satd_8x4_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -1565,12 +1398,8 @@
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
- pxor m1, m1
- movhlps m1, m7
- paddd m7, m1
- pshufd m1, m7, 1
- paddd m7, m1
- movd eax, m7
+ HADDD m7, m0
+ movd eax, m7
RET
%else
cglobal pixel_satd_12x32, 4,7,8,0-gprsize
@@ -1614,12 +1443,8 @@
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
- pxor m1, m1
- movhlps m1, m7
- paddd m7, m1
- pshufd m1, m7, 1
- paddd m7, m1
- movd eax, m7
+ HADDD m7, m0
+ movd eax, m7
RET
%endif
%else ;HIGH_BIT_DEPTH
@@ -1735,12 +1560,8 @@
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
- pxor m1, m1
- movhlps m1, m7
- paddd m7, m1
- pshufd m1, m7, 1
- paddd m7, m1
- movd eax, m7
+ HADDD m7, m0
+ movd eax, m7
RET
%else
cglobal pixel_satd_4x32, 4,7,8,0-gprsize
@@ -1827,12 +1648,8 @@
lea r0, [r6 + 24*SIZEOF_PIXEL]
lea r2, [r7 + 24*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_32x8, 4,7,8,0-gprsize ;if !WIN64
@@ -1852,12 +1669,8 @@
mov r2, [rsp]
add r2, 24*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -1880,12 +1693,8 @@
lea r2, [r7 + 24*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_32x16, 4,7,8,0-gprsize ;if !WIN64
@@ -1909,12 +1718,8 @@
add r2, 24*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -1941,12 +1746,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_32x24, 4,7,8,0-gprsize ;if !WIN64
@@ -1974,12 +1775,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2010,12 +1807,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_32x32, 4,7,8,0-gprsize ;if !WIN64
@@ -2047,12 +1840,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2099,12 +1888,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_32x64, 4,7,8,0-gprsize ;if !WIN64
@@ -2152,12 +1937,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2224,12 +2005,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_48x64, 4,7,8,0-gprsize ;if !WIN64
@@ -2299,12 +2076,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2344,12 +2117,8 @@
lea r2, [r7 + 56*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_64x16, 4,7,8,0-gprsize ;if !WIN64
@@ -2393,12 +2162,8 @@
add r2,56*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2453,12 +2218,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_64x32, 4,7,8,0-gprsize ;if !WIN64
@@ -2518,12 +2279,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2594,12 +2351,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m8, m8
- movhlps m8, m6
- paddd m6, m8
- pshufd m8, m6, 1
- paddd m6, m8
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_64x48, 4,7,8,0-gprsize ;if !WIN64
@@ -2675,12 +2428,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2767,12 +2516,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m8, m8
- movhlps m8, m6
- paddd m6, m8
- pshufd m8, m6, 1
- paddd m6, m8
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_64x64, 4,7,8,0-gprsize ;if !WIN64
@@ -2864,12 +2609,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2883,12 +2624,8 @@
call %%pixel_satd_8x4_internal2
RESTORE_AND_INC_POINTERS
call %%pixel_satd_8x4_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%if WIN64
@@ -2901,12 +2638,8 @@
call pixel_satd_8x8_internal2
RESTORE_AND_INC_POINTERS
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%if WIN64
@@ -2921,12 +2654,8 @@
RESTORE_AND_INC_POINTERS
call pixel_satd_8x8_internal2
call %%pixel_satd_8x4_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%if WIN64
@@ -2941,12 +2670,8 @@
RESTORE_AND_INC_POINTERS
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%if WIN64
@@ -2965,12 +2690,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%if WIN64
@@ -2997,12 +2718,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -3029,12 +2746,8 @@
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
- pxor m1, m1
- movhlps m1, m7
- paddd m7, m1
- pshufd m1, m7, 1
- paddd m7, m1
- movd eax, m7
+ HADDD m7, m0
+ movd eax, m7
RET
%else
cglobal pixel_satd_12x16, 4,7,8,0-gprsize
@@ -3060,12 +2773,8 @@
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
- pxor m1, m1
- movhlps m1, m7
- paddd m7, m1
- pshufd m1, m7, 1
- paddd m7, m1
- movd eax, m7
+ HADDD m7, m0
+ movd eax, m7
RET
%endif
%else ;HIGH_BIT_DEPTH
@@ -3149,12 +2858,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_24x32, 4,7,8,0-gprsize
@@ -3179,12 +2884,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif ;WIN64
@@ -3201,12 +2902,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%if WIN64
@@ -3217,12 +2914,8 @@
SATD_START_SSE2 m6, m7
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
cglobal pixel_satd_8x8, 4,6,8
More information about the x265-devel
mailing list