[x265] [PATCH] fix Main12 satd overflow bug up to SSE4, (fixes #180)
Min Chen
chenm003 at 163.com
Wed Sep 16 20:18:59 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1442427532 18000
# Node ID 125e4475fc1eb973beac2d00af5acefc441aed71
# Parent ef2ffe88b5f278f2e76f760be61fe29ec50d090f
fix Main12 satd overflow bug up to SSE4, (fixes #180)
---
source/common/x86/asm-primitives.cpp | 2 +-
source/common/x86/pixel-a.asm | 635 +++++++++-------------------------
2 files changed, 160 insertions(+), 477 deletions(-)
diff -r ef2ffe88b5f2 -r 125e4475fc1e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Sep 16 13:18:49 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Wed Sep 16 13:18:52 2015 -0500
@@ -1179,7 +1179,7 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = PFX(pixel_satd_4x32_avx);
p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_avx);
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = PFX(pixel_satd_8x16_avx);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_avx);
+ // p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_avx);
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = PFX(pixel_satd_8x8_avx);
p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_avx);
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = PFX(pixel_satd_8x32_avx);
diff -r ef2ffe88b5f2 -r 125e4475fc1e source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Wed Sep 16 13:18:49 2015 -0500
+++ b/source/common/x86/pixel-a.asm Wed Sep 16 13:18:52 2015 -0500
@@ -242,6 +242,12 @@
%endif
HADAMARD4_2D 4, 5, 6, 7, 3, %%n
paddw m4, m6
+;%if HIGH_BIT_DEPTH && (BIT_DEPTH == 12)
+; pxor m5, m5
+; punpcklwd m6, m4, m5
+; punpckhwd m4, m5
+; paddd m4, m6
+;%endif
SWAP %%n, 4
%endmacro
@@ -257,15 +263,39 @@
HADAMARD 1, max, %2, %4, %6, %7
%endif
%ifnidn %9, swap
+ %if (BIT_DEPTH == 12)
+ pxor m%6, m%6
+ punpcklwd m%7, m%2, m%6
+ punpckhwd m%2, m%6
+ paddd m%8, m%7
+ paddd m%8, m%2
+ %else
paddw m%8, m%2
+ %endif
%else
SWAP %8, %2
%endif
%if %1
+ %if (BIT_DEPTH == 12)
+ pxor m%6, m%6
+ punpcklwd m%7, m%4, m%6
+ punpckhwd m%4, m%6
+ paddd m%8, m%7
+ paddd m%8, m%4
+ %else
paddw m%8, m%4
+ %endif
%else
HADAMARD 1, max, %3, %5, %6, %7
+ %if (BIT_DEPTH == 12)
+ pxor m%6, m%6
+ punpcklwd m%7, m%3, m%6
+ punpckhwd m%3, m%6
+ paddd m%8, m%7
+ paddd m%8, m%3
+ %else
paddw m%8, m%3
+ %endif
%endif
%endmacro
@@ -281,29 +311,23 @@
%endif
pxor m%10, m%10
- mova m%9, m%2
- punpcklwd m%9, m%10
+ punpcklwd m%9, m%2, m%10
paddd m%8, m%9
- mova m%9, m%2
- punpckhwd m%9, m%10
+ punpckhwd m%9, m%2, m%10
paddd m%8, m%9
%if %1
pxor m%10, m%10
- mova m%9, m%4
- punpcklwd m%9, m%10
+ punpcklwd m%9, m%4, m%10
paddd m%8, m%9
- mova m%9, m%4
- punpckhwd m%9, m%10
+ punpckhwd m%9, m%4, m%10
paddd m%8, m%9
%else
HADAMARD 1, max, %3, %5, %6, %7
pxor m%10, m%10
- mova m%9, m%3
- punpcklwd m%9, m%10
+ punpcklwd m%9, m%3, m%10
paddd m%8, m%9
- mova m%9, m%3
- punpckhwd m%9, m%10
+ punpckhwd m%9, m%3, m%10
paddd m%8, m%9
%endif
%endmacro
@@ -326,146 +350,21 @@
movd eax, m0
and eax, 0xffff
%endif ; HIGH_BIT_DEPTH
- RET
-%endmacro
-
-; FIXME avoid the spilling of regs to hold 3*stride.
-; for small blocks on x86_32, modify pixel pointer instead.
-
-;-----------------------------------------------------------------------------
-; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
-;-----------------------------------------------------------------------------
-INIT_MMX mmx2
-cglobal pixel_satd_16x4_internal
- SATD_4x4_MMX m2, 0, 0
- SATD_4x4_MMX m1, 4, 0
- paddw m0, m2
- SATD_4x4_MMX m2, 8, 0
- paddw m0, m1
- SATD_4x4_MMX m1, 12, 0
- paddw m0, m2
- paddw m0, m1
- ret
-
-cglobal pixel_satd_8x8_internal
- SATD_4x4_MMX m2, 0, 0
- SATD_4x4_MMX m1, 4, 1
- paddw m0, m2
- paddw m0, m1
-pixel_satd_8x4_internal_mmx2:
- SATD_4x4_MMX m2, 0, 0
- SATD_4x4_MMX m1, 4, 0
- paddw m0, m2
- paddw m0, m1
- ret
-
-%if HIGH_BIT_DEPTH
-%macro SATD_MxN_MMX 3
-cglobal pixel_satd_%1x%2, 4,7
- SATD_START_MMX
- pxor m0, m0
- call pixel_satd_%1x%3_internal_mmx2
- HADDUW m0, m1
- movd r6d, m0
-%rep %2/%3-1
- pxor m0, m0
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- call pixel_satd_%1x%3_internal_mmx2
- movd m2, r4
- HADDUW m0, m1
- movd r4, m0
- add r6, r4
- movd r4, m2
-%endrep
- movifnidn eax, r6d
- RET
-%endmacro
-
-SATD_MxN_MMX 16, 16, 4
-SATD_MxN_MMX 16, 8, 4
-SATD_MxN_MMX 8, 16, 8
-%endif ; HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH == 0
-cglobal pixel_satd_16x16, 4,6
- SATD_START_MMX
- pxor m0, m0
-%rep 3
- call pixel_satd_16x4_internal_mmx2
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
-%endrep
- call pixel_satd_16x4_internal_mmx2
- HADDUW m0, m1
- movd eax, m0
- RET
-
-cglobal pixel_satd_16x8, 4,6
- SATD_START_MMX
- pxor m0, m0
- call pixel_satd_16x4_internal_mmx2
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- call pixel_satd_16x4_internal_mmx2
- SATD_END_MMX
-
-cglobal pixel_satd_8x16, 4,6
- SATD_START_MMX
- pxor m0, m0
- call pixel_satd_8x8_internal_mmx2
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- call pixel_satd_8x8_internal_mmx2
- SATD_END_MMX
-%endif ; !HIGH_BIT_DEPTH
-
-cglobal pixel_satd_8x8, 4,6
- SATD_START_MMX
- pxor m0, m0
- call pixel_satd_8x8_internal_mmx2
- SATD_END_MMX
-
-cglobal pixel_satd_8x4, 4,6
- SATD_START_MMX
- pxor m0, m0
- call pixel_satd_8x4_internal_mmx2
- SATD_END_MMX
-
-cglobal pixel_satd_4x16, 4,6
- SATD_START_MMX
- SATD_4x4_MMX m0, 0, 1
- SATD_4x4_MMX m1, 0, 1
- paddw m0, m1
- SATD_4x4_MMX m1, 0, 1
- paddw m0, m1
- SATD_4x4_MMX m1, 0, 0
- paddw m0, m1
- SATD_END_MMX
-
-cglobal pixel_satd_4x8, 4,6
- SATD_START_MMX
- SATD_4x4_MMX m0, 0, 1
- SATD_4x4_MMX m1, 0, 0
- paddw m0, m1
- SATD_END_MMX
-
-cglobal pixel_satd_4x4, 4,6
- SATD_START_MMX
- SATD_4x4_MMX m0, 0, 0
-%if HIGH_BIT_DEPTH
- HADDUW m0, m1
- movd eax, m0
-%else ; !HIGH_BIT_DEPTH
- pshufw m1, m0, q1032
- paddw m0, m1
- pshufw m1, m0, q2301
- paddw m0, m1
- movd eax, m0
- and eax, 0xffff
-%endif ; HIGH_BIT_DEPTH
EMMS
RET
+%endmacro
+
+; FIXME avoid the spilling of regs to hold 3*stride.
+; for small blocks on x86_32, modify pixel pointer instead.
+
+;-----------------------------------------------------------------------------
+; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+INIT_MMX mmx2
+cglobal pixel_satd_4x4, 4,6
+ SATD_START_MMX
+ SATD_4x4_MMX m0, 0, 0
+ SATD_END_MMX
%macro SATD_START_SSE2 2-3 0
FIX_STRIDES r1, r3
@@ -485,10 +384,14 @@
%macro SATD_END_SSE2 1-2
%if HIGH_BIT_DEPTH
+ %if BIT_DEPTH == 12
+ HADDD %1, xm0
+ %else ; BIT_DEPTH == 12
HADDUW %1, xm0
-%if %0 == 2
+ %endif ; BIT_DEPTH == 12
+ %if %0 == 2
paddd %1, %2
-%endif
+ %endif
%else
HADDW %1, xm7
%endif
@@ -631,7 +534,7 @@
mova m7, [hmul_4p]
%endif
SATD_4x8_SSE vertical, 0, swap
- HADDW m7, m1
+ HADDUW m7, m1
movd eax, m7
RET
@@ -644,7 +547,11 @@
lea r0, [r0+r1*2*SIZEOF_PIXEL]
lea r2, [r2+r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
+ %if BIT_DEPTH == 12
+ HADDD m7, m1
+ %else
HADDUW m7, m1
+ %endif
movd eax, m7
RET
@@ -690,12 +597,8 @@
mova m7, [pw_00ff]
%endif
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_16x8, 4,6,14
@@ -757,12 +660,8 @@
%%pixel_satd_16x8_internal:
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_32x8, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -778,12 +677,8 @@
lea r2, [r7 + 16]
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_32x16, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -803,11 +698,7 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
+ HADDD m10, m0
movd eax, m10
RET
@@ -832,12 +723,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_32x32, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -865,12 +752,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_32x64, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -914,12 +797,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_48x64, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -981,12 +860,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_64x16, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -1018,12 +893,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_64x32, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -1072,12 +943,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_64x48, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -1142,12 +1009,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
cglobal pixel_satd_64x64, 4,8,14 ;if WIN64 && notcpuflag(avx)
@@ -1228,12 +1091,8 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- pxor m9, m9
- movhlps m9, m10
- paddd m10, m9
- pshufd m9, m10, 1
- paddd m10, m9
- movd eax, m10
+ HADDD m10, m0
+ movd eax, m10
RET
%else
@@ -1250,11 +1109,7 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
+ HADDD m6, m0
movd eax, m6
RET
%else
@@ -1271,12 +1126,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
%if WIN64
@@ -1314,12 +1165,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_32x48, 4,7,8,0-gprsize ;if !WIN64
@@ -1359,12 +1206,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -1401,12 +1244,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_24x64, 4,7,8,0-gprsize ;if !WIN64
@@ -1443,12 +1282,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -1465,12 +1300,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_8x64, 4,7,8,0-gprsize ;if !WIN64
@@ -1485,12 +1316,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -1515,12 +1342,8 @@
mov [rsp], r2
call pixel_satd_8x8_internal2
call %%pixel_satd_8x4_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -1565,12 +1388,8 @@
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
- pxor m1, m1
- movhlps m1, m7
- paddd m7, m1
- pshufd m1, m7, 1
- paddd m7, m1
- movd eax, m7
+ HADDD m7, m0
+ movd eax, m7
RET
%else
cglobal pixel_satd_12x32, 4,7,8,0-gprsize
@@ -1614,12 +1433,8 @@
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
- pxor m1, m1
- movhlps m1, m7
- paddd m7, m1
- pshufd m1, m7, 1
- paddd m7, m1
- movd eax, m7
+ HADDD m7, m0
+ movd eax, m7
RET
%endif
%else ;HIGH_BIT_DEPTH
@@ -1735,12 +1550,8 @@
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
- pxor m1, m1
- movhlps m1, m7
- paddd m7, m1
- pshufd m1, m7, 1
- paddd m7, m1
- movd eax, m7
+ HADDD m7, m0
+ movd eax, m7
RET
%else
cglobal pixel_satd_4x32, 4,7,8,0-gprsize
@@ -1827,12 +1638,8 @@
lea r0, [r6 + 24*SIZEOF_PIXEL]
lea r2, [r7 + 24*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_32x8, 4,7,8,0-gprsize ;if !WIN64
@@ -1852,12 +1659,8 @@
mov r2, [rsp]
add r2, 24*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -1880,12 +1683,8 @@
lea r2, [r7 + 24*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_32x16, 4,7,8,0-gprsize ;if !WIN64
@@ -1909,12 +1708,8 @@
add r2, 24*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -1941,12 +1736,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_32x24, 4,7,8,0-gprsize ;if !WIN64
@@ -1974,12 +1765,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2010,12 +1797,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_32x32, 4,7,8,0-gprsize ;if !WIN64
@@ -2047,12 +1830,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2099,12 +1878,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_32x64, 4,7,8,0-gprsize ;if !WIN64
@@ -2152,12 +1927,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2224,12 +1995,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_48x64, 4,7,8,0-gprsize ;if !WIN64
@@ -2299,12 +2066,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2344,12 +2107,8 @@
lea r2, [r7 + 56*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_64x16, 4,7,8,0-gprsize ;if !WIN64
@@ -2393,12 +2152,8 @@
add r2,56*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2453,12 +2208,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_64x32, 4,7,8,0-gprsize ;if !WIN64
@@ -2518,12 +2269,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2594,12 +2341,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m8, m8
- movhlps m8, m6
- paddd m6, m8
- pshufd m8, m6, 1
- paddd m6, m8
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_64x48, 4,7,8,0-gprsize ;if !WIN64
@@ -2675,12 +2418,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2767,12 +2506,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m8, m8
- movhlps m8, m6
- paddd m6, m8
- pshufd m8, m6, 1
- paddd m6, m8
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_64x64, 4,7,8,0-gprsize ;if !WIN64
@@ -2864,12 +2599,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -2883,12 +2614,8 @@
call %%pixel_satd_8x4_internal2
RESTORE_AND_INC_POINTERS
call %%pixel_satd_8x4_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%if WIN64
@@ -2901,12 +2628,8 @@
call pixel_satd_8x8_internal2
RESTORE_AND_INC_POINTERS
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%if WIN64
@@ -2921,12 +2644,8 @@
RESTORE_AND_INC_POINTERS
call pixel_satd_8x8_internal2
call %%pixel_satd_8x4_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%if WIN64
@@ -2941,12 +2660,8 @@
RESTORE_AND_INC_POINTERS
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%if WIN64
@@ -2965,12 +2680,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%if WIN64
@@ -2997,12 +2708,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif
@@ -3029,12 +2736,8 @@
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
- pxor m1, m1
- movhlps m1, m7
- paddd m7, m1
- pshufd m1, m7, 1
- paddd m7, m1
- movd eax, m7
+ HADDD m7, m0
+ movd eax, m7
RET
%else
cglobal pixel_satd_12x16, 4,7,8,0-gprsize
@@ -3060,12 +2763,8 @@
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
- pxor m1, m1
- movhlps m1, m7
- paddd m7, m1
- pshufd m1, m7, 1
- paddd m7, m1
- movd eax, m7
+ HADDD m7, m0
+ movd eax, m7
RET
%endif
%else ;HIGH_BIT_DEPTH
@@ -3149,12 +2848,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%else
cglobal pixel_satd_24x32, 4,7,8,0-gprsize
@@ -3179,12 +2874,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%endif ;WIN64
@@ -3201,12 +2892,8 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
%if WIN64
@@ -3217,12 +2904,8 @@
SATD_START_SSE2 m6, m7
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- pxor m7, m7
- movhlps m7, m6
- paddd m6, m7
- pshufd m7, m6, 1
- paddd m6, m7
- movd eax, m6
+ HADDD m6, m0
+ movd eax, m6
RET
cglobal pixel_satd_8x8, 4,6,8
More information about the x265-devel
mailing list