[x265] [PATCH] asm: fix overflow due to pixel_satd asm function for 64-bit build
Yuvaraj Venkatesh
yuvaraj at multicorewareinc.com
Mon Jan 27 10:52:03 CET 2014
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1390811862 -19800
# Mon Jan 27 14:07:42 2014 +0530
# Node ID d1b8baced215bdbb8e018d66db701786bdddeef2
# Parent b59b1e579f78b4c29c0c1491e6198a63ba1d597f
asm: fix overflow due to pixel_satd asm function for 64-bit build
diff -r b59b1e579f78 -r d1b8baced215 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jan 27 00:10:56 2014 -0600
+++ b/source/common/x86/asm-primitives.cpp Mon Jan 27 14:07:42 2014 +0530
@@ -64,14 +64,30 @@
#define INIT8(name, cpu) INIT8_NAME(name, name, cpu)
#define HEVC_SATD(cpu) \
- p.satd[LUMA_32x32] = x265_pixel_satd_32x32_ ## cpu; \
- p.satd[LUMA_24x32] = x265_pixel_satd_24x32_ ## cpu; \
- p.satd[LUMA_64x64] = x265_pixel_satd_64x64_ ## cpu; \
- p.satd[LUMA_64x32] = x265_pixel_satd_64x32_ ## cpu; \
- p.satd[LUMA_32x64] = x265_pixel_satd_32x64_ ## cpu; \
- p.satd[LUMA_64x48] = x265_pixel_satd_64x48_ ## cpu; \
- p.satd[LUMA_48x64] = x265_pixel_satd_48x64_ ## cpu; \
- p.satd[LUMA_64x16] = x265_pixel_satd_64x16_ ## cpu
+ p.satd[LUMA_4x8] = x265_pixel_satd_4x8_ ## cpu; \
+ p.satd[LUMA_4x16] = x265_pixel_satd_4x16_ ## cpu; \
+ p.satd[LUMA_8x4] = x265_pixel_satd_8x4_ ## cpu; \
+ p.satd[LUMA_8x8] = x265_pixel_satd_8x8_ ## cpu; \
+ p.satd[LUMA_8x16] = x265_pixel_satd_8x16_ ## cpu; \
+ p.satd[LUMA_8x32] = x265_pixel_satd_8x32_ ## cpu; \
+ p.satd[LUMA_12x16] = x265_pixel_satd_12x16_ ## cpu; \
+ p.satd[LUMA_16x4] = x265_pixel_satd_16x4_ ## cpu; \
+ p.satd[LUMA_16x8] = x265_pixel_satd_16x8_ ## cpu; \
+ p.satd[LUMA_16x12] = x265_pixel_satd_16x12_ ## cpu; \
+ p.satd[LUMA_16x16] = x265_pixel_satd_16x16_ ## cpu; \
+ p.satd[LUMA_16x32] = x265_pixel_satd_16x32_ ## cpu; \
+ p.satd[LUMA_16x64] = x265_pixel_satd_16x64_ ## cpu; \
+ p.satd[LUMA_24x32] = x265_pixel_satd_24x32_ ## cpu; \
+ p.satd[LUMA_32x8] = x265_pixel_satd_32x8_ ## cpu; \
+ p.satd[LUMA_32x16] = x265_pixel_satd_32x16_ ## cpu; \
+ p.satd[LUMA_32x24] = x265_pixel_satd_32x24_ ## cpu; \
+ p.satd[LUMA_32x32] = x265_pixel_satd_32x32_ ## cpu; \
+ p.satd[LUMA_32x64] = x265_pixel_satd_32x64_ ## cpu; \
+ p.satd[LUMA_48x64] = x265_pixel_satd_48x64_ ## cpu; \
+ p.satd[LUMA_64x16] = x265_pixel_satd_64x16_ ## cpu; \
+ p.satd[LUMA_64x32] = x265_pixel_satd_64x32_ ## cpu; \
+ p.satd[LUMA_64x48] = x265_pixel_satd_64x48_ ## cpu; \
+ p.satd[LUMA_64x64] = x265_pixel_satd_64x64_ ## cpu;
#define SAD_X3(cpu) \
p.sad_x3[LUMA_16x8] = x265_pixel_sad_x3_16x8_ ## cpu; \
@@ -775,17 +791,8 @@
INIT8(sad, _mmx2);
INIT8(sad_x3, _mmx2);
INIT8(sad_x4, _mmx2);
- INIT8(satd, _mmx2);
+ p.satd[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
p.sa8d_inter[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
- p.satd[LUMA_8x32] = x265_pixel_satd_8x32_sse2;
- p.satd[LUMA_12x16] = x265_pixel_satd_12x16_sse2;
- p.satd[LUMA_16x4] = x265_pixel_satd_16x4_sse2;
- p.satd[LUMA_16x12] = x265_pixel_satd_16x12_sse2;
- p.satd[LUMA_16x32] = x265_pixel_satd_16x32_sse2;
- p.satd[LUMA_16x64] = x265_pixel_satd_16x64_sse2;
- p.satd[LUMA_32x8] = x265_pixel_satd_32x8_sse2;
- p.satd[LUMA_32x16] = x265_pixel_satd_32x16_sse2;
- p.satd[LUMA_32x24] = x265_pixel_satd_32x24_sse2;
p.frame_init_lowres_core = x265_frame_init_lowres_core_mmx2;
PIXEL_AVG(sse2);
@@ -916,6 +923,7 @@
p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_ssse3;
p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_ssse3;
+ HEVC_SATD(ssse3);
p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
p.luma_p2s = x265_luma_p2s_ssse3;
@@ -928,11 +936,6 @@
}
if (cpuMask & X265_CPU_SSE4)
{
- p.satd[LUMA_4x16] = x265_pixel_satd_4x16_sse4;
- p.satd[LUMA_12x16] = x265_pixel_satd_12x16_sse4;
- p.satd[LUMA_32x8] = x265_pixel_satd_32x8_sse4;
- p.satd[LUMA_32x16] = x265_pixel_satd_32x16_sse4;
- p.satd[LUMA_32x24] = x265_pixel_satd_32x24_sse4;
SA8D_INTER_FROM_BLOCK(sse4);
p.sse_pp[LUMA_12x16] = x265_pixel_ssd_12x16_sse4;
@@ -1022,17 +1025,13 @@
if (cpuMask & X265_CPU_AVX)
{
p.frame_init_lowres_core = x265_frame_init_lowres_core_avx;
- p.satd[LUMA_4x16] = x265_pixel_satd_4x16_avx;
- p.satd[LUMA_12x16] = x265_pixel_satd_12x16_avx;
- p.satd[LUMA_32x8] = x265_pixel_satd_32x8_avx;
- p.satd[LUMA_32x16] = x265_pixel_satd_32x16_avx;
- p.satd[LUMA_32x24] = x265_pixel_satd_32x24_avx;
+
SA8D_INTER_FROM_BLOCK(avx);
ASSGN_SSE(avx);
HEVC_SATD(avx);
ASSGN_SSE_SS(avx);
SAD_X3(avx);
- SAD_X3(avx);
+ SAD_X4(avx);
p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_avx;
p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_avx;
p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_avx;
diff -r b59b1e579f78 -r d1b8baced215 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Mon Jan 27 00:10:56 2014 -0600
+++ b/source/common/x86/pixel-a.asm Mon Jan 27 14:07:42 2014 +0530
@@ -674,197 +674,6 @@
%if vertical
mova m7, [pw_00ff]
%endif
- call pixel_satd_16x4_internal
- SATD_END_SSE2 m10
-
-cglobal pixel_satd_16x8, 4,6,12
- SATD_START_SSE2 m10, m7
-%if vertical
- mova m7, [pw_00ff]
-%endif
- jmp %%pixel_satd_16x8_internal
-
-cglobal pixel_satd_16x12, 4,6,12
- SATD_START_SSE2 m10, m7
-%if vertical
- mova m7, [pw_00ff]
-%endif
- call pixel_satd_16x4_internal
- jmp %%pixel_satd_16x8_internal
- SATD_END_SSE2 m10
-
-cglobal pixel_satd_16x32, 4,6,12
- SATD_START_SSE2 m10, m7
-%if vertical
- mova m7, [pw_00ff]
-%endif
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- jmp %%pixel_satd_16x8_internal
- SATD_END_SSE2 m10
-
-cglobal pixel_satd_16x64, 4,6,12
- SATD_START_SSE2 m10, m7
-%if vertical
- mova m7, [pw_00ff]
-%endif
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- jmp %%pixel_satd_16x8_internal
- SATD_END_SSE2 m10
-
-cglobal pixel_satd_16x16, 4,6,12
- SATD_START_SSE2 m10, m7
-%if vertical
- mova m7, [pw_00ff]
-%endif
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
-%%pixel_satd_16x8_internal:
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- SATD_END_SSE2 m10
-
-cglobal pixel_satd_32x8, 4,8,8 ;if WIN64 && notcpuflag(avx)
- SATD_START_SSE2 m10, m7
- mov r6, r0
- mov r7, r2
-%if vertical
- mova m7, [pw_00ff]
-%endif
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- lea r0, [r6 + 16]
- lea r2, [r7 + 16]
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- SATD_END_SSE2 m10
-
-cglobal pixel_satd_32x16, 4,8,8 ;if WIN64 && notcpuflag(avx)
- SATD_START_SSE2 m10, m7
- mov r6, r0
- mov r7, r2
-%if vertical
- mova m7, [pw_00ff]
-%endif
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- lea r0, [r6 + 16]
- lea r2, [r7 + 16]
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- SATD_END_SSE2 m10
-
-cglobal pixel_satd_32x24, 4,8,8 ;if WIN64 && notcpuflag(avx)
- SATD_START_SSE2 m10, m7
- mov r6, r0
- mov r7, r2
-%if vertical
- mova m7, [pw_00ff]
-%endif
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- lea r0, [r6 + 16]
- lea r2, [r7 + 16]
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- SATD_END_SSE2 m10
-
-cglobal pixel_satd_32x32, 4,8,8 ;if WIN64 && notcpuflag(avx)
- SATD_START_SSE2 m10, m7
- mov r6, r0
- mov r7, r2
-%if vertical
- mova m7, [pw_00ff]
-%endif
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- lea r0, [r6 + 16]
- lea r2, [r7 + 16]
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- SATD_END_SSE2 m10
-
-cglobal pixel_satd_32x64, 4,8,8 ;if WIN64 && notcpuflag(avx)
- SATD_START_SSE2 m10, m7
- mov r6, r0
- mov r7, r2
-%if vertical
- mova m7, [pw_00ff]
-%endif
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- lea r0, [r6 + 16]
- lea r2, [r7 + 16]
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
pxor m9, m9
movhlps m9, m10
@@ -874,63 +683,63 @@
movd eax, m10
RET
-cglobal pixel_satd_48x64, 4,8,8 ;if WIN64 && notcpuflag(avx)
+cglobal pixel_satd_16x8, 4,6,12
SATD_START_SSE2 m10, m7
- mov r6, r0
- mov r7, r2
%if vertical
mova m7, [pw_00ff]
%endif
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- lea r0, [r6 + 16]
- lea r2, [r7 + 16]
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- lea r0, [r6 + 32]
- lea r2, [r7 + 32]
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
+ jmp %%pixel_satd_16x8_internal
+
+cglobal pixel_satd_16x12, 4,6,12
+ SATD_START_SSE2 m10, m7
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_16x4_internal2
+ jmp %%pixel_satd_16x8_internal
+
+cglobal pixel_satd_16x32, 4,6,12
+ SATD_START_SSE2 m10, m7
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ jmp %%pixel_satd_16x8_internal
+
+cglobal pixel_satd_16x64, 4,6,12
+ SATD_START_SSE2 m10, m7
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ jmp %%pixel_satd_16x8_internal
+
+cglobal pixel_satd_16x16, 4,6,12
+ SATD_START_SSE2 m10, m7
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+%%pixel_satd_16x8_internal:
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
pxor m9, m9
@@ -941,83 +750,19 @@
movd eax, m10
RET
-cglobal pixel_satd_64x16, 4,8,8 ;if WIN64 && notcpuflag(avx)
+cglobal pixel_satd_32x8, 4,8,11 ;if WIN64 && notcpuflag(avx)
SATD_START_SSE2 m10, m7
mov r6, r0
mov r7, r2
%if vertical
mova m7, [pw_00ff]
%endif
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
lea r0, [r6 + 16]
lea r2, [r7 + 16]
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- lea r0, [r6 + 32]
- lea r2, [r7 + 32]
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- lea r0, [r6 + 48]
- lea r2, [r7 + 48]
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- call pixel_satd_16x4_internal
- SATD_END_SSE2 m10
-
-cglobal pixel_satd_64x32, 4,8,8 ;if WIN64 && notcpuflag(avx)
- SATD_START_SSE2 m10, m7
- mov r6, r0
- mov r7, r2
-%if vertical
- mova m7, [pw_00ff]
-%endif
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- lea r0, [r6 + 16]
- lea r2, [r7 + 16]
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- lea r0, [r6 + 32]
- lea r2, [r7 + 32]
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- lea r0, [r6 + 48]
- lea r2, [r7 + 48]
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
-
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
pxor m9, m9
movhlps m9, m10
paddd m10, m9
@@ -1026,7 +771,7 @@
movd eax, m10
RET
-cglobal pixel_satd_64x48, 4,8,8 ;if WIN64 && notcpuflag(avx)
+cglobal pixel_satd_32x16, 4,8,11 ;if WIN64 && notcpuflag(avx)
SATD_START_SSE2 m10, m7
mov r6, r0
mov r7, r2
@@ -1037,57 +782,12 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
lea r0, [r6 + 16]
lea r2, [r7 + 16]
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- lea r0, [r6 + 32]
- lea r2, [r7 + 32]
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- lea r0, [r6 + 48]
- lea r2, [r7 + 48]
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
-
pxor m9, m9
movhlps m9, m10
paddd m10, m9
@@ -1096,7 +796,7 @@
movd eax, m10
RET
-cglobal pixel_satd_64x64, 4,8,8 ;if WIN64 && notcpuflag(avx)
+cglobal pixel_satd_32x24, 4,8,11 ;if WIN64 && notcpuflag(avx)
SATD_START_SSE2 m10, m7
mov r6, r0
mov r7, r2
@@ -1109,16 +809,6 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
lea r0, [r6 + 16]
lea r2, [r7 + 16]
call pixel_satd_16x4_internal2
@@ -1127,53 +817,6 @@
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- lea r0, [r6 + 32]
- lea r2, [r7 + 32]
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- lea r0, [r6 + 48]
- lea r2, [r7 + 48]
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
- call pixel_satd_16x4_internal2
-
pxor m9, m9
movhlps m9, m10
paddd m10, m9
@@ -1182,6 +825,402 @@
movd eax, m10
RET
+cglobal pixel_satd_32x32, 4,8,11 ;if WIN64 && notcpuflag(avx)
+ SATD_START_SSE2 m10, m7
+ mov r6, r0
+ mov r7, r2
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ lea r0, [r6 + 16]
+ lea r2, [r7 + 16]
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ pxor m9, m9
+ movhlps m9, m10
+ paddd m10, m9
+ pshufd m9, m10, 1
+ paddd m10, m9
+ movd eax, m10
+ RET
+
+cglobal pixel_satd_32x64, 4,8,11 ;if WIN64 && notcpuflag(avx)
+ SATD_START_SSE2 m10, m7
+ mov r6, r0
+ mov r7, r2
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ lea r0, [r6 + 16]
+ lea r2, [r7 + 16]
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ pxor m9, m9
+ movhlps m9, m10
+ paddd m10, m9
+ pshufd m9, m10, 1
+ paddd m10, m9
+ movd eax, m10
+ RET
+
+cglobal pixel_satd_48x64, 4,8,11 ;if WIN64 && notcpuflag(avx)
+ SATD_START_SSE2 m10, m7
+ mov r6, r0
+ mov r7, r2
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ lea r0, [r6 + 16]
+ lea r2, [r7 + 16]
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ lea r0, [r6 + 32]
+ lea r2, [r7 + 32]
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ pxor m9, m9
+ movhlps m9, m10
+ paddd m10, m9
+ pshufd m9, m10, 1
+ paddd m10, m9
+ movd eax, m10
+ RET
+
+cglobal pixel_satd_64x16, 4,8,11 ;if WIN64 && notcpuflag(avx)
+ SATD_START_SSE2 m10, m7
+ mov r6, r0
+ mov r7, r2
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ lea r0, [r6 + 16]
+ lea r2, [r7 + 16]
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ lea r0, [r6 + 32]
+ lea r2, [r7 + 32]
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ lea r0, [r6 + 48]
+ lea r2, [r7 + 48]
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ pxor m9, m9
+ movhlps m9, m10
+ paddd m10, m9
+ pshufd m9, m10, 1
+ paddd m10, m9
+ movd eax, m10
+ RET
+
+cglobal pixel_satd_64x32, 4,8,11 ;if WIN64 && notcpuflag(avx)
+ SATD_START_SSE2 m10, m7
+ mov r6, r0
+ mov r7, r2
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ lea r0, [r6 + 16]
+ lea r2, [r7 + 16]
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ lea r0, [r6 + 32]
+ lea r2, [r7 + 32]
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ lea r0, [r6 + 48]
+ lea r2, [r7 + 48]
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+
+ pxor m9, m9
+ movhlps m9, m10
+ paddd m10, m9
+ pshufd m9, m10, 1
+ paddd m10, m9
+ movd eax, m10
+ RET
+
+cglobal pixel_satd_64x48, 4,8,11 ;if WIN64 && notcpuflag(avx)
+ SATD_START_SSE2 m10, m7
+ mov r6, r0
+ mov r7, r2
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ lea r0, [r6 + 16]
+ lea r2, [r7 + 16]
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ lea r0, [r6 + 32]
+ lea r2, [r7 + 32]
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ lea r0, [r6 + 48]
+ lea r2, [r7 + 48]
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+
+ pxor m9, m9
+ movhlps m9, m10
+ paddd m10, m9
+ pshufd m9, m10, 1
+ paddd m10, m9
+ movd eax, m10
+ RET
+
+cglobal pixel_satd_64x64, 4,8,11 ;if WIN64 && notcpuflag(avx)
+ SATD_START_SSE2 m10, m7
+ mov r6, r0
+ mov r7, r2
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ lea r0, [r6 + 16]
+ lea r2, [r7 + 16]
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ lea r0, [r6 + 32]
+ lea r2, [r7 + 32]
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ lea r0, [r6 + 48]
+ lea r2, [r7 + 48]
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+
+ pxor m9, m9
+ movhlps m9, m10
+ paddd m10, m9
+ pshufd m9, m10, 1
+ paddd m10, m9
+ movd eax, m10
+ RET
+
%else
%if WIN64
@@ -1189,17 +1228,23 @@
SATD_START_SSE2 m6, m7
mov r6, r0
mov r7, r2
- call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
- call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
lea r2, [r7 + 16*SIZEOF_PIXEL]
- call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
lea r2, [r7 + 24*SIZEOF_PIXEL]
- call pixel_satd_8x8_internal
- SATD_END_SSE2 m6
+ call pixel_satd_8x8_internal2
+ pxor m7, m7
+ movhlps m7, m6
+ paddd m6, m7
+ pshufd m7, m6, 1
+ paddd m6, m7
+ movd eax, m6
+ RET
%else
cglobal pixel_satd_32x8, 4,7,8,0-gprsize ;if !WIN64
SATD_START_SSE2 m6, m7
@@ -1226,21 +1271,27 @@
SATD_START_SSE2 m6, m7
mov r6, r0
mov r7, r2
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
lea r2, [r7 + 16*SIZEOF_PIXEL]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
lea r2, [r7 + 24*SIZEOF_PIXEL]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- SATD_END_SSE2 m6
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ pxor m7, m7
+ movhlps m7, m6
+ paddd m6, m7
+ pshufd m7, m6, 1
+ paddd m6, m7
+ movd eax, m6
+ RET
%else
cglobal pixel_satd_32x16, 4,7,8,0-gprsize ;if !WIN64
SATD_START_SSE2 m6, m7
@@ -1271,28 +1322,34 @@
SATD_START_SSE2 m6, m7
mov r6, r0
mov r7, r2
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
SATD_ACCUM m6, m0, m7
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
SATD_ACCUM m6, m0, m7
lea r0, [r6 + 16*SIZEOF_PIXEL]
lea r2, [r7 + 16*SIZEOF_PIXEL]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
SATD_ACCUM m6, m0, m7
lea r0, [r6 + 24*SIZEOF_PIXEL]
lea r2, [r7 + 24*SIZEOF_PIXEL]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- SATD_END_SSE2 m6, m7
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ pxor m7, m7
+ movhlps m7, m6
+ paddd m6, m7
+ pshufd m7, m6, 1
+ paddd m6, m7
+ movd eax, m6
+ RET
%else
cglobal pixel_satd_32x24, 4,7,8,0-gprsize ;if !WIN64
SATD_START_SSE2 m6, m7
@@ -1333,32 +1390,38 @@
SATD_START_SSE2 m6, m7
mov r6, r0
mov r7, r2
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
SATD_ACCUM m6, m0, m7
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
SATD_ACCUM m6, m0, m7
lea r0, [r6 + 16*SIZEOF_PIXEL]
lea r2, [r7 + 16*SIZEOF_PIXEL]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
SATD_ACCUM m6, m0, m7
lea r0, [r6 + 24*SIZEOF_PIXEL]
lea r2, [r7 + 24*SIZEOF_PIXEL]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- SATD_END_SSE2 m6, m7
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ pxor m7, m7
+ movhlps m7, m6
+ paddd m6, m7
+ pshufd m7, m6, 1
+ paddd m6, m7
+ movd eax, m6
+ RET
%else
cglobal pixel_satd_32x32, 4,7,8,0-gprsize ;if !WIN64
@@ -1658,44 +1721,50 @@
SATD_START_SSE2 m6, m7
mov r6, r0
mov r7, r2
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
SATD_ACCUM m6, m0, m7
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
SATD_ACCUM m6, m0, m7
lea r0, [r6 + 16*SIZEOF_PIXEL]
lea r2, [r7 + 16*SIZEOF_PIXEL]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
SATD_ACCUM m6, m0, m7
lea r0, [r6 + 24*SIZEOF_PIXEL]
lea r2, [r7 + 24*SIZEOF_PIXEL]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
SATD_ACCUM m6, m0, m7
lea r0, [r6 + 32*SIZEOF_PIXEL]
lea r2, [r7 + 32*SIZEOF_PIXEL]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
SATD_ACCUM m6, m0, m7
lea r0, [r6 + 40*SIZEOF_PIXEL]
lea r2, [r7 + 40*SIZEOF_PIXEL]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
SATD_ACCUM m6, m0, m7
lea r0, [r6 + 48*SIZEOF_PIXEL]
lea r2, [r7 + 48*SIZEOF_PIXEL]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
SATD_ACCUM m6, m0, m7
lea r0, [r6 + 56*SIZEOF_PIXEL]
lea r2, [r7 + 56*SIZEOF_PIXEL]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- SATD_END_SSE2 m6, m7
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ pxor m7, m7
+ movhlps m7, m6
+ paddd m6, m7
+ pshufd m7, m6, 1
+ paddd m6, m7
+ movd eax, m6
+ RET
%else
cglobal pixel_satd_64x16, 4,7,8,0-gprsize ;if !WIN64
SATD_START_SSE2 m6, m7
@@ -2362,25 +2431,29 @@
SATD_START_SSE2 m6, m7
mov r6, r0
mov r7, r2
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- SATD_ACCUM m6, m0, m7
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- SATD_ACCUM m6, m0, m7
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
lea r2, [r7 + 16*SIZEOF_PIXEL]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- SATD_END_SSE2 m6, m7
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ pxor m7, m7
+ movhlps m7, m6
+ paddd m6, m7
+ pshufd m7, m6, 1
+ paddd m6, m7
+ movd eax, m6
+ RET
%else
cglobal pixel_satd_24x32, 4,7,8,0-gprsize
SATD_START_SSE2 m6, m7
@@ -2417,22 +2490,40 @@
%if vertical
mova m7, [pw_00ff]
%endif
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- SATD_END_SSE2 m6
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ pxor m7, m7
+ movhlps m7, m6
+ paddd m6, m7
+ pshufd m7, m6, 1
+ paddd m6, m7
+ movd eax, m6
+ RET
cglobal pixel_satd_8x16, 4,6,8
SATD_START_SSE2 m6, m7
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- SATD_END_SSE2 m6
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ pxor m7, m7
+ movhlps m7, m6
+ paddd m6, m7
+ pshufd m7, m6, 1
+ paddd m6, m7
+ movd eax, m6
+ RET
cglobal pixel_satd_8x8, 4,6,8
SATD_START_SSE2 m6, m7
- call pixel_satd_8x8_internal
- SATD_END_SSE2 m6
+ call pixel_satd_8x8_internal2
+ pxor m7, m7
+ movhlps m7, m6
+ paddd m6, m7
+ pshufd m7, m6, 1
+ paddd m6, m7
+ movd eax, m6
+ RET
cglobal pixel_satd_8x4, 4,6,8
SATD_START_SSE2 m6, m7
@@ -2440,7 +2531,6 @@
SATD_END_SSE2 m6
%endmacro ; SATDS_SSE2
-
;=============================================================================
; SA8D
;=============================================================================
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140127/712fee88/attachment-0001.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 5910.patch
Type: application/octet-stream
Size: 44084 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140127/712fee88/attachment-0001.obj>
More information about the x265-devel
mailing list