<div dir="ltr"><div># HG changeset patch<br></div><div># User Yuvaraj Venkatesh <<a href="mailto:yuvaraj@multicorewareinc.com">yuvaraj@multicorewareinc.com</a>></div><div># Date 1390811862 -19800</div><div># Mon Jan 27 14:07:42 2014 +0530</div>
<div># Node ID d1b8baced215bdbb8e018d66db701786bdddeef2</div><div># Parent b59b1e579f78b4c29c0c1491e6198a63ba1d597f</div><div>asm: fix overflow due to pixel_satd asm function for 64-bit build</div><div><br></div><div>diff -r b59b1e579f78 -r d1b8baced215 source/common/x86/asm-primitives.cpp</div>
<div>--- a/source/common/x86/asm-primitives.cpp<span class="" style="white-space:pre"> </span>Mon Jan 27 00:10:56 2014 -0600</div><div>+++ b/source/common/x86/asm-primitives.cpp<span class="" style="white-space:pre"> </span>Mon Jan 27 14:07:42 2014 +0530</div>
<div>@@ -64,14 +64,30 @@</div><div> #define INIT8(name, cpu) INIT8_NAME(name, name, cpu)</div><div> </div><div> #define HEVC_SATD(cpu) \</div><div>- p.satd[LUMA_32x32] = x265_pixel_satd_32x32_ ## cpu; \</div><div>- p.satd[LUMA_24x32] = x265_pixel_satd_24x32_ ## cpu; \</div>
<div>- p.satd[LUMA_64x64] = x265_pixel_satd_64x64_ ## cpu; \</div><div>- p.satd[LUMA_64x32] = x265_pixel_satd_64x32_ ## cpu; \</div><div>- p.satd[LUMA_32x64] = x265_pixel_satd_32x64_ ## cpu; \</div><div>- p.satd[LUMA_64x48] = x265_pixel_satd_64x48_ ## cpu; \</div>
<div>- p.satd[LUMA_48x64] = x265_pixel_satd_48x64_ ## cpu; \</div><div>- p.satd[LUMA_64x16] = x265_pixel_satd_64x16_ ## cpu</div><div>+ p.satd[LUMA_4x8] = x265_pixel_satd_4x8_ ## cpu; \</div><div>+ p.satd[LUMA_4x16] = x265_pixel_satd_4x16_ ## cpu; \</div>
<div>+ p.satd[LUMA_8x4] = x265_pixel_satd_8x4_ ## cpu; \</div><div>+ p.satd[LUMA_8x8] = x265_pixel_satd_8x8_ ## cpu; \</div><div>+ p.satd[LUMA_8x16] = x265_pixel_satd_8x16_ ## cpu; \</div><div>+ p.satd[LUMA_8x32] = x265_pixel_satd_8x32_ ## cpu; \</div>
<div>+ p.satd[LUMA_12x16] = x265_pixel_satd_12x16_ ## cpu; \</div><div>+ p.satd[LUMA_16x4] = x265_pixel_satd_16x4_ ## cpu; \</div><div>+ p.satd[LUMA_16x8] = x265_pixel_satd_16x8_ ## cpu; \</div><div>+ p.satd[LUMA_16x12] = x265_pixel_satd_16x12_ ## cpu; \</div>
<div>+ p.satd[LUMA_16x16] = x265_pixel_satd_16x16_ ## cpu; \</div><div>+ p.satd[LUMA_16x32] = x265_pixel_satd_16x32_ ## cpu; \</div><div>+ p.satd[LUMA_16x64] = x265_pixel_satd_16x64_ ## cpu; \</div><div>+ p.satd[LUMA_24x32] = x265_pixel_satd_24x32_ ## cpu; \</div>
<div>+ p.satd[LUMA_32x8] = x265_pixel_satd_32x8_ ## cpu; \</div><div>+ p.satd[LUMA_32x16] = x265_pixel_satd_32x16_ ## cpu; \</div><div>+ p.satd[LUMA_32x24] = x265_pixel_satd_32x24_ ## cpu; \</div><div>+ p.satd[LUMA_32x32] = x265_pixel_satd_32x32_ ## cpu; \</div>
<div>+ p.satd[LUMA_32x64] = x265_pixel_satd_32x64_ ## cpu; \</div><div>+ p.satd[LUMA_48x64] = x265_pixel_satd_48x64_ ## cpu; \</div><div>+ p.satd[LUMA_64x16] = x265_pixel_satd_64x16_ ## cpu; \</div><div>+ p.satd[LUMA_64x32] = x265_pixel_satd_64x32_ ## cpu; \</div>
<div>+ p.satd[LUMA_64x48] = x265_pixel_satd_64x48_ ## cpu; \</div><div>+ p.satd[LUMA_64x64] = x265_pixel_satd_64x64_ ## cpu;</div><div> </div><div> #define SAD_X3(cpu) \</div><div> p.sad_x3[LUMA_16x8] = x265_pixel_sad_x3_16x8_ ## cpu; \</div>
<div>@@ -775,17 +791,8 @@</div><div> INIT8(sad, _mmx2);</div><div> INIT8(sad_x3, _mmx2);</div><div> INIT8(sad_x4, _mmx2);</div><div>- INIT8(satd, _mmx2);</div><div>+ p.satd[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;</div>
<div> p.sa8d_inter[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;</div><div>- p.satd[LUMA_8x32] = x265_pixel_satd_8x32_sse2;</div><div>- p.satd[LUMA_12x16] = x265_pixel_satd_12x16_sse2;</div><div>- p.satd[LUMA_16x4] = x265_pixel_satd_16x4_sse2;</div>
<div>- p.satd[LUMA_16x12] = x265_pixel_satd_16x12_sse2;</div><div>- p.satd[LUMA_16x32] = x265_pixel_satd_16x32_sse2;</div><div>- p.satd[LUMA_16x64] = x265_pixel_satd_16x64_sse2;</div><div>- p.satd[LUMA_32x8] = x265_pixel_satd_32x8_sse2;</div>
<div>- p.satd[LUMA_32x16] = x265_pixel_satd_32x16_sse2;</div><div>- p.satd[LUMA_32x24] = x265_pixel_satd_32x24_sse2;</div><div> p.frame_init_lowres_core = x265_frame_init_lowres_core_mmx2;</div><div>
</div><div> PIXEL_AVG(sse2);</div><div>@@ -916,6 +923,7 @@</div><div> </div><div> p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_ssse3;</div><div> p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_ssse3;</div>
<div>+ HEVC_SATD(ssse3);</div><div> </div><div> p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;</div><div> p.luma_p2s = x265_luma_p2s_ssse3;</div><div>@@ -928,11 +936,6 @@</div><div> }</div>
<div> if (cpuMask & X265_CPU_SSE4)</div><div> {</div><div>- p.satd[LUMA_4x16] = x265_pixel_satd_4x16_sse4;</div><div>- p.satd[LUMA_12x16] = x265_pixel_satd_12x16_sse4;</div><div>- p.satd[LUMA_32x8] = x265_pixel_satd_32x8_sse4;</div>
<div>- p.satd[LUMA_32x16] = x265_pixel_satd_32x16_sse4;</div><div>- p.satd[LUMA_32x24] = x265_pixel_satd_32x24_sse4;</div><div> SA8D_INTER_FROM_BLOCK(sse4);</div><div> </div><div> p.sse_pp[LUMA_12x16] = x265_pixel_ssd_12x16_sse4;</div>
<div>@@ -1022,17 +1025,13 @@</div><div> if (cpuMask & X265_CPU_AVX)</div><div> {</div><div> p.frame_init_lowres_core = x265_frame_init_lowres_core_avx;</div><div>- p.satd[LUMA_4x16] = x265_pixel_satd_4x16_avx;</div>
<div>- p.satd[LUMA_12x16] = x265_pixel_satd_12x16_avx;</div><div>- p.satd[LUMA_32x8] = x265_pixel_satd_32x8_avx;</div><div>- p.satd[LUMA_32x16] = x265_pixel_satd_32x16_avx;</div><div>- p.satd[LUMA_32x24] = x265_pixel_satd_32x24_avx;</div>
<div>+</div><div> SA8D_INTER_FROM_BLOCK(avx);</div><div> ASSGN_SSE(avx);</div><div> HEVC_SATD(avx);</div><div> ASSGN_SSE_SS(avx);</div><div> SAD_X3(avx);</div><div>- SAD_X3(avx);</div>
<div>+ SAD_X4(avx);</div><div> p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_avx;</div><div> p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_avx;</div><div> p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_avx;</div>
<div>diff -r b59b1e579f78 -r d1b8baced215 source/common/x86/pixel-a.asm</div><div>--- a/source/common/x86/pixel-a.asm<span class="" style="white-space:pre"> </span>Mon Jan 27 00:10:56 2014 -0600</div><div>+++ b/source/common/x86/pixel-a.asm<span class="" style="white-space:pre"> </span>Mon Jan 27 14:07:42 2014 +0530</div>
<div>@@ -674,197 +674,6 @@</div><div> %if vertical</div><div> mova m7, [pw_00ff]</div><div> %endif</div><div>- call pixel_satd_16x4_internal</div><div>- SATD_END_SSE2 m10</div><div>-</div><div>-cglobal pixel_satd_16x8, 4,6,12</div>
<div>- SATD_START_SSE2 m10, m7</div><div>-%if vertical</div><div>- mova m7, [pw_00ff]</div><div>-%endif</div><div>- jmp %%pixel_satd_16x8_internal</div><div>-</div><div>-cglobal pixel_satd_16x12, 4,6,12</div><div>
- SATD_START_SSE2 m10, m7</div><div>-%if vertical</div><div>- mova m7, [pw_00ff]</div><div>-%endif</div><div>- call pixel_satd_16x4_internal</div><div>- jmp %%pixel_satd_16x8_internal</div><div>- SATD_END_SSE2 m10 </div>
<div>-</div><div>-cglobal pixel_satd_16x32, 4,6,12</div><div>- SATD_START_SSE2 m10, m7</div><div>-%if vertical</div><div>- mova m7, [pw_00ff]</div><div>-%endif</div><div>- call pixel_satd_16x4_internal</div><div>
- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>
- jmp %%pixel_satd_16x8_internal</div><div>- SATD_END_SSE2 m10</div><div>-</div><div>-cglobal pixel_satd_16x64, 4,6,12</div><div>- SATD_START_SSE2 m10, m7</div><div>-%if vertical</div><div>- mova m7, [pw_00ff]</div>
<div>-%endif</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div>
<div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div>
<div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- jmp %%pixel_satd_16x8_internal</div>
<div>- SATD_END_SSE2 m10 </div><div>-</div><div>-cglobal pixel_satd_16x16, 4,6,12</div><div>- SATD_START_SSE2 m10, m7</div><div>-%if vertical</div><div>- mova m7, [pw_00ff]</div><div>-%endif</div><div>- call pixel_satd_16x4_internal</div>
<div>- call pixel_satd_16x4_internal</div><div>-%%pixel_satd_16x8_internal:</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- SATD_END_SSE2 m10</div><div>-</div><div>
-cglobal pixel_satd_32x8, 4,8,8 ;if WIN64 && notcpuflag(avx)</div><div>- SATD_START_SSE2 m10, m7</div><div>- mov r6, r0</div><div>- mov r7, r2</div><div>-%if vertical</div><div>- mova m7, [pw_00ff]</div>
<div>-%endif</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- lea r0, [r6 + 16]</div><div>- lea r2, [r7 + 16]</div><div>- call pixel_satd_16x4_internal</div><div>
- call pixel_satd_16x4_internal</div><div>- SATD_END_SSE2 m10</div><div>-</div><div>-cglobal pixel_satd_32x16, 4,8,8 ;if WIN64 && notcpuflag(avx)</div><div>- SATD_START_SSE2 m10, m7</div><div>- mov r6, r0</div>
<div>- mov r7, r2</div><div>-%if vertical</div><div>- mova m7, [pw_00ff]</div><div>-%endif</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div>
<div>- call pixel_satd_16x4_internal</div><div>- lea r0, [r6 + 16]</div><div>- lea r2, [r7 + 16]</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div>
<div>- call pixel_satd_16x4_internal</div><div>- SATD_END_SSE2 m10</div><div>-</div><div>-cglobal pixel_satd_32x24, 4,8,8 ;if WIN64 && notcpuflag(avx)</div><div>- SATD_START_SSE2 m10, m7</div><div>- mov r6, r0</div>
<div>- mov r7, r2</div><div>-%if vertical</div><div>- mova m7, [pw_00ff]</div><div>-%endif</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div>
<div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- lea r0, [r6 + 16]</div><div>- lea r2, [r7 + 16]</div><div>- call pixel_satd_16x4_internal</div>
<div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div>
<div>- SATD_END_SSE2 m10</div><div>-</div><div>-cglobal pixel_satd_32x32, 4,8,8 ;if WIN64 && notcpuflag(avx)</div><div>- SATD_START_SSE2 m10, m7</div><div>- mov r6, r0</div><div>- mov r7, r2</div><div>
-%if vertical</div><div>- mova m7, [pw_00ff]</div><div>-%endif</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div>
<div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- lea r0, [r6 + 16]</div><div>- lea r2, [r7 + 16]</div>
<div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div>
<div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- SATD_END_SSE2 m10</div><div>-</div><div>-cglobal pixel_satd_32x64, 4,8,8 ;if WIN64 && notcpuflag(avx)</div>
<div>- SATD_START_SSE2 m10, m7</div><div>- mov r6, r0</div><div>- mov r7, r2</div><div>-%if vertical</div><div>- mova m7, [pw_00ff]</div><div>-%endif</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- lea r0, [r6 + 16]</div><div>
- lea r2, [r7 + 16]</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div> call pixel_satd_16x4_internal2</div><div> pxor m9, m9</div><div> movhlps m9, m10</div><div>@@ -874,63 +683,63 @@</div><div> movd eax, m10</div><div> RET</div><div> </div><div>-cglobal pixel_satd_48x64, 4,8,8 ;if WIN64 && notcpuflag(avx)</div>
<div>+cglobal pixel_satd_16x8, 4,6,12</div><div> SATD_START_SSE2 m10, m7</div><div>- mov r6, r0</div><div>- mov r7, r2</div><div> %if vertical</div><div> mova m7, [pw_00ff]</div><div> %endif</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- lea r0, [r6 + 16]</div><div>- lea r2, [r7 + 16]</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- lea r0, [r6 + 32]</div><div>- lea r2, [r7 + 32]</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>+ jmp %%pixel_satd_16x8_internal</div><div>+</div><div>+cglobal pixel_satd_16x12, 4,6,12</div><div>+ SATD_START_SSE2 m10, m7</div>
<div>+%if vertical</div><div>+ mova m7, [pw_00ff]</div><div>+%endif</div><div>+ call pixel_satd_16x4_internal2</div><div>+ jmp %%pixel_satd_16x8_internal</div><div>+</div><div>+cglobal pixel_satd_16x32, 4,6,12</div>
<div>+ SATD_START_SSE2 m10, m7</div><div>+%if vertical</div><div>+ mova m7, [pw_00ff]</div><div>+%endif</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ jmp %%pixel_satd_16x8_internal</div><div>+</div><div>+cglobal pixel_satd_16x64, 4,6,12</div>
<div>+ SATD_START_SSE2 m10, m7</div><div>+%if vertical</div><div>+ mova m7, [pw_00ff]</div><div>+%endif</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ jmp %%pixel_satd_16x8_internal</div><div>+</div><div>+cglobal pixel_satd_16x16, 4,6,12</div><div>+ SATD_START_SSE2 m10, m7</div><div>+%if vertical</div><div>+ mova m7, [pw_00ff]</div>
<div>+%endif</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+%%pixel_satd_16x8_internal:</div><div> call pixel_satd_16x4_internal2</div><div> call pixel_satd_16x4_internal2</div>
<div> pxor m9, m9</div><div>@@ -941,83 +750,19 @@</div><div> movd eax, m10</div><div> RET</div><div> </div><div>-cglobal pixel_satd_64x16, 4,8,8 ;if WIN64 && notcpuflag(avx)</div><div>+cglobal pixel_satd_32x8, 4,8,11 ;if WIN64 && notcpuflag(avx)</div>
<div> SATD_START_SSE2 m10, m7</div><div> mov r6, r0</div><div> mov r7, r2</div><div> %if vertical</div><div> mova m7, [pw_00ff]</div><div> %endif</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div>
<div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div> lea r0, [r6 + 16]</div><div> lea r2, [r7 + 16]</div>
<div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- lea r0, [r6 + 32]</div><div>- lea r2, [r7 + 32]</div>
<div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- lea r0, [r6 + 48]</div><div>- lea r2, [r7 + 48]</div>
<div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- call pixel_satd_16x4_internal</div><div>- SATD_END_SSE2 m10</div><div>-</div>
<div>-cglobal pixel_satd_64x32, 4,8,8 ;if WIN64 && notcpuflag(avx)</div><div>- SATD_START_SSE2 m10, m7</div><div>- mov r6, r0</div><div>- mov r7, r2</div><div>-%if vertical</div><div>- mova m7, [pw_00ff]</div>
<div>-%endif</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- lea r0, [r6 + 16]</div><div>- lea r2, [r7 + 16]</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- lea r0, [r6 + 32]</div><div>- lea r2, [r7 + 32]</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- lea r0, [r6 + 48]</div><div>- lea r2, [r7 + 48]</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>-</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div> pxor m9, m9</div><div> movhlps m9, m10</div><div> paddd m10, m9</div><div>@@ -1026,7 +771,7 @@</div>
<div> movd eax, m10</div><div> RET</div><div> </div><div>-cglobal pixel_satd_64x48, 4,8,8 ;if WIN64 && notcpuflag(avx)</div><div>+cglobal pixel_satd_32x16, 4,8,11 ;if WIN64 && notcpuflag(avx)</div>
<div> SATD_START_SSE2 m10, m7</div><div> mov r6, r0</div><div> mov r7, r2</div><div>@@ -1037,57 +782,12 @@</div><div> call pixel_satd_16x4_internal2</div><div> call pixel_satd_16x4_internal2</div><div>
call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div> lea r0, [r6 + 16]</div><div>
lea r2, [r7 + 16]</div><div> call pixel_satd_16x4_internal2</div><div> call pixel_satd_16x4_internal2</div><div> call pixel_satd_16x4_internal2</div><div> call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- lea r0, [r6 + 32]</div><div>- lea r2, [r7 + 32]</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- lea r0, [r6 + 48]</div><div>- lea r2, [r7 + 48]</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>-</div><div> pxor m9, m9</div><div> movhlps m9, m10</div><div> paddd m10, m9</div>
<div>@@ -1096,7 +796,7 @@</div><div> movd eax, m10</div><div> RET</div><div> </div><div>-cglobal pixel_satd_64x64, 4,8,8 ;if WIN64 && notcpuflag(avx)</div><div>+cglobal pixel_satd_32x24, 4,8,11 ;if WIN64 && notcpuflag(avx)</div>
<div> SATD_START_SSE2 m10, m7</div><div> mov r6, r0</div><div> mov r7, r2</div><div>@@ -1109,16 +809,6 @@</div><div> call pixel_satd_16x4_internal2</div><div> call pixel_satd_16x4_internal2</div><div> call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div> lea r0, [r6 + 16]</div><div> lea r2, [r7 + 16]</div><div> call pixel_satd_16x4_internal2</div><div>@@ -1127,53 +817,6 @@</div><div> call pixel_satd_16x4_internal2</div><div> call pixel_satd_16x4_internal2</div>
<div> call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- lea r0, [r6 + 32]</div><div>- lea r2, [r7 + 32]</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- lea r0, [r6 + 48]</div><div>- lea r2, [r7 + 48]</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div><div>- call pixel_satd_16x4_internal2</div>
<div>-</div><div> pxor m9, m9</div><div> movhlps m9, m10</div><div> paddd m10, m9</div><div>@@ -1182,6 +825,402 @@</div><div> movd eax, m10</div><div> RET</div><div> </div><div>+cglobal pixel_satd_32x32, 4,8,11 ;if WIN64 && notcpuflag(avx)</div>
<div>+ SATD_START_SSE2 m10, m7</div><div>+ mov r6, r0</div><div>+ mov r7, r2</div><div>+%if vertical</div><div>+ mova m7, [pw_00ff]</div><div>+%endif</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ lea r0, [r6 + 16]</div><div>+ lea r2, [r7 + 16]</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ pxor m9, m9</div><div>+ movhlps m9, m10</div><div>+ paddd m10, m9</div><div>+ pshufd m9, m10, 1</div><div>+ paddd m10, m9</div><div>+ movd eax, m10</div><div>+ RET</div><div>+</div>
<div>+cglobal pixel_satd_32x64, 4,8,11 ;if WIN64 && notcpuflag(avx)</div><div>+ SATD_START_SSE2 m10, m7</div><div>+ mov r6, r0</div><div>+ mov r7, r2</div><div>+%if vertical</div><div>+ mova m7, [pw_00ff]</div>
<div>+%endif</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ lea r0, [r6 + 16]</div><div>+ lea r2, [r7 + 16]</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ pxor m9, m9</div><div>+ movhlps m9, m10</div><div>+ paddd m10, m9</div>
<div>+ pshufd m9, m10, 1</div><div>+ paddd m10, m9</div><div>+ movd eax, m10</div><div>+ RET</div><div>+</div><div>+cglobal pixel_satd_48x64, 4,8,11 ;if WIN64 && notcpuflag(avx)</div><div>+ SATD_START_SSE2 m10, m7</div>
<div>+ mov r6, r0</div><div>+ mov r7, r2</div><div>+%if vertical</div><div>+ mova m7, [pw_00ff]</div><div>+%endif</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>
+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ lea r0, [r6 + 16]</div><div>
+ lea r2, [r7 + 16]</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ lea r0, [r6 + 32]</div><div>+ lea r2, [r7 + 32]</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ pxor m9, m9</div><div>+ movhlps m9, m10</div><div>+ paddd m10, m9</div>
<div>+ pshufd m9, m10, 1</div><div>+ paddd m10, m9</div><div>+ movd eax, m10</div><div>+ RET</div><div>+</div><div>+cglobal pixel_satd_64x16, 4,8,11 ;if WIN64 && notcpuflag(avx)</div><div>+ SATD_START_SSE2 m10, m7</div>
<div>+ mov r6, r0</div><div>+ mov r7, r2</div><div>+%if vertical</div><div>+ mova m7, [pw_00ff]</div><div>+%endif</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>
+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ lea r0, [r6 + 16]</div><div>+ lea r2, [r7 + 16]</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ lea r0, [r6 + 32]</div><div>+ lea r2, [r7 + 32]</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ lea r0, [r6 + 48]</div><div>+ lea r2, [r7 + 48]</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ pxor m9, m9</div><div>+ movhlps m9, m10</div><div>+ paddd m10, m9</div><div>+ pshufd m9, m10, 1</div>
<div>+ paddd m10, m9</div><div>+ movd eax, m10</div><div>+ RET</div><div>+</div><div>+cglobal pixel_satd_64x32, 4,8,11 ;if WIN64 && notcpuflag(avx)</div><div>+ SATD_START_SSE2 m10, m7</div><div>
+ mov r6, r0</div><div>+ mov r7, r2</div><div>+%if vertical</div><div>+ mova m7, [pw_00ff]</div><div>+%endif</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ lea r0, [r6 + 16]</div><div>+ lea r2, [r7 + 16]</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ lea r0, [r6 + 32]</div><div>
+ lea r2, [r7 + 32]</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ lea r0, [r6 + 48]</div><div>+ lea r2, [r7 + 48]</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+</div><div>+ pxor m9, m9</div><div>+ movhlps m9, m10</div><div>+ paddd m10, m9</div><div>+ pshufd m9, m10, 1</div>
<div>+ paddd m10, m9</div><div>+ movd eax, m10</div><div>+ RET</div><div>+</div><div>+cglobal pixel_satd_64x48, 4,8,11 ;if WIN64 && notcpuflag(avx)</div><div>+ SATD_START_SSE2 m10, m7</div><div>
+ mov r6, r0</div><div>+ mov r7, r2</div><div>+%if vertical</div><div>+ mova m7, [pw_00ff]</div><div>+%endif</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ lea r0, [r6 + 16]</div><div>
+ lea r2, [r7 + 16]</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ lea r0, [r6 + 32]</div><div>+ lea r2, [r7 + 32]</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ lea r0, [r6 + 48]</div><div>+ lea r2, [r7 + 48]</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+</div><div>+ pxor m9, m9</div><div>+ movhlps m9, m10</div><div>+ paddd m10, m9</div>
<div>+ pshufd m9, m10, 1</div><div>+ paddd m10, m9</div><div>+ movd eax, m10</div><div>+ RET</div><div>+</div><div>+cglobal pixel_satd_64x64, 4,8,11 ;if WIN64 && notcpuflag(avx)</div><div>+ SATD_START_SSE2 m10, m7</div>
<div>+ mov r6, r0</div><div>+ mov r7, r2</div><div>+%if vertical</div><div>+ mova m7, [pw_00ff]</div><div>+%endif</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>
+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ lea r0, [r6 + 16]</div><div>
+ lea r2, [r7 + 16]</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ lea r0, [r6 + 32]</div><div>+ lea r2, [r7 + 32]</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ lea r0, [r6 + 48]</div><div>+ lea r2, [r7 + 48]</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div><div>+ call pixel_satd_16x4_internal2</div>
<div>+</div><div>+ pxor m9, m9</div><div>+ movhlps m9, m10</div><div>+ paddd m10, m9</div><div>+ pshufd m9, m10, 1</div><div>+ paddd m10, m9</div><div>+ movd eax, m10</div><div>+ RET</div>
<div>+</div><div> %else</div><div> </div><div> %if WIN64</div><div>@@ -1189,17 +1228,23 @@</div><div> SATD_START_SSE2 m6, m7</div><div> mov r6, r0</div><div> mov r7, r2</div><div>- call pixel_satd_8x8_internal</div>
<div>+ call pixel_satd_8x8_internal2</div><div> lea r0, [r6 + 8*SIZEOF_PIXEL]</div><div> lea r2, [r7 + 8*SIZEOF_PIXEL]</div><div>- call pixel_satd_8x8_internal</div><div>+ call pixel_satd_8x8_internal2</div>
<div> lea r0, [r6 + 16*SIZEOF_PIXEL]</div><div> lea r2, [r7 + 16*SIZEOF_PIXEL]</div><div>- call pixel_satd_8x8_internal</div><div>+ call pixel_satd_8x8_internal2</div><div> lea r0, [r6 + 24*SIZEOF_PIXEL]</div>
<div> lea r2, [r7 + 24*SIZEOF_PIXEL]</div><div>- call pixel_satd_8x8_internal</div><div>- SATD_END_SSE2 m6</div><div>+ call pixel_satd_8x8_internal2</div><div>+ pxor m7, m7</div><div>+ movhlps m7, m6</div>
<div>+ paddd m6, m7</div><div>+ pshufd m7, m6, 1</div><div>+ paddd m6, m7</div><div>+ movd eax, m6</div><div>+ RET</div><div> %else</div><div> cglobal pixel_satd_32x8, 4,7,8,0-gprsize ;if !WIN64</div>
<div> SATD_START_SSE2 m6, m7</div><div>@@ -1226,21 +1271,27 @@</div><div> SATD_START_SSE2 m6, m7</div><div> mov r6, r0</div><div> mov r7, r2</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div>
<div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div> lea r0, [r6 + 8*SIZEOF_PIXEL]</div><div> lea r2, [r7 + 8*SIZEOF_PIXEL]</div><div>- call pixel_satd_8x8_internal</div>
<div>- call pixel_satd_8x8_internal</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div> lea r0, [r6 + 16*SIZEOF_PIXEL]</div><div> lea r2, [r7 + 16*SIZEOF_PIXEL]</div>
<div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div> lea r0, [r6 + 24*SIZEOF_PIXEL]</div>
<div> lea r2, [r7 + 24*SIZEOF_PIXEL]</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>- SATD_END_SSE2 m6</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div>
<div>+ pxor m7, m7</div><div>+ movhlps m7, m6</div><div>+ paddd m6, m7</div><div>+ pshufd m7, m6, 1</div><div>+ paddd m6, m7</div><div>+ movd eax, m6</div><div>+ RET</div><div> %else</div><div>
cglobal pixel_satd_32x16, 4,7,8,0-gprsize ;if !WIN64</div><div> SATD_START_SSE2 m6, m7</div><div>@@ -1271,28 +1322,34 @@</div><div> SATD_START_SSE2 m6, m7</div><div> mov r6, r0</div><div> mov r7, r2</div>
<div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div>
<div>+ call pixel_satd_8x8_internal2</div><div> SATD_ACCUM m6, m0, m7</div><div> lea r0, [r6 + 8*SIZEOF_PIXEL]</div><div> lea r2, [r7 + 8*SIZEOF_PIXEL]</div><div>- call pixel_satd_8x8_internal</div><div>
- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div>
SATD_ACCUM m6, m0, m7</div><div> lea r0, [r6 + 16*SIZEOF_PIXEL]</div><div> lea r2, [r7 + 16*SIZEOF_PIXEL]</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div>
<div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div> SATD_ACCUM m6, m0, m7</div><div> lea r0, [r6 + 24*SIZEOF_PIXEL]</div><div>
lea r2, [r7 + 24*SIZEOF_PIXEL]</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>- SATD_END_SSE2 m6, m7</div><div>+ call pixel_satd_8x8_internal2</div>
<div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div>+ pxor m7, m7</div><div>+ movhlps m7, m6</div><div>+ paddd m6, m7</div><div>+ pshufd m7, m6, 1</div><div>+ paddd m6, m7</div>
<div>+ movd eax, m6</div><div>+ RET</div><div> %else</div><div> cglobal pixel_satd_32x24, 4,7,8,0-gprsize ;if !WIN64</div><div> SATD_START_SSE2 m6, m7</div><div>@@ -1333,32 +1390,38 @@</div><div> SATD_START_SSE2 m6, m7</div>
<div> mov r6, r0</div><div> mov r7, r2</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div>
<div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div> SATD_ACCUM m6, m0, m7</div><div>
lea r0, [r6 + 8*SIZEOF_PIXEL]</div><div> lea r2, [r7 + 8*SIZEOF_PIXEL]</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>
- call pixel_satd_8x8_internal</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div>
SATD_ACCUM m6, m0, m7</div><div> lea r0, [r6 + 16*SIZEOF_PIXEL]</div><div> lea r2, [r7 + 16*SIZEOF_PIXEL]</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div>
<div>- call pixel_satd_8x8_internal</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div>
<div> SATD_ACCUM m6, m0, m7</div><div> lea r0, [r6 + 24*SIZEOF_PIXEL]</div><div> lea r2, [r7 + 24*SIZEOF_PIXEL]</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>
- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>- SATD_END_SSE2 m6, m7</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div>
<div>+ call pixel_satd_8x8_internal2</div><div>+ pxor m7, m7</div><div>+ movhlps m7, m6</div><div>+ paddd m6, m7</div><div>+ pshufd m7, m6, 1</div><div>+ paddd m6, m7</div><div>+ movd eax, m6</div>
<div>+ RET</div><div> %else</div><div> cglobal pixel_satd_32x32, 4,7,8,0-gprsize ;if !WIN64</div><div> </div><div>@@ -1658,44 +1721,50 @@</div><div> SATD_START_SSE2 m6, m7</div><div> mov r6, r0</div><div> mov r7, r2</div>
<div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div> SATD_ACCUM m6, m0, m7</div><div> lea r0, [r6 + 8*SIZEOF_PIXEL]</div>
<div> lea r2, [r7 + 8*SIZEOF_PIXEL]</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div>
<div> SATD_ACCUM m6, m0, m7</div><div> lea r0, [r6 + 16*SIZEOF_PIXEL]</div><div> lea r2, [r7 + 16*SIZEOF_PIXEL]</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>
+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div> SATD_ACCUM m6, m0, m7</div><div> lea r0, [r6 + 24*SIZEOF_PIXEL]</div><div> lea r2, [r7 + 24*SIZEOF_PIXEL]</div><div>- call pixel_satd_8x8_internal</div>
<div>- call pixel_satd_8x8_internal</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div> SATD_ACCUM m6, m0, m7</div><div> lea r0, [r6 + 32*SIZEOF_PIXEL]</div><div>
lea r2, [r7 + 32*SIZEOF_PIXEL]</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div>
SATD_ACCUM m6, m0, m7</div><div> lea r0, [r6 + 40*SIZEOF_PIXEL]</div><div> lea r2, [r7 + 40*SIZEOF_PIXEL]</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>+ call pixel_satd_8x8_internal2</div>
<div>+ call pixel_satd_8x8_internal2</div><div> SATD_ACCUM m6, m0, m7</div><div> lea r0, [r6 + 48*SIZEOF_PIXEL]</div><div> lea r2, [r7 + 48*SIZEOF_PIXEL]</div><div>- call pixel_satd_8x8_internal</div><div>
- call pixel_satd_8x8_internal</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div> SATD_ACCUM m6, m0, m7</div><div> lea r0, [r6 + 56*SIZEOF_PIXEL]</div><div> lea r2, [r7 + 56*SIZEOF_PIXEL]</div>
<div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>- SATD_END_SSE2 m6, m7</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div>+ pxor m7, m7</div>
<div>+ movhlps m7, m6</div><div>+ paddd m6, m7</div><div>+ pshufd m7, m6, 1</div><div>+ paddd m6, m7</div><div>+ movd eax, m6</div><div>+ RET</div><div> %else</div><div> cglobal pixel_satd_64x16, 4,7,8,0-gprsize ;if !WIN64</div>
<div> SATD_START_SSE2 m6, m7</div><div>@@ -2362,25 +2431,29 @@</div><div> SATD_START_SSE2 m6, m7</div><div> mov r6, r0</div><div> mov r7, r2</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div>
<div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>- SATD_ACCUM m6, m0, m7</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div>
<div>+ call pixel_satd_8x8_internal2</div><div> lea r0, [r6 + 8*SIZEOF_PIXEL]</div><div> lea r2, [r7 + 8*SIZEOF_PIXEL]</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div>
<div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>- SATD_ACCUM m6, m0, m7</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div>
<div>+ call pixel_satd_8x8_internal2</div><div> lea r0, [r6 + 16*SIZEOF_PIXEL]</div><div> lea r2, [r7 + 16*SIZEOF_PIXEL]</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div>
<div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>- SATD_END_SSE2 m6, m7</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div>
<div>+ call pixel_satd_8x8_internal2</div><div>+ pxor m7, m7</div><div>+ movhlps m7, m6</div><div>+ paddd m6, m7</div><div>+ pshufd m7, m6, 1</div><div>+ paddd m6, m7</div><div>+ movd eax, m6</div>
<div>+ RET</div><div> %else</div><div> cglobal pixel_satd_24x32, 4,7,8,0-gprsize</div><div> SATD_START_SSE2 m6, m7</div><div>@@ -2417,22 +2490,40 @@</div><div> %if vertical</div><div> mova m7, [pw_00ff]</div><div>
%endif</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>- SATD_END_SSE2 m6</div>
<div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div>+ pxor m7, m7</div><div>+ movhlps m7, m6</div>
<div>+ paddd m6, m7</div><div>+ pshufd m7, m6, 1</div><div>+ paddd m6, m7</div><div>+ movd eax, m6</div><div>+ RET</div><div> </div><div> cglobal pixel_satd_8x16, 4,6,8</div><div> SATD_START_SSE2 m6, m7</div>
<div>- call pixel_satd_8x8_internal</div><div>- call pixel_satd_8x8_internal</div><div>- SATD_END_SSE2 m6</div><div>+ call pixel_satd_8x8_internal2</div><div>+ call pixel_satd_8x8_internal2</div><div>+ pxor m7, m7</div>
<div>+ movhlps m7, m6</div><div>+ paddd m6, m7</div><div>+ pshufd m7, m6, 1</div><div>+ paddd m6, m7</div><div>+ movd eax, m6</div><div>+ RET</div><div> </div><div> cglobal pixel_satd_8x8, 4,6,8</div>
<div> SATD_START_SSE2 m6, m7</div><div>- call pixel_satd_8x8_internal</div><div>- SATD_END_SSE2 m6</div><div>+ call pixel_satd_8x8_internal2</div><div>+ pxor m7, m7</div><div>+ movhlps m7, m6</div><div>
+ paddd m6, m7</div><div>+ pshufd m7, m6, 1</div><div>+ paddd m6, m7</div><div>+ movd eax, m6</div><div>+ RET</div><div> </div><div> cglobal pixel_satd_8x4, 4,6,8</div><div> SATD_START_SSE2 m6, m7</div>
<div>@@ -2440,7 +2531,6 @@</div><div> SATD_END_SSE2 m6</div><div> %endmacro ; SATDS_SSE2</div><div> </div><div>-</div><div> ;=============================================================================</div><div> ; SA8D</div>
<div> ;=============================================================================</div><div><br></div></div>