[x265] [PATCH] asm: fix overflow due to pixel_satd asm function for 64-bit build
Steve Borho
steve at borho.org
Tue Jan 28 16:00:36 CET 2014
It doesn't apply on either branch
On Jan 28, 2014, at 5:18 AM, chen <chenm003 at 163.com> wrote:
> right
>
> At 2014-01-28 18:06:56,"Yuvaraj Venkatesh" <yuvaraj at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
> # Date 1390811862 -19800
> # Mon Jan 27 14:07:42 2014 +0530
> # Node ID ab4c9712a79bd504ff9ed8f95ce6aabcefdf54f7
> # Parent 3568c1b19947f8641504c69bca6ab859ed11825e
> asm: fix overflow due to pixel_satd asm function for 64-bit build
>
> diff -r 3568c1b19947 -r ab4c9712a79b source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Tue Jan 28 01:49:03 2014 -0600
> +++ b/source/common/x86/asm-primitives.cpp Mon Jan 27 14:07:42 2014 +0530
> @@ -64,14 +64,30 @@
> #define INIT8(name, cpu) INIT8_NAME(name, name, cpu)
>
> #define HEVC_SATD(cpu) \
> - p.satd[LUMA_32x32] = x265_pixel_satd_32x32_ ## cpu; \
> - p.satd[LUMA_24x32] = x265_pixel_satd_24x32_ ## cpu; \
> - p.satd[LUMA_64x64] = x265_pixel_satd_64x64_ ## cpu; \
> - p.satd[LUMA_64x32] = x265_pixel_satd_64x32_ ## cpu; \
> - p.satd[LUMA_32x64] = x265_pixel_satd_32x64_ ## cpu; \
> - p.satd[LUMA_64x48] = x265_pixel_satd_64x48_ ## cpu; \
> - p.satd[LUMA_48x64] = x265_pixel_satd_48x64_ ## cpu; \
> - p.satd[LUMA_64x16] = x265_pixel_satd_64x16_ ## cpu
> + p.satd[LUMA_4x8] = x265_pixel_satd_4x8_ ## cpu; \
> + p.satd[LUMA_4x16] = x265_pixel_satd_4x16_ ## cpu; \
> + p.satd[LUMA_8x4] = x265_pixel_satd_8x4_ ## cpu; \
> + p.satd[LUMA_8x8] = x265_pixel_satd_8x8_ ## cpu; \
> + p.satd[LUMA_8x16] = x265_pixel_satd_8x16_ ## cpu; \
> + p.satd[LUMA_8x32] = x265_pixel_satd_8x32_ ## cpu; \
> + p.satd[LUMA_12x16] = x265_pixel_satd_12x16_ ## cpu; \
> + p.satd[LUMA_16x4] = x265_pixel_satd_16x4_ ## cpu; \
> + p.satd[LUMA_16x8] = x265_pixel_satd_16x8_ ## cpu; \
> + p.satd[LUMA_16x12] = x265_pixel_satd_16x12_ ## cpu; \
> + p.satd[LUMA_16x16] = x265_pixel_satd_16x16_ ## cpu; \
> + p.satd[LUMA_16x32] = x265_pixel_satd_16x32_ ## cpu; \
> + p.satd[LUMA_16x64] = x265_pixel_satd_16x64_ ## cpu; \
> + p.satd[LUMA_24x32] = x265_pixel_satd_24x32_ ## cpu; \
> + p.satd[LUMA_32x8] = x265_pixel_satd_32x8_ ## cpu; \
> + p.satd[LUMA_32x16] = x265_pixel_satd_32x16_ ## cpu; \
> + p.satd[LUMA_32x24] = x265_pixel_satd_32x24_ ## cpu; \
> + p.satd[LUMA_32x32] = x265_pixel_satd_32x32_ ## cpu; \
> + p.satd[LUMA_32x64] = x265_pixel_satd_32x64_ ## cpu; \
> + p.satd[LUMA_48x64] = x265_pixel_satd_48x64_ ## cpu; \
> + p.satd[LUMA_64x16] = x265_pixel_satd_64x16_ ## cpu; \
> + p.satd[LUMA_64x32] = x265_pixel_satd_64x32_ ## cpu; \
> + p.satd[LUMA_64x48] = x265_pixel_satd_64x48_ ## cpu; \
> + p.satd[LUMA_64x64] = x265_pixel_satd_64x64_ ## cpu;
>
> #define SAD_X3(cpu) \
> p.sad_x3[LUMA_16x8] = x265_pixel_sad_x3_16x8_ ## cpu; \
> @@ -775,17 +791,8 @@
> INIT8(sad, _mmx2);
> INIT8(sad_x3, _mmx2);
> INIT8(sad_x4, _mmx2);
> - INIT8(satd, _mmx2);
> + p.satd[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
> p.sa8d_inter[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
> - p.satd[LUMA_8x32] = x265_pixel_satd_8x32_sse2;
> - p.satd[LUMA_12x16] = x265_pixel_satd_12x16_sse2;
> - p.satd[LUMA_16x4] = x265_pixel_satd_16x4_sse2;
> - p.satd[LUMA_16x12] = x265_pixel_satd_16x12_sse2;
> - p.satd[LUMA_16x32] = x265_pixel_satd_16x32_sse2;
> - p.satd[LUMA_16x64] = x265_pixel_satd_16x64_sse2;
> - p.satd[LUMA_32x8] = x265_pixel_satd_32x8_sse2;
> - p.satd[LUMA_32x16] = x265_pixel_satd_32x16_sse2;
> - p.satd[LUMA_32x24] = x265_pixel_satd_32x24_sse2;
> p.frame_init_lowres_core = x265_frame_init_lowres_core_mmx2;
>
> PIXEL_AVG(sse2);
> @@ -916,6 +923,7 @@
>
> p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_ssse3;
> p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_ssse3;
> + HEVC_SATD(ssse3);
>
> p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
> p.luma_p2s = x265_luma_p2s_ssse3;
> @@ -928,11 +936,6 @@
> }
> if (cpuMask & X265_CPU_SSE4)
> {
> - p.satd[LUMA_4x16] = x265_pixel_satd_4x16_sse4;
> - p.satd[LUMA_12x16] = x265_pixel_satd_12x16_sse4;
> - p.satd[LUMA_32x8] = x265_pixel_satd_32x8_sse4;
> - p.satd[LUMA_32x16] = x265_pixel_satd_32x16_sse4;
> - p.satd[LUMA_32x24] = x265_pixel_satd_32x24_sse4;
> SA8D_INTER_FROM_BLOCK(sse4);
>
> p.sse_pp[LUMA_12x16] = x265_pixel_ssd_12x16_sse4;
> @@ -1022,17 +1025,13 @@
> if (cpuMask & X265_CPU_AVX)
> {
> p.frame_init_lowres_core = x265_frame_init_lowres_core_avx;
> - p.satd[LUMA_4x16] = x265_pixel_satd_4x16_avx;
> - p.satd[LUMA_12x16] = x265_pixel_satd_12x16_avx;
> - p.satd[LUMA_32x8] = x265_pixel_satd_32x8_avx;
> - p.satd[LUMA_32x16] = x265_pixel_satd_32x16_avx;
> - p.satd[LUMA_32x24] = x265_pixel_satd_32x24_avx;
> +
> SA8D_INTER_FROM_BLOCK(avx);
> ASSGN_SSE(avx);
> HEVC_SATD(avx);
> ASSGN_SSE_SS(avx);
> SAD_X3(avx);
> - SAD_X3(avx);
> + SAD_X4(avx);
> p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_avx;
> p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_avx;
> p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_avx;
> diff -r 3568c1b19947 -r ab4c9712a79b source/common/x86/pixel-a.asm
> --- a/source/common/x86/pixel-a.asm Tue Jan 28 01:49:03 2014 -0600
> +++ b/source/common/x86/pixel-a.asm Mon Jan 27 14:07:42 2014 +0530
> @@ -674,197 +674,6 @@
> %if vertical
> mova m7, [pw_00ff]
> %endif
> - call pixel_satd_16x4_internal
> - SATD_END_SSE2 m10
> -
> -cglobal pixel_satd_16x8, 4,6,12
> - SATD_START_SSE2 m10, m7
> -%if vertical
> - mova m7, [pw_00ff]
> -%endif
> - jmp %%pixel_satd_16x8_internal
> -
> -cglobal pixel_satd_16x12, 4,6,12
> - SATD_START_SSE2 m10, m7
> -%if vertical
> - mova m7, [pw_00ff]
> -%endif
> - call pixel_satd_16x4_internal
> - jmp %%pixel_satd_16x8_internal
> - SATD_END_SSE2 m10
> -
> -cglobal pixel_satd_16x32, 4,6,12
> - SATD_START_SSE2 m10, m7
> -%if vertical
> - mova m7, [pw_00ff]
> -%endif
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - jmp %%pixel_satd_16x8_internal
> - SATD_END_SSE2 m10
> -
> -cglobal pixel_satd_16x64, 4,6,12
> - SATD_START_SSE2 m10, m7
> -%if vertical
> - mova m7, [pw_00ff]
> -%endif
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - jmp %%pixel_satd_16x8_internal
> - SATD_END_SSE2 m10
> -
> -cglobal pixel_satd_16x16, 4,6,12
> - SATD_START_SSE2 m10, m7
> -%if vertical
> - mova m7, [pw_00ff]
> -%endif
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> -%%pixel_satd_16x8_internal:
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - SATD_END_SSE2 m10
> -
> -cglobal pixel_satd_32x8, 4,8,8 ;if WIN64 && notcpuflag(avx)
> - SATD_START_SSE2 m10, m7
> - mov r6, r0
> - mov r7, r2
> -%if vertical
> - mova m7, [pw_00ff]
> -%endif
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - lea r0, [r6 + 16]
> - lea r2, [r7 + 16]
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - SATD_END_SSE2 m10
> -
> -cglobal pixel_satd_32x16, 4,8,8 ;if WIN64 && notcpuflag(avx)
> - SATD_START_SSE2 m10, m7
> - mov r6, r0
> - mov r7, r2
> -%if vertical
> - mova m7, [pw_00ff]
> -%endif
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - lea r0, [r6 + 16]
> - lea r2, [r7 + 16]
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - SATD_END_SSE2 m10
> -
> -cglobal pixel_satd_32x24, 4,8,8 ;if WIN64 && notcpuflag(avx)
> - SATD_START_SSE2 m10, m7
> - mov r6, r0
> - mov r7, r2
> -%if vertical
> - mova m7, [pw_00ff]
> -%endif
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - lea r0, [r6 + 16]
> - lea r2, [r7 + 16]
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - SATD_END_SSE2 m10
> -
> -cglobal pixel_satd_32x32, 4,8,8 ;if WIN64 && notcpuflag(avx)
> - SATD_START_SSE2 m10, m7
> - mov r6, r0
> - mov r7, r2
> -%if vertical
> - mova m7, [pw_00ff]
> -%endif
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - lea r0, [r6 + 16]
> - lea r2, [r7 + 16]
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - SATD_END_SSE2 m10
> -
> -cglobal pixel_satd_32x64, 4,8,8 ;if WIN64 && notcpuflag(avx)
> - SATD_START_SSE2 m10, m7
> - mov r6, r0
> - mov r7, r2
> -%if vertical
> - mova m7, [pw_00ff]
> -%endif
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - lea r0, [r6 + 16]
> - lea r2, [r7 + 16]
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> call pixel_satd_16x4_internal2
> pxor m9, m9
> movhlps m9, m10
> @@ -874,63 +683,63 @@
> movd eax, m10
> RET
>
> -cglobal pixel_satd_48x64, 4,8,8 ;if WIN64 && notcpuflag(avx)
> +cglobal pixel_satd_16x8, 4,6,12
> SATD_START_SSE2 m10, m7
> - mov r6, r0
> - mov r7, r2
> %if vertical
> mova m7, [pw_00ff]
> %endif
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - lea r0, [r6 + 16]
> - lea r2, [r7 + 16]
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - lea r0, [r6 + 32]
> - lea r2, [r7 + 32]
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> + jmp %%pixel_satd_16x8_internal
> +
> +cglobal pixel_satd_16x12, 4,6,12
> + SATD_START_SSE2 m10, m7
> +%if vertical
> + mova m7, [pw_00ff]
> +%endif
> + call pixel_satd_16x4_internal2
> + jmp %%pixel_satd_16x8_internal
> +
> +cglobal pixel_satd_16x32, 4,6,12
> + SATD_START_SSE2 m10, m7
> +%if vertical
> + mova m7, [pw_00ff]
> +%endif
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + jmp %%pixel_satd_16x8_internal
> +
> +cglobal pixel_satd_16x64, 4,6,12
> + SATD_START_SSE2 m10, m7
> +%if vertical
> + mova m7, [pw_00ff]
> +%endif
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + jmp %%pixel_satd_16x8_internal
> +
> +cglobal pixel_satd_16x16, 4,6,12
> + SATD_START_SSE2 m10, m7
> +%if vertical
> + mova m7, [pw_00ff]
> +%endif
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> +%%pixel_satd_16x8_internal:
> call pixel_satd_16x4_internal2
> call pixel_satd_16x4_internal2
> pxor m9, m9
> @@ -941,83 +750,19 @@
> movd eax, m10
> RET
>
> -cglobal pixel_satd_64x16, 4,8,8 ;if WIN64 && notcpuflag(avx)
> +cglobal pixel_satd_32x8, 4,8,11 ;if WIN64 && notcpuflag(avx)
> SATD_START_SSE2 m10, m7
> mov r6, r0
> mov r7, r2
> %if vertical
> mova m7, [pw_00ff]
> %endif
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> lea r0, [r6 + 16]
> lea r2, [r7 + 16]
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - lea r0, [r6 + 32]
> - lea r2, [r7 + 32]
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - lea r0, [r6 + 48]
> - lea r2, [r7 + 48]
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - call pixel_satd_16x4_internal
> - SATD_END_SSE2 m10
> -
> -cglobal pixel_satd_64x32, 4,8,8 ;if WIN64 && notcpuflag(avx)
> - SATD_START_SSE2 m10, m7
> - mov r6, r0
> - mov r7, r2
> -%if vertical
> - mova m7, [pw_00ff]
> -%endif
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - lea r0, [r6 + 16]
> - lea r2, [r7 + 16]
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - lea r0, [r6 + 32]
> - lea r2, [r7 + 32]
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - lea r0, [r6 + 48]
> - lea r2, [r7 + 48]
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> -
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> pxor m9, m9
> movhlps m9, m10
> paddd m10, m9
> @@ -1026,7 +771,7 @@
> movd eax, m10
> RET
>
> -cglobal pixel_satd_64x48, 4,8,8 ;if WIN64 && notcpuflag(avx)
> +cglobal pixel_satd_32x16, 4,8,11 ;if WIN64 && notcpuflag(avx)
> SATD_START_SSE2 m10, m7
> mov r6, r0
> mov r7, r2
> @@ -1037,57 +782,12 @@
> call pixel_satd_16x4_internal2
> call pixel_satd_16x4_internal2
> call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> lea r0, [r6 + 16]
> lea r2, [r7 + 16]
> call pixel_satd_16x4_internal2
> call pixel_satd_16x4_internal2
> call pixel_satd_16x4_internal2
> call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - lea r0, [r6 + 32]
> - lea r2, [r7 + 32]
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - lea r0, [r6 + 48]
> - lea r2, [r7 + 48]
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> -
> pxor m9, m9
> movhlps m9, m10
> paddd m10, m9
> @@ -1096,7 +796,7 @@
> movd eax, m10
> RET
>
> -cglobal pixel_satd_64x64, 4,8,8 ;if WIN64 && notcpuflag(avx)
> +cglobal pixel_satd_32x24, 4,8,11 ;if WIN64 && notcpuflag(avx)
> SATD_START_SSE2 m10, m7
> mov r6, r0
> mov r7, r2
> @@ -1109,16 +809,6 @@
> call pixel_satd_16x4_internal2
> call pixel_satd_16x4_internal2
> call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> lea r0, [r6 + 16]
> lea r2, [r7 + 16]
> call pixel_satd_16x4_internal2
> @@ -1127,53 +817,6 @@
> call pixel_satd_16x4_internal2
> call pixel_satd_16x4_internal2
> call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - lea r0, [r6 + 32]
> - lea r2, [r7 + 32]
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - lea r0, [r6 + 48]
> - lea r2, [r7 + 48]
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> - call pixel_satd_16x4_internal2
> -
> pxor m9, m9
> movhlps m9, m10
> paddd m10, m9
> @@ -1182,6 +825,402 @@
> movd eax, m10
> RET
>
> +cglobal pixel_satd_32x32, 4,8,11 ;if WIN64 && notcpuflag(avx)
> + SATD_START_SSE2 m10, m7
> + mov r6, r0
> + mov r7, r2
> +%if vertical
> + mova m7, [pw_00ff]
> +%endif
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + lea r0, [r6 + 16]
> + lea r2, [r7 + 16]
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + pxor m9, m9
> + movhlps m9, m10
> + paddd m10, m9
> + pshufd m9, m10, 1
> + paddd m10, m9
> + movd eax, m10
> + RET
> +
> +cglobal pixel_satd_32x64, 4,8,11 ;if WIN64 && notcpuflag(avx)
> + SATD_START_SSE2 m10, m7
> + mov r6, r0
> + mov r7, r2
> +%if vertical
> + mova m7, [pw_00ff]
> +%endif
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + lea r0, [r6 + 16]
> + lea r2, [r7 + 16]
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + pxor m9, m9
> + movhlps m9, m10
> + paddd m10, m9
> + pshufd m9, m10, 1
> + paddd m10, m9
> + movd eax, m10
> + RET
> +
> +cglobal pixel_satd_48x64, 4,8,11 ;if WIN64 && notcpuflag(avx)
> + SATD_START_SSE2 m10, m7
> + mov r6, r0
> + mov r7, r2
> +%if vertical
> + mova m7, [pw_00ff]
> +%endif
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + lea r0, [r6 + 16]
> + lea r2, [r7 + 16]
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + lea r0, [r6 + 32]
> + lea r2, [r7 + 32]
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + pxor m9, m9
> + movhlps m9, m10
> + paddd m10, m9
> + pshufd m9, m10, 1
> + paddd m10, m9
> + movd eax, m10
> + RET
> +
> +cglobal pixel_satd_64x16, 4,8,11 ;if WIN64 && notcpuflag(avx)
> + SATD_START_SSE2 m10, m7
> + mov r6, r0
> + mov r7, r2
> +%if vertical
> + mova m7, [pw_00ff]
> +%endif
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + lea r0, [r6 + 16]
> + lea r2, [r7 + 16]
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + lea r0, [r6 + 32]
> + lea r2, [r7 + 32]
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + lea r0, [r6 + 48]
> + lea r2, [r7 + 48]
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + pxor m9, m9
> + movhlps m9, m10
> + paddd m10, m9
> + pshufd m9, m10, 1
> + paddd m10, m9
> + movd eax, m10
> + RET
> +
> +cglobal pixel_satd_64x32, 4,8,11 ;if WIN64 && notcpuflag(avx)
> + SATD_START_SSE2 m10, m7
> + mov r6, r0
> + mov r7, r2
> +%if vertical
> + mova m7, [pw_00ff]
> +%endif
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + lea r0, [r6 + 16]
> + lea r2, [r7 + 16]
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + lea r0, [r6 + 32]
> + lea r2, [r7 + 32]
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + lea r0, [r6 + 48]
> + lea r2, [r7 + 48]
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> +
> + pxor m9, m9
> + movhlps m9, m10
> + paddd m10, m9
> + pshufd m9, m10, 1
> + paddd m10, m9
> + movd eax, m10
> + RET
> +
> +cglobal pixel_satd_64x48, 4,8,11 ;if WIN64 && notcpuflag(avx)
> + SATD_START_SSE2 m10, m7
> + mov r6, r0
> + mov r7, r2
> +%if vertical
> + mova m7, [pw_00ff]
> +%endif
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + lea r0, [r6 + 16]
> + lea r2, [r7 + 16]
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + lea r0, [r6 + 32]
> + lea r2, [r7 + 32]
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + lea r0, [r6 + 48]
> + lea r2, [r7 + 48]
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> +
> + pxor m9, m9
> + movhlps m9, m10
> + paddd m10, m9
> + pshufd m9, m10, 1
> + paddd m10, m9
> + movd eax, m10
> + RET
> +
> +cglobal pixel_satd_64x64, 4,8,11 ;if WIN64 && notcpuflag(avx)
> + SATD_START_SSE2 m10, m7
> + mov r6, r0
> + mov r7, r2
> +%if vertical
> + mova m7, [pw_00ff]
> +%endif
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + lea r0, [r6 + 16]
> + lea r2, [r7 + 16]
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + lea r0, [r6 + 32]
> + lea r2, [r7 + 32]
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + lea r0, [r6 + 48]
> + lea r2, [r7 + 48]
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> + call pixel_satd_16x4_internal2
> +
> + pxor m9, m9
> + movhlps m9, m10
> + paddd m10, m9
> + pshufd m9, m10, 1
> + paddd m10, m9
> + movd eax, m10
> + RET
> +
> %else
>
> %if WIN64
> @@ -1189,17 +1228,23 @@
> SATD_START_SSE2 m6, m7
> mov r6, r0
> mov r7, r2
> - call pixel_satd_8x8_internal
> + call pixel_satd_8x8_internal2
> lea r0, [r6 + 8*SIZEOF_PIXEL]
> lea r2, [r7 + 8*SIZEOF_PIXEL]
> - call pixel_satd_8x8_internal
> + call pixel_satd_8x8_internal2
> lea r0, [r6 + 16*SIZEOF_PIXEL]
> lea r2, [r7 + 16*SIZEOF_PIXEL]
> - call pixel_satd_8x8_internal
> + call pixel_satd_8x8_internal2
> lea r0, [r6 + 24*SIZEOF_PIXEL]
> lea r2, [r7 + 24*SIZEOF_PIXEL]
> - call pixel_satd_8x8_internal
> - SATD_END_SSE2 m6
> + call pixel_satd_8x8_internal2
> + pxor m7, m7
> + movhlps m7, m6
> + paddd m6, m7
> + pshufd m7, m6, 1
> + paddd m6, m7
> + movd eax, m6
> + RET
> %else
> cglobal pixel_satd_32x8, 4,7,8,0-gprsize ;if !WIN64
> SATD_START_SSE2 m6, m7
> @@ -1226,21 +1271,27 @@
> SATD_START_SSE2 m6, m7
> mov r6, r0
> mov r7, r2
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> lea r0, [r6 + 8*SIZEOF_PIXEL]
> lea r2, [r7 + 8*SIZEOF_PIXEL]
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> lea r0, [r6 + 16*SIZEOF_PIXEL]
> lea r2, [r7 + 16*SIZEOF_PIXEL]
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> lea r0, [r6 + 24*SIZEOF_PIXEL]
> lea r2, [r7 + 24*SIZEOF_PIXEL]
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - SATD_END_SSE2 m6
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + pxor m7, m7
> + movhlps m7, m6
> + paddd m6, m7
> + pshufd m7, m6, 1
> + paddd m6, m7
> + movd eax, m6
> + RET
> %else
> cglobal pixel_satd_32x16, 4,7,8,0-gprsize ;if !WIN64
> SATD_START_SSE2 m6, m7
> @@ -1271,28 +1322,34 @@
> SATD_START_SSE2 m6, m7
> mov r6, r0
> mov r7, r2
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> SATD_ACCUM m6, m0, m7
> lea r0, [r6 + 8*SIZEOF_PIXEL]
> lea r2, [r7 + 8*SIZEOF_PIXEL]
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> SATD_ACCUM m6, m0, m7
> lea r0, [r6 + 16*SIZEOF_PIXEL]
> lea r2, [r7 + 16*SIZEOF_PIXEL]
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> SATD_ACCUM m6, m0, m7
> lea r0, [r6 + 24*SIZEOF_PIXEL]
> lea r2, [r7 + 24*SIZEOF_PIXEL]
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - SATD_END_SSE2 m6, m7
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + pxor m7, m7
> + movhlps m7, m6
> + paddd m6, m7
> + pshufd m7, m6, 1
> + paddd m6, m7
> + movd eax, m6
> + RET
> %else
> cglobal pixel_satd_32x24, 4,7,8,0-gprsize ;if !WIN64
> SATD_START_SSE2 m6, m7
> @@ -1333,32 +1390,38 @@
> SATD_START_SSE2 m6, m7
> mov r6, r0
> mov r7, r2
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> SATD_ACCUM m6, m0, m7
> lea r0, [r6 + 8*SIZEOF_PIXEL]
> lea r2, [r7 + 8*SIZEOF_PIXEL]
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> SATD_ACCUM m6, m0, m7
> lea r0, [r6 + 16*SIZEOF_PIXEL]
> lea r2, [r7 + 16*SIZEOF_PIXEL]
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> SATD_ACCUM m6, m0, m7
> lea r0, [r6 + 24*SIZEOF_PIXEL]
> lea r2, [r7 + 24*SIZEOF_PIXEL]
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - SATD_END_SSE2 m6, m7
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + pxor m7, m7
> + movhlps m7, m6
> + paddd m6, m7
> + pshufd m7, m6, 1
> + paddd m6, m7
> + movd eax, m6
> + RET
> %else
> cglobal pixel_satd_32x32, 4,7,8,0-gprsize ;if !WIN64
> SATD_START_SSE2 m6, m7
> @@ -1656,44 +1719,50 @@
> SATD_START_SSE2 m6, m7
> mov r6, r0
> mov r7, r2
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> SATD_ACCUM m6, m0, m7
> lea r0, [r6 + 8*SIZEOF_PIXEL]
> lea r2, [r7 + 8*SIZEOF_PIXEL]
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> SATD_ACCUM m6, m0, m7
> lea r0, [r6 + 16*SIZEOF_PIXEL]
> lea r2, [r7 + 16*SIZEOF_PIXEL]
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> SATD_ACCUM m6, m0, m7
> lea r0, [r6 + 24*SIZEOF_PIXEL]
> lea r2, [r7 + 24*SIZEOF_PIXEL]
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> SATD_ACCUM m6, m0, m7
> lea r0, [r6 + 32*SIZEOF_PIXEL]
> lea r2, [r7 + 32*SIZEOF_PIXEL]
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> SATD_ACCUM m6, m0, m7
> lea r0, [r6 + 40*SIZEOF_PIXEL]
> lea r2, [r7 + 40*SIZEOF_PIXEL]
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> SATD_ACCUM m6, m0, m7
> lea r0, [r6 + 48*SIZEOF_PIXEL]
> lea r2, [r7 + 48*SIZEOF_PIXEL]
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> SATD_ACCUM m6, m0, m7
> lea r0, [r6 + 56*SIZEOF_PIXEL]
> lea r2, [r7 + 56*SIZEOF_PIXEL]
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - SATD_END_SSE2 m6, m7
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + pxor m7, m7
> + movhlps m7, m6
> + paddd m6, m7
> + pshufd m7, m6, 1
> + paddd m6, m7
> + movd eax, m6
> + RET
> %else
> cglobal pixel_satd_64x16, 4,7,8,0-gprsize ;if !WIN64
> SATD_START_SSE2 m6, m7
> @@ -2360,25 +2429,29 @@
> SATD_START_SSE2 m6, m7
> mov r6, r0
> mov r7, r2
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - SATD_ACCUM m6, m0, m7
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> lea r0, [r6 + 8*SIZEOF_PIXEL]
> lea r2, [r7 + 8*SIZEOF_PIXEL]
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - SATD_ACCUM m6, m0, m7
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> lea r0, [r6 + 16*SIZEOF_PIXEL]
> lea r2, [r7 + 16*SIZEOF_PIXEL]
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - SATD_END_SSE2 m6, m7
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + pxor m7, m7
> + movhlps m7, m6
> + paddd m6, m7
> + pshufd m7, m6, 1
> + paddd m6, m7
> + movd eax, m6
> + RET
> %else
> cglobal pixel_satd_24x32, 4,7,8,0-gprsize
> SATD_START_SSE2 m6, m7
> @@ -2415,22 +2488,40 @@
> %if vertical
> mova m7, [pw_00ff]
> %endif
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - SATD_END_SSE2 m6
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + pxor m7, m7
> + movhlps m7, m6
> + paddd m6, m7
> + pshufd m7, m6, 1
> + paddd m6, m7
> + movd eax, m6
> + RET
>
> cglobal pixel_satd_8x16, 4,6,8
> SATD_START_SSE2 m6, m7
> - call pixel_satd_8x8_internal
> - call pixel_satd_8x8_internal
> - SATD_END_SSE2 m6
> + call pixel_satd_8x8_internal2
> + call pixel_satd_8x8_internal2
> + pxor m7, m7
> + movhlps m7, m6
> + paddd m6, m7
> + pshufd m7, m6, 1
> + paddd m6, m7
> + movd eax, m6
> + RET
>
> cglobal pixel_satd_8x8, 4,6,8
> SATD_START_SSE2 m6, m7
> - call pixel_satd_8x8_internal
> - SATD_END_SSE2 m6
> + call pixel_satd_8x8_internal2
> + pxor m7, m7
> + movhlps m7, m6
> + paddd m6, m7
> + pshufd m7, m6, 1
> + paddd m6, m7
> + movd eax, m6
> + RET
>
> cglobal pixel_satd_8x4, 4,6,8
> SATD_START_SSE2 m6, m7
> @@ -2438,7 +2529,6 @@
> SATD_END_SSE2 m6
> %endmacro ; SATDS_SSE2
>
> -
> ;=============================================================================
> ; SA8D
> ;=============================================================================
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140128/1e842031/attachment-0001.html>
More information about the x265-devel
mailing list