[x265] [PATCH] asm: fix overflow due to pixel_satd asm function for 64-bit build

yuvaraj at multicorewareinc.com yuvaraj at multicorewareinc.com
Wed Jan 29 07:35:48 CET 2014


# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1390977306 -19800
#      Wed Jan 29 12:05:06 2014 +0530
# Branch stable
# Node ID a2f0f9a5e6b91ef71517915200cdee42383cebc1
# Parent  4ec459e04f9efc311bb567fe98969a2e2c5ac92a
asm: fix overflow due to pixel_satd asm function for 64-bit build

diff -r 4ec459e04f9e -r a2f0f9a5e6b9 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jan 28 13:53:13 2014 -0600
+++ b/source/common/x86/asm-primitives.cpp	Wed Jan 29 12:05:06 2014 +0530
@@ -64,14 +64,30 @@
 #define INIT8(name, cpu) INIT8_NAME(name, name, cpu)
 
 #define HEVC_SATD(cpu) \
-    p.satd[LUMA_32x32] = x265_pixel_satd_32x32_ ## cpu; \
-    p.satd[LUMA_24x32] = x265_pixel_satd_24x32_ ## cpu; \
-    p.satd[LUMA_64x64] = x265_pixel_satd_64x64_ ## cpu; \
-    p.satd[LUMA_64x32] = x265_pixel_satd_64x32_ ## cpu; \
-    p.satd[LUMA_32x64] = x265_pixel_satd_32x64_ ## cpu; \
-    p.satd[LUMA_64x48] = x265_pixel_satd_64x48_ ## cpu; \
-    p.satd[LUMA_48x64] = x265_pixel_satd_48x64_ ## cpu; \
-    p.satd[LUMA_64x16] = x265_pixel_satd_64x16_ ## cpu
+    p.satd[LUMA_4x8]   = x265_pixel_satd_4x8_ ## cpu; \
+    p.satd[LUMA_4x16]   = x265_pixel_satd_4x16_ ## cpu; \
+    p.satd[LUMA_8x4]   = x265_pixel_satd_8x4_ ## cpu; \
+    p.satd[LUMA_8x8]   = x265_pixel_satd_8x8_ ## cpu; \
+    p.satd[LUMA_8x16]   = x265_pixel_satd_8x16_ ## cpu; \
+    p.satd[LUMA_8x32]   = x265_pixel_satd_8x32_ ## cpu; \
+    p.satd[LUMA_12x16]   = x265_pixel_satd_12x16_ ## cpu; \
+    p.satd[LUMA_16x4]   = x265_pixel_satd_16x4_ ## cpu; \
+    p.satd[LUMA_16x8]   = x265_pixel_satd_16x8_ ## cpu; \
+    p.satd[LUMA_16x12]   = x265_pixel_satd_16x12_ ## cpu; \
+    p.satd[LUMA_16x16]   = x265_pixel_satd_16x16_ ## cpu; \
+    p.satd[LUMA_16x32]   = x265_pixel_satd_16x32_ ## cpu; \
+    p.satd[LUMA_16x64]   = x265_pixel_satd_16x64_ ## cpu; \
+    p.satd[LUMA_24x32]   = x265_pixel_satd_24x32_ ## cpu; \
+    p.satd[LUMA_32x8]   = x265_pixel_satd_32x8_ ## cpu; \
+    p.satd[LUMA_32x16]   = x265_pixel_satd_32x16_ ## cpu; \
+    p.satd[LUMA_32x24]   = x265_pixel_satd_32x24_ ## cpu; \
+    p.satd[LUMA_32x32]   = x265_pixel_satd_32x32_ ## cpu; \
+    p.satd[LUMA_32x64]   = x265_pixel_satd_32x64_ ## cpu; \
+    p.satd[LUMA_48x64]   = x265_pixel_satd_48x64_ ## cpu; \
+    p.satd[LUMA_64x16]   = x265_pixel_satd_64x16_ ## cpu; \
+    p.satd[LUMA_64x32]   = x265_pixel_satd_64x32_ ## cpu; \
+    p.satd[LUMA_64x48]   = x265_pixel_satd_64x48_ ## cpu; \
+    p.satd[LUMA_64x64]   = x265_pixel_satd_64x64_ ## cpu;
 
 #define SAD_X3(cpu) \
     p.sad_x3[LUMA_16x8] = x265_pixel_sad_x3_16x8_ ## cpu; \
@@ -767,17 +783,8 @@
         INIT8(sad, _mmx2);
         INIT8(sad_x3, _mmx2);
         INIT8(sad_x4, _mmx2);
-        INIT8(satd, _mmx2);
+        p.satd[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
         p.sa8d_inter[LUMA_4x4]  = x265_pixel_satd_4x4_mmx2;
-        p.satd[LUMA_8x32] = x265_pixel_satd_8x32_sse2;
-        p.satd[LUMA_12x16] = x265_pixel_satd_12x16_sse2;
-        p.satd[LUMA_16x4] = x265_pixel_satd_16x4_sse2;
-        p.satd[LUMA_16x12] = x265_pixel_satd_16x12_sse2;
-        p.satd[LUMA_16x32] = x265_pixel_satd_16x32_sse2;
-        p.satd[LUMA_16x64] = x265_pixel_satd_16x64_sse2;
-        p.satd[LUMA_32x8]  = x265_pixel_satd_32x8_sse2;
-        p.satd[LUMA_32x16] = x265_pixel_satd_32x16_sse2;
-        p.satd[LUMA_32x24] = x265_pixel_satd_32x24_sse2;
         p.frame_init_lowres_core = x265_frame_init_lowres_core_mmx2;
 
         PIXEL_AVG(sse2);
@@ -810,7 +817,6 @@
         INIT2(sad, _sse2);
         INIT2(sad_x3, _sse2);
         INIT2(sad_x4, _sse2);
-        INIT6(satd, _sse2);
         HEVC_SATD(sse2);
 
         CHROMA_BLOCKCOPY(_sse2);
@@ -916,11 +922,7 @@
     }
     if (cpuMask & X265_CPU_SSE4)
     {
-        p.satd[LUMA_4x16]   = x265_pixel_satd_4x16_sse4;
-        p.satd[LUMA_12x16]  = x265_pixel_satd_12x16_sse4;
-        p.satd[LUMA_32x8] = x265_pixel_satd_32x8_sse4;
-        p.satd[LUMA_32x16] = x265_pixel_satd_32x16_sse4;
-        p.satd[LUMA_32x24] = x265_pixel_satd_32x24_sse4;
+        HEVC_SATD(sse4);
         SA8D_INTER_FROM_BLOCK(sse4);
 
         p.sse_pp[LUMA_12x16] = x265_pixel_ssd_12x16_sse4;
@@ -938,7 +940,6 @@
 
         CHROMA_FILTERS(_sse4);
         LUMA_FILTERS(_sse4);
-        HEVC_SATD(sse4);
         ASSGN_SSE_SS(sse4);
         p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x4] = x265_blockcopy_sp_2x4_sse4;
         p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x8] = x265_blockcopy_sp_2x8_sse4;
@@ -1005,17 +1006,13 @@
     if (cpuMask & X265_CPU_AVX)
     {
         p.frame_init_lowres_core = x265_frame_init_lowres_core_avx;
-        p.satd[LUMA_4x16]   = x265_pixel_satd_4x16_avx;
-        p.satd[LUMA_12x16]  = x265_pixel_satd_12x16_avx;
-        p.satd[LUMA_32x8] = x265_pixel_satd_32x8_avx;
-        p.satd[LUMA_32x16] = x265_pixel_satd_32x16_avx;
-        p.satd[LUMA_32x24] = x265_pixel_satd_32x24_avx;
+        HEVC_SATD(avx);
         SA8D_INTER_FROM_BLOCK(avx);
         ASSGN_SSE(avx);
-        HEVC_SATD(avx);
+        
         ASSGN_SSE_SS(avx);
         SAD_X3(avx);
-        SAD_X3(avx);
+        SAD_X4(avx);
         p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_avx;
         p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_avx;
         p.sad_x3[LUMA_16x4]  = x265_pixel_sad_x3_16x4_avx;
diff -r 4ec459e04f9e -r a2f0f9a5e6b9 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Tue Jan 28 13:53:13 2014 -0600
+++ b/source/common/x86/pixel-a.asm	Wed Jan 29 12:05:06 2014 +0530
@@ -669,163 +669,196 @@
     SATD_8x4_1_SSE 0, 4, 8, 5, 9, 6, 3, 10, 12, 13
     ret
 
-cglobal pixel_satd_16x4, 4,6,12
-    SATD_START_SSE2 m10, m7
-%if vertical
-    mova m7, [pw_00ff]
-%endif
-    call pixel_satd_16x4_internal
-    SATD_END_SSE2 m10
-
-cglobal pixel_satd_16x8, 4,6,12
-    SATD_START_SSE2 m10, m7
-%if vertical
-    mova m7, [pw_00ff]
-%endif
-    jmp %%pixel_satd_16x8_internal
-
-cglobal pixel_satd_16x12, 4,6,12
-    SATD_START_SSE2 m10, m7
-%if vertical
-    mova m7, [pw_00ff]
-%endif
-    call pixel_satd_16x4_internal
-    jmp %%pixel_satd_16x8_internal
-    SATD_END_SSE2 m10  
-
-cglobal pixel_satd_16x32, 4,6,12
-    SATD_START_SSE2 m10, m7
-%if vertical
-    mova m7, [pw_00ff]
-%endif
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    jmp %%pixel_satd_16x8_internal
-    SATD_END_SSE2 m10
-
-cglobal pixel_satd_16x64, 4,6,12
-    SATD_START_SSE2 m10, m7
-%if vertical
-    mova m7, [pw_00ff]
-%endif
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    jmp %%pixel_satd_16x8_internal
-    SATD_END_SSE2 m10    
-
-cglobal pixel_satd_16x16, 4,6,12
-    SATD_START_SSE2 m10, m7
-%if vertical
-    mova m7, [pw_00ff]
-%endif
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-%%pixel_satd_16x8_internal:
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    SATD_END_SSE2 m10
-
-cglobal pixel_satd_32x8, 4,8,8    ;if WIN64 && notcpuflag(avx)
-    SATD_START_SSE2 m10, m7
-    mov r6, r0
-    mov r7, r2
-%if vertical
-    mova m7, [pw_00ff]
-%endif
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    lea r0, [r6 + 16]
-    lea r2, [r7 + 16]
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    SATD_END_SSE2 m10
-
-cglobal pixel_satd_32x16, 4,8,8    ;if WIN64 && notcpuflag(avx)
-    SATD_START_SSE2 m10, m7
-    mov r6, r0
-    mov r7, r2
-%if vertical
-    mova m7, [pw_00ff]
-%endif
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    lea r0, [r6 + 16]
-    lea r2, [r7 + 16]
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    SATD_END_SSE2 m10
-
-cglobal pixel_satd_32x24, 4,8,8    ;if WIN64 && notcpuflag(avx)
-    SATD_START_SSE2 m10, m7
-    mov r6, r0
-    mov r7, r2
-%if vertical
-    mova m7, [pw_00ff]
-%endif
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    lea r0, [r6 + 16]
-    lea r2, [r7 + 16]
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    SATD_END_SSE2 m10
-
-cglobal pixel_satd_32x32, 4,8,8    ;if WIN64 && notcpuflag(avx)
-    SATD_START_SSE2 m10, m7
-    mov r6, r0
-    mov r7, r2
-%if vertical
-    mova m7, [pw_00ff]
-%endif
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    lea r0, [r6 + 16]
-    lea r2, [r7 + 16]
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    SATD_END_SSE2 m10
-
-cglobal pixel_satd_32x64, 4,8,8    ;if WIN64 && notcpuflag(avx)
+cglobal pixel_satd_16x4, 4,6,12
+    SATD_START_SSE2 m10, m7
+%if vertical
+    mova m7, [pw_00ff]
+%endif
+    call pixel_satd_16x4_internal2
+    pxor     m9, m9
+    movhlps  m9, m10
+    paddd   m10, m9
+    pshufd   m9, m10, 1
+    paddd   m10, m9
+    movd    eax, m10
+    RET
+
+cglobal pixel_satd_16x8, 4,6,12
+    SATD_START_SSE2 m10, m7
+%if vertical
+    mova m7, [pw_00ff]
+%endif
+    jmp %%pixel_satd_16x8_internal
+
+cglobal pixel_satd_16x12, 4,6,12
+    SATD_START_SSE2 m10, m7
+%if vertical
+    mova m7, [pw_00ff]
+%endif
+    call pixel_satd_16x4_internal2
+    jmp %%pixel_satd_16x8_internal
+
+cglobal pixel_satd_16x32, 4,6,12
+    SATD_START_SSE2 m10, m7
+%if vertical
+    mova m7, [pw_00ff]
+%endif
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    jmp %%pixel_satd_16x8_internal
+
+cglobal pixel_satd_16x64, 4,6,12
+    SATD_START_SSE2 m10, m7
+%if vertical
+    mova m7, [pw_00ff]
+%endif
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    jmp %%pixel_satd_16x8_internal
+
+cglobal pixel_satd_16x16, 4,6,12
+    SATD_START_SSE2 m10, m7
+%if vertical
+    mova m7, [pw_00ff]
+%endif
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+%%pixel_satd_16x8_internal:
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    pxor     m9, m9
+    movhlps  m9, m10
+    paddd   m10, m9
+    pshufd   m9, m10, 1
+    paddd   m10, m9
+    movd    eax, m10
+    RET
+
+cglobal pixel_satd_32x8, 4,8,11    ;if WIN64 && notcpuflag(avx)
+    SATD_START_SSE2 m10, m7
+    mov r6, r0
+    mov r7, r2
+%if vertical
+    mova m7, [pw_00ff]
+%endif
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    lea r0, [r6 + 16]
+    lea r2, [r7 + 16]
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    pxor     m9, m9
+    movhlps  m9, m10
+    paddd   m10, m9
+    pshufd   m9, m10, 1
+    paddd   m10, m9
+    movd    eax, m10
+    RET
+
+cglobal pixel_satd_32x16, 4,8,11    ;if WIN64 && notcpuflag(avx)
+    SATD_START_SSE2 m10, m7
+    mov r6, r0
+    mov r7, r2
+%if vertical
+    mova m7, [pw_00ff]
+%endif
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    lea r0, [r6 + 16]
+    lea r2, [r7 + 16]
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    pxor     m9, m9
+    movhlps  m9, m10
+    paddd   m10, m9
+    pshufd   m9, m10, 1
+    paddd   m10, m9
+    movd    eax, m10
+    RET
+
+cglobal pixel_satd_32x24, 4,8,11    ;if WIN64 && notcpuflag(avx)
+    SATD_START_SSE2 m10, m7
+    mov r6, r0
+    mov r7, r2
+%if vertical
+    mova m7, [pw_00ff]
+%endif
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    lea r0, [r6 + 16]
+    lea r2, [r7 + 16]
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    pxor     m9, m9
+    movhlps  m9, m10
+    paddd   m10, m9
+    pshufd   m9, m10, 1
+    paddd   m10, m9
+    movd    eax, m10
+    RET
+
+cglobal pixel_satd_32x32, 4,8,11    ;if WIN64 && notcpuflag(avx)
+    SATD_START_SSE2 m10, m7
+    mov r6, r0
+    mov r7, r2
+%if vertical
+    mova m7, [pw_00ff]
+%endif
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    lea r0, [r6 + 16]
+    lea r2, [r7 + 16]
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    pxor     m9, m9
+    movhlps  m9, m10
+    paddd   m10, m9
+    pshufd   m9, m10, 1
+    paddd   m10, m9
+    movd    eax, m10
+    RET
+
+cglobal pixel_satd_32x64, 4,8,11    ;if WIN64 && notcpuflag(avx)
     SATD_START_SSE2 m10, m7
     mov r6, r0
     mov r7, r2
@@ -874,7 +907,7 @@
     movd    eax, m10
     RET
 
-cglobal pixel_satd_48x64, 4,8,8    ;if WIN64 && notcpuflag(avx)
+cglobal pixel_satd_48x64, 4,8,11    ;if WIN64 && notcpuflag(avx)
     SATD_START_SSE2 m10, m7
     mov r6, r0
     mov r7, r2
@@ -941,38 +974,44 @@
     movd    eax, m10
     RET
 
-cglobal pixel_satd_64x16, 4,8,8    ;if WIN64 && notcpuflag(avx)
-    SATD_START_SSE2 m10, m7
-    mov r6, r0
-    mov r7, r2
-%if vertical
-    mova m7, [pw_00ff]
-%endif
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    lea r0, [r6 + 16]
-    lea r2, [r7 + 16]
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    lea r0, [r6 + 32]
-    lea r2, [r7 + 32]
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    lea r0, [r6 + 48]
-    lea r2, [r7 + 48]
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    SATD_END_SSE2 m10
-
-cglobal pixel_satd_64x32, 4,8,8    ;if WIN64 && notcpuflag(avx)
+cglobal pixel_satd_64x16, 4,8,11    ;if WIN64 && notcpuflag(avx)
+    SATD_START_SSE2 m10, m7
+    mov r6, r0
+    mov r7, r2
+%if vertical
+    mova m7, [pw_00ff]
+%endif
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    lea r0, [r6 + 16]
+    lea r2, [r7 + 16]
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    lea r0, [r6 + 32]
+    lea r2, [r7 + 32]
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    lea r0, [r6 + 48]
+    lea r2, [r7 + 48]
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    pxor     m9, m9
+    movhlps  m9, m10
+    paddd   m10, m9
+    pshufd   m9, m10, 1
+    paddd   m10, m9
+    movd    eax, m10
+    RET
+
+cglobal pixel_satd_64x32, 4,8,11    ;if WIN64 && notcpuflag(avx)
     SATD_START_SSE2 m10, m7
     mov r6, r0
     mov r7, r2
@@ -1026,7 +1065,7 @@
     movd    eax, m10
     RET
 
-cglobal pixel_satd_64x48, 4,8,8    ;if WIN64 && notcpuflag(avx)
+cglobal pixel_satd_64x48, 4,8,11    ;if WIN64 && notcpuflag(avx)
     SATD_START_SSE2 m10, m7
     mov r6, r0
     mov r7, r2
@@ -1096,7 +1135,7 @@
     movd    eax, m10
     RET
 
-cglobal pixel_satd_64x64, 4,8,8    ;if WIN64 && notcpuflag(avx)
+cglobal pixel_satd_64x64, 4,8,11    ;if WIN64 && notcpuflag(avx)
     SATD_START_SSE2 m10, m7
     mov r6, r0
     mov r7, r2
@@ -1185,21 +1224,27 @@
 %else
 
 %if WIN64
-cglobal pixel_satd_32x8, 4,8,8    ;if WIN64 && cpuflag(avx)
-    SATD_START_SSE2 m6, m7
-    mov r6, r0
-    mov r7, r2
-    call pixel_satd_8x8_internal
-    lea r0, [r6 + 8*SIZEOF_PIXEL]
-    lea r2, [r7 + 8*SIZEOF_PIXEL]
-    call pixel_satd_8x8_internal
-    lea r0, [r6 + 16*SIZEOF_PIXEL]
-    lea r2, [r7 + 16*SIZEOF_PIXEL]
-    call pixel_satd_8x8_internal
-    lea r0, [r6 + 24*SIZEOF_PIXEL]
-    lea r2, [r7 + 24*SIZEOF_PIXEL]
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6
+cglobal pixel_satd_32x8, 4,8,8    ;if WIN64 && cpuflag(avx)
+    SATD_START_SSE2 m6, m7
+    mov r6, r0
+    mov r7, r2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 8*SIZEOF_PIXEL]
+    lea r2, [r7 + 8*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 16*SIZEOF_PIXEL]
+    lea r2, [r7 + 16*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 24*SIZEOF_PIXEL]
+    lea r2, [r7 + 24*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
 %else
 cglobal pixel_satd_32x8, 4,7,8,0-gprsize    ;if !WIN64
     SATD_START_SSE2 m6, m7
@@ -1222,25 +1267,31 @@
 %endif
 
 %if WIN64
-cglobal pixel_satd_32x16, 4,8,8    ;if WIN64 && cpuflag(avx)
-    SATD_START_SSE2 m6, m7
-    mov r6, r0
-    mov r7, r2
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    lea r0, [r6 + 8*SIZEOF_PIXEL]
-    lea r2, [r7 + 8*SIZEOF_PIXEL]
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    lea r0, [r6 + 16*SIZEOF_PIXEL]
-    lea r2, [r7 + 16*SIZEOF_PIXEL]
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    lea r0, [r6 + 24*SIZEOF_PIXEL]
-    lea r2, [r7 + 24*SIZEOF_PIXEL]
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6
+cglobal pixel_satd_32x16, 4,8,8    ;if WIN64 && cpuflag(avx)
+    SATD_START_SSE2 m6, m7
+    mov r6, r0
+    mov r7, r2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 8*SIZEOF_PIXEL]
+    lea r2, [r7 + 8*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 16*SIZEOF_PIXEL]
+    lea r2, [r7 + 16*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 24*SIZEOF_PIXEL]
+    lea r2, [r7 + 24*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
 %else
 cglobal pixel_satd_32x16, 4,7,8,0-gprsize   ;if !WIN64
     SATD_START_SSE2 m6, m7
@@ -1267,32 +1318,35 @@
 %endif
 
 %if WIN64
-cglobal pixel_satd_32x24, 4,8,8    ;if WIN64 && cpuflag(avx)
-    SATD_START_SSE2 m6, m7
-    mov r6, r0
-    mov r7, r2
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
-    lea r0, [r6 + 8*SIZEOF_PIXEL]
-    lea r2, [r7 + 8*SIZEOF_PIXEL]
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
-    lea r0, [r6 + 16*SIZEOF_PIXEL]
-    lea r2, [r7 + 16*SIZEOF_PIXEL]
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
-    lea r0, [r6 + 24*SIZEOF_PIXEL]
-    lea r2, [r7 + 24*SIZEOF_PIXEL]
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6, m7
+cglobal pixel_satd_32x24, 4,8,8    ;if WIN64 && cpuflag(avx)
+    SATD_START_SSE2 m6, m7
+    mov r6, r0
+    mov r7, r2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 8*SIZEOF_PIXEL]
+    lea r2, [r7 + 8*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 16*SIZEOF_PIXEL]
+    lea r2, [r7 + 16*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 24*SIZEOF_PIXEL]
+    lea r2, [r7 + 24*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
 %else
 cglobal pixel_satd_32x24, 4,7,8,0-gprsize   ;if !WIN64
     SATD_START_SSE2 m6, m7
@@ -1329,36 +1383,39 @@
 %endif
 
 %if WIN64
-cglobal pixel_satd_32x32, 4,8,8    ;if WIN64 && cpuflag(avx)
-    SATD_START_SSE2 m6, m7
-    mov r6, r0
-    mov r7, r2
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
-    lea r0, [r6 + 8*SIZEOF_PIXEL]
-    lea r2, [r7 + 8*SIZEOF_PIXEL]
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
-    lea r0, [r6 + 16*SIZEOF_PIXEL]
-    lea r2, [r7 + 16*SIZEOF_PIXEL]
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
-    lea r0, [r6 + 24*SIZEOF_PIXEL]
-    lea r2, [r7 + 24*SIZEOF_PIXEL]
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6, m7
+cglobal pixel_satd_32x32, 4,8,8    ;if WIN64 && cpuflag(avx)
+    SATD_START_SSE2 m6, m7
+    mov r6, r0
+    mov r7, r2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 8*SIZEOF_PIXEL]
+    lea r2, [r7 + 8*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 16*SIZEOF_PIXEL]
+    lea r2, [r7 + 16*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 24*SIZEOF_PIXEL]
+    lea r2, [r7 + 24*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
 %else
 cglobal pixel_satd_32x32, 4,7,8,0-gprsize   ;if !WIN64
     SATD_START_SSE2 m6, m7
@@ -1652,48 +1709,47 @@
 
 
 %if WIN64
-cglobal pixel_satd_64x16, 4,8,8    ;if WIN64 && cpuflag(avx)
-    SATD_START_SSE2 m6, m7
-    mov r6, r0
-    mov r7, r2
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
-    lea r0, [r6 + 8*SIZEOF_PIXEL]
-    lea r2, [r7 + 8*SIZEOF_PIXEL]
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
-    lea r0, [r6 + 16*SIZEOF_PIXEL]
-    lea r2, [r7 + 16*SIZEOF_PIXEL]
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
-    lea r0, [r6 + 24*SIZEOF_PIXEL]
-    lea r2, [r7 + 24*SIZEOF_PIXEL]
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
-    lea r0, [r6 + 32*SIZEOF_PIXEL]
-    lea r2, [r7 + 32*SIZEOF_PIXEL]
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
-    lea r0, [r6 + 40*SIZEOF_PIXEL]
-    lea r2, [r7 + 40*SIZEOF_PIXEL]
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
-    lea r0, [r6 + 48*SIZEOF_PIXEL]
-    lea r2, [r7 + 48*SIZEOF_PIXEL]
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
-    lea r0, [r6 + 56*SIZEOF_PIXEL]
-    lea r2, [r7 + 56*SIZEOF_PIXEL]
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6, m7
+cglobal pixel_satd_64x16, 4,8,8    ;if WIN64 && cpuflag(avx)
+    SATD_START_SSE2 m6, m7
+    mov r6, r0
+    mov r7, r2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 8*SIZEOF_PIXEL]
+    lea r2, [r7 + 8*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 16*SIZEOF_PIXEL]
+    lea r2, [r7 + 16*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 24*SIZEOF_PIXEL]
+    lea r2, [r7 + 24*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 32*SIZEOF_PIXEL]
+    lea r2, [r7 + 32*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 40*SIZEOF_PIXEL]
+    lea r2, [r7 + 40*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 48*SIZEOF_PIXEL]
+    lea r2, [r7 + 48*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 56*SIZEOF_PIXEL]
+    lea r2, [r7 + 56*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
 %else
 cglobal pixel_satd_64x16, 4,7,8,0-gprsize   ;if !WIN64
     SATD_START_SSE2 m6, m7
@@ -2360,25 +2416,29 @@
     SATD_START_SSE2 m6, m7
     mov r6, r0
     mov r7, r2
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
     lea r0, [r6 + 8*SIZEOF_PIXEL]
     lea r2, [r7 + 8*SIZEOF_PIXEL]
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
     lea r0, [r6 + 16*SIZEOF_PIXEL]
     lea r2, [r7 + 16*SIZEOF_PIXEL]
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
 %else
 cglobal pixel_satd_24x32, 4,7,8,0-gprsize
     SATD_START_SSE2 m6, m7
@@ -2410,31 +2470,49 @@
     SATD_END_SSE2 m6, m7
 %endif    ;WIN64
 
-cglobal pixel_satd_8x32, 4,6,8
-    SATD_START_SSE2 m6, m7
-%if vertical
-    mova m7, [pw_00ff]
-%endif
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6
-
-cglobal pixel_satd_8x16, 4,6,8
-    SATD_START_SSE2 m6, m7
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6
-
-cglobal pixel_satd_8x8, 4,6,8
-    SATD_START_SSE2 m6, m7
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6
-
-cglobal pixel_satd_8x4, 4,6,8
-    SATD_START_SSE2 m6, m7
-    call %%pixel_satd_8x4_internal
+cglobal pixel_satd_8x32, 4,6,8
+    SATD_START_SSE2 m6, m7
+%if vertical
+    mova m7, [pw_00ff]
+%endif
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
+
+cglobal pixel_satd_8x16, 4,6,8
+    SATD_START_SSE2 m6, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
+
+cglobal pixel_satd_8x8, 4,6,8
+    SATD_START_SSE2 m6, m7
+    call pixel_satd_8x8_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
+
+cglobal pixel_satd_8x4, 4,6,8
+    SATD_START_SSE2 m6, m7
+    call %%pixel_satd_8x4_internal
     SATD_END_SSE2 m6
 %endmacro ; SATDS_SSE2
 


More information about the x265-devel mailing list