[x265] [PATCH] asm: assembly code for pixel_satd_32x64 and pixel_satd_48x64

yuvaraj at multicorewareinc.com yuvaraj at multicorewareinc.com
Thu Nov 14 14:41:56 CET 2013


# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1384436305 -19800
#      Thu Nov 14 19:08:25 2013 +0530
# Node ID 95ed43fcd3f3f94c8bfe2f0ccb4e4b0fe38e2731
# Parent  cb15dab6333f3ce23083274718754ca588596547
asm: assembly code for pixel_satd_32x64 and pixel_satd_48x64

diff -r cb15dab6333f -r 95ed43fcd3f3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Nov 14 17:34:34 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Nov 14 19:08:25 2013 +0530
@@ -63,9 +63,9 @@
     p.satd[LUMA_24x32] = x265_pixel_satd_24x32_ ## cpu; \
     p.satd[LUMA_64x64] = x265_pixel_satd_64x64_ ## cpu; \
     p.satd[LUMA_64x32] = x265_pixel_satd_64x32_ ## cpu; \
-    p.satd[LUMA_32x64] = cmp<32, 64, 16, 16, x265_pixel_satd_16x16_ ## cpu>; \
+    p.satd[LUMA_32x64] = x265_pixel_satd_32x64_ ## cpu; \
     p.satd[LUMA_64x48] = x265_pixel_satd_64x48_ ## cpu; \
-    p.satd[LUMA_48x64] = cmp<48, 64, 16, 16, x265_pixel_satd_16x16_ ## cpu>; \
+    p.satd[LUMA_48x64] = x265_pixel_satd_48x64_ ## cpu; \
     p.satd[LUMA_64x16] = x265_pixel_satd_64x16_ ## cpu
 
 #define ASSGN_SSE(cpu) \
diff -r cb15dab6333f -r 95ed43fcd3f3 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Thu Nov 14 17:34:34 2013 +0530
+++ b/source/common/x86/pixel-a.asm	Thu Nov 14 19:08:25 2013 +0530
@@ -1880,83 +1880,47 @@
     call pixel_satd_16x4_internal
     SATD_END_SSE2 m10
 
-cglobal pixel_satd_64x16, 4,8,8    ;if WIN64 && notcpuflag(avx)
+cglobal pixel_satd_32x64, 4,8,8    ;if WIN64 && notcpuflag(avx)
     SATD_START_SSE2 m10, m7
     mov r6, r0
     mov r7, r2
 %if vertical
     mova m7, [pw_00ff]
 %endif
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
     lea r0, [r6 + 16]
     lea r2, [r7 + 16]
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    lea r0, [r6 + 32]
-    lea r2, [r7 + 32]
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    lea r0, [r6 + 48]
-    lea r2, [r7 + 48]
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    SATD_END_SSE2 m10
-
-cglobal pixel_satd_64x32, 4,8,8    ;if WIN64 && notcpuflag(avx)
-    SATD_START_SSE2 m10, m7
-    mov r6, r0
-    mov r7, r2
-%if vertical
-    mova m7, [pw_00ff]
-%endif
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    lea r0, [r6 + 16]
-    lea r2, [r7 + 16]
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    lea r0, [r6 + 32]
-    lea r2, [r7 + 32]
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    lea r0, [r6 + 48]
-    lea r2, [r7 + 48]
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
     pxor     m9, m9
     movhlps  m9, m10
     paddd   m10, m9
@@ -1965,7 +1929,7 @@
     movd    eax, m10
     RET
 
-cglobal pixel_satd_64x48, 4,8,8    ;if WIN64 && notcpuflag(avx)
+cglobal pixel_satd_48x64, 4,8,8    ;if WIN64 && notcpuflag(avx)
     SATD_START_SSE2 m10, m7
     mov r6, r0
     mov r7, r2
@@ -1984,6 +1948,10 @@
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
     lea r0, [r6 + 16]
     lea r2, [r7 + 16]
     call pixel_satd_16x4_internal2
@@ -1998,6 +1966,10 @@
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
     lea r0, [r6 + 32]
     lea r2, [r7 + 32]
     call pixel_satd_16x4_internal2
@@ -2012,21 +1984,10 @@
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
-    lea r0, [r6 + 48]
-    lea r2, [r7 + 48]
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
     pxor     m9, m9
     movhlps  m9, m10
     paddd   m10, m9
@@ -2035,75 +1996,74 @@
     movd    eax, m10
     RET
 
-cglobal pixel_satd_64x64, 4,8,8    ;if WIN64 && notcpuflag(avx)
+cglobal pixel_satd_64x16, 4,8,8    ;if WIN64 && notcpuflag(avx)
     SATD_START_SSE2 m10, m7
     mov r6, r0
     mov r7, r2
 %if vertical
     mova m7, [pw_00ff]
 %endif
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
     lea r0, [r6 + 16]
     lea r2, [r7 + 16]
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
     lea r0, [r6 + 32]
     lea r2, [r7 + 32]
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
     lea r0, [r6 + 48]
     lea r2, [r7 + 48]
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
-    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
+    SATD_END_SSE2 m10
+
+cglobal pixel_satd_64x32, 4,8,8    ;if WIN64 && notcpuflag(avx)
+    SATD_START_SSE2 m10, m7
+    mov r6, r0
+    mov r7, r2
+%if vertical
+    mova m7, [pw_00ff]
+%endif
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    lea r0, [r6 + 16]
+    lea r2, [r7 + 16]
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    lea r0, [r6 + 32]
+    lea r2, [r7 + 32]
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    lea r0, [r6 + 48]
+    lea r2, [r7 + 48]
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
     call pixel_satd_16x4_internal2
@@ -2121,6 +2081,162 @@
     movd    eax, m10
     RET
 
+cglobal pixel_satd_64x48, 4,8,8    ;if WIN64 && notcpuflag(avx)
+    SATD_START_SSE2 m10, m7
+    mov r6, r0
+    mov r7, r2
+%if vertical
+    mova m7, [pw_00ff]
+%endif
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    lea r0, [r6 + 16]
+    lea r2, [r7 + 16]
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    lea r0, [r6 + 32]
+    lea r2, [r7 + 32]
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    lea r0, [r6 + 48]
+    lea r2, [r7 + 48]
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+
+    pxor     m9, m9
+    movhlps  m9, m10
+    paddd   m10, m9
+    pshufd   m9, m10, 1
+    paddd   m10, m9
+    movd    eax, m10
+    RET
+
+cglobal pixel_satd_64x64, 4,8,8    ;if WIN64 && notcpuflag(avx)
+    SATD_START_SSE2 m10, m7
+    mov r6, r0
+    mov r7, r2
+%if vertical
+    mova m7, [pw_00ff]
+%endif
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    lea r0, [r6 + 16]
+    lea r2, [r7 + 16]
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    lea r0, [r6 + 32]
+    lea r2, [r7 + 32]
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    lea r0, [r6 + 48]
+    lea r2, [r7 + 48]
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+
+    pxor     m9, m9
+    movhlps  m9, m10
+    paddd   m10, m9
+    pshufd   m9, m10, 1
+    paddd   m10, m9
+    movd    eax, m10
+    RET
+
 %else
 
 cglobal pixel_satd_32x8, 4,6,8    ;if !WIN64
@@ -2308,6 +2424,263 @@
 %endif
 
 %if WIN64
+cglobal pixel_satd_32x64, 4,8,8    ;if WIN64 && cpuflag(avx)
+    SATD_START_SSE2 m6, m7
+    mov r6, r0
+    mov r7, r2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 8]
+    lea r2, [r7 + 8]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 16]
+    lea r2, [r7 + 16]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 24]
+    lea r2, [r7 + 24]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
+%else
+cglobal pixel_satd_32x64, 4,6,8    ;if !WIN64
+    SATD_START_SSE2 m6, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    mov r0, r0mp
+    mov r2, r2mp
+    add r0, 8
+    add r2, 8
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    mov r0, r0mp
+    mov r2, r2mp
+    add r0, 16
+    add r2, 16
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    mov r0, r0mp
+    mov r2, r2mp
+    add r0, 24
+    add r2, 24
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
+%endif
+
+%if WIN64
+cglobal pixel_satd_48x64, 4,8,8    ;if WIN64 && cpuflag(avx)
+    SATD_START_SSE2 m6, m7
+    mov r6, r0
+    mov r7, r2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 8]
+    lea r2, [r7 + 8]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 16]
+    lea r2, [r7 + 16]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 24]
+    lea r2, [r7 + 24]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 32]
+    lea r2, [r7 + 32]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 40]
+    lea r2, [r7 + 40]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
+%else
+cglobal pixel_satd_48x64, 4,6,8    ;if !WIN64
+    SATD_START_SSE2 m6, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    mov r0, r0mp
+    mov r2, r2mp
+    add r0, 8
+    add r2, 8
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    mov r0, r0mp
+    mov r2, r2mp
+    add r0, 16
+    add r2, 16
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    mov r0, r0mp
+    mov r2, r2mp
+    add r0, 24
+    add r2, 24
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    mov r0, r0mp
+    mov r2, r2mp
+    add r0, 32
+    add r2, 32
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    mov r0, r0mp
+    mov r2, r2mp
+    add r0, 40
+    add r2, 40
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
+%endif
+
+
+%if WIN64
 cglobal pixel_satd_64x16, 4,8,8    ;if WIN64 && cpuflag(avx)
     SATD_START_SSE2 m6, m7
     mov r6, r0


More information about the x265-devel mailing list