[x265] [PATCH] asm: assembly code for pixel_satd_64x64
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Thu Nov 14 13:07:12 CET 2013
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1384430674 -19800
# Thu Nov 14 17:34:34 2013 +0530
# Node ID cb15dab6333f3ce23083274718754ca588596547
# Parent 125f9c97e57737fbcf0bc616e1337265a5090440
asm: assembly code for pixel_satd_64x64
diff -r 125f9c97e577 -r cb15dab6333f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Nov 14 16:42:05 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Nov 14 17:34:34 2013 +0530
@@ -61,7 +61,7 @@
#define HEVC_SATD(cpu) \
p.satd[LUMA_32x32] = x265_pixel_satd_32x32_ ## cpu; \
p.satd[LUMA_24x32] = x265_pixel_satd_24x32_ ## cpu; \
- p.satd[LUMA_64x64] = cmp<64, 64, 16, 16, x265_pixel_satd_16x16_ ## cpu>; \
+ p.satd[LUMA_64x64] = x265_pixel_satd_64x64_ ## cpu; \
p.satd[LUMA_64x32] = x265_pixel_satd_64x32_ ## cpu; \
p.satd[LUMA_32x64] = cmp<32, 64, 16, 16, x265_pixel_satd_16x16_ ## cpu>; \
p.satd[LUMA_64x48] = x265_pixel_satd_64x48_ ## cpu; \
diff -r 125f9c97e577 -r cb15dab6333f source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Thu Nov 14 16:42:05 2013 +0530
+++ b/source/common/x86/pixel-a.asm Thu Nov 14 17:34:34 2013 +0530
@@ -2035,6 +2035,92 @@
movd eax, m10
RET
+cglobal pixel_satd_64x64, 4,8,8 ;if WIN64 && notcpuflag(avx)
+ SATD_START_SSE2 m10, m7
+ mov r6, r0
+ mov r7, r2
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ lea r0, [r6 + 16]
+ lea r2, [r7 + 16]
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ lea r0, [r6 + 32]
+ lea r2, [r7 + 32]
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ lea r0, [r6 + 48]
+ lea r2, [r7 + 48]
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+ call pixel_satd_16x4_internal2
+
+ pxor m9, m9
+ movhlps m9, m10
+ paddd m10, m9
+ pshufd m9, m10, 1
+ paddd m10, m9
+ movd eax, m10
+ RET
+
%else
cglobal pixel_satd_32x8, 4,6,8 ;if !WIN64
@@ -2599,6 +2685,200 @@
RET
%endif
+%if WIN64
+cglobal pixel_satd_64x64, 4,8,9 ;if WIN64 && cpuflag(avx)
+ SATD_START_SSE2 m6, m7
+ mov r6, r0
+ mov r7, r2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ lea r0, [r6 + 8]
+ lea r2, [r7 + 8]
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ lea r0, [r6 + 16]
+ lea r2, [r7 + 16]
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ lea r0, [r6 + 24]
+ lea r2, [r7 + 24]
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ lea r0, [r6 + 32]
+ lea r2, [r7 + 32]
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ lea r0, [r6 + 40]
+ lea r2, [r7 + 40]
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ lea r0, [r6 + 48]
+ lea r2, [r7 + 48]
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ lea r0, [r6 + 56]
+ lea r2, [r7 + 56]
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ pxor m8, m8
+ movhlps m8, m6
+ paddd m6, m8
+ pshufd m8, m6, 1
+ paddd m6, m8
+ movd eax, m6
+ RET
+%else
+cglobal pixel_satd_64x64, 4,6,8 ;if !WIN64
+ SATD_START_SSE2 m6, m7
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ mov r0, r0mp
+ mov r2, r2mp
+ add r0, 8
+ add r2, 8
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ mov r0, r0mp
+ mov r2, r2mp
+ add r0, 16
+ add r2, 16
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ mov r0, r0mp
+ mov r2, r2mp
+ add r0, 24
+ add r2, 24
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ mov r0, r0mp
+ mov r2, r2mp
+ add r0, 32
+ add r2, 32
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ mov r0, r0mp
+ mov r2, r2mp
+ add r0, 40
+ add r2, 40
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ mov r0, r0mp
+ mov r2, r2mp
+ add r0, 48
+ add r2, 48
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ mov r0, r0mp
+ mov r2, r2mp
+ add r0, 56
+ add r2, 56
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ pxor m7, m7
+ movhlps m7, m6
+ paddd m6, m7
+ pshufd m7, m6, 1
+ paddd m6, m7
+ movd eax, m6
+ RET
+%endif
+
cglobal pixel_satd_16x4, 4,6,8
SATD_START_SSE2 m6, m7
BACKUP_POINTERS
More information about the x265-devel
mailing list