[x265] [PATCH] asm: assembly code for pixel_sa8d_64x48
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Thu Nov 21 08:20:46 CET 2013
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1385017172 -19800
# Thu Nov 21 12:29:32 2013 +0530
# Node ID 4821c41544b3ff487ef137914f3dfb509e6368e8
# Parent db1151bb4974f1288745ba39dfd6e1838113feb7
asm: assembly code for pixel_sa8d_64x48
diff -r db1151bb4974 -r 4821c41544b3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Nov 20 18:36:04 2013 -0600
+++ b/source/common/x86/asm-primitives.cpp Thu Nov 21 12:29:32 2013 +0530
@@ -104,7 +104,7 @@
p.sa8d_inter[LUMA_64x64] = x265_pixel_sa8d_64x64_ ## cpu; \
p.sa8d_inter[LUMA_64x32] = x265_pixel_sa8d_64x32_ ## cpu; \
p.sa8d_inter[LUMA_32x64] = x265_pixel_sa8d_32x64_ ## cpu; \
- p.sa8d_inter[LUMA_64x48] = cmp<64, 48, 16, 16, x265_pixel_sa8d_16x16_ ## cpu>; \
+ p.sa8d_inter[LUMA_64x48] = x265_pixel_sa8d_64x48_ ## cpu; \
p.sa8d_inter[LUMA_48x64] = cmp<48, 64, 16, 16, x265_pixel_sa8d_16x16_ ## cpu>; \
p.sa8d_inter[LUMA_64x16] = cmp<64, 16, 16, 16, x265_pixel_sa8d_16x16_ ## cpu>; \
p.sa8d_inter[LUMA_16x64] = cmp<16, 64, 16, 16, x265_pixel_sa8d_16x16_ ## cpu>
diff -r db1151bb4974 -r 4821c41544b3 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Wed Nov 20 18:36:04 2013 -0600
+++ b/source/common/x86/pixel-a.asm Thu Nov 21 12:29:32 2013 +0530
@@ -3778,6 +3778,105 @@
movd eax, m12
RET
+cglobal pixel_sa8d_64x48, 4,8,12
+ FIX_STRIDES r1, r3
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+ pxor m12, m12
+%if vertical == 0
+ mova m7, [hmul_8p]
+%endif
+ SA8D_16x16
+ lea r4, [8*r1]
+ lea r5, [8*r3]
+ sub r2, r4
+ sub r0, r5
+ add r2, 16
+ add r0, 16
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+ SA8D_16x16
+ lea r4, [8*r1]
+ lea r5, [8*r3]
+ sub r2, r4
+ sub r0, r5
+ add r2, 16
+ add r0, 16
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+ SA8D_16x16
+ lea r4, [8*r1]
+ lea r5, [8*r3]
+ sub r2, r4
+ sub r0, r5
+ add r2, 16
+ add r0, 16
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+ SA8D_16x16
+ lea r0, [r0+8*r1]
+ lea r2, [r2+8*r3]
+ SA8D_16x16
+ lea r4, [8*r1]
+ lea r5, [8*r3]
+ sub r2, r4
+ sub r0, r5
+ sub r2, 16
+ sub r0, 16
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+ SA8D_16x16
+ lea r4, [8*r1]
+ lea r5, [8*r3]
+ sub r2, r4
+ sub r0, r5
+ sub r2, 16
+ sub r0, 16
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+ SA8D_16x16
+ lea r4, [8*r1]
+ lea r5, [8*r3]
+ sub r2, r4
+ sub r0, r5
+ sub r2, 16
+ sub r0, 16
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+ SA8D_16x16
+ lea r0, [r0+8*r1]
+ lea r2, [r2+8*r3]
+ SA8D_16x16
+ lea r4, [8*r1]
+ lea r5, [8*r3]
+ sub r2, r4
+ sub r0, r5
+ add r2, 16
+ add r0, 16
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+ SA8D_16x16
+ lea r4, [8*r1]
+ lea r5, [8*r3]
+ sub r2, r4
+ sub r0, r5
+ add r2, 16
+ add r0, 16
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+ SA8D_16x16
+ lea r4, [8*r1]
+ lea r5, [8*r3]
+ sub r2, r4
+ sub r0, r5
+ add r2, 16
+ add r0, 16
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+ SA8D_16x16
+ movd eax, m12
+ RET
+
cglobal pixel_sa8d_64x64, 4,8,12
FIX_STRIDES r1, r3
lea r4, [3*r1]
@@ -4690,6 +4789,284 @@
mov esp, r6
RET
+cglobal pixel_sa8d_64x48, 4,7,8
+ FIX_STRIDES r1, r3
+ mov r6, esp
+ and esp, ~15
+ sub esp, 64
+
+ lea r4, [r1 + 2*r1]
+ lea r5, [r3 + 2*r3]
+ call pixel_sa8d_8x8_internal2
+ mova [rsp+48], m0
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+48], m0
+
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ add r0, 8*SIZEOF_PIXEL
+ add r2, 8*SIZEOF_PIXEL
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+48], m0
+ call pixel_sa8d_8x8_internal2
+ paddusw m0, [esp+48]
+ HADDUW m0, m1
+ movd r4d, m0
+ add r4d, 1
+ shr r4d, 1
+ mov dword [esp+36], r4d
+
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ add r0, 16*SIZEOF_PIXEL
+ add r2, 16*SIZEOF_PIXEL
+ lea r4, [r1 + 2*r1]
+ call pixel_sa8d_8x8_internal2
+ mova [esp+48], m0
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+48], m0
+
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ add r0, 24*SIZEOF_PIXEL
+ add r2, 24*SIZEOF_PIXEL
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+64-mmsize], m0
+ call pixel_sa8d_8x8_internal2
+ AVG_16x16
+
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ add r0, 32*SIZEOF_PIXEL
+ add r2, 32*SIZEOF_PIXEL
+ lea r4, [r1 + 2*r1]
+ call pixel_sa8d_8x8_internal2
+ mova [esp+48], m0
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+48], m0
+
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ add r0, 40*SIZEOF_PIXEL
+ add r2, 40*SIZEOF_PIXEL
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+64-mmsize], m0
+ call pixel_sa8d_8x8_internal2
+ AVG_16x16
+
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ add r0, 48*SIZEOF_PIXEL
+ add r2, 48*SIZEOF_PIXEL
+ lea r4, [r1 + 2*r1]
+ call pixel_sa8d_8x8_internal2
+ mova [esp+48], m0
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+48], m0
+
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ add r0, 56*SIZEOF_PIXEL
+ add r2, 56*SIZEOF_PIXEL
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+64-mmsize], m0
+ call pixel_sa8d_8x8_internal2
+ AVG_16x16
+
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ lea r0, [r0 + r1*8]
+ lea r2, [r2 + r3*8]
+ lea r0, [r0 + r1*8]
+ lea r2, [r2 + r3*8]
+ mov [r6+20], r0
+ mov [r6+28], r2
+
+ lea r4, [r1 + 2*r1]
+ call pixel_sa8d_8x8_internal2
+ mova [esp+48], m0
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+48], m0
+
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ add r0, 8*SIZEOF_PIXEL
+ add r2, 8*SIZEOF_PIXEL
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+64-mmsize], m0
+ call pixel_sa8d_8x8_internal2
+ AVG_16x16
+
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ add r0, 16*SIZEOF_PIXEL
+ add r2, 16*SIZEOF_PIXEL
+ lea r4, [r1 + 2*r1]
+ call pixel_sa8d_8x8_internal2
+ mova [esp+48], m0
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+48], m0
+
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ add r0, 24*SIZEOF_PIXEL
+ add r2, 24*SIZEOF_PIXEL
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+64-mmsize], m0
+ call pixel_sa8d_8x8_internal2
+ AVG_16x16
+
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ add r0, 32*SIZEOF_PIXEL
+ add r2, 32*SIZEOF_PIXEL
+ lea r4, [r1 + 2*r1]
+ call pixel_sa8d_8x8_internal2
+ mova [esp+48], m0
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+48], m0
+
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ add r0, 40*SIZEOF_PIXEL
+ add r2, 40*SIZEOF_PIXEL
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+64-mmsize], m0
+ call pixel_sa8d_8x8_internal2
+ AVG_16x16
+
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ add r0, 48*SIZEOF_PIXEL
+ add r2, 48*SIZEOF_PIXEL
+ lea r4, [r1 + 2*r1]
+ call pixel_sa8d_8x8_internal2
+ mova [esp+48], m0
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+48], m0
+
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ add r0, 56*SIZEOF_PIXEL
+ add r2, 56*SIZEOF_PIXEL
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+64-mmsize], m0
+ call pixel_sa8d_8x8_internal2
+ AVG_16x16
+
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ lea r0, [r0 + r1*8]
+ lea r2, [r2 + r3*8]
+ lea r0, [r0 + r1*8]
+ lea r2, [r2 + r3*8]
+ mov [r6+20], r0
+ mov [r6+28], r2
+
+ lea r4, [r1 + 2*r1]
+ call pixel_sa8d_8x8_internal2
+ mova [esp+48], m0
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+48], m0
+
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ add r0, 8*SIZEOF_PIXEL
+ add r2, 8*SIZEOF_PIXEL
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+64-mmsize], m0
+ call pixel_sa8d_8x8_internal2
+ AVG_16x16
+
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ add r0, 16*SIZEOF_PIXEL
+ add r2, 16*SIZEOF_PIXEL
+ lea r4, [r1 + 2*r1]
+ call pixel_sa8d_8x8_internal2
+ mova [esp+48], m0
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+48], m0
+
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ add r0, 24*SIZEOF_PIXEL
+ add r2, 24*SIZEOF_PIXEL
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+64-mmsize], m0
+ call pixel_sa8d_8x8_internal2
+ AVG_16x16
+
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ add r0, 32*SIZEOF_PIXEL
+ add r2, 32*SIZEOF_PIXEL
+ lea r4, [r1 + 2*r1]
+ call pixel_sa8d_8x8_internal2
+ mova [esp+48], m0
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+48], m0
+
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ add r0, 40*SIZEOF_PIXEL
+ add r2, 40*SIZEOF_PIXEL
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+64-mmsize], m0
+ call pixel_sa8d_8x8_internal2
+ AVG_16x16
+
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ add r0, 48*SIZEOF_PIXEL
+ add r2, 48*SIZEOF_PIXEL
+ lea r4, [r1 + 2*r1]
+ call pixel_sa8d_8x8_internal2
+ mova [esp+48], m0
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+48], m0
+
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ add r0, 56*SIZEOF_PIXEL
+ add r2, 56*SIZEOF_PIXEL
+ call pixel_sa8d_8x8_internal2
+ SA8D_INTER
+ mova [esp+64-mmsize], m0
+ call pixel_sa8d_8x8_internal2
+ paddusw m0, [esp+48]
+ HADDUW m0, m1
+ movd r4d, m0
+ add r4d, 1
+ shr r4d, 1
+ add r4d, dword [esp+36]
+ mov eax, r4d
+ mov esp, r6
+ RET
+
cglobal pixel_sa8d_64x64, 4,7,8
FIX_STRIDES r1, r3
mov r6, esp
More information about the x265-devel
mailing list