[x265] [PATCH] asm: 10bpp code for transpose 32x32
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Wed Dec 4 08:03:27 CET 2013
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1386140597 -19800
# Wed Dec 04 12:33:17 2013 +0530
# Node ID ee1221fac033355129128ba5f847910e3ed49047
# Parent 8b73b22d90e1a0d70495e8b5f009a9c4fc37f258
asm: 10bpp code for transpose 32x32
diff -r 8b73b22d90e1 -r ee1221fac033 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Dec 04 12:06:19 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Dec 04 12:33:17 2013 +0530
@@ -529,6 +529,7 @@
p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
p.transpose[BLOCK_8x8] = x265_transpose8_sse2;
p.transpose[BLOCK_16x16] = x265_transpose16_sse2;
+ p.transpose[BLOCK_32x32] = x265_transpose32_sse2;
p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
PIXEL_AVG(sse2);
diff -r 8b73b22d90e1 -r ee1221fac033 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Wed Dec 04 12:06:19 2013 +0530
+++ b/source/common/x86/pixel-util8.asm Wed Dec 04 12:33:17 2013 +0530
@@ -1039,8 +1039,76 @@
; void transpose_32x32(pixel *dst, pixel *src, intptr_t stride)
;-----------------------------------------------------------------
INIT_XMM sse2
+%if HIGH_BIT_DEPTH
+cglobal transpose32, 3, 7, 4, dest, src, stride
+ add r2, r2
+ mov r3, r0
+ mov r4, r1
+ mov r5, 64
+ mov r6, r0
+ call transpose8_internal
+ lea r1, [r1 - 8 + 2 * r2]
+ lea r0, [r6 + 16]
+ mov r3, r0
+ call transpose8_internal
+ lea r1, [r1 - 8 + 2 * r2]
+ lea r0, [r6 + 32]
+ mov r3, r0
+ call transpose8_internal
+ lea r1, [r1 - 8 + 2 * r2]
+ lea r0, [r6 + 48]
+ mov r3, r0
+ call transpose8_internal
+ lea r1, [r4 + 16]
+ lea r0, [r6 + 8 * 64]
+ mov r3, r0
+ call transpose8_internal
+ lea r1, [r1 - 8 + 2 * r2]
+ lea r0, [r6 + 8 * 64 + 16]
+ mov r3, r0
+ call transpose8_internal
+ lea r1, [r1 - 8 + 2 * r2]
+ lea r0, [r6 + 8 * 64 + 32]
+ mov r3, r0
+ call transpose8_internal
+ lea r1, [r1 - 8 + 2 * r2]
+ lea r0, [r6 + 8 * 64 + 48]
+ mov r3, r0
+ call transpose8_internal
+ lea r1, [r4 + 32]
+ lea r0, [r6 + 16 * 64]
+ mov r3, r0
+ call transpose8_internal
+ lea r1, [r1 - 8 + 2 * r2]
+ lea r0, [r6 + 16 * 64 + 16]
+ mov r3, r0
+ call transpose8_internal
+ lea r1, [r1 - 8 + 2 * r2]
+ lea r0, [r6 + 16 * 64 + 32]
+ mov r3, r0
+ call transpose8_internal
+ lea r1, [r1 - 8 + 2 * r2]
+ lea r0, [r6 + 16 * 64 + 48]
+ mov r3, r0
+ call transpose8_internal
+ lea r1, [r4 + 48]
+ lea r0, [r6 + 24 * 64]
+ mov r3, r0
+ call transpose8_internal
+ lea r1, [r1 - 8 + 2 * r2]
+ lea r0, [r6 + 24 * 64 + 16]
+ mov r3, r0
+ call transpose8_internal
+ lea r1, [r1 - 8 + 2 * r2]
+ lea r0, [r6 + 24 * 64 + 32]
+ mov r3, r0
+ call transpose8_internal
+ lea r1, [r1 - 8 + 2 * r2]
+ lea r0, [r6 + 24 * 64 + 48]
+ mov r3, r0
+ call transpose8_internal
+%else
cglobal transpose32, 3, 7, 8, dest, src, stride
-
mov r3, r0
mov r4, r1
mov r5, r0
@@ -1058,7 +1126,7 @@
lea r0, [r3 + 16 * 32 + 16]
mov r5, r0
call transpose16_internal
-
+%endif
RET
;-----------------------------------------------------------------
More information about the x265-devel
mailing list