[x265] [PATCH Review only] asm: code for transpose_32x32 routine
chen
chenm003 at 163.com
Wed Nov 20 13:52:30 CET 2013
At 2013-11-20 20:21:41,murugan at multicorewareinc.com wrote:
># HG changeset patch
># User Murugan Vairavel <murugan at multicorewareinc.com>
># Date 1384950080 -19800
># Wed Nov 20 17:51:20 2013 +0530
># Node ID 2ea432a36a03a7ee7e0f788181ff35ac8db20337
># Parent a059cd5c08133486fb0499aeb77ac34677759d9b
>asm: code for transpose_32x32 routine
>
>diff -r a059cd5c0813 -r 2ea432a36a03 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Wed Nov 20 15:13:48 2013 +0530
>+++ b/source/common/x86/asm-primitives.cpp Wed Nov 20 17:51:20 2013 +0530
>@@ -547,6 +547,7 @@
> p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
> p.transpose[BLOCK_8x8] = x265_transpose8_sse2;
> p.transpose[BLOCK_16x16] = x265_transpose16_sse2;
>+ p.transpose[BLOCK_32x32] = x265_transpose32_sse2;
> }
> if (cpuMask & X265_CPU_SSSE3)
> {
>diff -r a059cd5c0813 -r 2ea432a36a03 source/common/x86/pixel-a.asm
>--- a/source/common/x86/pixel-a.asm Wed Nov 20 15:13:48 2013 +0530
>+++ b/source/common/x86/pixel-a.asm Wed Nov 20 17:51:20 2013 +0530
>@@ -7079,6 +7079,48 @@
>
> RET
>
>+%macro TRANSPOSE_16x16 1
>+ TRANSPOSE_8x8 %1
>+ lea r1, [r1 + 2 * r2]
>+ lea r0, [r5 + 8]
>+ TRANSPOSE_8x8 %1
>+ lea r1, [r6 + 8]
>+ lea r0, [r5 + 8 * %1]
>+ TRANSPOSE_8x8 %1
>+ lea r1, [r1 + 2 * r2]
>+ lea r0, [r5 + 8 * %1 + 8]
>+ TRANSPOSE_8x8 %1
>+%endmacro
>+
>+;-----------------------------------------------------------------
>+; void transpose_32x32(pixel *dst, pixel *src, intptr_t stride)
>+;-----------------------------------------------------------------
>+INIT_XMM sse2
>+cglobal transpose32, 3, 7, 8, dest, src, stride
>+
>+ mov r3, r0
>+ mov r4, r1
>+ mov r5, r0
>+ mov r6, r1
>+ TRANSPOSE_16x16 32
>+ lea r1, [r1 - 8 + 2 * r2]
>+ lea r0, [r3 + 16]
>+ mov r5, r0
>+ mov r6, r1
>+ TRANSPOSE_16x16 32
>+ lea r1, [r4 + 16]
>+ lea r0, [r3 + 16 * 32]
>+ mov r5, r0
>+ mov r6, r1
>+ TRANSPOSE_16x16 32
>+ lea r1, [r1 - 8 + 2 * r2]
>+ lea r0, [r3 + 16 * 32 + 16]
>+ mov r5, r0
>+ mov r6, r1
>+ TRANSPOSE_16x16 32
>+
>+ RET
The code is right, but all of macro extended, so the obj is large and lower cache performance
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131120/5b6b9eae/attachment-0001.html>
More information about the x265-devel
mailing list