[x265] [PATCH Review only] asm: code for transpose_8x8 routine
chen
chenm003 at 163.com
Tue Nov 19 10:16:31 CET 2013
At 2013-11-19 14:23:41,murugan at multicorewareinc.com wrote:
># HG changeset patch
># User Murugan Vairavel <murugan at multicorewareinc.com>
># Date 1384842189 -19800
># Tue Nov 19 11:53:09 2013 +0530
># Node ID 3a94cc365533bf7def255dc5b28e6a6a1d1bfa50
># Parent f6a050b79cfa400aa432f49ee8a4c2b9f20cf930
>asm: code for transpose_8x8 routine
>
>diff -r f6a050b79cfa -r 3a94cc365533 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Tue Nov 19 11:25:00 2013 +0530
>+++ b/source/common/x86/asm-primitives.cpp Tue Nov 19 11:53:09 2013 +0530
>@@ -546,6 +546,7 @@
> p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
> p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
> p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
>+ p.transpose[BLOCK_8x8] = x265_transpose8_sse2;
> }
> if (cpuMask & X265_CPU_SSSE3)
> {
>diff -r f6a050b79cfa -r 3a94cc365533 source/common/x86/pixel-a.asm
>--- a/source/common/x86/pixel-a.asm Tue Nov 19 11:25:00 2013 +0530
>+++ b/source/common/x86/pixel-a.asm Tue Nov 19 11:53:09 2013 +0530
>@@ -8359,3 +8359,45 @@
> movu [r0], m0
>
> RET
>+
>+;-----------------------------------------------------------------
>+; void transpose_8x8(pixel *dst, pixel *src, intptr_t stride)
>+;-----------------------------------------------------------------
>+INIT_XMM sse2
>+cglobal transpose8, 3, 3, 8, dest, src, stride
>+
>+ movh m0, [r1]
>+ movh m1, [r1 + r2]
>+ movh m2, [r1 + 2 * r2]
>+ lea r1, [r1 + 2 * r2]
>+ movh m3, [r1 + r2]
>+ movh m4, [r1 + 2 * r2]
>+ lea r1, [r1 + 2 * r2]
>+ movh m5, [r1 + r2]
>+ movh m6, [r1 + 2 * r2]
>+ lea r1, [r1 + 2 * r2]
>+ movh m7, [r1 + r2]
>+
>+ punpcklbw m0, m1
>+ punpcklbw m2, m3
>+ punpcklbw m4, m5
>+ punpcklbw m6, m7
>+ movu m1, m0
register to register copy use mova is better, of course, use "punpckhwd m1, m0, m2" is best way
>+ punpcklwd m0, m2
>+ punpckhwd m1, m2
>+ movu m5, m4
>+ punpcklwd m4, m6
>+ punpckhwd m5, m6
>+ movu m2, m0
>+ punpckldq m0, m4
>+ punpckhdq m2, m4
>+ movu m3, m1
>+ punpckldq m1, m5
>+ punpckhdq m3, m5
>+
>+ movu [r0], m0
>+ movu [r0 + 16], m2
>+ movu [r0 + 32], m1
>+ movu [r0 + 48], m3
>+
>+ RET
>diff -r f6a050b79cfa -r 3a94cc365533 source/common/x86/pixel.h
>--- a/source/common/x86/pixel.h Tue Nov 19 11:25:00 2013 +0530
>+++ b/source/common/x86/pixel.h Tue Nov 19 11:53:09 2013 +0530
>@@ -366,5 +366,6 @@
> void x265_getResidual16_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
> void x265_getResidual32_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
> void x265_transpose4_sse2(pixel *dest, pixel *src, intptr_t stride);
>+void x265_transpose8_sse2(pixel *dest, pixel *src, intptr_t stride);
>
> #endif // ifndef X265_I386_PIXEL_H
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131119/a3f65be1/attachment.html>
More information about the x265-devel
mailing list