[x265] [PATCH Review only] asm: code for transpose_16x16 routine
chen
chenm003 at 163.com
Wed Nov 20 06:00:53 CET 2013
>+%macro transpose_8x8 0
macro name upper
>+
>+ movh m0, [r1]
>+ movh m1, [r1 + r2]
>+ movh m2, [r1 + 2 * r2]
>+ lea r1, [r1 + 2 * r2]
>+ movh m3, [r1 + r2]
>+ movh m4, [r1 + 2 * r2]
>+ lea r1, [r1 + 2 * r2]
>+ movh m5, [r1 + r2]
>+ movh m6, [r1 + 2 * r2]
>+ lea r1, [r1 + 2 * r2]
>+ movh m7, [r1 + r2]
>+
>+ punpcklbw m0, m1
>+ punpcklbw m2, m3
>+ punpcklbw m4, m5
>+ punpcklbw m6, m7
>+
>+ punpckhwd m1, m0, m2
>+ punpcklwd m0, m2
>+ punpckhwd m5, m4, m6
>+ punpcklwd m4, m6
>+ punpckhdq m2, m0, m4
>+ punpckldq m0, m4
>+ punpckhdq m3, m1, m5
>+ punpckldq m1, m5
>+
>+ movlps [r0], m0
>+ movhps [r0 + r3], m0
>+ movlps [r0 + 2 * r3], m2
>+ lea r0, [r0 + 2 * r3]
>+ movhps [r0 + r3], m2
>+ movlps [r0 + 2 * r3], m1
>+ lea r0, [r0 + 2 * r3]
>+ movhps [r0 + r3], m1
>+ movlps [r0 + 2 * r3], m3
>+ lea r0, [r0 + 2 * r3]
>+ movhps [r0 + r3], m3
>+
>+%endmacro
this macro is right, but need some modify, see below
>+
>+;-----------------------------------------------------------------
>+; void transpose_16x16(pixel *dst, pixel *src, intptr_t stride)
>+;-----------------------------------------------------------------
>+INIT_XMM sse2
>+cglobal transpose16, 3, 5, 8, dest, src, stride
>+
>+ mov r4, r0
>+ mov r5, r1
you declare you use r0-r4 only
>+ mov r3, 16
when stride is constant, inlin r3 is better, so you have to modify 8x8 macro and below
>+ transpose_8x8
>+ lea r1, [r1 + 2 * r2]
>+ lea r0, [r4 + 8]
>+ transpose_8x8
>+ lea r1, [r5 + 8]
>+ lea r0, [r4 + r3 * 8]
>+ transpose_8x8
>+ lea r1, [r1 + 2 * r2]
>+ lea r0, [r4 + r3 * 8 +8]
>+ transpose_8x8
>+
>+ RET
>diff -r 3a94cc365533 -r 435c48eb30e1 source/common/x86/pixel.h
>--- a/source/common/x86/pixel.h Tue Nov 19 11:53:09 2013 +0530
>+++ b/source/common/x86/pixel.h Tue Nov 19 19:19:30 2013 +0530
>@@ -367,5 +367,6 @@
> void x265_getResidual32_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
> void x265_transpose4_sse2(pixel *dest, pixel *src, intptr_t stride);
> void x265_transpose8_sse2(pixel *dest, pixel *src, intptr_t stride);
>+void x265_transpose16_sse2(pixel *dest, pixel *src, intptr_t stride);
>
> #endif // ifndef X265_I386_PIXEL_H
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131120/9f670214/attachment.html>
More information about the x265-devel
mailing list