[x265] [PATCH Review only] asm: code for transpose_16x16 routine

chen chenm003 at 163.com
Wed Nov 20 06:00:53 CET 2013


>+%macro transpose_8x8 0
macro name upper

>+
>+    movh         m0,    [r1]
>+    movh         m1,    [r1 + r2]
>+    movh         m2,    [r1 + 2 * r2]
>+    lea          r1,    [r1 + 2 * r2]
>+    movh         m3,    [r1 + r2]
>+    movh         m4,    [r1 + 2 * r2]
>+    lea          r1,    [r1 + 2 * r2]
>+    movh         m5,    [r1 + r2]
>+    movh         m6,    [r1 + 2 * r2]
>+    lea          r1,    [r1 + 2 * r2]
>+    movh         m7,    [r1 + r2]
>+
>+    punpcklbw    m0,    m1
>+    punpcklbw    m2,    m3
>+    punpcklbw    m4,    m5
>+    punpcklbw    m6,    m7
>+
>+    punpckhwd    m1,    m0,    m2
>+    punpcklwd    m0,    m2
>+    punpckhwd    m5,    m4,    m6
>+    punpcklwd    m4,    m6
>+    punpckhdq    m2,    m0,    m4
>+    punpckldq    m0,    m4
>+    punpckhdq    m3,    m1,    m5
>+    punpckldq    m1,    m5
>+
>+    movlps         [r0],             m0
>+    movhps         [r0 + r3],        m0
>+    movlps         [r0 + 2 * r3],    m2
>+    lea            r0,               [r0 + 2 * r3]
>+    movhps         [r0 + r3],        m2
>+    movlps         [r0 + 2 * r3],    m1
>+    lea            r0,               [r0 + 2 * r3]
>+    movhps         [r0 + r3],        m1
>+    movlps         [r0 + 2 * r3],    m3
>+    lea            r0,               [r0 + 2 * r3]
>+    movhps         [r0 + r3],        m3
>+
>+%endmacro
this macro is right, but need some modify, see below
 

>+
>+;-----------------------------------------------------------------
>+; void transpose_16x16(pixel *dst, pixel *src, intptr_t stride)
>+;-----------------------------------------------------------------
>+INIT_XMM sse2
>+cglobal transpose16, 3, 5, 8, dest, src, stride
>+
>+    mov    r4,    r0
>+    mov    r5,    r1
you declare you use r0-r4 only

>+    mov    r3,    16
when stride is constant, inlin r3 is better, so you have to modify 8x8 macro and below

>+    transpose_8x8
>+    lea    r1,    [r1 + 2 * r2]
>+    lea    r0,    [r4 + 8]
>+    transpose_8x8
>+    lea    r1,    [r5 + 8]
>+    lea    r0,    [r4 + r3 * 8]
>+    transpose_8x8
>+    lea    r1,    [r1 + 2 * r2]
>+    lea    r0,    [r4 + r3 * 8 +8]
>+    transpose_8x8
>+
>+    RET
>diff -r 3a94cc365533 -r 435c48eb30e1 source/common/x86/pixel.h
>--- a/source/common/x86/pixel.h Tue Nov 19 11:53:09 2013 +0530
>+++ b/source/common/x86/pixel.h Tue Nov 19 19:19:30 2013 +0530
>@@ -367,5 +367,6 @@
> void x265_getResidual32_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
> void x265_transpose4_sse2(pixel *dest, pixel *src, intptr_t stride);
> void x265_transpose8_sse2(pixel *dest, pixel *src, intptr_t stride);
>+void x265_transpose16_sse2(pixel *dest, pixel *src, intptr_t stride);
> 
> #endif // ifndef X265_I386_PIXEL_H
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131120/9f670214/attachment.html>


More information about the x265-devel mailing list