[x265] [PATCH Review only] asm: code for transpose_8x8 routine

chen chenm003 at 163.com
Tue Nov 19 10:16:31 CET 2013


At 2013-11-19 14:23:41,murugan at multicorewareinc.com wrote:
># HG changeset patch
># User Murugan Vairavel <murugan at multicorewareinc.com>
># Date 1384842189 -19800
>#      Tue Nov 19 11:53:09 2013 +0530
># Node ID 3a94cc365533bf7def255dc5b28e6a6a1d1bfa50
># Parent  f6a050b79cfa400aa432f49ee8a4c2b9f20cf930
>asm: code for transpose_8x8 routine
>
>diff -r f6a050b79cfa -r 3a94cc365533 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Tue Nov 19 11:25:00 2013 +0530
>+++ b/source/common/x86/asm-primitives.cpp Tue Nov 19 11:53:09 2013 +0530
>@@ -546,6 +546,7 @@
>         p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
>         p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
>         p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
>+        p.transpose[BLOCK_8x8] = x265_transpose8_sse2;
>     }
>     if (cpuMask & X265_CPU_SSSE3)
>     {
>diff -r f6a050b79cfa -r 3a94cc365533 source/common/x86/pixel-a.asm
>--- a/source/common/x86/pixel-a.asm Tue Nov 19 11:25:00 2013 +0530
>+++ b/source/common/x86/pixel-a.asm Tue Nov 19 11:53:09 2013 +0530
>@@ -8359,3 +8359,45 @@
>     movu         [r0],    m0
> 
>     RET
>+
>+;-----------------------------------------------------------------
>+; void transpose_8x8(pixel *dst, pixel *src, intptr_t stride)
>+;-----------------------------------------------------------------
>+INIT_XMM sse2
>+cglobal transpose8, 3, 3, 8, dest, src, stride
>+
>+    movh         m0,    [r1]
>+    movh         m1,    [r1 + r2]
>+    movh         m2,    [r1 + 2 * r2]
>+    lea          r1,    [r1 + 2 * r2]
>+    movh         m3,    [r1 + r2]
>+    movh         m4,    [r1 + 2 * r2]
>+    lea          r1,    [r1 + 2 * r2]
>+    movh         m5,    [r1 + r2]
>+    movh         m6,    [r1 + 2 * r2]
>+    lea          r1,    [r1 + 2 * r2]
>+    movh         m7,    [r1 + r2]
>+
>+    punpcklbw    m0,    m1
>+    punpcklbw    m2,    m3
>+    punpcklbw    m4,    m5
>+    punpcklbw    m6,    m7
>+    movu         m1,    m0
register to register copy use mova is better, of course, use "punpckhwd m1, m0, m2" is best way

>+    punpcklwd    m0,    m2
>+    punpckhwd    m1,    m2
>+    movu         m5,    m4
>+    punpcklwd    m4,    m6
>+    punpckhwd    m5,    m6
>+    movu         m2,    m0
>+    punpckldq    m0,    m4
>+    punpckhdq    m2,    m4
>+    movu         m3,    m1
>+    punpckldq    m1,    m5
>+    punpckhdq    m3,    m5
>+
>+    movu         [r0],         m0
>+    movu         [r0 + 16],    m2
>+    movu         [r0 + 32],    m1
>+    movu         [r0 + 48],    m3
>+
>+    RET
>diff -r f6a050b79cfa -r 3a94cc365533 source/common/x86/pixel.h
>--- a/source/common/x86/pixel.h Tue Nov 19 11:25:00 2013 +0530
>+++ b/source/common/x86/pixel.h Tue Nov 19 11:53:09 2013 +0530
>@@ -366,5 +366,6 @@
> void x265_getResidual16_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
> void x265_getResidual32_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
> void x265_transpose4_sse2(pixel *dest, pixel *src, intptr_t stride);
>+void x265_transpose8_sse2(pixel *dest, pixel *src, intptr_t stride);
> 
> #endif // ifndef X265_I386_PIXEL_H
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131119/a3f65be1/attachment.html>


More information about the x265-devel mailing list