[x265] [PATCH Review only] asm: code for transpose_8x8 routine

murugan at multicorewareinc.com murugan at multicorewareinc.com
Tue Nov 19 07:23:41 CET 2013


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1384842189 -19800
#      Tue Nov 19 11:53:09 2013 +0530
# Node ID 3a94cc365533bf7def255dc5b28e6a6a1d1bfa50
# Parent  f6a050b79cfa400aa432f49ee8a4c2b9f20cf930
asm: code for transpose_8x8 routine

diff -r f6a050b79cfa -r 3a94cc365533 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Nov 19 11:25:00 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Nov 19 11:53:09 2013 +0530
@@ -546,6 +546,7 @@
         p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
         p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
         p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
+        p.transpose[BLOCK_8x8] = x265_transpose8_sse2;
     }
     if (cpuMask & X265_CPU_SSSE3)
     {
diff -r f6a050b79cfa -r 3a94cc365533 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Tue Nov 19 11:25:00 2013 +0530
+++ b/source/common/x86/pixel-a.asm	Tue Nov 19 11:53:09 2013 +0530
@@ -8359,3 +8359,45 @@
     movu         [r0],    m0
 
     RET
+
+;-----------------------------------------------------------------
+; void transpose_8x8(pixel *dst, pixel *src, intptr_t stride)
+;-----------------------------------------------------------------
+INIT_XMM sse2
+cglobal transpose8, 3, 3, 8, dest, src, stride
+
+    movh         m0,    [r1]
+    movh         m1,    [r1 + r2]
+    movh         m2,    [r1 + 2 * r2]
+    lea          r1,    [r1 + 2 * r2]
+    movh         m3,    [r1 + r2]
+    movh         m4,    [r1 + 2 * r2]
+    lea          r1,    [r1 + 2 * r2]
+    movh         m5,    [r1 + r2]
+    movh         m6,    [r1 + 2 * r2]
+    lea          r1,    [r1 + 2 * r2]
+    movh         m7,    [r1 + r2]
+
+    punpcklbw    m0,    m1
+    punpcklbw    m2,    m3
+    punpcklbw    m4,    m5
+    punpcklbw    m6,    m7
+    movu         m1,    m0
+    punpcklwd    m0,    m2
+    punpckhwd    m1,    m2
+    movu         m5,    m4
+    punpcklwd    m4,    m6
+    punpckhwd    m5,    m6
+    movu         m2,    m0
+    punpckldq    m0,    m4
+    punpckhdq    m2,    m4
+    movu         m3,    m1
+    punpckldq    m1,    m5
+    punpckhdq    m3,    m5
+
+    movu         [r0],         m0
+    movu         [r0 + 16],    m2
+    movu         [r0 + 32],    m1
+    movu         [r0 + 48],    m3
+
+    RET
diff -r f6a050b79cfa -r 3a94cc365533 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Tue Nov 19 11:25:00 2013 +0530
+++ b/source/common/x86/pixel.h	Tue Nov 19 11:53:09 2013 +0530
@@ -366,5 +366,6 @@
 void x265_getResidual16_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
 void x265_getResidual32_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
 void x265_transpose4_sse2(pixel *dest, pixel *src, intptr_t stride);
+void x265_transpose8_sse2(pixel *dest, pixel *src, intptr_t stride);
 
 #endif // ifndef X265_I386_PIXEL_H


More information about the x265-devel mailing list