[x265] [PATCH Review only] asm: code for transpose_16x16 routine

murugan at multicorewareinc.com murugan at multicorewareinc.com
Tue Nov 19 14:50:13 CET 2013


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1384868970 -19800
#      Tue Nov 19 19:19:30 2013 +0530
# Node ID 435c48eb30e1789cd1271a35fe48fe7bef49ab56
# Parent  3a94cc365533bf7def255dc5b28e6a6a1d1bfa50
asm: code for transpose_16x16 routine

diff -r 3a94cc365533 -r 435c48eb30e1 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Nov 19 11:53:09 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Nov 19 19:19:30 2013 +0530
@@ -547,6 +547,7 @@
         p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
         p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
         p.transpose[BLOCK_8x8] = x265_transpose8_sse2;
+        p.transpose[BLOCK_16x16] = x265_transpose16_sse2;
     }
     if (cpuMask & X265_CPU_SSSE3)
     {
diff -r 3a94cc365533 -r 435c48eb30e1 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Tue Nov 19 11:53:09 2013 +0530
+++ b/source/common/x86/pixel-a.asm	Tue Nov 19 19:19:30 2013 +0530
@@ -8401,3 +8401,68 @@
     movu         [r0 + 48],    m3
 
     RET
+
+%macro transpose_8x8 0
+
+    movh         m0,    [r1]
+    movh         m1,    [r1 + r2]
+    movh         m2,    [r1 + 2 * r2]
+    lea          r1,    [r1 + 2 * r2]
+    movh         m3,    [r1 + r2]
+    movh         m4,    [r1 + 2 * r2]
+    lea          r1,    [r1 + 2 * r2]
+    movh         m5,    [r1 + r2]
+    movh         m6,    [r1 + 2 * r2]
+    lea          r1,    [r1 + 2 * r2]
+    movh         m7,    [r1 + r2]
+
+    punpcklbw    m0,    m1
+    punpcklbw    m2,    m3
+    punpcklbw    m4,    m5
+    punpcklbw    m6,    m7
+
+    punpckhwd    m1,    m0,    m2
+    punpcklwd    m0,    m2
+    punpckhwd    m5,    m4,    m6
+    punpcklwd    m4,    m6
+    punpckhdq    m2,    m0,    m4
+    punpckldq    m0,    m4
+    punpckhdq    m3,    m1,    m5
+    punpckldq    m1,    m5
+
+    movlps         [r0],             m0
+    movhps         [r0 + r3],        m0
+    movlps         [r0 + 2 * r3],    m2
+    lea            r0,               [r0 + 2 * r3]
+    movhps         [r0 + r3],        m2
+    movlps         [r0 + 2 * r3],    m1
+    lea            r0,               [r0 + 2 * r3]
+    movhps         [r0 + r3],        m1
+    movlps         [r0 + 2 * r3],    m3
+    lea            r0,               [r0 + 2 * r3]
+    movhps         [r0 + r3],        m3
+
+%endmacro
+
+
+;-----------------------------------------------------------------
+; void transpose_16x16(pixel *dst, pixel *src, intptr_t stride)
+;-----------------------------------------------------------------
+INIT_XMM sse2
+cglobal transpose16, 3, 5, 8, dest, src, stride
+
+    mov    r4,    r0
+    mov    r5,    r1
+    mov    r3,    16
+    transpose_8x8
+    lea    r1,    [r1 + 2 * r2]
+    lea    r0,    [r4 + 8]
+    transpose_8x8
+    lea    r1,    [r5 + 8]
+    lea    r0,    [r4 + r3 * 8]
+    transpose_8x8
+    lea    r1,    [r1 + 2 * r2]
+    lea    r0,    [r4 + r3 * 8 +8]
+    transpose_8x8
+
+    RET
diff -r 3a94cc365533 -r 435c48eb30e1 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Tue Nov 19 11:53:09 2013 +0530
+++ b/source/common/x86/pixel.h	Tue Nov 19 19:19:30 2013 +0530
@@ -367,5 +367,6 @@
 void x265_getResidual32_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
 void x265_transpose4_sse2(pixel *dest, pixel *src, intptr_t stride);
 void x265_transpose8_sse2(pixel *dest, pixel *src, intptr_t stride);
+void x265_transpose16_sse2(pixel *dest, pixel *src, intptr_t stride);
 
 #endif // ifndef X265_I386_PIXEL_H


More information about the x265-devel mailing list