[x265] [PATCH] asm: code for transpose_64x64 routine

murugan at multicorewareinc.com murugan at multicorewareinc.com
Thu Nov 21 10:14:27 CET 2013


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1385025246 -19800
#      Thu Nov 21 14:44:06 2013 +0530
# Node ID 44d15f8ce9403cc8c8d97bffee355c1e24ad1271
# Parent  e4e6d522248ece211ace5eb35f6cd9b1f6ca078c
asm: code for transpose_64x64 routine

diff -r e4e6d522248e -r 44d15f8ce940 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Nov 21 14:30:27 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Nov 21 14:44:06 2013 +0530
@@ -550,6 +550,7 @@
         p.transpose[BLOCK_8x8] = x265_transpose8_sse2;
         p.transpose[BLOCK_16x16] = x265_transpose16_sse2;
         p.transpose[BLOCK_32x32] = x265_transpose32_sse2;
+        p.transpose[BLOCK_64x64] = x265_transpose64_sse2;
     }
     if (cpuMask & X265_CPU_SSSE3)
     {
diff -r e4e6d522248e -r 44d15f8ce940 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Thu Nov 21 14:30:27 2013 +0530
+++ b/source/common/x86/pixel-a.asm	Thu Nov 21 14:44:06 2013 +0530
@@ -8502,6 +8502,83 @@
 
     RET
 
+;-----------------------------------------------------------------
+; void transpose_64x64(pixel *dst, pixel *src, intptr_t stride)
+;-----------------------------------------------------------------
+INIT_XMM sse2
+cglobal transpose64, 3, 7, 8, dest, src, stride
+
+    mov    r3,    r0
+    mov    r4,    r1
+    mov    r5,    r0
+    mov    r6,    64
+    call   transpose16_internal
+    lea    r1,    [r1 - 8 + 2 * r2]
+    lea    r0,    [r3 + 16]
+    mov    r5,    r0
+    call   transpose16_internal
+    lea    r1,    [r1 - 8 + 2 * r2]
+    lea    r0,    [r3 + 32]
+    mov    r5,    r0
+    call   transpose16_internal
+    lea    r1,    [r1 - 8 + 2 * r2]
+    lea    r0,    [r3 + 48]
+    mov    r5,    r0
+    call   transpose16_internal
+
+    lea    r1,    [r4 + 16]
+    lea    r0,    [r3 + 16 * 64]
+    mov    r5,    r0
+    call   transpose16_internal
+    lea    r1,    [r1 - 8 + 2 * r2]
+    lea    r0,    [r3 + 16 * 64 + 16]
+    mov    r5,    r0
+    call   transpose16_internal
+    lea    r1,    [r1 - 8 + 2 * r2]
+    lea    r0,    [r3 + 16 * 64 + 32]
+    mov    r5,    r0
+    call   transpose16_internal
+    lea    r1,    [r1 - 8 + 2 * r2]
+    lea    r0,    [r3 + 16 * 64 + 48]
+    mov    r5,    r0
+    call   transpose16_internal
+
+    lea    r1,    [r4 + 32]
+    lea    r0,    [r3 + 32 * 64]
+    mov    r5,    r0
+    call   transpose16_internal
+    lea    r1,    [r1 - 8 + 2 * r2]
+    lea    r0,    [r3 + 32 * 64 + 16]
+    mov    r5,    r0
+    call   transpose16_internal
+    lea    r1,    [r1 - 8 + 2 * r2]
+    lea    r0,    [r3 + 32 * 64 + 32]
+    mov    r5,    r0
+    call   transpose16_internal
+    lea    r1,    [r1 - 8 + 2 * r2]
+    lea    r0,    [r3 + 32 * 64 + 48]
+    mov    r5,    r0
+    call   transpose16_internal
+
+    lea    r1,    [r4 + 48]
+    lea    r0,    [r3 + 48 * 64]
+    mov    r5,    r0
+    call   transpose16_internal
+    lea    r1,    [r1 - 8 + 2 * r2]
+    lea    r0,    [r3 + 48 * 64 + 16]
+    mov    r5,    r0
+    call   transpose16_internal
+    lea    r1,    [r1 - 8 + 2 * r2]
+    lea    r0,    [r3 + 48 * 64 + 32]
+    mov    r5,    r0
+    call   transpose16_internal
+    lea    r1,    [r1 - 8 + 2 * r2]
+    lea    r0,    [r3 + 48 * 64 + 48]
+    mov    r5,    r0
+    call   transpose16_internal
+
+    RET
+
 ;-----------------------------------------------------------------------------
 ; void pixel_sub_ps_c_2x4(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
 ;-----------------------------------------------------------------------------
diff -r e4e6d522248e -r 44d15f8ce940 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Thu Nov 21 14:30:27 2013 +0530
+++ b/source/common/x86/pixel.h	Thu Nov 21 14:44:06 2013 +0530
@@ -371,5 +371,6 @@
 void x265_transpose16_sse2(pixel *dest, pixel *src, intptr_t stride);
 uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
 void x265_transpose32_sse2(pixel *dest, pixel *src, intptr_t stride);
+void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride);
 
 #endif // ifndef X265_I386_PIXEL_H


More information about the x265-devel mailing list