[x265] [PATCH] asm: Fix for transpose_32x32 routine

murugan at multicorewareinc.com murugan at multicorewareinc.com
Thu Nov 21 10:03:27 CET 2013


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1385024427 -19800
#      Thu Nov 21 14:30:27 2013 +0530
# Node ID e4e6d522248ece211ace5eb35f6cd9b1f6ca078c
# Parent  db1151bb4974f1288745ba39dfd6e1838113feb7
asm: Fix for transpose_32x32 routine

diff -r db1151bb4974 -r e4e6d522248e source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Wed Nov 20 18:36:04 2013 -0600
+++ b/source/common/x86/pixel-a.asm	Thu Nov 21 14:30:27 2013 +0530
@@ -8459,18 +8459,22 @@
 
     RET
 
-%macro TRANSPOSE_16x16 1
-    TRANSPOSE_8x8 %1
+cglobal transpose16_internal
+    TRANSPOSE_8x8 r6
     lea    r1,    [r1 + 2 * r2]
     lea    r0,    [r5 + 8]
-    TRANSPOSE_8x8 %1
-    lea    r1,    [r6 + 8]
-    lea    r0,    [r5 + 8 * %1]
-    TRANSPOSE_8x8 %1
+    TRANSPOSE_8x8 r6
     lea    r1,    [r1 + 2 * r2]
-    lea    r0,    [r5 + 8 * %1 + 8]
-    TRANSPOSE_8x8 %1
-%endmacro
+    neg    r2
+    lea    r1,    [r1 + r2 * 8]
+    lea    r1,    [r1 + r2 * 8 + 8]
+    neg    r2
+    lea    r0,    [r5 + 8 * r6]
+    TRANSPOSE_8x8 r6
+    lea    r1,    [r1 + 2 * r2]
+    lea    r0,    [r5 + 8 * r6 + 8]
+    TRANSPOSE_8x8 r6
+    ret
 
 ;-----------------------------------------------------------------
 ; void transpose_32x32(pixel *dst, pixel *src, intptr_t stride)
@@ -8481,23 +8485,20 @@
     mov    r3,    r0
     mov    r4,    r1
     mov    r5,    r0
-    mov    r6,    r1
-    TRANSPOSE_16x16 32
+    mov    r6,    32
+    call   transpose16_internal
     lea    r1,    [r1 - 8 + 2 * r2]
     lea    r0,    [r3 + 16]
     mov    r5,    r0
-    mov    r6,    r1
-    TRANSPOSE_16x16 32
+    call   transpose16_internal
     lea    r1,    [r4 + 16]
     lea    r0,    [r3 + 16 * 32]
     mov    r5,    r0
-    mov    r6,    r1
-    TRANSPOSE_16x16 32
+    call   transpose16_internal
     lea    r1,    [r1 - 8 + 2 * r2]
     lea    r0,    [r3 + 16 * 32 + 16]
     mov    r5,    r0
-    mov    r6,    r1
-    TRANSPOSE_16x16 32
+    call   transpose16_internal
 
     RET
 


More information about the x265-devel mailing list