[x265] [PATCH] asm: cleanup routines of transpose module for 4x4, 8x8 and 16x16
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Wed Nov 20 10:46:01 CET 2013
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1384940628 -19800
# Wed Nov 20 15:13:48 2013 +0530
# Node ID a059cd5c08133486fb0499aeb77ac34677759d9b
# Parent 108ddc9e5c6b15e758ccbf08a0e923cbb7b28b5e
asm: cleanup routines of transpose module for 4x4, 8x8 and 16x16
diff -r 108ddc9e5c6b -r a059cd5c0813 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Tue Nov 19 23:45:52 2013 -0600
+++ b/source/common/x86/pixel-a.asm Wed Nov 20 15:13:48 2013 +0530
@@ -6958,6 +6958,25 @@
RET
;-----------------------------------------------------------------
+; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)
+;-----------------------------------------------------------------
+INIT_XMM sse2
+cglobal transpose4, 3, 3, 4, dest, src, stride
+
+ movd m0, [r1]
+ movd m1, [r1 + r2]
+ movd m2, [r1 + 2 * r2]
+ lea r1, [r1 + 2 * r2]
+ movd m3, [r1 + r2]
+
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ punpcklwd m0, m2
+ movu [r0], m0
+
+ RET
+
+;-----------------------------------------------------------------
; void transpose_8x8(pixel *dst, pixel *src, intptr_t stride)
;-----------------------------------------------------------------
INIT_XMM sse2
@@ -6979,18 +6998,15 @@
punpcklbw m2, m3
punpcklbw m4, m5
punpcklbw m6, m7
- movu m1, m0
+
+ punpckhwd m1, m0, m2
punpcklwd m0, m2
- punpckhwd m1, m2
- movu m5, m4
+ punpckhwd m5, m4, m6
punpcklwd m4, m6
- punpckhwd m5, m6
- movu m2, m0
+ punpckhdq m2, m0, m4
punpckldq m0, m4
- punpckhdq m2, m4
- movu m3, m1
+ punpckhdq m3, m1, m5
punpckldq m1, m5
- punpckhdq m3, m5
movu [r0], m0
movu [r0 + 16], m2
@@ -6999,7 +7015,7 @@
RET
-%macro transpose_8x8 0
+%macro TRANSPOSE_8x8 1
movh m0, [r1]
movh m1, [r1 + r2]
@@ -7028,16 +7044,16 @@
punpckldq m1, m5
movlps [r0], m0
- movhps [r0 + r3], m0
- movlps [r0 + 2 * r3], m2
- lea r0, [r0 + 2 * r3]
- movhps [r0 + r3], m2
- movlps [r0 + 2 * r3], m1
- lea r0, [r0 + 2 * r3]
- movhps [r0 + r3], m1
- movlps [r0 + 2 * r3], m3
- lea r0, [r0 + 2 * r3]
- movhps [r0 + r3], m3
+ movhps [r0 + %1], m0
+ movlps [r0 + 2 * %1], m2
+ lea r0, [r0 + 2 * %1]
+ movhps [r0 + %1], m2
+ movlps [r0 + 2 * %1], m1
+ lea r0, [r0 + 2 * %1]
+ movhps [r0 + %1], m1
+ movlps [r0 + 2 * %1], m3
+ lea r0, [r0 + 2 * %1]
+ movhps [r0 + %1], m3
%endmacro
@@ -7048,19 +7064,18 @@
INIT_XMM sse2
cglobal transpose16, 3, 5, 8, dest, src, stride
- mov r4, r0
- mov r5, r1
- mov r3, 16
- transpose_8x8
+ mov r3, r0
+ mov r4, r1
+ TRANSPOSE_8x8 16
lea r1, [r1 + 2 * r2]
- lea r0, [r4 + 8]
- transpose_8x8
- lea r1, [r5 + 8]
- lea r0, [r4 + r3 * 8]
- transpose_8x8
+ lea r0, [r3 + 8]
+ TRANSPOSE_8x8 16
+ lea r1, [r4 + 8]
+ lea r0, [r3 + 8 * 16]
+ TRANSPOSE_8x8 16
lea r1, [r1 + 2 * r2]
- lea r0, [r4 + r3 * 8 +8]
- transpose_8x8
+ lea r0, [r3 + 8 * 16 + 8]
+ TRANSPOSE_8x8 16
RET
@@ -8329,25 +8344,3 @@
movu [r0 + 48], m4
RET
-
-;-----------------------------------------------------------------
-; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)
-;-----------------------------------------------------------------
-INIT_XMM sse2
-cglobal transpose4, 3, 3, 4, dest, src, stride
-
- movd m0, [r1]
- movd m1, [r1 + r2]
- movd m2, [r1 + 2 * r2]
-
- lea r1, [r1 + 2 * r2]
-
- movd m3, [r1 + r2]
-
- punpcklbw m0, m1
- punpcklbw m2, m3
- punpcklwd m0, m2
-
- movu [r0], m0
-
-RET
More information about the x265-devel
mailing list