[x265] [PATCH] blockcopy_pp_6x8 sse2 asm code optimization
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Tue Feb 3 10:00:54 CET 2015
# HG changeset patch
# User Praveen Tiwari
# Date 1422954042 -19800
# Node ID d212ce9fa3705b9e7d4d23f14412bd28fe3bbfde
# Parent 059892f65db3e4c70017241ea847717e11be0124
blockcopy_pp_6x8 sse2 asm code optimization
improved, 248.67c -> 212.56c
diff -r 059892f65db3 -r d212ce9fa370 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Tue Feb 03 11:58:18 2015 +0530
+++ b/source/common/x86/blockcopy8.asm Tue Feb 03 14:30:42 2015 +0530
@@ -224,65 +224,51 @@
; void blockcopy_pp_6x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_pp_6x8, 4, 7, 8
-
- movd m0, [r2]
- movd m1, [r2 + r3]
- movd m2, [r2 + 2 * r3]
- lea r5, [r2 + 2 * r3]
- movd m3, [r5 + r3]
-
- movd m4, [r5 + 2 * r3]
- lea r5, [r5 + 2 * r3]
- movd m5, [r5 + r3]
- movd m6, [r5 + 2 * r3]
- lea r5, [r5 + 2 * r3]
- movd m7, [r5 + r3]
-
- movd [r0], m0
- movd [r0 + r1], m1
- movd [r0 + 2 * r1], m2
- lea r6, [r0 + 2 * r1]
- movd [r6 + r1], m3
-
- movd [r6 + 2 * r1], m4
- lea r6, [r6 + 2 * r1]
- movd [r6 + r1], m5
- movd [r6 + 2 * r1], m6
- lea r6, [r6 + 2 * r1]
- movd [r6 + r1], m7
-
- mov r4w, [r2 + 4]
- mov r5w, [r2 + r3 + 4]
- mov r6w, [r2 + 2 * r3 + 4]
-
- mov [r0 + 4], r4w
- mov [r0 + r1 + 4], r5w
- mov [r0 + 2 * r1 + 4], r6w
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
-
- mov r4w, [r2 + r3 + 4]
- mov r5w, [r2 + 2 * r3 + 4]
-
- mov [r0 + r1 + 4], r4w
- mov [r0 + 2 * r1 + 4], r5w
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
-
- mov r4w, [r2 + r3 + 4]
- mov r5w, [r2 + 2 * r3 + 4]
-
- mov [r0 + r1 + 4], r4w
- mov [r0 + 2 * r1 + 4], r5w
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
-
- mov r4w, [r2 + r3 + 4]
- mov [r0 + r1 + 4], r4w
+cglobal blockcopy_pp_6x8, 4, 7, 3
+
+ movd m0, [r2]
+ mov r4w, [r2 + 4]
+ movd m1, [r2 + r3]
+ mov r5w, [r2 + r3 + 4]
+ movd m2, [r2 + 2 * r3]
+ mov r6w, [r2 + 2 * r3 + 4]
+
+ movd [r0], m0
+ mov [r0 + 4], r4w
+ movd [r0 + r1], m1
+ mov [r0 + r1 + 4], r5w
+ movd [r0 + 2 * r1], m2
+ mov [r0 + 2 * r1 + 4], r6w
+
+ lea r2, [r2 + 2 * r3]
+ movd m0, [r2 + r3]
+ mov r4w, [r2 + r3 + 4]
+ movd m1, [r2 + 2 * r3]
+ mov r5w, [r2 + 2 * r3 + 4]
+ lea r2, [r2 + 2 * r3]
+ movd m2, [r2 + r3]
+ mov r6w, [r2 + r3 + 4]
+
+ lea r0, [r0 + 2 * r1]
+ movd [r0 + r1], m0
+ mov [r0 + r1 + 4], r4w
+ movd [r0 + 2 * r1], m1
+ mov [r0 + 2 * r1 + 4], r5w
+ lea r0, [r0 + 2 * r1]
+ movd [r0 + r1], m2
+ mov [r0 + r1 + 4], r6w
+
+ lea r2, [r2 + 2 * r3]
+ movd m0, [r2]
+ mov r4w, [r2 + 4]
+ movd m1, [r2 + r3]
+ mov r5w, [r2 + r3 + 4]
+
+ lea r0, [r0 + 2 * r1]
+ movd [r0], m0
+ mov [r0 + 4], r4w
+ movd [r0 + r1], m1
+ mov [r0 + r1 + 4], r5w
RET
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list