[x265] [PATCH] blockcopy_pp_2x16 SSE2 asm code: optimization
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Thu Feb 5 10:25:05 CET 2015
# HG changeset patch
# User Praveen Tiwari
# Date 1423128295 -19800
# Node ID 1265aafe5af1d66b8151ffd9bdc6fe595f7e6343
# Parent 6843cdeae82b7429eedee297c33b0eb6b49401a2
blockcopy_pp_2x16 SSE2 asm code: optimization
reduced LEA instructions and eliminated branch instructions
improved, 310.94c -> 268.89c
diff -r 6843cdeae82b -r 1265aafe5af1 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Thu Feb 05 14:46:54 2015 +0530
+++ b/source/common/x86/blockcopy8.asm Thu Feb 05 14:54:55 2015 +0530
@@ -93,16 +93,30 @@
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_2x16, 4, 7, 0
- mov r6d, 16/2
-.loop:
- mov r4w, [r2]
- mov r5w, [r2 + r3]
- dec r6d
- lea r2, [r2 + r3 * 2]
- mov [r0], r4w
- mov [r0 + r1], r5w
- lea r0, [r0 + r1 * 2]
- jnz .loop
+ lea r5, [3 * r1]
+ lea r6, [3 * r3]
+
+ mov r4w, [r2]
+ mov [r0], r4w
+ mov r4w, [r2 + r3]
+ mov [r0 + r1], r4w
+ mov r4w, [r2 + 2 * r3]
+ mov [r0 + 2 * r1], r4w
+ mov r4w, [r2 + r6]
+ mov [r0 + r5], r4w
+
+%rep 3
+ lea r2, [r2 + 4 * r3]
+ mov r4w, [r2]
+ lea r0, [r0 + 4 * r1]
+ mov [r0], r4w
+ mov r4w, [r2 + r3]
+ mov [r0 + r1], r4w
+ mov r4w, [r2 + 2 * r3]
+ mov [r0 + 2 * r1], r4w
+ mov r4w, [r2 + r6]
+ mov [r0 + r5], r4w
+%endrep
RET
More information about the x265-devel
mailing list