[x265] [PATCH] blockcopy_pp_8x12: sse2 asm code optimization
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Tue Feb 3 13:25:48 CET 2015
# HG changeset patch
# User Praveen Tiwari
# Date 1422966338 -19800
# Node ID 3fc854e9e1b07e490c1422635dffea7b62e911c9
# Parent bfc9a2d99e20568cb43d9fba0133735009793b00
blockcopy_pp_8x12: sse2 asm code optimization
improved, 235.05c -> 158.79c
diff -r bfc9a2d99e20 -r 3fc854e9e1b0 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Tue Feb 03 17:14:55 2015 +0530
+++ b/source/common/x86/blockcopy8.asm Tue Feb 03 17:55:38 2015 +0530
@@ -351,17 +351,34 @@
; void blockcopy_pp_8x12(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_pp_8x12, 4, 5, 2
- mov r4d, 12/2
-.loop:
- movh m0, [r2]
- movh m1, [r2 + r3]
- movh [r0], m0
- movh [r0 + r1], m1
- dec r4d
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- jnz .loop
+cglobal blockcopy_pp_8x12, 4, 5, 4
+
+ lea r4, [3 * r3]
+ lea r5, [3 * r1]
+
+ movh m0, [r2]
+ movh m1, [r2 + r3]
+ movh m2, [r2 + 2 * r3]
+ movh m3, [r2 + r4]
+
+ movh [r0], m0
+ movh [r0 + r1], m1
+ movh [r0 + 2 * r1], m2
+ movh [r0 + r5], m3
+
+ %rep 2
+ lea r2, [r2 + 4 * r3]
+ movh m0, [r2]
+ movh m1, [r2 + r3]
+ movh m2, [r2 + 2 * r3]
+ movh m3, [r2 + r4]
+
+ lea r0, [r0 + 4 * r1]
+ movh [r0], m0
+ movh [r0 + r1], m1
+ movh [r0 + 2 * r1], m2
+ movh [r0 + r5], m3
+ %endrep
RET
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list