[x265] [PATCH] blockcopy_pp_8x64: sse2 asm code optimization
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Tue Feb 3 14:07:25 CET 2015
# HG changeset patch
# User Praveen Tiwari
# Date 1422968834 -19800
# Node ID 6aba648bfada606d14f20e0a7cdb667d043069ae
# Parent a7dff1040961c2c17254c2e2bb0bf5b7857c8187
blockcopy_pp_8x64: sse2 asm code optimization
improved, 800.38c -> 752.18c
diff -r a7dff1040961 -r 6aba648bfada source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Tue Feb 03 18:26:22 2015 +0530
+++ b/source/common/x86/blockcopy8.asm Tue Feb 03 18:37:14 2015 +0530
@@ -482,48 +482,38 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
+; void blockcopy_pp_8x64(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
-%macro BLOCKCOPY_PP_W8_H8 2
INIT_XMM sse2
-cglobal blockcopy_pp_%1x%2, 4, 5, 6
- mov r4d, %2/8
-
-.loop:
- movh m0, [r2]
- movh m1, [r2 + r3]
- lea r2, [r2 + 2 * r3]
- movh m2, [r2]
- movh m3, [r2 + r3]
- lea r2, [r2 + 2 * r3]
- movh m4, [r2]
- movh m5, [r2 + r3]
-
- movh [r0], m0
- movh [r0 + r1], m1
- lea r0, [r0 + 2 * r1]
- movh [r0], m2
- movh [r0 + r1], m3
- lea r0, [r0 + 2 * r1]
- movh [r0], m4
- movh [r0 + r1], m5
-
- lea r2, [r2 + 2 * r3]
- movh m4, [r2]
- movh m5, [r2 + r3]
- lea r0, [r0 + 2 * r1]
- movh [r0], m4
- movh [r0 + r1], m5
-
- dec r4d
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- jnz .loop
-RET
-%endmacro
-
-
-BLOCKCOPY_PP_W8_H8 8, 64
+cglobal blockcopy_pp_8x64, 4, 6, 4
+
+ lea r4, [3 * r3]
+ lea r5, [3 * r1]
+
+ movh m0, [r2]
+ movh m1, [r2 + r3]
+ movh m2, [r2 + 2 * r3]
+ movh m3, [r2 + r4]
+
+ movh [r0], m0
+ movh [r0 + r1], m1
+ movh [r0 + 2 * r1], m2
+ movh [r0 + r5], m3
+
+ %rep 15
+ lea r2, [r2 + 4 * r3]
+ movh m0, [r2]
+ movh m1, [r2 + r3]
+ movh m2, [r2 + 2 * r3]
+ movh m3, [r2 + r4]
+
+ lea r0, [r0 + 4 * r1]
+ movh [r0], m0
+ movh [r0 + r1], m1
+ movh [r0 + 2 * r1], m2
+ movh [r0 + r5], m3
+ %endrep
+ RET
;-----------------------------------------------------------------------------
; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
More information about the x265-devel
mailing list