[x265] [PATCH] blockcopy_pp_12x16: SSE2 asm code optimization
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Thu Feb 5 13:08:49 CET 2015
# HG changeset patch
# User Praveen Tiwari
# Date 1423138113 -19800
# Node ID 7173414b198e8f0d23ae3680ac739387978755e9
# Parent b10384b8c8a9a60fe37f4e5f3506673dcf00c004
blockcopy_pp_12x16: SSE2 asm code optimization
improved, 381.50c -> 368.40c
diff -r b10384b8c8a9 -r 7173414b198e source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Thu Feb 05 17:29:03 2015 +0530
+++ b/source/common/x86/blockcopy8.asm Thu Feb 05 17:38:33 2015 +0530
@@ -545,14 +545,12 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
+; void blockcopy_pp_12x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
-%macro BLOCKCOPY_PP_W12_H4 2
INIT_XMM sse2
-cglobal blockcopy_pp_%1x%2, 4, 5, 4
- mov r4d, %2/4
-
-.loop:
+cglobal blockcopy_pp_12x16, 4, 4, 4
+
+%rep 4
movh m0, [r2]
movd m1, [r2 + 8]
movh m2, [r2 + r3]
@@ -575,14 +573,10 @@
movh [r0 + r1], m2
movd [r0 + r1 + 8], m3
- dec r4d
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
- jnz .loop
+%endrep
RET
-%endmacro
-
-BLOCKCOPY_PP_W12_H4 12, 16
;-----------------------------------------------------------------------------
; void blockcopy_pp_12x32(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
More information about the x265-devel
mailing list