[x265] [PATCH] blockcopy_pp_12x16: SSE2 asm code optimization

praveen at multicorewareinc.com praveen at multicorewareinc.com
Thu Feb 5 13:08:49 CET 2015


# HG changeset patch
# User Praveen Tiwari
# Date 1423138113 -19800
# Node ID 7173414b198e8f0d23ae3680ac739387978755e9
# Parent  b10384b8c8a9a60fe37f4e5f3506673dcf00c004
blockcopy_pp_12x16: SSE2 asm code optimization

improved, 381.50c -> 368.40c

diff -r b10384b8c8a9 -r 7173414b198e source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Thu Feb 05 17:29:03 2015 +0530
+++ b/source/common/x86/blockcopy8.asm	Thu Feb 05 17:38:33 2015 +0530
@@ -545,14 +545,12 @@
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
+; void blockcopy_pp_12x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
-%macro BLOCKCOPY_PP_W12_H4 2
 INIT_XMM sse2
-cglobal blockcopy_pp_%1x%2, 4, 5, 4
-    mov         r4d,       %2/4
-
-.loop:
+cglobal blockcopy_pp_12x16, 4, 4, 4
+
+%rep 4
     movh    m0,     [r2]
     movd    m1,     [r2 + 8]
     movh    m2,     [r2 + r3]
@@ -575,14 +573,10 @@
     movh    [r0 + r1],        m2
     movd    [r0 + r1 + 8],    m3
 
-    dec     r4d
     lea     r0,               [r0 + 2 * r1]
     lea     r2,               [r2 + 2 * r3]
-    jnz     .loop
+%endrep
     RET
-%endmacro
-
-BLOCKCOPY_PP_W12_H4 12, 16
 
 ;-----------------------------------------------------------------------------
 ; void blockcopy_pp_12x32(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)


More information about the x265-devel mailing list