[x265] [PATCH] blockcopy_pp_8x64: sse2 asm code optimization

praveen at multicorewareinc.com praveen at multicorewareinc.com
Tue Feb 3 14:07:25 CET 2015


# HG changeset patch
# User Praveen Tiwari
# Date 1422968834 -19800
# Node ID 6aba648bfada606d14f20e0a7cdb667d043069ae
# Parent  a7dff1040961c2c17254c2e2bb0bf5b7857c8187
blockcopy_pp_8x64: sse2 asm code optimization

improved, 800.38c -> 752.18c

diff -r a7dff1040961 -r 6aba648bfada source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Tue Feb 03 18:26:22 2015 +0530
+++ b/source/common/x86/blockcopy8.asm	Tue Feb 03 18:37:14 2015 +0530
@@ -482,48 +482,38 @@
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
+; void blockcopy_pp_8x64(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
-%macro BLOCKCOPY_PP_W8_H8 2
 INIT_XMM sse2
-cglobal blockcopy_pp_%1x%2, 4, 5, 6
-    mov         r4d,       %2/8
-
-.loop:
-     movh    m0,     [r2]
-     movh    m1,     [r2 + r3]
-     lea     r2,     [r2 + 2 * r3]
-     movh    m2,     [r2]
-     movh    m3,     [r2 + r3]
-     lea     r2,     [r2 + 2 * r3]
-     movh    m4,     [r2]
-     movh    m5,     [r2 + r3]
-
-     movh    [r0],         m0
-     movh    [r0 + r1],    m1
-     lea     r0,           [r0 + 2 * r1]
-     movh    [r0],         m2
-     movh    [r0 + r1],    m3
-     lea     r0,           [r0 + 2 * r1]
-     movh    [r0],         m4
-     movh    [r0 + r1],    m5
-
-     lea     r2,           [r2 + 2 * r3]
-     movh    m4,           [r2]
-     movh    m5,           [r2 + r3]
-     lea     r0,           [r0 + 2 * r1]
-     movh    [r0],         m4
-     movh    [r0 + r1],    m5
-
-     dec     r4d
-     lea     r0,           [r0 + 2 * r1]
-     lea     r2,           [r2 + 2 * r3]
-     jnz    .loop
-RET
-%endmacro
-
-
-BLOCKCOPY_PP_W8_H8 8, 64
+cglobal blockcopy_pp_8x64, 4, 6, 4
+
+    lea      r4, [3 * r3]
+    lea      r5, [3 * r1]
+
+    movh     m0, [r2]
+    movh     m1, [r2 + r3]
+    movh     m2, [r2 + 2 * r3]
+    movh     m3, [r2 + r4]
+
+    movh     [r0],          m0
+    movh     [r0 + r1],     m1
+    movh     [r0 + 2 * r1], m2
+    movh     [r0 + r5],     m3
+
+    %rep 15
+    lea      r2, [r2 + 4 * r3]
+    movh     m0, [r2]
+    movh     m1, [r2 + r3]
+    movh     m2, [r2 + 2 * r3]
+    movh     m3, [r2 + r4]
+
+    lea      r0,            [r0 + 4 * r1]
+    movh     [r0],          m0
+    movh     [r0 + r1],     m1
+    movh     [r0 + 2 * r1], m2
+    movh     [r0 + r5],     m3
+    %endrep
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)


More information about the x265-devel mailing list