[x265] [PATCH] blockcopy_pp_2x8 SSE2 asm code: optimize LEA instruction

praveen at multicorewareinc.com praveen at multicorewareinc.com
Thu Feb 5 10:17:03 CET 2015


# HG changeset patch
# User Praveen Tiwari
# Date 1423127814 -19800
# Node ID 6843cdeae82b7429eedee297c33b0eb6b49401a2
# Parent  cd4117a34a19a76d0462c9a644ecc728d8e1c0ee
blockcopy_pp_2x8 SSE2 asm code: optimize LEA instruction

diff -r cd4117a34a19 -r 6843cdeae82b source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Thu Feb 05 12:27:23 2015 +0530
+++ b/source/common/x86/blockcopy8.asm	Thu Feb 05 14:46:54 2015 +0530
@@ -63,37 +63,29 @@
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_2x8, 4, 7, 0
-    mov     r4w,     [r2]
-    mov     r5w,     [r2 + r3]
-    mov     r6w,     [r2 + 2 * r3]
-
-    mov     [r0],            r4w
-    mov     [r0 + r1],       r5w
-    mov     [r0 + 2 * r1],   r6w
-
-    lea     r0,             [r0 + 2 * r1]
-    lea     r2,             [r2 + 2 * r3]
-
-    mov     r4w,             [r2 + r3]
-    mov     r5w,             [r2 + 2 * r3]
-
-    mov     [r0 + r1],       r4w
-    mov     [r0 + 2 * r1],   r5w
-
-    lea     r0,              [r0 + 2 * r1]
-    lea     r2,              [r2 + 2 * r3]
-
-    mov     r4w,             [r2 + r3]
-    mov     r5w,             [r2 + 2 * r3]
-
-    mov     [r0 + r1],       r4w
-    mov     [r0 + 2 * r1],   r5w
-
-    lea     r0,              [r0 + 2 * r1]
-    lea     r2,              [r2 + 2 * r3]
-
-    mov     r4w,             [r2 + r3]
-    mov     [r0 + r1],       r4w
+    lea     r5,      [3 * r1]
+    lea     r6,      [3 * r3]
+
+    mov     r4w,           [r2]
+    mov     [r0],          r4w
+    mov     r4w,           [r2 + r3]
+    mov     [r0 + r1],     r4w
+    mov     r4w,           [r2 + 2 * r3]
+    mov     [r0 + 2 * r1], r4w
+    mov     r4w,           [r2 + r6]
+    mov     [r0 + r5],     r4w
+
+    lea     r2,            [r2 + 4 * r3]
+    mov     r4w,           [r2]
+    lea     r0,            [r0 + 4 * r1]
+    mov     [r0],          r4w
+
+    mov     r4w,           [r2 + r3]
+    mov     [r0 + r1],     r4w
+    mov     r4w,           [r2 + 2 * r3]
+    mov     [r0 + 2 * r1], r4w
+    mov     r4w,           [r2 + r6]
+    mov     [r0 + r5],     r4w
     RET
 
 ;-----------------------------------------------------------------------------


More information about the x265-devel mailing list