[x265] [PATCH] blockcopy_pp_2x16 SSE2 asm code: optimization

praveen at multicorewareinc.com praveen at multicorewareinc.com
Thu Feb 5 10:25:05 CET 2015


# HG changeset patch
# User Praveen Tiwari
# Date 1423128295 -19800
# Node ID 1265aafe5af1d66b8151ffd9bdc6fe595f7e6343
# Parent  6843cdeae82b7429eedee297c33b0eb6b49401a2
blockcopy_pp_2x16 SSE2 asm code: optimization

reduced LEA instructions and eliminated branch instructions
improved, 310.94c -> 268.89c

diff -r 6843cdeae82b -r 1265aafe5af1 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Thu Feb 05 14:46:54 2015 +0530
+++ b/source/common/x86/blockcopy8.asm	Thu Feb 05 14:54:55 2015 +0530
@@ -93,16 +93,30 @@
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_2x16, 4, 7, 0
-    mov     r6d,    16/2
-.loop:
-    mov     r4w,    [r2]
-    mov     r5w,    [r2 + r3]
-    dec     r6d
-    lea     r2,     [r2 + r3 * 2]
-    mov     [r0],       r4w
-    mov     [r0 + r1],  r5w
-    lea     r0,     [r0 + r1 * 2]
-    jnz     .loop
+    lea     r5,      [3 * r1]
+    lea     r6,      [3 * r3]
+
+    mov     r4w,           [r2]
+    mov     [r0],          r4w
+    mov     r4w,           [r2 + r3]
+    mov     [r0 + r1],     r4w
+    mov     r4w,           [r2 + 2 * r3]
+    mov     [r0 + 2 * r1], r4w
+    mov     r4w,           [r2 + r6]
+    mov     [r0 + r5],     r4w
+
+%rep 3
+    lea     r2,            [r2 + 4 * r3]
+    mov     r4w,           [r2]
+    lea     r0,            [r0 + 4 * r1]
+    mov     [r0],          r4w
+    mov     r4w,           [r2 + r3]
+    mov     [r0 + r1],     r4w
+    mov     r4w,           [r2 + 2 * r3]
+    mov     [r0 + 2 * r1], r4w
+    mov     r4w,           [r2 + r6]
+    mov     [r0 + r5],     r4w
+%endrep
     RET
 
 


More information about the x265-devel mailing list