[x265] [PATCH] blockcopy_sp_2x4, optimized asm code

praveen at multicorewareinc.com praveen at multicorewareinc.com
Fri Nov 8 17:29:09 CET 2013


# HG changeset patch
# User Praveen Tiwari
# Date 1383928138 -19800
# Node ID 12f0499974995e8beb0fb9b9019ef8d799aa4fb9
# Parent  1f96f7e693f551be88259cc5f8dc156ae1690cd0
blockcopy_sp_2x4, optimized asm code

diff -r 1f96f7e693f5 -r 12f049997499 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Fri Nov 08 21:50:23 2013 +0530
+++ b/source/common/x86/blockcopy8.asm	Fri Nov 08 21:58:58 2013 +0530
@@ -803,35 +803,31 @@
 ; void blockcopy_sp_2x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal blockcopy_sp_2x4, 4, 6, 5, dest, destStride, src, srcStride
+cglobal blockcopy_sp_2x4, 4, 5, 4, dest, destStride, src, srcStride
 
 add        r3,     r3
 
-mova       m0,     [tab_Vm]
+movd       m0,     [r2]
+movd       m1,     [r2 + r3]
+movd       m2,     [r2 + 2 * r3]
+lea        r2,     [r2 + 2 * r3]
+movd       m3,     [r2 + r3]
 
-movd       m1,     [r2]
-movd       m2,     [r2 + r3]
-movd       m3,     [r2 + 2 * r3]
-lea        r4,     [r2 + 2 * r3]
-movd       m4,     [r4 + r3]
+packuswb   m0,            m1
+packuswb   m2,            m3
 
-pshufb     m1,     m0
-pshufb     m2,     m0
-pshufb     m3,     m0
-pshufb     m4,     m0
+pextrw     r4,            m0,          0
+mov        [r0],          r4w
 
-pextrw     r5,            m1,          0
-mov        [r0],          r5w
+pextrw     r4,            m0,          4
+mov        [r0 + r1],     r4w
 
-pextrw     r5,            m2,          0
-mov        [r0 + r1],     r5w
+pextrw     r4,            m2,          0
+mov        [r0 + 2 * r1], r4w
 
-pextrw     r5,            m3,          0
-mov        [r0 + 2 * r1], r5w
-
-lea        r4,            [r0 + 2 * r1]
-pextrw     r5,            m4,          0
-mov        [r4 + r1],     r5w
+lea        r0,            [r0 + 2 * r1]
+pextrw     r4,            m2,          4
+mov        [r0 + r1],     r4w
 
 RET
 


More information about the x265-devel mailing list