[x265] [PATCH] blockcopy_sp_2x8, optimized asm code

praveen at multicorewareinc.com praveen at multicorewareinc.com
Fri Nov 8 17:44:35 CET 2013


# HG changeset patch
# User Praveen Tiwari
# Date 1383929065 -19800
# Node ID 15cc9030934b809150ad4c3e007f737456ea78e6
# Parent  12f0499974995e8beb0fb9b9019ef8d799aa4fb9
blockcopy_sp_2x8, optimized asm code

diff -r 12f049997499 -r 15cc9030934b source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Fri Nov 08 21:58:58 2013 +0530
+++ b/source/common/x86/blockcopy8.asm	Fri Nov 08 22:14:25 2013 +0530
@@ -836,61 +836,53 @@
 ; void blockcopy_sp_2x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal blockcopy_sp_2x8, 4, 7, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_2x8, 4, 5, 8, dest, destStride, src, srcStride
 
 add        r3,      r3
 
-mova       m0,      [tab_Vm]
+movd       m0,      [r2]
+movd       m1,      [r2 + r3]
+movd       m2,      [r2 + 2 * r3]
+lea        r2,      [r2 + 2 * r3]
+movd       m3,      [r2 + r3]
+movd       m4,      [r2 + 2 * r3]
+lea        r2,      [r2 + 2 * r3]
+movd       m5,      [r2 + r3]
+movd       m6,      [r2 + 2 * r3]
+lea        r2,      [r2 + 2 * r3]
+movd       m7,      [r2 + r3]
 
-movd       m1,      [r2]
-movd       m2,      [r2 + r3]
-movd       m3,      [r2 + 2 * r3]
-lea        r4,      [r2 + 2 * r3]
-movd       m4,      [r4 + r3]
-movd       m5,      [r4 + 2 * r3]
-lea        r4,      [r4 + 2 * r3]
-movd       m6,      [r4 + r3]
-movd       m7,      [r4 + 2 * r3]
-lea        r5,      [r4 + 2 * r3]
+packuswb   m0,            m1
+packuswb   m2,            m3
+packuswb   m4,            m5
+packuswb   m6,            m7
 
-pshufb     m1,      m0
-pshufb     m2,      m0
-pshufb     m3,      m0
-pshufb     m4,      m0
-pshufb     m5,      m0
-pshufb     m6,      m0
-pshufb     m7,      m0
+pextrw     r4,            m0,          0
+mov        [r0],          r4w
 
-pextrw     r6,            m1,          0
-mov        [r0],          r6w
+pextrw     r4,            m0,          4
+mov        [r0 + r1],     r4w
 
-pextrw     r6,            m2,          0
-mov        [r0 + r1],     r6w
+pextrw     r4,            m2,          0
+mov        [r0 + 2 * r1], r4w
 
-pextrw     r6,            m3,          0
-mov        [r0 + 2 * r1], r6w
+lea        r0,            [r0 + 2 * r1]
+pextrw     r4,            m2,          4
+mov        [r0 + r1],     r4w
 
-lea        r4,            [r0 + 2 * r1]
-pextrw     r6,            m4,          0
-mov        [r4 + r1],     r6w
+pextrw     r4,            m4,          0
+mov        [r0 + 2 * r1], r4w
 
-pextrw     r6,            m5,          0
-mov        [r4 + 2 * r1], r6w
+lea        r0,            [r0 + 2 * r1]
+pextrw     r4,            m4,          4
+mov        [r0 + r1],     r4w
 
+pextrw     r4,            m6,          0
+mov        [r0 + 2 * r1], r4w
 
-lea        r4,            [r4 + 2 * r1]
-pextrw     r6,            m6,          0
-mov        [r4 + r1],     r6w
-
-pextrw     r6,            m7,          0
-mov        [r4 + 2 * r1], r6w
-
-movd       m1,            [r5 + r3]
-pshufb     m1,            m0
-
-lea        r4,            [r4 + 2 * r1]
-pextrw     r6,            m1,          0
-mov        [r4 + r1],     r6w
+lea        r0,            [r0 + 2 * r1]
+pextrw     r4,            m6,          4
+mov        [r0 + r1],     r4w
 
 RET
 


More information about the x265-devel mailing list