[x265] [PATCH] asm : Optimisations in blockcopy_sp asm routines(2x4, 2x8, 6x8)

nabajit at multicorewareinc.com nabajit at multicorewareinc.com
Tue Feb 11 10:51:09 CET 2014


# HG changeset patch
# User Nabajit Deka
# Date 1392112254 -19800
#      Tue Feb 11 15:20:54 2014 +0530
# Node ID e6e9310bc545a84fd30533fc7739912c55179d17
# Parent  07b5d6b82f5fbcb78ecab12cb8abcf13c78fe552
asm : Optimisations in blockcopy_sp asm routines(2x4, 2x8, 6x8)

diff -r 07b5d6b82f5f -r e6e9310bc545 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Mon Feb 10 15:05:04 2014 -0600
+++ b/source/common/x86/blockcopy8.asm	Tue Feb 11 15:20:54 2014 +0530
@@ -1246,31 +1246,27 @@
 ; void blockcopy_sp_2x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal blockcopy_sp_2x4, 4, 5, 4, dest, destStride, src, srcStride
-
-add        r3,     r3
-
-movd       m0,     [r2]
-movd       m1,     [r2 + r3]
-movd       m2,     [r2 + 2 * r3]
-lea        r2,     [r2 + 2 * r3]
-movd       m3,     [r2 + r3]
-
-packuswb   m0,            m1
-packuswb   m2,            m3
-
-pextrw     r4,            m0,          0
-mov        [r0],          r4w
-
-pextrw     r4,            m0,          4
-mov        [r0 + r1],     r4w
-
-pextrw     r4,            m2,          0
+cglobal blockcopy_sp_2x4, 4, 5, 2
+
+add        r3, r3
+
+;Row 0-1
+movd       m0, [r2]
+movd       m1, [r2 + r3]
+packuswb   m0, m1
+movd       r4d, m0
+mov        [r0], r4w
+pextrw     [r0 + r1], m0, 4
+
+;Row 2-3
+movd       m0, [r2 + 2 * r3]
+lea        r2, [r2 + 2 * r3]
+movd       m1, [r2 + r3]
+packuswb   m0, m1
+movd       r4d, m0
 mov        [r0 + 2 * r1], r4w
-
-lea        r0,            [r0 + 2 * r1]
-pextrw     r4,            m2,          4
-mov        [r0 + r1],     r4w
+lea        r0, [r0 + 2 * r1]
+pextrw     [r0 + r1], m0, 4
 
 RET
 
@@ -1279,53 +1275,47 @@
 ; void blockcopy_sp_2x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal blockcopy_sp_2x8, 4, 5, 8, dest, destStride, src, srcStride
-
-add        r3,      r3
-
-movd       m0,      [r2]
-movd       m1,      [r2 + r3]
-movd       m2,      [r2 + 2 * r3]
-lea        r2,      [r2 + 2 * r3]
-movd       m3,      [r2 + r3]
-movd       m4,      [r2 + 2 * r3]
-lea        r2,      [r2 + 2 * r3]
-movd       m5,      [r2 + r3]
-movd       m6,      [r2 + 2 * r3]
-lea        r2,      [r2 + 2 * r3]
-movd       m7,      [r2 + r3]
-
-packuswb   m0,            m1
-packuswb   m2,            m3
-packuswb   m4,            m5
-packuswb   m6,            m7
-
-pextrw     r4,            m0,          0
-mov        [r0],          r4w
-
-pextrw     r4,            m0,          4
-mov        [r0 + r1],     r4w
-
-pextrw     r4,            m2,          0
+cglobal blockcopy_sp_2x8, 4, 5, 2
+
+add        r3, r3
+
+;Row 0-1
+movd       m0, [r2]
+movd       m1, [r2 + r3]
+packuswb   m0, m1
+movd       r4d, m0
+mov        [r0], r4w
+pextrw     [r0 + r1], m0, 4
+
+;Row 2-3
+movd       m0, [r2 + 2 * r3]
+lea        r2, [r2 + 2 * r3]
+movd       m1, [r2 + r3]
+packuswb   m0, m1
+movd       r4d, m0
 mov        [r0 + 2 * r1], r4w
-
-lea        r0,            [r0 + 2 * r1]
-pextrw     r4,            m2,          4
-mov        [r0 + r1],     r4w
-
-pextrw     r4,            m4,          0
+lea        r0, [r0 + 2 * r1]
+pextrw     [r0 + r1], m0, 4
+
+;Row 4-5
+movd       m0, [r2 + 2 * r3]
+lea        r2, [r2 + 2 * r3]
+movd       m1, [r2 + r3]
+packuswb   m0, m1
+movd       r4d, m0
 mov        [r0 + 2 * r1], r4w
-
-lea        r0,            [r0 + 2 * r1]
-pextrw     r4,            m4,          4
-mov        [r0 + r1],     r4w
-
-pextrw     r4,            m6,          0
+lea        r0, [r0 + 2 * r1]
+pextrw     [r0 + r1], m0, 4
+
+;Row 6-7
+movd       m0, [r2 + 2 * r3]
+lea        r2, [r2 + 2 * r3]
+movd       m1, [r2 + r3]
+packuswb   m0, m1
+movd       r4d, m0
 mov        [r0 + 2 * r1], r4w
-
-lea        r0,            [r0 + 2 * r1]
-pextrw     r4,            m6,          4
-mov        [r0 + r1],     r4w
+lea        r0, [r0 + 2 * r1]
+pextrw     [r0 + r1], m0, 4
 
 RET
 
@@ -1477,40 +1467,65 @@
 ;-----------------------------------------------------------------------------
 ; void blockcopy_sp_6x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
-%macro BLOCKCOPY_SP_W6_H4 2
 INIT_XMM sse4
-cglobal blockcopy_sp_6x8, 4, 6, 2, dest, destStride, src, srcStride
-
-mov            r4d,           %2/2
-
-add            r3,            r3
-
-.loop
-     movu       m0,           [r2]
-     movu       m1,           [r2 + r3]
-
-
-     packuswb   m0,           m1
-
-     movd      [r0],          m0
-     pextrw    r5,            m0,    2
-     mov       [r0 + 4],      r5w
-
-     pextrw    r5,            m0,    6
-     pshufd     m0,           m0,    2
-     movd      [r0 + r1],     m0
-     mov       [r0 + r1 + 4], r5w
-
-     lea        r0,           [r0 + 2 * r1]
-     lea        r2,           [r2 + 2 * r3]
-
-     dec        r4d
-     jnz        .loop
-
-RET
-%endmacro
-
-BLOCKCOPY_SP_W6_H4 6, 8
+cglobal blockcopy_sp_6x8, 4, 4, 2
+
+    add       r3, r3
+
+    movu      m0, [r2]
+    movu      m1, [r2 + r3]
+    packuswb  m0, m1
+
+    movd      [r0], m0
+    pextrw    [r0 + 4], m0, 2
+
+    movhlps   m0, m0
+    movd      [r0 + r1], m0
+    pextrw    [r0 + r1 + 4], m0, 2
+
+    lea       r0, [r0 + 2 * r1]
+    lea       r2, [r2 + 2 * r3]
+
+    movu      m0, [r2]
+    movu      m1, [r2 + r3]
+    packuswb  m0, m1
+
+    movd      [r0], m0
+    pextrw    [r0 + 4], m0, 2
+
+    movhlps   m0, m0
+    movd      [r0 + r1], m0
+    pextrw    [r0 + r1 + 4], m0, 2
+
+    lea       r0, [r0 + 2 * r1]
+    lea       r2, [r2 + 2 * r3]
+
+    movu      m0, [r2]
+    movu      m1, [r2 + r3]
+    packuswb  m0, m1
+
+    movd      [r0], m0
+    pextrw    [r0 + 4], m0, 2
+
+    movhlps   m0, m0
+    movd      [r0 + r1], m0
+    pextrw    [r0 + r1 + 4], m0, 2
+
+    lea       r0, [r0 + 2 * r1]
+    lea       r2, [r2 + 2 * r3]
+
+    movu      m0, [r2]
+    movu      m1, [r2 + r3]
+    packuswb  m0, m1
+
+    movd      [r0], m0
+    pextrw    [r0 + 4], m0, 2
+
+    movhlps   m0, m0
+    movd      [r0 + r1], m0
+    pextrw    [r0 + r1 + 4], m0, 2
+
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void blockcopy_sp_8x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)


More information about the x265-devel mailing list