[x265] [PATCH] asm : Optimisations in blockcopy_sp asm routines(2x4, 2x8, 6x8)

Nabajit Deka nabajit at multicorewareinc.com
Tue Feb 11 11:50:37 CET 2014


Yes, it gives small improvements


On Tue, Feb 11, 2014 at 4:06 PM, chen <chenm003 at 163.com> wrote:

> similar  code, only remove reduce MOV instruction.
>
> At 2014-02-11 17:51:09,nabajit at multicorewareinc.com wrote:
> ># HG changeset patch
> ># User Nabajit Deka
> ># Date 1392112254 -19800
> >#      Tue Feb 11 15:20:54 2014 +0530
> ># Node ID e6e9310bc545a84fd30533fc7739912c55179d17
> ># Parent  07b5d6b82f5fbcb78ecab12cb8abcf13c78fe552
> >asm : Optimisations in blockcopy_sp asm routines(2x4, 2x8, 6x8)
> >
> >diff -r 07b5d6b82f5f -r e6e9310bc545 source/common/x86/blockcopy8.asm
> >--- a/source/common/x86/blockcopy8.asm	Mon Feb 10 15:05:04 2014 -0600
> >+++ b/source/common/x86/blockcopy8.asm	Tue Feb 11 15:20:54 2014 +0530
> >@@ -1246,31 +1246,27 @@
> > ; void blockcopy_sp_2x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> > ;-----------------------------------------------------------------------------
> > INIT_XMM sse4
> >-cglobal blockcopy_sp_2x4, 4, 5, 4, dest, destStride, src, srcStride
> >-
> >-add        r3,     r3
> >-
> >-movd       m0,     [r2]
> >-movd       m1,     [r2 + r3]
> >-movd       m2,     [r2 + 2 * r3]
> >-lea        r2,     [r2 + 2 * r3]
> >-movd       m3,     [r2 + r3]
> >-
> >-packuswb   m0,            m1
> >-packuswb   m2,            m3
> >-
> >-pextrw     r4,            m0,          0
> >-mov        [r0],          r4w
> >-
> >-pextrw     r4,            m0,          4
> >-mov        [r0 + r1],     r4w
> >-
> >-pextrw     r4,            m2,          0
> >+cglobal blockcopy_sp_2x4, 4, 5, 2
> >+
> >+add        r3, r3
> >+
> >+;Row 0-1
> >+movd       m0, [r2]
> >+movd       m1, [r2 + r3]
> >+packuswb   m0, m1
> >+movd       r4d, m0
> >+mov        [r0], r4w
> >+pextrw     [r0 + r1], m0, 4
> >+
> >+;Row 2-3
> >+movd       m0, [r2 + 2 * r3]
> >+lea        r2, [r2 + 2 * r3]
> >+movd       m1, [r2 + r3]
> >+packuswb   m0, m1
> >+movd       r4d, m0
> > mov        [r0 + 2 * r1], r4w
> >-
> >-lea        r0,            [r0 + 2 * r1]
> >-pextrw     r4,            m2,          4
> >-mov        [r0 + r1],     r4w
> >+lea        r0, [r0 + 2 * r1]
> >+pextrw     [r0 + r1], m0, 4
> >
> > RET
> >
> >@@ -1279,53 +1275,47 @@
> > ; void blockcopy_sp_2x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> > ;-----------------------------------------------------------------------------
> > INIT_XMM sse4
> >-cglobal blockcopy_sp_2x8, 4, 5, 8, dest, destStride, src, srcStride
> >-
> >-add        r3,      r3
> >-
> >-movd       m0,      [r2]
> >-movd       m1,      [r2 + r3]
> >-movd       m2,      [r2 + 2 * r3]
> >-lea        r2,      [r2 + 2 * r3]
> >-movd       m3,      [r2 + r3]
> >-movd       m4,      [r2 + 2 * r3]
> >-lea        r2,      [r2 + 2 * r3]
> >-movd       m5,      [r2 + r3]
> >-movd       m6,      [r2 + 2 * r3]
> >-lea        r2,      [r2 + 2 * r3]
> >-movd       m7,      [r2 + r3]
> >-
> >-packuswb   m0,            m1
> >-packuswb   m2,            m3
> >-packuswb   m4,            m5
> >-packuswb   m6,            m7
> >-
> >-pextrw     r4,            m0,          0
> >-mov        [r0],          r4w
> >-
> >-pextrw     r4,            m0,          4
> >-mov        [r0 + r1],     r4w
> >-
> >-pextrw     r4,            m2,          0
> >+cglobal blockcopy_sp_2x8, 4, 5, 2
> >+
> >+add        r3, r3
> >+
> >+;Row 0-1
> >+movd       m0, [r2]
> >+movd       m1, [r2 + r3]
> >+packuswb   m0, m1
> >+movd       r4d, m0
> >+mov        [r0], r4w
> >+pextrw     [r0 + r1], m0, 4
> >+
> >+;Row 2-3
> >+movd       m0, [r2 + 2 * r3]
> >+lea        r2, [r2 + 2 * r3]
> >+movd       m1, [r2 + r3]
> >+packuswb   m0, m1
> >+movd       r4d, m0
> > mov        [r0 + 2 * r1], r4w
> >-
> >-lea        r0,            [r0 + 2 * r1]
> >-pextrw     r4,            m2,          4
> >-mov        [r0 + r1],     r4w
> >-
> >-pextrw     r4,            m4,          0
> >+lea        r0, [r0 + 2 * r1]
> >+pextrw     [r0 + r1], m0, 4
> >+
> >+;Row 4-5
> >+movd       m0, [r2 + 2 * r3]
> >+lea        r2, [r2 + 2 * r3]
> >+movd       m1, [r2 + r3]
> >+packuswb   m0, m1
> >+movd       r4d, m0
> > mov        [r0 + 2 * r1], r4w
> >-
> >-lea        r0,            [r0 + 2 * r1]
> >-pextrw     r4,            m4,          4
> >-mov        [r0 + r1],     r4w
> >-
> >-pextrw     r4,            m6,          0
> >+lea        r0, [r0 + 2 * r1]
> >+pextrw     [r0 + r1], m0, 4
> >+
> >+;Row 6-7
> >+movd       m0, [r2 + 2 * r3]
> >+lea        r2, [r2 + 2 * r3]
> >+movd       m1, [r2 + r3]
> >+packuswb   m0, m1
> >+movd       r4d, m0
> > mov        [r0 + 2 * r1], r4w
> >-
> >-lea        r0,            [r0 + 2 * r1]
> >-pextrw     r4,            m6,          4
> >-mov        [r0 + r1],     r4w
> >+lea        r0, [r0 + 2 * r1]
> >+pextrw     [r0 + r1], m0, 4
> >
> > RET
> >
> >@@ -1477,40 +1467,65 @@
> > ;-----------------------------------------------------------------------------
> > ; void blockcopy_sp_6x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> > ;-----------------------------------------------------------------------------
> >-%macro BLOCKCOPY_SP_W6_H4 2
> > INIT_XMM sse4
> >-cglobal blockcopy_sp_6x8, 4, 6, 2, dest, destStride, src, srcStride
> >-
> >-mov            r4d,           %2/2
> >-
> >-add            r3,            r3
> >-
> >-.loop
> >-     movu       m0,           [r2]
> >-     movu       m1,           [r2 + r3]
> >-
> >-
> >-     packuswb   m0,           m1
> >-
> >-     movd      [r0],          m0
> >-     pextrw    r5,            m0,    2
> >-     mov       [r0 + 4],      r5w
> >-
> >-     pextrw    r5,            m0,    6
> >-     pshufd     m0,           m0,    2
> >-     movd      [r0 + r1],     m0
> >-     mov       [r0 + r1 + 4], r5w
> >-
> >-     lea        r0,           [r0 + 2 * r1]
> >-     lea        r2,           [r2 + 2 * r3]
> >-
> >-     dec        r4d
> >-     jnz        .loop
> >-
> >-RET
> >-%endmacro
> >-
> >-BLOCKCOPY_SP_W6_H4 6, 8
> >+cglobal blockcopy_sp_6x8, 4, 4, 2
> >+
> >+    add       r3, r3
> >+
> >+    movu      m0, [r2]
> >+    movu      m1, [r2 + r3]
> >+    packuswb  m0, m1
> >+
> >+    movd      [r0], m0
> >+    pextrw    [r0 + 4], m0, 2
> >+
> >+    movhlps   m0, m0
> >+    movd      [r0 + r1], m0
> >+    pextrw    [r0 + r1 + 4], m0, 2
> >+
> >+    lea       r0, [r0 + 2 * r1]
> >+    lea       r2, [r2 + 2 * r3]
> >+
> >+    movu      m0, [r2]
> >+    movu      m1, [r2 + r3]
> >+    packuswb  m0, m1
> >+
> >+    movd      [r0], m0
> >+    pextrw    [r0 + 4], m0, 2
> >+
> >+    movhlps   m0, m0
> >+    movd      [r0 + r1], m0
> >+    pextrw    [r0 + r1 + 4], m0, 2
> >+
> >+    lea       r0, [r0 + 2 * r1]
> >+    lea       r2, [r2 + 2 * r3]
> >+
> >+    movu      m0, [r2]
> >+    movu      m1, [r2 + r3]
> >+    packuswb  m0, m1
> >+
> >+    movd      [r0], m0
> >+    pextrw    [r0 + 4], m0, 2
> >+
> >+    movhlps   m0, m0
> >+    movd      [r0 + r1], m0
> >+    pextrw    [r0 + r1 + 4], m0, 2
> >+
> >+    lea       r0, [r0 + 2 * r1]
> >+    lea       r2, [r2 + 2 * r3]
> >+
> >+    movu      m0, [r2]
> >+    movu      m1, [r2 + r3]
> >+    packuswb  m0, m1
> >+
> >+    movd      [r0], m0
> >+    pextrw    [r0 + 4], m0, 2
> >+
> >+    movhlps   m0, m0
> >+    movd      [r0 + r1], m0
> >+    pextrw    [r0 + r1 + 4], m0, 2
> >+
> >+    RET
> >
> > ;-----------------------------------------------------------------------------
> > ; void blockcopy_sp_8x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> >_______________________________________________
> >x265-devel mailing list
> >x265-devel at videolan.org
> >https://mailman.videolan.org/listinfo/x265-devel
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140211/bb0a3e4d/attachment.html>


More information about the x265-devel mailing list