[x265] [PATCH] asm : Optimisations in blockcopy_sp asm routines(2x4, 2x8, 6x8)
Nabajit Deka
nabajit at multicorewareinc.com
Tue Feb 11 11:50:37 CET 2014
Yes, it gives small improvements
On Tue, Feb 11, 2014 at 4:06 PM, chen <chenm003 at 163.com> wrote:
> similar code, only remove reduce MOV instruction.
>
> At 2014-02-11 17:51:09,nabajit at multicorewareinc.com wrote:
> ># HG changeset patch
> ># User Nabajit Deka
> ># Date 1392112254 -19800
> ># Tue Feb 11 15:20:54 2014 +0530
> ># Node ID e6e9310bc545a84fd30533fc7739912c55179d17
> ># Parent 07b5d6b82f5fbcb78ecab12cb8abcf13c78fe552
> >asm : Optimisations in blockcopy_sp asm routines(2x4, 2x8, 6x8)
> >
> >diff -r 07b5d6b82f5f -r e6e9310bc545 source/common/x86/blockcopy8.asm
> >--- a/source/common/x86/blockcopy8.asm Mon Feb 10 15:05:04 2014 -0600
> >+++ b/source/common/x86/blockcopy8.asm Tue Feb 11 15:20:54 2014 +0530
> >@@ -1246,31 +1246,27 @@
> > ; void blockcopy_sp_2x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> > ;-----------------------------------------------------------------------------
> > INIT_XMM sse4
> >-cglobal blockcopy_sp_2x4, 4, 5, 4, dest, destStride, src, srcStride
> >-
> >-add r3, r3
> >-
> >-movd m0, [r2]
> >-movd m1, [r2 + r3]
> >-movd m2, [r2 + 2 * r3]
> >-lea r2, [r2 + 2 * r3]
> >-movd m3, [r2 + r3]
> >-
> >-packuswb m0, m1
> >-packuswb m2, m3
> >-
> >-pextrw r4, m0, 0
> >-mov [r0], r4w
> >-
> >-pextrw r4, m0, 4
> >-mov [r0 + r1], r4w
> >-
> >-pextrw r4, m2, 0
> >+cglobal blockcopy_sp_2x4, 4, 5, 2
> >+
> >+add r3, r3
> >+
> >+;Row 0-1
> >+movd m0, [r2]
> >+movd m1, [r2 + r3]
> >+packuswb m0, m1
> >+movd r4d, m0
> >+mov [r0], r4w
> >+pextrw [r0 + r1], m0, 4
> >+
> >+;Row 2-3
> >+movd m0, [r2 + 2 * r3]
> >+lea r2, [r2 + 2 * r3]
> >+movd m1, [r2 + r3]
> >+packuswb m0, m1
> >+movd r4d, m0
> > mov [r0 + 2 * r1], r4w
> >-
> >-lea r0, [r0 + 2 * r1]
> >-pextrw r4, m2, 4
> >-mov [r0 + r1], r4w
> >+lea r0, [r0 + 2 * r1]
> >+pextrw [r0 + r1], m0, 4
> >
> > RET
> >
> >@@ -1279,53 +1275,47 @@
> > ; void blockcopy_sp_2x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> > ;-----------------------------------------------------------------------------
> > INIT_XMM sse4
> >-cglobal blockcopy_sp_2x8, 4, 5, 8, dest, destStride, src, srcStride
> >-
> >-add r3, r3
> >-
> >-movd m0, [r2]
> >-movd m1, [r2 + r3]
> >-movd m2, [r2 + 2 * r3]
> >-lea r2, [r2 + 2 * r3]
> >-movd m3, [r2 + r3]
> >-movd m4, [r2 + 2 * r3]
> >-lea r2, [r2 + 2 * r3]
> >-movd m5, [r2 + r3]
> >-movd m6, [r2 + 2 * r3]
> >-lea r2, [r2 + 2 * r3]
> >-movd m7, [r2 + r3]
> >-
> >-packuswb m0, m1
> >-packuswb m2, m3
> >-packuswb m4, m5
> >-packuswb m6, m7
> >-
> >-pextrw r4, m0, 0
> >-mov [r0], r4w
> >-
> >-pextrw r4, m0, 4
> >-mov [r0 + r1], r4w
> >-
> >-pextrw r4, m2, 0
> >+cglobal blockcopy_sp_2x8, 4, 5, 2
> >+
> >+add r3, r3
> >+
> >+;Row 0-1
> >+movd m0, [r2]
> >+movd m1, [r2 + r3]
> >+packuswb m0, m1
> >+movd r4d, m0
> >+mov [r0], r4w
> >+pextrw [r0 + r1], m0, 4
> >+
> >+;Row 2-3
> >+movd m0, [r2 + 2 * r3]
> >+lea r2, [r2 + 2 * r3]
> >+movd m1, [r2 + r3]
> >+packuswb m0, m1
> >+movd r4d, m0
> > mov [r0 + 2 * r1], r4w
> >-
> >-lea r0, [r0 + 2 * r1]
> >-pextrw r4, m2, 4
> >-mov [r0 + r1], r4w
> >-
> >-pextrw r4, m4, 0
> >+lea r0, [r0 + 2 * r1]
> >+pextrw [r0 + r1], m0, 4
> >+
> >+;Row 4-5
> >+movd m0, [r2 + 2 * r3]
> >+lea r2, [r2 + 2 * r3]
> >+movd m1, [r2 + r3]
> >+packuswb m0, m1
> >+movd r4d, m0
> > mov [r0 + 2 * r1], r4w
> >-
> >-lea r0, [r0 + 2 * r1]
> >-pextrw r4, m4, 4
> >-mov [r0 + r1], r4w
> >-
> >-pextrw r4, m6, 0
> >+lea r0, [r0 + 2 * r1]
> >+pextrw [r0 + r1], m0, 4
> >+
> >+;Row 6-7
> >+movd m0, [r2 + 2 * r3]
> >+lea r2, [r2 + 2 * r3]
> >+movd m1, [r2 + r3]
> >+packuswb m0, m1
> >+movd r4d, m0
> > mov [r0 + 2 * r1], r4w
> >-
> >-lea r0, [r0 + 2 * r1]
> >-pextrw r4, m6, 4
> >-mov [r0 + r1], r4w
> >+lea r0, [r0 + 2 * r1]
> >+pextrw [r0 + r1], m0, 4
> >
> > RET
> >
> >@@ -1477,40 +1467,65 @@
> > ;-----------------------------------------------------------------------------
> > ; void blockcopy_sp_6x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> > ;-----------------------------------------------------------------------------
> >-%macro BLOCKCOPY_SP_W6_H4 2
> > INIT_XMM sse4
> >-cglobal blockcopy_sp_6x8, 4, 6, 2, dest, destStride, src, srcStride
> >-
> >-mov r4d, %2/2
> >-
> >-add r3, r3
> >-
> >-.loop
> >- movu m0, [r2]
> >- movu m1, [r2 + r3]
> >-
> >-
> >- packuswb m0, m1
> >-
> >- movd [r0], m0
> >- pextrw r5, m0, 2
> >- mov [r0 + 4], r5w
> >-
> >- pextrw r5, m0, 6
> >- pshufd m0, m0, 2
> >- movd [r0 + r1], m0
> >- mov [r0 + r1 + 4], r5w
> >-
> >- lea r0, [r0 + 2 * r1]
> >- lea r2, [r2 + 2 * r3]
> >-
> >- dec r4d
> >- jnz .loop
> >-
> >-RET
> >-%endmacro
> >-
> >-BLOCKCOPY_SP_W6_H4 6, 8
> >+cglobal blockcopy_sp_6x8, 4, 4, 2
> >+
> >+ add r3, r3
> >+
> >+ movu m0, [r2]
> >+ movu m1, [r2 + r3]
> >+ packuswb m0, m1
> >+
> >+ movd [r0], m0
> >+ pextrw [r0 + 4], m0, 2
> >+
> >+ movhlps m0, m0
> >+ movd [r0 + r1], m0
> >+ pextrw [r0 + r1 + 4], m0, 2
> >+
> >+ lea r0, [r0 + 2 * r1]
> >+ lea r2, [r2 + 2 * r3]
> >+
> >+ movu m0, [r2]
> >+ movu m1, [r2 + r3]
> >+ packuswb m0, m1
> >+
> >+ movd [r0], m0
> >+ pextrw [r0 + 4], m0, 2
> >+
> >+ movhlps m0, m0
> >+ movd [r0 + r1], m0
> >+ pextrw [r0 + r1 + 4], m0, 2
> >+
> >+ lea r0, [r0 + 2 * r1]
> >+ lea r2, [r2 + 2 * r3]
> >+
> >+ movu m0, [r2]
> >+ movu m1, [r2 + r3]
> >+ packuswb m0, m1
> >+
> >+ movd [r0], m0
> >+ pextrw [r0 + 4], m0, 2
> >+
> >+ movhlps m0, m0
> >+ movd [r0 + r1], m0
> >+ pextrw [r0 + r1 + 4], m0, 2
> >+
> >+ lea r0, [r0 + 2 * r1]
> >+ lea r2, [r2 + 2 * r3]
> >+
> >+ movu m0, [r2]
> >+ movu m1, [r2 + r3]
> >+ packuswb m0, m1
> >+
> >+ movd [r0], m0
> >+ pextrw [r0 + 4], m0, 2
> >+
> >+ movhlps m0, m0
> >+ movd [r0 + r1], m0
> >+ pextrw [r0 + r1 + 4], m0, 2
> >+
> >+ RET
> >
> > ;-----------------------------------------------------------------------------
> > ; void blockcopy_sp_8x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> >_______________________________________________
> >x265-devel mailing list
> >x265-devel at videolan.org
> >https://mailman.videolan.org/listinfo/x265-devel
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140211/bb0a3e4d/attachment.html>
More information about the x265-devel
mailing list