<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><DIV>similar code, only remove reduce MOV instruction.</DIV><PRE>At 2014-02-11 17:51:09,nabajit@multicorewareinc.com wrote:
># HG changeset patch
># User Nabajit Deka
># Date 1392112254 -19800
># Tue Feb 11 15:20:54 2014 +0530
># Node ID e6e9310bc545a84fd30533fc7739912c55179d17
># Parent 07b5d6b82f5fbcb78ecab12cb8abcf13c78fe552
>asm : Optimisations in blockcopy_sp asm routines(2x4, 2x8, 6x8)
>
>diff -r 07b5d6b82f5f -r e6e9310bc545 source/common/x86/blockcopy8.asm
>--- a/source/common/x86/blockcopy8.asm Mon Feb 10 15:05:04 2014 -0600
>+++ b/source/common/x86/blockcopy8.asm Tue Feb 11 15:20:54 2014 +0530
>@@ -1246,31 +1246,27 @@
> ; void blockcopy_sp_2x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
>-cglobal blockcopy_sp_2x4, 4, 5, 4, dest, destStride, src, srcStride
>-
>-add r3, r3
>-
>-movd m0, [r2]
>-movd m1, [r2 + r3]
>-movd m2, [r2 + 2 * r3]
>-lea r2, [r2 + 2 * r3]
>-movd m3, [r2 + r3]
>-
>-packuswb m0, m1
>-packuswb m2, m3
>-
>-pextrw r4, m0, 0
>-mov [r0], r4w
>-
>-pextrw r4, m0, 4
>-mov [r0 + r1], r4w
>-
>-pextrw r4, m2, 0
>+cglobal blockcopy_sp_2x4, 4, 5, 2
>+
>+add r3, r3
>+
>+;Row 0-1
>+movd m0, [r2]
>+movd m1, [r2 + r3]
>+packuswb m0, m1
>+movd r4d, m0
>+mov [r0], r4w
>+pextrw [r0 + r1], m0, 4
>+
>+;Row 2-3
>+movd m0, [r2 + 2 * r3]
>+lea r2, [r2 + 2 * r3]
>+movd m1, [r2 + r3]
>+packuswb m0, m1
>+movd r4d, m0
> mov [r0 + 2 * r1], r4w
>-
>-lea r0, [r0 + 2 * r1]
>-pextrw r4, m2, 4
>-mov [r0 + r1], r4w
>+lea r0, [r0 + 2 * r1]
>+pextrw [r0 + r1], m0, 4
>
> RET
>
>@@ -1279,53 +1275,47 @@
> ; void blockcopy_sp_2x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
>-cglobal blockcopy_sp_2x8, 4, 5, 8, dest, destStride, src, srcStride
>-
>-add r3, r3
>-
>-movd m0, [r2]
>-movd m1, [r2 + r3]
>-movd m2, [r2 + 2 * r3]
>-lea r2, [r2 + 2 * r3]
>-movd m3, [r2 + r3]
>-movd m4, [r2 + 2 * r3]
>-lea r2, [r2 + 2 * r3]
>-movd m5, [r2 + r3]
>-movd m6, [r2 + 2 * r3]
>-lea r2, [r2 + 2 * r3]
>-movd m7, [r2 + r3]
>-
>-packuswb m0, m1
>-packuswb m2, m3
>-packuswb m4, m5
>-packuswb m6, m7
>-
>-pextrw r4, m0, 0
>-mov [r0], r4w
>-
>-pextrw r4, m0, 4
>-mov [r0 + r1], r4w
>-
>-pextrw r4, m2, 0
>+cglobal blockcopy_sp_2x8, 4, 5, 2
>+
>+add r3, r3
>+
>+;Row 0-1
>+movd m0, [r2]
>+movd m1, [r2 + r3]
>+packuswb m0, m1
>+movd r4d, m0
>+mov [r0], r4w
>+pextrw [r0 + r1], m0, 4
>+
>+;Row 2-3
>+movd m0, [r2 + 2 * r3]
>+lea r2, [r2 + 2 * r3]
>+movd m1, [r2 + r3]
>+packuswb m0, m1
>+movd r4d, m0
> mov [r0 + 2 * r1], r4w
>-
>-lea r0, [r0 + 2 * r1]
>-pextrw r4, m2, 4
>-mov [r0 + r1], r4w
>-
>-pextrw r4, m4, 0
>+lea r0, [r0 + 2 * r1]
>+pextrw [r0 + r1], m0, 4
>+
>+;Row 4-5
>+movd m0, [r2 + 2 * r3]
>+lea r2, [r2 + 2 * r3]
>+movd m1, [r2 + r3]
>+packuswb m0, m1
>+movd r4d, m0
> mov [r0 + 2 * r1], r4w
>-
>-lea r0, [r0 + 2 * r1]
>-pextrw r4, m4, 4
>-mov [r0 + r1], r4w
>-
>-pextrw r4, m6, 0
>+lea r0, [r0 + 2 * r1]
>+pextrw [r0 + r1], m0, 4
>+
>+;Row 6-7
>+movd m0, [r2 + 2 * r3]
>+lea r2, [r2 + 2 * r3]
>+movd m1, [r2 + r3]
>+packuswb m0, m1
>+movd r4d, m0
> mov [r0 + 2 * r1], r4w
>-
>-lea r0, [r0 + 2 * r1]
>-pextrw r4, m6, 4
>-mov [r0 + r1], r4w
>+lea r0, [r0 + 2 * r1]
>+pextrw [r0 + r1], m0, 4
>
> RET
>
>@@ -1477,40 +1467,65 @@
> ;-----------------------------------------------------------------------------
> ; void blockcopy_sp_6x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
>-%macro BLOCKCOPY_SP_W6_H4 2
> INIT_XMM sse4
>-cglobal blockcopy_sp_6x8, 4, 6, 2, dest, destStride, src, srcStride
>-
>-mov r4d, %2/2
>-
>-add r3, r3
>-
>-.loop
>- movu m0, [r2]
>- movu m1, [r2 + r3]
>-
>-
>- packuswb m0, m1
>-
>- movd [r0], m0
>- pextrw r5, m0, 2
>- mov [r0 + 4], r5w
>-
>- pextrw r5, m0, 6
>- pshufd m0, m0, 2
>- movd [r0 + r1], m0
>- mov [r0 + r1 + 4], r5w
>-
>- lea r0, [r0 + 2 * r1]
>- lea r2, [r2 + 2 * r3]
>-
>- dec r4d
>- jnz .loop
>-
>-RET
>-%endmacro
>-
>-BLOCKCOPY_SP_W6_H4 6, 8
>+cglobal blockcopy_sp_6x8, 4, 4, 2
>+
>+ add r3, r3
>+
>+ movu m0, [r2]
>+ movu m1, [r2 + r3]
>+ packuswb m0, m1
>+
>+ movd [r0], m0
>+ pextrw [r0 + 4], m0, 2
>+
>+ movhlps m0, m0
>+ movd [r0 + r1], m0
>+ pextrw [r0 + r1 + 4], m0, 2
>+
>+ lea r0, [r0 + 2 * r1]
>+ lea r2, [r2 + 2 * r3]
>+
>+ movu m0, [r2]
>+ movu m1, [r2 + r3]
>+ packuswb m0, m1
>+
>+ movd [r0], m0
>+ pextrw [r0 + 4], m0, 2
>+
>+ movhlps m0, m0
>+ movd [r0 + r1], m0
>+ pextrw [r0 + r1 + 4], m0, 2
>+
>+ lea r0, [r0 + 2 * r1]
>+ lea r2, [r2 + 2 * r3]
>+
>+ movu m0, [r2]
>+ movu m1, [r2 + r3]
>+ packuswb m0, m1
>+
>+ movd [r0], m0
>+ pextrw [r0 + 4], m0, 2
>+
>+ movhlps m0, m0
>+ movd [r0 + r1], m0
>+ pextrw [r0 + r1 + 4], m0, 2
>+
>+ lea r0, [r0 + 2 * r1]
>+ lea r2, [r2 + 2 * r3]
>+
>+ movu m0, [r2]
>+ movu m1, [r2 + r3]
>+ packuswb m0, m1
>+
>+ movd [r0], m0
>+ pextrw [r0 + 4], m0, 2
>+
>+ movhlps m0, m0
>+ movd [r0 + r1], m0
>+ pextrw [r0 + r1 + 4], m0, 2
>+
>+ RET
>
> ;-----------------------------------------------------------------------------
> ; void blockcopy_sp_8x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
>_______________________________________________
>x265-devel mailing list
>x265-devel@videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
</PRE></div>