[x265] [PATCH] blockcopy_sp_16xN, optimized asm code

chen chenm003 at 163.com
Fri Nov 8 14:40:22 CET 2013


code is right, but need uncrustify it, ex: add r3, r3

t 2013-11-08 21:32:05,praveen at multicorewareinc.com wrote:
># HG changeset patch
># User Praveen Tiwari
># Date 1383917516 -19800
># Node ID 662664f0863b38b838a15867745c5564f574fb09
># Parent  227a5666e08869d36e07a75f3db95dd94c774715
>blockcopy_sp_16xN, optimized asm code
>
>diff -r 227a5666e088 -r 662664f0863b source/common/x86/blockcopy8.asm
>--- a/source/common/x86/blockcopy8.asm Fri Nov 08 17:38:24 2013 +0530
>+++ b/source/common/x86/blockcopy8.asm Fri Nov 08 19:01:56 2013 +0530
>@@ -1325,51 +1325,38 @@
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_SP_W16_H4 2
> INIT_XMM sse2
>-cglobal blockcopy_sp_%1x%2, 4, 7, 7, dest, destStride, src, srcStride
>+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
> 
>-mov         r6d,    %2
>+mov             r4d,     %2/4
> 
>-add        r3,      r3
>-
>-mova       m0,      [tab_Vm]
>+add             r3,      r3
> 
> .loop
>-     movu       m1,      [r2]
>-     movu       m2,      [r2 + 16]
>-     movu       m3,      [r2 + r3]
>-     movu       m4,      [r2 + r3 + 16]
>-     movu       m5,      [r2 + 2 * r3]
>-     movu       m6,      [r2 + 2 * r3 + 16]
>+     movu       m0,      [r2]
>+     movu       m1,      [r2 + 16]
>+     movu       m2,      [r2 + r3]
>+     movu       m3,      [r2 + r3 + 16]
>+     movu       m4,      [r2 + 2 * r3]
>+     movu       m5,      [r2 + 2 * r3 + 16]
>+     lea        r2,      [r2 + 2 * r3]
>+     movu       m6,      [r2 + r3]
>+     movu       m7,      [r2 + r3 + 16]
> 
>-     pshufb     m1,      m0
>-     pshufb     m2,      m0
>-     pshufb     m3,      m0
>-     pshufb     m4,      m0
>-     pshufb     m5,      m0
>-     pshufb     m6,      m0
>+     packuswb   m0,      m1
>+     packuswb   m2,      m3
>+     packuswb   m4,      m5
>+     packuswb   m6,      m7
> 
>-     movh       [r0],              m1
>-     movh       [r0 + 8],          m2
>-     movh       [r0 + r1],         m3
>-     movh       [r0 + r1 + 8],     m4
>-     movh       [r0 + 2 * r1],     m5
>-     movh       [r0 + 2 * r1 + 8], m6
>+     movu       [r0],              m0
>+     movu       [r0 + r1],         m2
>+     movu       [r0 + 2 * r1],     m4
>+     lea        r0,                [r0 + 2 * r1]
>+     movu       [r0 + r1],         m6
> 
>-     lea        r4,      [r2 + 2 * r3]
>-     movu       m1,      [r4 + r3]
>-     movu       m2,      [r4 + r3 + 16]
>+     lea        r0,                [r0 + 2 * r1]
>+     lea        r2,                [r2 + 2 * r3]
> 
>-     pshufb     m1,      m0
>-     pshufb     m2,      m0
>-
>-     lea        r5,            [r0 + 2 * r1]
>-     movh       [r5 + r1],     m1
>-     movh       [r5 + r1 + 8], m2
>-
>-     lea        r0,              [r5 + 2 * r1]
>-     lea        r2,              [r4 + 2 * r3]
>-
>-     sub        r6d,             4
>+     dec        r4d
>      jnz        .loop
> 
> RET
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131108/8a891e4d/attachment-0001.html>


More information about the x265-devel mailing list