[x265] [PATCH] copy_cnt 4x4, eliminated move instructions, +1x improvement

chen chenm003 at 163.com
Fri Sep 5 19:26:13 CEST 2014


 

At 2014-09-05 14:04:06,praveen at multicorewareinc.com wrote:
># HG changeset patch
># User Praveen Tiwari
># Date 1409897037 -19800
># Node ID 1847e02a9514b02690cc4b97bae022091bf33424
># Parent  0b3f68d5f1699540c71ed7b75e2b0ca965fad82f
>copy_cnt 4x4, eliminated move instructions, +1x improvement
>
>diff -r 0b3f68d5f169 -r 1847e02a9514 source/common/x86/blockcopy8.asm
>--- a/source/common/x86/blockcopy8.asm	Fri Sep 05 11:20:29 2014 +0530
>+++ b/source/common/x86/blockcopy8.asm	Fri Sep 05 11:33:57 2014 +0530
>@@ -3953,42 +3953,37 @@
> ; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
> ;--------------------------------------------------------------------------------------
> INIT_XMM sse4
>-cglobal copy_cnt_4, 3,3,5
>+cglobal copy_cnt_4, 3,3,3
>     add         r2d, r2d
>-    pxor        m4, m4
>+    pxor        m2, m2
> 
>      ; row 0 & 1
>      movh        m0, [r1]
>-     movh        m1, [r1 + r2]
>-     movh        [r0], m0
>-     movh        [r0 + 8], m1
>-
>-     mova        m2, [r0]
>+     movhps      m0, [r1 + r2]
>+     mova        [r0], m0

are you sure [r0] is aligned address?
> 
>      ; row 2 & 3
>-     movh        m0, [r1 + r2 * 2]
>+     movh        m1, [r1 + r2 * 2]
>      lea         r2, [r2 * 3]
>-     movh        m1, [r1 + r2]
>-     movh        [r0 + 16], m0
>-     movh        [r0 + 24], m1
>- 
>-     mova        m0, [r0 + 16]
>-     packsswb    m2, m0
>-     pcmpeqb     m2, m4
>+     movhps      m1, [r1 + r2]
>+     mova        [r0 + 16], m1
>+
>+     packsswb    m0, m1
>+     pcmpeqb     m0, m2
> 
>      ; get count
>      ; CHECK_ME: Intel documents said POPCNT is SSE4.2 instruction, but just implement after Nehalem
> %if 0
>-     pmovmskb    eax, m2
>+     pmovmskb    eax, m0
>      not         ax
>      popcnt      ax, ax
> %else
>-     mova        m0, [pb_1]
>-     paddb       m2, m0
>-     psadbw      m2, m4
>-     pshufd      m0, m2, 2
>-     paddw       m2, m0
>-     movd        eax, m2
>+     mova        m1, [pb_1]

>+     paddb       m0, m1

this is my old path, it is not optimize, 'paddd m0, [pb_1]' is less code size
>+     psadbw      m0, m2
>+     pshufd      m1, m0, 2
>+     paddw       m0, m1
>+     movd        eax, m0
>  %endif
>      RET
> 
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140906/9c9cd129/attachment.html>


More information about the x265-devel mailing list