[x265] [PATCH] copy_cnt_16: avx2 asm code as per new interface, improved 514.32 cycles -> 313.66 cycles

chen chenm003 at 163.com
Thu Sep 11 19:49:03 CEST 2014


 

At 2014-09-11 21:38:32,praveen at multicorewareinc.com wrote:
># HG changeset patch
># User Praveen Tiwari
># Date 1410442704 -19800
># Node ID df74723eb9a3861f6bba7f33d09a37efe53932a4
># Parent  9241634204a12babf8a2a90dc4f776646a9ddfb3
>copy_cnt_16: avx2 asm code as per new interface, improved 514.32 cycles -> 313.66 cycles
>
>diff -r 9241634204a1 -r df74723eb9a3 source/common/x86/blockcopy8.asm
>--- a/source/common/x86/blockcopy8.asm	Thu Sep 11 17:33:44 2014 +0530
>+++ b/source/common/x86/blockcopy8.asm	Thu Sep 11 19:08:24 2014 +0530
>@@ -4161,67 +4161,46 @@
> INIT_YMM avx2
> cglobal copy_cnt_16, 3,5,5
>     add         r2d, r2d
>-    lea         r4, [r2 * 3]
>-    mov         r3d, 16/4
>-    ; NOTE: xorpd is faster than pxor
>+    lea         r3,  [r2 * 3]
>+    mov         r4d, 16/4
>+
>+    movu        m3, [pb_1]

movu?
 
>+    movu        [r0 + 32], m1
>+    packsswb   m0, m1

ident
 
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140912/11adb774/attachment.html>


More information about the x265-devel mailing list