[x265] [PATCH] copy_cnt_4: faster AVX2 code

praveen at multicorewareinc.com praveen at multicorewareinc.com
Tue Sep 9 10:37:23 CEST 2014


# HG changeset patch
# User Praveen Tiwari
# Date 1410251834 -19800
# Node ID d011073f35258cb2f0ad95db6038c2d9fb840b27
# Parent  ebb84e9dbb0fa0e8c4c9304b2efd57f8ac3d0c05
copy_cnt_4: faster AVX2 code

diff -r ebb84e9dbb0f -r d011073f3525 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Tue Sep 09 11:36:58 2014 +0530
+++ b/source/common/x86/blockcopy8.asm	Tue Sep 09 14:07:14 2014 +0530
@@ -3990,7 +3990,7 @@
 INIT_YMM avx2
 cglobal copy_cnt_4, 3,3,3
     add         r2d, r2d
-    xorpd       xm2, xm2
+    xorpd       m2,  m2
 
     ; row 0 & 1
     movq        xm0, [r1]
@@ -4004,11 +4004,9 @@
     vinserti128 m0, m0, xm1, 1
     movu    [r0], m0
 
-    vextractf128 xm1, m0, 1
-    packsswb     xm0, xm1
-    pcmpeqb      xm0, xm2
-
     ; get count
+    packsswb    xm0, xm1
+    pcmpeqb     xm0, xm2
     pmovmskb    eax, xm0
     not         ax
     popcnt      ax, ax


More information about the x265-devel mailing list