[x265-commits] [x265] backout 0dc2cbc36ee5 to 331ef5121676

Steve Borho steve at borho.org
Tue Sep 9 18:58:02 CEST 2014


details:   http://hg.videolan.org/x265/rev/491e74c58e51
branches:  
changeset: 8031:491e74c58e51
user:      Steve Borho <steve at borho.org>
date:      Tue Sep 09 18:55:48 2014 +0200
description:
backout 0dc2cbc36ee5 to 331ef5121676

diffstat:

 source/common/x86/blockcopy8.asm |  127 +++++++++++++++++++++++++-------------
 1 files changed, 83 insertions(+), 44 deletions(-)

diffs (167 lines):

diff -r 2d9eb8cebb71 -r 491e74c58e51 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Tue Sep 09 20:02:39 2014 +0530
+++ b/source/common/x86/blockcopy8.asm	Tue Sep 09 18:55:48 2014 +0200
@@ -3973,12 +3973,13 @@ cglobal copy_cnt_4, 3,3,3
 
     ; get count
     ; CHECK_ME: Intel documents said POPCNT is SSE4.2 instruction, but just implement after Nehalem
-%if 1
+%if 0
     pmovmskb    eax, m0
     not         ax
     popcnt      ax, ax
 %else
-    paddb       m0, [pb_1]
+    mova        m1, [pb_1]
+    paddb       m0, m1
     psadbw      m0, m2
     pshufd      m1, m0, 2
     paddw       m0, m1
@@ -3990,7 +3991,7 @@ cglobal copy_cnt_4, 3,3,3
 INIT_YMM avx2
 cglobal copy_cnt_4, 3,3,3
     add         r2d, r2d
-    xorpd       m2,  m2
+    xorpd       xm2, xm2
 
     ; row 0 & 1
     movq        xm0, [r1]
@@ -4004,9 +4005,11 @@ cglobal copy_cnt_4, 3,3,3
     vinserti128 m0, m0, xm1, 1
     movu    [r0], m0
 
+    vextractf128 xm1, m0, 1
+    packsswb     xm0, xm1
+    pcmpeqb      xm0, xm2
+
     ; get count
-    packsswb    xm0, xm1
-    pcmpeqb     xm0, xm2
     pmovmskb    eax, xm0
     not         ax
     popcnt      ax, ax
@@ -4076,49 +4079,85 @@ cglobal copy_cnt_8, 3,3,6
 
 
 INIT_YMM avx2
-cglobal copy_cnt_8, 3,3,6
+%if ARCH_X86_64 == 1
+cglobal copy_cnt_8, 3,4,6
+  %define tmpd eax
+%else
+cglobal copy_cnt_8, 3,5,6
+  %define tmpd r4d
+%endif
     add         r2d, r2d
-    xorpd       m5, m5
-
-    ; row 0 - 1
+    pxor        m4, m4
+    lea         r3, [r2 * 3]
+
+    ; row 0
     movu        xm0, [r1]
-    movu        xm1, [r1 + r2]
-    vinserti128 m0, m0, xm1, 1
-    movu        [r0], m0
-
-    ; row 2 - 3
-    movu        xm1, [r1 + r2 * 2]
-    lea         r1,  [r1 + r2 * 2]
-    movu        xm2, [r1 + r2]
-    vinserti128 m1, m1, xm2, 1
-    movu        [r0 + 32], m1
-
-    ; row 4 - 5
-    movu        xm2, [r1 + r2 * 2]
-    lea         r1,  [r1 + r2 * 2]
-    movu        xm3, [r1 + r2]
-    vinserti128 m2, m2, xm3, 1
-    movu        [r0 + 64], m2
-
-    ; row 6 - 7
-    movu        xm3, [r1 + r2 * 2]
-    lea         r1,  [r1 + r2 * 2]
-    movu        xm4, [r1 + r2]
-    vinserti128 m3, m3, xm4, 1
-    movu        [r0 + 96], m3
+    mova        xm2, xm0
+    pmovsxwd    m1, xm0
+    movu        [r0 + 0 * mmsize], m1
+
+    ; row 1
+    movu        xm0, [r1 + r2]
+    vinserti128 m2, m2, xm0, 1
+    pmovsxwd    m1, xm0
+    movu        [r0 + 1 * mmsize], m1
+
+    ; row 2
+    movu        xm0, [r1 + r2 * 2]
+    mova        xm5, xm0
+    pmovsxwd    m1, xm0
+    movu        [r0 + 2 * mmsize], m1
+
+    ; row 3
+    movu        xm0, [r1 + r3]
+    vinserti128 m5, m5, xm0, 1
+    packsswb    m2, m5
+    pcmpeqb     m2, m4
+    pmovmskb    tmpd, m2
+    not         tmpd
+    popcnt      tmpd, tmpd
+    pmovsxwd    m1, xm0
+    movu        [r0 + 3 * mmsize], m1
+
+    add         r0, 4 * mmsize
+    lea         r1, [r1 + r2 * 4]
+
+    ; row 4
+    movu        xm0, [r1]
+    mova        xm2, xm0
+    pmovsxwd    m1, xm0
+    movu        [r0 + 0 * mmsize], m1
+
+    ; row 5
+    movu        xm0, [r1 + r2]
+    vinserti128 m2, m2, xm0, 1
+    pmovsxwd    m1, xm0
+    movu        [r0 + 1 * mmsize], m1
+
+    ; row 6
+    movu        xm0, [r1 + r2 * 2]
+    mova        xm5, xm0
+    pmovsxwd    m1, xm0
+    movu        [r0 + 2 * mmsize], m1
+
+    ; row 7
+    movu        xm0, [r1 + r3]
+    pmovsxwd    m1, xm0
+    movu        [r0 + 3 * mmsize], m1
+    vinserti128 m5, m5, xm0, 1
 
     ; get count
-    vpacksswb    m0, m1
-    vpcmpeqb     m0, m5
-    vpmovmskb    eax, m0
-    not          eax
-    popcnt       eax, eax
-    vpacksswb    m2, m3
-    vpcmpeqb     m2, m5
-    vpmovmskb    r1d, m2
-    not          r1d
-    popcnt       r1d, r1d
-    add          eax, r1d
+    packsswb    m2, m5
+    pcmpeqb     m2, m4
+    pmovmskb    r0d, m2
+    not         r0d
+    popcnt      r0d, r0d
+
+%if ARCH_X86_64 == 1
+    add         tmpd, r0d
+%else
+    add         r0d, tmpd
+%endif
     RET
 
 


More information about the x265-commits mailing list