[x265] [PATCH] copy_cnt_8, AVX2 asm code as per new interface, performance improved from 5.13x to 7.59x on HASWELL-I5

praveen at multicorewareinc.com praveen at multicorewareinc.com
Wed Sep 10 07:53:52 CEST 2014


# HG changeset patch
# User Praveen Tiwari
# Date 1410328371 -19800
# Node ID d29cb300975a491287abdfb6abd2a9d3141e99f0
# Parent  408e2e6f0f709525cedb784a65386a116f2d3d00
copy_cnt_8, AVX2 asm code as per new interface, performance improved from 5.13x to 7.59x on HASWELL-I5

diff -r 408e2e6f0f70 -r d29cb300975a source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Tue Sep 09 22:23:26 2014 +0200
+++ b/source/common/x86/blockcopy8.asm	Wed Sep 10 11:22:51 2014 +0530
@@ -4079,85 +4079,44 @@
 
 
 INIT_YMM avx2
-%if ARCH_X86_64 == 1
-cglobal copy_cnt_8, 3,4,6
-  %define tmpd eax
-%else
-cglobal copy_cnt_8, 3,5,6
-  %define tmpd r4d
-%endif
+cglobal copy_cnt_8, 3,4,5
     add         r2d, r2d
-    pxor        m4, m4
     lea         r3, [r2 * 3]
 
-    ; row 0
+    ; row 0 - 1
     movu        xm0, [r1]
-    mova        xm2, xm0
-    pmovsxwd    m1, xm0
-    movu        [r0 + 0 * mmsize], m1
-
-    ; row 1
-    movu        xm0, [r1 + r2]
-    vinserti128 m2, m2, xm0, 1
-    pmovsxwd    m1, xm0
-    movu        [r0 + 1 * mmsize], m1
-
-    ; row 2
-    movu        xm0, [r1 + r2 * 2]
-    mova        xm5, xm0
-    pmovsxwd    m1, xm0
-    movu        [r0 + 2 * mmsize], m1
-
-    ; row 3
-    movu        xm0, [r1 + r3]
-    vinserti128 m5, m5, xm0, 1
-    packsswb    m2, m5
-    pcmpeqb     m2, m4
-    pmovmskb    tmpd, m2
-    not         tmpd
-    popcnt      tmpd, tmpd
-    pmovsxwd    m1, xm0
-    movu        [r0 + 3 * mmsize], m1
-
-    add         r0, 4 * mmsize
-    lea         r1, [r1 + r2 * 4]
-
-    ; row 4
-    movu        xm0, [r1]
-    mova        xm2, xm0
-    pmovsxwd    m1, xm0
-    movu        [r0 + 0 * mmsize], m1
-
-    ; row 5
-    movu        xm0, [r1 + r2]
-    vinserti128 m2, m2, xm0, 1
-    pmovsxwd    m1, xm0
-    movu        [r0 + 1 * mmsize], m1
-
-    ; row 6
-    movu        xm0, [r1 + r2 * 2]
-    mova        xm5, xm0
-    pmovsxwd    m1, xm0
-    movu        [r0 + 2 * mmsize], m1
-
-    ; row 7
-    movu        xm0, [r1 + r3]
-    pmovsxwd    m1, xm0
-    movu        [r0 + 3 * mmsize], m1
-    vinserti128 m5, m5, xm0, 1
+    vinserti128 m0, m0, [r1 + r2], 1
+    movu        [r0], m0
+
+    ; row 2 - 3
+    movu        xm1, [r1 + r2 * 2]
+    vinserti128 m1, m1, [r1 + r3], 1
+    movu        [r0 + 32], m1
+    lea         r1,  [r1 + r2 * 4]
+
+    ; row 4 - 5
+    movu        xm2, [r1]
+    vinserti128 m2, m2, [r1 + r2], 1
+    movu        [r0 + 64], m2
+
+    ; row 6 - 7
+    movu        xm3, [r1 + r2 * 2]
+    vinserti128 m3, m3, [r1 + r3], 1
+    movu        [r0 + 96], m3
 
     ; get count
-    packsswb    m2, m5
-    pcmpeqb     m2, m4
-    pmovmskb    r0d, m2
-    not         r0d
-    popcnt      r0d, r0d
-
-%if ARCH_X86_64 == 1
-    add         tmpd, r0d
-%else
-    add         r0d, tmpd
-%endif
+    xorpd        m4, m4
+    vpacksswb    m0, m1
+    vpacksswb    m2, m3
+    pminub       m0, [pb_1]
+    pminub       m2, [pb_1]
+    paddb        m0, m2
+    vextracti128 xm1, m0, 1
+    paddb        xm0, xm1
+    psadbw       xm0, xm4
+    movhlps      xm1, xm0
+    paddd        xm0, xm1
+    movd         eax, xm0
     RET
 
 


More information about the x265-devel mailing list