<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><div> </div><pre><br>At 2014-09-09 20:23:25,praveen@multicorewareinc.com wrote:
># HG changeset patch
># User Praveen Tiwari
># Date 1410265389 -19800
># Node ID cbb7e2beff033e441a13cd82bbfd85b362cd6d24
># Parent d011073f35258cb2f0ad95db6038c2d9fb840b27
>copy_cnt_8 AVX2 asm code, as per new interface
>
>diff -r d011073f3525 -r cbb7e2beff03 source/common/x86/blockcopy8.asm
>--- a/source/common/x86/blockcopy8.asm Tue Sep 09 14:07:14 2014 +0530
>+++ b/source/common/x86/blockcopy8.asm Tue Sep 09 17:53:09 2014 +0530
>@@ -4076,85 +4076,49 @@
>
>
> INIT_YMM avx2
>-%if ARCH_X86_64 == 1
>-cglobal copy_cnt_8, 3,4,6
>- %define tmpd eax
>-%else
>-cglobal copy_cnt_8, 3,5,6
>- %define tmpd r4d
>-%endif
>+cglobal copy_cnt_8, 3,3,6
> add r2d, r2d
>- pxor m4, m4
>- lea r3, [r2 * 3]
>-
>- ; row 0
>+ xorpd m5, m5
>+
>+ ; row 0 - 1
> movu xm0, [r1]
>- mova xm2, xm0
>- pmovsxwd m1, xm0
>- movu [r0 + 0 * mmsize], m1
>-
>- ; row 1
>- movu xm0, [r1 + r2]
>- vinserti128 m2, m2, xm0, 1
>- pmovsxwd m1, xm0
>- movu [r0 + 1 * mmsize], m1
>-
>- ; row 2
>- movu xm0, [r1 + r2 * 2]
>- mova xm5, xm0
>- pmovsxwd m1, xm0
>- movu [r0 + 2 * mmsize], m1
>-
>- ; row 3
>- movu xm0, [r1 + r3]
>- vinserti128 m5, m5, xm0, 1
>- packsswb m2, m5
>- pcmpeqb m2, m4
>- pmovmskb tmpd, m2
>- not tmpd
>- popcnt tmpd, tmpd
>- pmovsxwd m1, xm0
>- movu [r0 + 3 * mmsize], m1
>-
>- add r0, 4 * mmsize
>- lea r1, [r1 + r2 * 4]
>-
>- ; row 4
>- movu xm0, [r1]
>- mova xm2, xm0
>- pmovsxwd m1, xm0
>- movu [r0 + 0 * mmsize], m1
>-
>- ; row 5
>- movu xm0, [r1 + r2]
>- vinserti128 m2, m2, xm0, 1
>- pmovsxwd m1, xm0
>- movu [r0 + 1 * mmsize], m1
>-
>- ; row 6
>- movu xm0, [r1 + r2 * 2]
>- mova xm5, xm0
>- pmovsxwd m1, xm0
>- movu [r0 + 2 * mmsize], m1
>-
>- ; row 7
>- movu xm0, [r1 + r3]
>- pmovsxwd m1, xm0
>- movu [r0 + 3 * mmsize], m1
>- vinserti128 m5, m5, xm0, 1
>+ movu xm1, [r1 + r2]
>+ vinserti128 m0, m0, xm1, 1
>+ movu [r0], m0
>+
>+ ; row 2 - 3
>+ movu xm1, [r1 + r2 * 2]
>+ lea r1, [r1 + r2 * 2]
>+ movu xm2, [r1 + r2]
>+ vinserti128 m1, m1, xm2, 1
>+ movu [r0 + 32], m1
>+
>+ ; row 4 - 5
>+ movu xm2, [r1 + r2 * 2]
>+ lea r1, [r1 + r2 * 2]
>+ movu xm3, [r1 + r2]
>+ vinserti128 m2, m2, xm3, 1
>+ movu [r0 + 64], m2
>+
>+ ; row 6 - 7
>+ movu xm3, [r1 + r2 * 2]
>+ lea r1, [r1 + r2 * 2]
>+ movu xm4, [r1 + r2]
>+ vinserti128 m3, m3, xm4, 1
>+ movu [r0 + 96], m3
>
> ; get count
>- packsswb m2, m5
>- pcmpeqb m2, m4
>- pmovmskb r0d, m2
>- not r0d
>- popcnt r0d, r0d
>-
>-%if ARCH_X86_64 == 1
>- add tmpd, r0d
>-%else
>- add r0d, tmpd
>-%endif
>+ vpacksswb m0, m1
>+ vpcmpeqb m0, m5
>+ vpmovmskb eax, m0
>+ not eax
>+ popcnt eax, eax
>+ vpacksswb m2, m3
>+ vpcmpeqb m2, m5
>+ vpmovmskb r1d, m2
>+ not r1d
>+ popcnt r1d, r1d
</pre><pre>>+ add eax, r1d
> RET
>
> </pre><pre> </pre><pre> </pre><pre><pre>popcnt is expensive instruction, in this case, you didn't need it,</pre><pre>and many algorithm need modify, see below reference code:
</pre><pre>INIT_YMM avx2<br>cglobal copy_cnt_8, 3,4,5<br> add r2d, r2d<br> lea r3, [r2 * 3]</pre></pre><pre> ; row 0 - 1<br> movu xm0, [r1]<br> vinserti128 m0, m0, [r1 + r2], 1<br> movu [r0], m0</pre><pre> ; row 2 - 3<br> movu xm1, [r1 + r2 * 2]<br> vinserti128 m1, m1, [r1 + r3], 1<br> movu [r0 + 32], m1<br> lea r1, [r1 + r2 * 4]</pre><pre> ; row 4 - 5<br> movu xm2, [r1]<br> vinserti128 m2, m2, [r1 + r2], 1<br> movu [r0 + 64], m2</pre><pre> ; row 6 - 7<br> movu xm3, [r1 + r2 * 2]<br> vinserti128 m3, m3, [r1 + r3], 1<br> movu [r0 + 96], m3</pre><pre> ; get count<br> xorpd m4, m4<br> vpacksswb m0, m1<br> vpacksswb m2, m3<br> pminub m0, [pb_1]<br> pminub m2, [pb_1]<br> paddb m0, m2<br> vextracti128 xm1, m0, 1<br> paddb xm0, xm1<br> psadbw xm0, xm4<br> movhlps xm1, xm0<br> paddd xm0, xm1<br> movd eax, xm0<br> RET<br></pre></div>