<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><div>right<br></div><pre><br>At 2014-09-18 19:32:16,praveen@multicorewareinc.com wrote:
># HG changeset patch
># User Praveen Tiwari
># Date 1411039927 -19800
># Node ID 2fae54f872d000b308c2122295bc8df7320ef135
># Parent 532f798f98d7c7f5c493a819046a45e29b2da16a
>copy_cnt_32: avx2 asm code, improved 1521.17 cycles -> 934.46 cycles
>
>diff -r 532f798f98d7 -r 2fae54f872d0 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Thu Sep 18 16:37:55 2014 +0530
>+++ b/source/common/x86/asm-primitives.cpp Thu Sep 18 17:02:07 2014 +0530
>@@ -1731,7 +1731,7 @@
>
> p.copy_cnt[BLOCK_8x8] = x265_copy_cnt_8_avx2;
> p.copy_cnt[BLOCK_16x16] = x265_copy_cnt_16_avx2;
>- // p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_avx2;
>+ p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_avx2;
>
>
> p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_avx2;
>diff -r 532f798f98d7 -r 2fae54f872d0 source/common/x86/blockcopy8.asm
>--- a/source/common/x86/blockcopy8.asm Thu Sep 18 16:37:55 2014 +0530
>+++ b/source/common/x86/blockcopy8.asm Thu Sep 18 17:02:07 2014 +0530
>@@ -4268,42 +4268,47 @@
>
>
> INIT_YMM avx2
>-cglobal copy_cnt_32, 3,4,5
>+cglobal copy_cnt_32, 3, 5, 5
> add r2d, r2d
>- mov r3d, 32/1
>- xorpd m3, m3
>+ mov r3d, 32/2
>+
>+ mova m3, [pb_1]
> xorpd m4, m4
>
>-.loop
>+.loop:
> ; row 0
>- movu m0, [r1 + 0 * mmsize]
>- movu m1, [r1 + 1 * mmsize]
>- packsswb m2, m0, m1
>- pcmpeqb m2, m4
>- paddb m3, m2
>-
>- pmovsxwd m2, xm0
>- pmovsxwd m0, [r1 + 0 * mmsize + mmsize/2]
>- movu [r0 + 0 * mmsize], m2
>- movu [r0 + 1 * mmsize], m0
>- pmovsxwd m0, xm1
>- pmovsxwd m1, [r1 + 1 * mmsize + mmsize/2]
>- movu [r0 + 2 * mmsize], m0
>- movu [r0 + 3 * mmsize], m1
>-
>- add r0, 4 * mmsize
>- add r1, r2
>+ movu m0, [r1]
>+ movu [r0], m0
>+ movu m1, [r1 + 32]
>+ movu [r0 + 32], m1
>+
>+ packsswb m0, m1
>+ pminub m0, m3
>+
>+ ; row 1
>+ movu m1, [r1 + r2]
>+ movu [r0 + 64], m1
>+ movu m2, [r1 + r2 + 32]
>+ movu [r0 + 96], m2
>+
>+ packsswb m1, m2
>+ pminub m1, m3
>+ paddb m0, m1
>+ paddb m4, m0
>+
>+ add r0, 128
>+ lea r1, [r1 + 2 * r2]
> dec r3d
>- jnz .loop
>+ jnz .loop
>
> ; get count
>- vextracti128 xm0, m3, 1
>- paddb xm0, xm3
>- movhlps xm3, xm0
>- paddb xm0, xm3
>- paddb xm0, [pb_128]
>- psadbw xm0, xm4
>- movd eax, xm0
>+ xorpd m0, m0
>+ vextracti128 xm1, m4, 1
>+ paddb xm4, xm1
>+ psadbw xm4, xm0
>+ movhlps xm1, xm4
>+ paddd xm4, xm1
>+ movd eax, xm4
> RET
>
> ;-----------------------------------------------------------------------------
>_______________________________________________
>x265-devel mailing list
>x265-devel@videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
</pre></div>