<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><div>right</div><div> </div><pre><br>At 2014-09-18 19:08:23,praveen@multicorewareinc.com wrote:
># HG changeset patch
># User Praveen Tiwari
># Date 1411038475 -19800
># Node ID 532f798f98d7c7f5c493a819046a45e29b2da16a
># Parent e723ecc1e5c99c451cbc8034514b9dc590a2d4ef
>copy_cnt_16: avx2 asm code, improved 514.32 cycles -> 313.66 cycles
>
>diff -r e723ecc1e5c9 -r 532f798f98d7 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Thu Sep 18 15:30:18 2014 +0530
>+++ b/source/common/x86/asm-primitives.cpp Thu Sep 18 16:37:55 2014 +0530
>@@ -1730,7 +1730,7 @@
> * code is updated, avx2 version will be enabled */
>
> p.copy_cnt[BLOCK_8x8] = x265_copy_cnt_8_avx2;
>- // p.copy_cnt[BLOCK_16x16] = x265_copy_cnt_16_avx2;
>+ p.copy_cnt[BLOCK_16x16] = x265_copy_cnt_16_avx2;
> // p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_avx2;
>
>
>diff -r e723ecc1e5c9 -r 532f798f98d7 source/common/x86/blockcopy8.asm
>--- a/source/common/x86/blockcopy8.asm Thu Sep 18 15:30:18 2014 +0530
>+++ b/source/common/x86/blockcopy8.asm Thu Sep 18 16:37:55 2014 +0530
>@@ -4159,69 +4159,48 @@
>
>
> INIT_YMM avx2
>-cglobal copy_cnt_16, 3,5,5
>+cglobal copy_cnt_16, 3, 5, 5
> add r2d, r2d
>- lea r4, [r2 * 3]
>- mov r3d, 16/4
>- ; NOTE: xorpd is faster than pxor
>+ lea r3, [r2 * 3]
>+ mov r4d, 16/4
>+
>+ mova m3, [pb_1]
> xorpd m4, m4
>- xorpd m3, m3
>-
>-.loop
>- ; row 0
>+
>+.loop:
>+ ; row 0 - 1
> movu m0, [r1]
>- movu xm1, [r1 + mmsize/2]
>- pmovsxwd m2, xm0
>- pmovsxwd m1, xm1
>- movu [r0 + 0 * mmsize], m2
>- movu [r0 + 1 * mmsize], m1
>-
>- ; row 1
>+ movu [r0], m0
> movu m1, [r1 + r2]
>- movu xm2, [r1 + r2 + mmsize/2]
>+ movu [r0 + 32], m1
>+
> packsswb m0, m1
>- pcmpeqb m0, m3
>+ pminub m0, m3
>+
>+ ; row 2 - 3
>+ movu m1, [r1 + r2 * 2]
>+ movu [r0 + 64], m1
>+ movu m2, [r1 + r3]
>+ movu [r0 + 96], m2
>+
>+ packsswb m1, m2
>+ pminub m1, m3
>+ paddb m0, m1
> paddb m4, m0
>- pmovsxwd m1, xm1
>- pmovsxwd m2, xm2
>- movu [r0 + 2 * mmsize], m1
>- movu [r0 + 3 * mmsize], m2
>-
>- ; move output pointer here to avoid 128 bytes offset limit
>- add r0, 4 * mmsize
>-
>- ; row 2
>- movu m0, [r1 + r2 * 2]
>- movu xm1, [r1 + r2 * 2 + mmsize/2]
>- pmovsxwd m2, xm0
>- pmovsxwd m1, xm1
>- movu [r0 + 0 * mmsize], m2
>- movu [r0 + 1 * mmsize], m1
>-
>- ; row 3
>- movu m1, [r1 + r4]
>- movu xm2, [r1 + r4 + mmsize/2]
>- packsswb m0, m1
>- pcmpeqb m0, m3
>- paddb m4, m0
>- pmovsxwd m1, xm1
>- pmovsxwd m2, xm2
>- movu [r0 + 2 * mmsize], m1
>- movu [r0 + 3 * mmsize], m2
>-
>- add r0, 4 * mmsize
>- lea r1, [r1 + r2 * 4]
>- dec r3d
>- jnz .loop
>+
>+ add r0, 128
>+ lea r1, [r1 + 4 * r2]
>+ dec r4d
>+ jnz .loop
>
> ; get count
>- vextracti128 xm0, m4, 1
>- paddb xm0, xm4
>- movhlps xm1, xm0
>- paddb xm0, xm1
>- paddb xm0, [pb_32]
>- psadbw xm0, xm3
>- movd eax, xm0
>+ xorpd m0, m0
>+ vextracti128 xm1, m4, 1
>+ paddb xm4, xm1
>+ psadbw xm4, xm0
>+ movhlps xm1, xm4
>+ paddd xm4, xm1
>+ movd eax, xm4
> RET
>
> ;--------------------------------------------------------------------------------------
>diff -r e723ecc1e5c9 -r 532f798f98d7 source/common/x86/const-a.asm
>--- a/source/common/x86/const-a.asm Thu Sep 18 15:30:18 2014 +0530
>+++ b/source/common/x86/const-a.asm Thu Sep 18 16:37:55 2014 +0530
>@@ -29,6 +29,8 @@
>
> SECTION_RODATA 32
>
>+const pb_1, times 32 db 1
>+
> const hsub_mul, times 16 db 1, -1
> const pw_1, times 16 dw 1
> const pw_16, times 16 dw 16
>@@ -53,7 +55,6 @@
> const pb_64, times 16 db 64
> const pb_01, times 8 db 0,1
> const pb_0, times 16 db 0
>-const pb_1, times 32 db 1
> const pb_a1, times 16 db 0xa1
> const pb_3, times 16 db 3
> const pb_8, times 16 db 8
>_______________________________________________
>x265-devel mailing list
>x265-devel@videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
</pre></div>