<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><div>It is long code sequence, are you compare loop vs unroll performance?</div><pre><br>At 2014-09-30 13:41:03,praveen@multicorewareinc.com wrote:
># HG changeset patch
># User Praveen Tiwari
># Date 1412055614 -19800
># Node ID fb46cfd8ee611339da6af16f3c0de426eca3628f
># Parent 975078c41433328fa35913ad46a95cd9c78a8bb2
>blockfill_s_32x32 avx2 asm code: performance improved from 1354.05 cycles to 705.81 cycles, over sse version of asm code
>
>diff -r 975078c41433 -r fb46cfd8ee61 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Tue Sep 30 10:57:12 2014 +0530
>+++ b/source/common/x86/asm-primitives.cpp Tue Sep 30 11:10:14 2014 +0530
>@@ -1761,6 +1761,7 @@
> p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_avx2;
>
> p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_avx2;
>+ p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_avx2;
>
> p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_avx2;
> p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2;
>diff -r 975078c41433 -r fb46cfd8ee61 source/common/x86/blockcopy8.asm
>--- a/source/common/x86/blockcopy8.asm Tue Sep 30 10:57:12 2014 +0530
>+++ b/source/common/x86/blockcopy8.asm Tue Sep 30 11:10:14 2014 +0530
>@@ -1902,7 +1902,85 @@
>
> BLOCKFILL_S_W32_H4 32, 32
>
>-
>+INIT_YMM avx2
>+cglobal blockfill_s_32x32, 3, 4, 1
>+add r1, r1
>+lea r3, [3 * r1]
>+movd xm0, r2d
>+vpbroadcastw m0, xm0
>+
>+movu [r0], m0
>+movu [r0 + 32], m0
>+movu [r0 + r1], m0
>+movu [r0 + r1 + 32], m0
>+movu [r0 + 2 * r1], m0
>+movu [r0 + 2 * r1 + 32], m0
>+movu [r0 + r3], m0
>+movu [r0 + r3 + 32], m0
>+lea r0, [r0 + 4 * r1]
>+movu [r0], m0
>+movu [r0 + 32], m0
>+movu [r0 + r1], m0
>+movu [r0 + r1 + 32], m0
>+movu [r0 + 2 * r1], m0
>+movu [r0 + 2 * r1 + 32], m0
>+movu [r0 + r3], m0
>+movu [r0 + r3 + 32], m0
>+lea r0, [r0 + 4 * r1]
>+movu [r0], m0
>+movu [r0 + 32], m0
>+movu [r0 + r1], m0
>+movu [r0 + r1 + 32], m0
>+movu [r0 + 2 * r1], m0
>+movu [r0 + 2 * r1 + 32], m0
>+movu [r0 + r3], m0
>+movu [r0 + r3 + 32], m0
>+lea r0, [r0 + 4 * r1]
>+movu [r0], m0
>+movu [r0 + 32], m0
>+movu [r0 + r1], m0
>+movu [r0 + r1 + 32], m0
>+movu [r0 + 2 * r1], m0
>+movu [r0 + 2 * r1 + 32], m0
>+movu [r0 + r3], m0
>+movu [r0 + r3 + 32], m0
>+lea r0, [r0 + 4 * r1]
>+movu [r0], m0
>+movu [r0 + 32], m0
>+movu [r0 + r1], m0
>+movu [r0 + r1 + 32], m0
>+movu [r0 + 2 * r1], m0
>+movu [r0 + 2 * r1 + 32], m0
>+movu [r0 + r3], m0
>+movu [r0 + r3 + 32], m0
>+lea r0, [r0 + 4 * r1]
>+movu [r0], m0
>+movu [r0 + 32], m0
>+movu [r0 + r1], m0
>+movu [r0 + r1 + 32], m0
>+movu [r0 + 2 * r1], m0
>+movu [r0 + 2 * r1 + 32], m0
>+movu [r0 + r3], m0
>+movu [r0 + r3 + 32], m0
>+lea r0, [r0 + 4 * r1]
>+movu [r0], m0
>+movu [r0 + 32], m0
>+movu [r0 + r1], m0
>+movu [r0 + r1 + 32], m0
>+movu [r0 + 2 * r1], m0
>+movu [r0 + 2 * r1 + 32], m0
>+movu [r0 + r3], m0
>+movu [r0 + r3 + 32], m0
>+lea r0, [r0 + 4 * r1]
>+movu [r0], m0
>+movu [r0 + 32], m0
>+movu [r0 + r1], m0
>+movu [r0 + r1 + 32], m0
>+movu [r0 + 2 * r1], m0
>+movu [r0 + 2 * r1 + 32], m0
>+movu [r0 + r3], m0
>+movu [r0 + r3 + 32], m0
>+RET
>
> ;-----------------------------------------------------------------------------
> ; void blockcopy_ps_2x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
>diff -r 975078c41433 -r fb46cfd8ee61 source/common/x86/blockcopy8.h
>--- a/source/common/x86/blockcopy8.h Tue Sep 30 10:57:12 2014 +0530
>+++ b/source/common/x86/blockcopy8.h Tue Sep 30 11:10:14 2014 +0530
>@@ -202,6 +202,7 @@
> void x265_blockcopy_pp_32x64_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
>
> void x265_blockfill_s_16x16_avx2(int16_t *dst, intptr_t dstride, int16_t val);
>+void x265_blockfill_s_32x32_avx2(int16_t *dst, intptr_t dstride, int16_t val);
>
> #undef BLOCKCOPY_COMMON
> #undef BLOCKCOPY_SS_PP
>_______________________________________________
>x265-devel mailing list
>x265-devel@videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
</pre></div>