<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><div>
<div></div>
<div id="divNeteaseMailCard"></div>it right, but we can improve<br></div><pre>At 2015-03-02 16:47:23,sumalatha@multicorewareinc.com wrote:
># HG changeset patch
># User Sumalatha Polureddy<sumalatha@multicorewareinc.com>
># Date 1425286035 -19800
># Node ID 1be088c8bc675752ebfebc4fda3bad41659269a4
># Parent a9ad4d8202796dfb78e9d180f5fdb7cc0996ea66
>asm: avx2 code for add_ps[8x8] for 10bpp -- 24.9x
>
>add_ps[ 8x8] 24.97x 275.68 6882.88
>
>diff -r a9ad4d820279 -r 1be088c8bc67 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Mon Mar 02 14:10:07 2015 +0530
>+++ b/source/common/x86/asm-primitives.cpp Mon Mar 02 14:17:15 2015 +0530
>@@ -1069,6 +1069,8 @@
> }
> if (cpuMask & X265_CPU_AVX2)
> {
>+ p.cu[BLOCK_8x8].add_ps = x265_pixel_add_ps_8x8_avx2;
>+
> p.cu[BLOCK_32x32].ssd_s = x265_pixel_ssd_s_32_avx2;
> p.cu[BLOCK_16x16].sse_ss = x265_pixel_ssd_ss_16x16_avx2;
>
>diff -r a9ad4d820279 -r 1be088c8bc67 source/common/x86/pixeladd8.asm
>--- a/source/common/x86/pixeladd8.asm Mon Mar 02 14:10:07 2015 +0530
>+++ b/source/common/x86/pixeladd8.asm Mon Mar 02 14:17:15 2015 +0530
>@@ -229,6 +229,53 @@
>
> jnz .loop
> RET
>+
>+INIT_YMM avx2
>+cglobal pixel_add_ps_8x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
>+ mova m5, [pw_pixel_max]
>+ pxor m4, m4
>+ mov r6d, %2/4
>+ add r4, r4
>+ add r5, r5
>+ add r1, r1
>+.loop:
>+ movu xm0, [r2] ; row 0 of src0
>+ movu xm1, [r2 + r4] ; row 1 of src0
>+ vinserti128 m0, m0, xm1, 1
>+
>+ movu xm1, [r3] ; row 0 of src1
>+ movu xm2, [r3 + r5] ; row 1 of src1
>+ vinserti128 m1, m1, xm2, 1
>+ lea r2, [r2 + r4 * 2]
>+ lea r3, [r3 + r5 * 2]
>+
>+ paddw m0, m1<br>in here, we may replace vinsert+vinsert+padd with padd+padd+vinsert, the vinsert use Port5, it is bottleneck on Haswell<br>
>+ CLIPW m0, m4, m5
>+ movu [r0], xm0 ; row 0 of dst
>+ vextracti128 xm3, m0, 1
>+ movu [r0 + r1], xm3 ; row 1 of dst
>+ lea r0, [r0 + r1 * 2]
>+
>+ movu xm0, [r2] ; row 2 of src0
>+ movu xm1, [r2 + r4] ; row 3 of src0
>+ vinserti128 m0, m0, xm1, 1
>+
>+ movu xm1, [r3] ; row 2 of src1
>+ movu xm2, [r3 + r5] ; row 3 of src1
>+ vinserti128 m1, m1, xm2, 1
>+ lea r2, [r2 + r4 * 2]
>+ lea r3, [r3 + r5 * 2]
>+
>+ paddw m0, m1
>+ CLIPW m0, m4, m5
>+ movu [r0], xm0 ; row 2 of dst
>+ vextracti128 xm3, m0, 1
>+ movu [r0 + r1], xm3 ; row 3 of dst
>+ lea r0, [r0 + r1 * 2]
>+
>+ dec r6d
>+ jnz .loop
>+ RET
> %else
> INIT_XMM sse4
> cglobal pixel_add_ps_8x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
>_______________________________________________
>x265-devel mailing list
>x265-devel@videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
</pre></div>