<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><div> </div><pre><br>At 2015-03-10 22:05:25,rajesh@multicorewareinc.com wrote:
># HG changeset patch
># User Rajesh Paulraj<rajesh@multicorewareinc.com>
># Date 1425996251 -19800
># Tue Mar 10 19:34:11 2015 +0530
># Node ID fdfd37fe64245837628ae0445749811a281e3aae
># Parent 2dc6b50681ccc8b3a5123ea02728786de9aca7a4
>asm: avx2 8bpp code for filter_p2s[4x4](2.26x), filter_p2s[4x8](3.01x),
> filter_p2s[4x16](3.00x)
>
>diff -r 2dc6b50681cc -r fdfd37fe6424 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Tue Mar 10 18:41:56 2015 +0530
>+++ b/source/common/x86/asm-primitives.cpp Tue Mar 10 19:34:11 2015 +0530
>@@ -1479,6 +1479,10 @@
> p.cu[BLOCK_16x16].calcresidual = x265_getResidual16_avx2;
> p.cu[BLOCK_32x32].calcresidual = x265_getResidual32_avx2;
>
>+ p.pu[LUMA_4x4].filter_p2s = x265_pixelToShort_4x4_avx2;
>+ p.pu[LUMA_4x8].filter_p2s = x265_pixelToShort_4x8_avx2;
>+ p.pu[LUMA_4x16].filter_p2s = x265_pixelToShort_4x16_avx2;
>+
> p.scale1D_128to64 = x265_scale1D_128to64_avx2;
> p.weight_pp = x265_weight_pp_avx2;
>
>diff -r 2dc6b50681cc -r fdfd37fe6424 source/common/x86/ipfilter8.asm
>--- a/source/common/x86/ipfilter8.asm Tue Mar 10 18:41:56 2015 +0530
>+++ b/source/common/x86/ipfilter8.asm Tue Mar 10 19:34:11 2015 +0530
>@@ -6027,6 +6027,51 @@
> PIXEL_WH_4xN 4, 4
> PIXEL_WH_4xN 4, 8
> PIXEL_WH_4xN 4, 16
>+
>+;-----------------------------------------------------------------------------
>+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
>+;-----------------------------------------------------------------------------
>+%macro P2S_H_4xN_avx2 1
>+INIT_YMM avx2
>+cglobal pixelToShort_4x%1, 3, 5, 6
>+
>+ ; load height
>+ mov r3d, %1
>+
>+ ; load constant
>+ vbroadcasti128 m4, [pb_128]
>+ vbroadcasti128 m5, [tab_c_64_n64]
>+
>+%rep %1 / 4
>+ movd xm0, [r0]
>+ movd xm1, [r0 + r1]
>+ movd xm2, [r0 + r1 * 2]
>+ lea r4, [r1 * 3]
>+ movd xm3, [r0 + r4]
>+
>+ punpckldq m0, m0, m1
>+ punpckldq m2, m2 , m3
>+ punpcklbw m0, m4
>+ punpcklbw m2, m4
>+ vinserti128 m2, m0, xm2, 1
>+ pmaddubsw m2,m5
>+
>+ movq [r2 + FENC_STRIDE * 0], xm2
>+ movhps [r2 + FENC_STRIDE * 2], xm2
>+ vextracti128 xm2, m2, 1
</pre><pre>vinsert+pmaddubsw+vextract are slower than two of pmaddubsw, and the code can't see improve here</pre><pre>>+ movq [r2 + FENC_STRIDE * 4], xm2
>+ movhps [r2 + FENC_STRIDE * 6], xm2
>+
>+ lea r0, [r0 + r1 * 4]
>+ add r2, FENC_STRIDE * 8
>+
>+%endrep
>+ RET
>+%endmacro
>+P2S_H_4xN_avx2 4
>+P2S_H_4xN_avx2 8
>+P2S_H_4xN_avx2 16
>+
> ;-----------------------------------------------------------------------------
> ; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
> ;-----------------------------------------------------------------------------
>@@ -6528,7 +6573,7 @@
> movu [r2 + FENC_STRIDE * 6 + 64], m7
> movu [r2 + FENC_STRIDE * 6 + 80], m6
>
>- lea r0, [r0 + r1 * 4]
>+ lea r0, [r0 + r1 * 4]
> add r2, FENC_STRIDE * 8
> %endrep
> RET
>diff -r 2dc6b50681cc -r fdfd37fe6424 source/common/x86/ipfilter8.h
>--- a/source/common/x86/ipfilter8.h Tue Mar 10 18:41:56 2015 +0530
>+++ b/source/common/x86/ipfilter8.h Tue Mar 10 19:34:11 2015 +0530
>@@ -646,6 +646,10 @@
> void x265_pixelToShort_24x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> void x265_pixelToShort_48x64_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
>
>+void x265_pixelToShort_4x4_avx2(const pixel* src, intptr_t srcStride, int16_t* dst);
>+void x265_pixelToShort_4x8_avx2(const pixel* src, intptr_t srcStride, int16_t* dst);
>+void x265_pixelToShort_4x16_avx2(const pixel* src, intptr_t srcStride, int16_t* dst);
>+
> #undef LUMA_FILTERS
> #undef LUMA_SP_FILTERS
> #undef LUMA_SS_FILTERS
>_______________________________________________
>x265-devel mailing list
>x265-devel@videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
</pre></div>