[x265] [PATCH] Luma_hpp[16x16] avx2 asm code : improved 2307c->1695c
Aasaipriya Chandran
aasaipriya at multicorewareinc.com
Thu Nov 13 06:13:26 CET 2014
Hello Chen,
Thanks for the comments, I will make necessary changes and send the patch
again.
Regards,
Aasaipriya C
On Wed, Nov 12, 2014 at 10:16 PM, chen <chenm003 at 163.com> wrote:
>
>
>
>
> At 2014-11-12 18:56:45,aasaipriya at multicorewareinc.com wrote:
> ># HG changeset patch
> ># User Aasaipriya
> ># Date 1415788047 -19800
> ># Wed Nov 12 15:57:27 2014 +0530
> ># Node ID f0a17f2b4c22ff8aa05afb40e3360e5ac03590a6
> ># Parent 98fb658f3229ab10e808204c265a12e18d71638e
> >Luma_hpp[16x16] avx2 asm code : improved 2307c->1695c
> >
> >diff -r 98fb658f3229 -r f0a17f2b4c22 source/common/x86/asm-primitives.cpp
> >--- a/source/common/x86/asm-primitives.cpp Tue Nov 11 19:30:19 2014 +0900
> >+++ b/source/common/x86/asm-primitives.cpp Wed Nov 12 15:57:27 2014 +0530
> >@@ -1799,6 +1799,7 @@
> > p.transpose[BLOCK_64x64] = x265_transpose64_avx2;
> > #endif
> > p.luma_hpp[LUMA_4x4] = x265_interp_8tap_horiz_pp_4x4_avx2;
> >+ p.luma_hpp[LUMA_16x16] = x265_interp_8tap_horiz_pp_16x16_avx2;
> > p.luma_vpp[LUMA_4x4] = x265_interp_8tap_vert_pp_4x4_avx2;
> > }
> > #endif // if HIGH_BIT_DEPTH
> >diff -r 98fb658f3229 -r f0a17f2b4c22 source/common/x86/ipfilter8.asm
> >--- a/source/common/x86/ipfilter8.asm Tue Nov 11 19:30:19 2014 +0900
> >+++ b/source/common/x86/ipfilter8.asm Wed Nov 12 15:57:27 2014 +0530
> >@@ -854,6 +854,58 @@
> > RET
> >
> >
> >+INIT_YMM avx2
> >+cglobal interp_8tap_horiz_pp_16x16, 4,6,8
> >+ sub r0, 3
> >+ mov r4d, r4m
> >+%ifdef PIC
> >+ lea r5, [tab_LumaCoeff]
> >+ vpbroadcastd m0, [r5 + r4 * 8]
> >+ vpbroadcastd m1, [r5 + r4 * 8 + 4]
> >+%else
> >+ vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
> >+ vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
> >+%endif
> >+ mova m2, [tab_Tm]
> >+ movu m3, [tab_Tm + 16]
> >+ vpbroadcastd m7, [pw_1]
> >+
> >+ ; register map
> >+ ; m0 , m1 interpolate coeff
> >+ ; m2 , m2 shuffle order table
> >+ ; m7 - pw_1
> >+
> >+ mov r4d, 16
> >+.loop:
> >+ ; Row 0
> >+ vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
> >+ pshufb m5, m4, m3
> >+ pshufb m4, m2
> >+ pmaddubsw m4, m0
> >+ pmaddubsw m5, m1
> >+ paddw m4, m5
> >+ pmaddwd m4, m7
> >+ vbroadcasti128 m5, [r0 + 8] ; second 8 elements in Row0
> >+ pshufb m6, m5, m3
> >+ pshufb m5, m2
> >+ pmaddubsw m5, m0
> >+ pmaddubsw m6, m1
> >+ paddw m5, m6
> >+ pmaddwd m5, m7
> >+ packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
> >+ pmulhrsw m4, [pw_512]
>
> >+ packuswb m4, m4
> >+ vpermq m4, m4, 11011000b
> >+ pshufd xm4, xm4, 11011000b
> >+ movq [r2], xm4
> >+ movhps [r2 + 8], xm4
>
> why split into two write instruction?
>
>
>
> >+ lea r0, [r0 + r1]
> >+ lea r2, [r2 + r3]
>
> add r0, r1
>
> >+ dec r4d
> >+ jnz .loop
> >+ RET
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20141113/bba1c1ac/attachment.html>
More information about the x265-devel
mailing list