[x265] [PATCH] asm: improve avx2 code sub_ps[32x32] 1402 -> 1360
Sumalatha Polureddy
sumalatha at multicorewareinc.com
Mon Apr 13 07:47:37 CEST 2015
This patch has not been pushed yet
Regards
Sumalatha
On Wed, Apr 8, 2015 at 3:44 PM, chen <chenm003 at 163.com> wrote:
> right
>
>
> At 2015-04-08 17:58:35,sumalatha at multicorewareinc.com wrote:
> ># HG changeset patch
> ># User Sumalatha Polureddy
> ># Date 1428486008 -19800
> ># Wed Apr 08 15:10:08 2015 +0530
> ># Node ID 4819d554dbbc63e6881bd8eee9d61a93320197f2
> ># Parent 3e416dec8024b8339b18568cf65e48eb3448bed1
> >asm: improve avx2 code sub_ps[32x32] 1402 -> 1360
> >
> >diff -r 3e416dec8024 -r 4819d554dbbc source/common/x86/pixel-util8.asm
> >--- a/source/common/x86/pixel-util8.asm Tue Apr 07 16:00:39 2015 -0500
> >+++ b/source/common/x86/pixel-util8.asm Wed Apr 08 15:10:08 2015 +0530
> >@@ -4686,10 +4686,14 @@
> > ;-----------------------------------------------------------------------------
> > ; void pixel_sub_ps_32x32(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
> > ;-----------------------------------------------------------------------------
> >+%if ARCH_X86_64
> > INIT_YMM avx2
> >-cglobal pixel_sub_ps_32x32, 6, 7, 4, dest, deststride, src0, src1, srcstride0, srcstride1
> >- mov r6d, 4
> >- add r1, r1
> >+cglobal pixel_sub_ps_32x32, 6, 10, 4, dest, deststride, src0, src1, srcstride0, srcstride1
> >+ mov r6d, 4
> >+ add r1, r1
> >+ lea r7, [r4 * 3]
> >+ lea r8, [r5 * 3]
> >+ lea r9, [r1 * 3]
> >
> > .loop:
> > pmovzxbw m0, [r2]
> >@@ -4714,8 +4718,43 @@
> > movu [r0 + r1], m0
> > movu [r0 + r1 + 32], m1
> >
> >- add r2, r4
> >- add r3, r5
> >+ pmovzxbw m0, [r2 + 2 * r4]
> >+ pmovzxbw m1, [r2 + 2 * r4 + 16]
> >+ pmovzxbw m2, [r3 + 2 * r5]
> >+ pmovzxbw m3, [r3 + 2 * r5 + 16]
> >+
> >+ psubw m0, m2
> >+ psubw m1, m3
> >+
> >+ movu [r0 + r1 * 2 ], m0
> >+ movu [r0 + r1 * 2 + 32], m1
> >+
> >+ pmovzxbw m0, [r2 + r7]
> >+ pmovzxbw m1, [r2 + r7 + 16]
> >+ pmovzxbw m2, [r3 + r8]
> >+ pmovzxbw m3, [r3 + r8 + 16]
> >+
> >+
> >+ psubw m0, m2
> >+ psubw m1, m3
> >+
> >+ movu [r0 + r9], m0
> >+ movu [r0 + r9 +32], m1
> >+
> >+ lea r2, [r2 + r4 * 4]
> >+ lea r3, [r3 + r5 * 4]
> >+ lea r0, [r0 + r1 * 4]
> >+
> >+ pmovzxbw m0, [r2]
> >+ pmovzxbw m1, [r2 + 16]
> >+ pmovzxbw m2, [r3]
> >+ pmovzxbw m3, [r3 + 16]
> >+
> >+ psubw m0, m2
> >+ psubw m1, m3
> >+
> >+ movu [r0 ], m0
> >+ movu [r0 + 32], m1
> >
> > pmovzxbw m0, [r2 + r4]
> > pmovzxbw m1, [r2 + r4 + 16]
> >@@ -4724,94 +4763,40 @@
> >
> > psubw m0, m2
> > psubw m1, m3
> >- lea r0, [r0 + r1 * 2]
> >-
> >- movu [r0 ], m0
> >- movu [r0 + 32], m1
> >-
> >- add r2, r4
> >- add r3, r5
> >-
> >- pmovzxbw m0, [r2 + r4]
> >- pmovzxbw m1, [r2 + r4 + 16]
> >- pmovzxbw m2, [r3 + r5]
> >- pmovzxbw m3, [r3 + r5 + 16]
> >-
> >+
> >+ movu [r0 + r1], m0
> >+ movu [r0 + r1 + 32], m1
> >+
> >+ pmovzxbw m0, [r2 + 2 * r4]
> >+ pmovzxbw m1, [r2 + 2 * r4 + 16]
> >+ pmovzxbw m2, [r3 + 2 * r5]
> >+ pmovzxbw m3, [r3 + 2 * r5 + 16]
> >
> > psubw m0, m2
> > psubw m1, m3
> >- add r0, r1
> >-
> >- movu [r0 ], m0
> >- movu [r0 + 32], m1
> >-
> >- add r2, r4
> >- add r3, r5
> >-
> >- pmovzxbw m0, [r2 + r4]
> >- pmovzxbw m1, [r2 + r4 + 16]
> >- pmovzxbw m2, [r3 + r5]
> >- pmovzxbw m3, [r3 + r5 + 16]
> >+
> >+ movu [r0 + r1 * 2], m0
> >+ movu [r0 + r1 * 2 + 32], m1
> >+
> >+ pmovzxbw m0, [r2 + r7]
> >+ pmovzxbw m1, [r2 + r7 + 16]
> >+ pmovzxbw m2, [r3 + r8]
> >+ pmovzxbw m3, [r3 + r8 + 16]
> >
> > psubw m0, m2
> > psubw m1, m3
> >- add r0, r1
> >-
> >- movu [r0 ], m0
> >- movu [r0 + 32], m1
> >-
> >- add r2, r4
> >- add r3, r5
> >-
> >- pmovzxbw m0, [r2 + r4]
> >- pmovzxbw m1, [r2 + r4 + 16]
> >- pmovzxbw m2, [r3 + r5]
> >- pmovzxbw m3, [r3 + r5 + 16]
> >-
> >- psubw m0, m2
> >- psubw m1, m3
> >- add r0, r1
> >-
> >- movu [r0 ], m0
> >- movu [r0 + 32], m1
> >-
> >- add r2, r4
> >- add r3, r5
> >-
> >- pmovzxbw m0, [r2 + r4]
> >- pmovzxbw m1, [r2 + r4 + 16]
> >- pmovzxbw m2, [r3 + r5]
> >- pmovzxbw m3, [r3 + r5 + 16]
> >-
> >- psubw m0, m2
> >- psubw m1, m3
> >- add r0, r1
> >-
> >- movu [r0 ], m0
> >- movu [r0 + 32], m1
> >-
> >- add r2, r4
> >- add r3, r5
> >-
> >- pmovzxbw m0, [r2 + r4]
> >- pmovzxbw m1, [r2 + r4 + 16]
> >- pmovzxbw m2, [r3 + r5]
> >- pmovzxbw m3, [r3 + r5 + 16]
> >-
> >- psubw m0, m2
> >- psubw m1, m3
> >- add r0, r1
> >-
> >- movu [r0 ], m0
> >- movu [r0 + 32], m1
> >-
> >- lea r0, [r0 + r1]
> >- lea r2, [r2 + r4 * 2]
> >- lea r3, [r3 + r5 * 2]
> >+
> >+ movu [r0 + r9], m0
> >+ movu [r0 + r9 + 32], m1
> >+
> >+ lea r0, [r0 + r1 * 4]
> >+ lea r2, [r2 + r4 * 4]
> >+ lea r3, [r3 + r5 * 4]
> >
> > dec r6d
> > jnz .loop
> > RET
> >+%endif
> >
> > ;-----------------------------------------------------------------------------
> > ; void pixel_sub_ps_64x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0,
> intptr_t srcstride1);
> >_______________________________________________
> >x265-devel mailing list
> >x265-devel at videolan.org
> >https://mailman.videolan.org/listinfo/x265-devel
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150413/bdbaabeb/attachment.html>
More information about the x265-devel
mailing list