[x265] [PATCH] blockcopy_pp_6x8 sse2 asm code optimization
Praveen Tiwari
praveen at multicorewareinc.com
Tue Feb 3 12:17:29 CET 2015
maskmovdqu, [i420] copy_pp[ 6x8] 0.79x 670.04 527.63
even slower than C code as well as high maintenance code, need to maintain
separate code for separate platform as we have mapped rdi differently.
Regards,
Praveen
On Tue, Feb 3, 2015 at 2:57 PM, chen <chenm003 at 163.com> wrote:
> How about maskmovdqu
>
> At 2015-02-03 17:00:54,praveen at multicorewareinc.com wrote:
>
> ># HG changeset patch
> ># User Praveen Tiwari
> ># Date 1422954042 -19800
> ># Node ID d212ce9fa3705b9e7d4d23f14412bd28fe3bbfde
> ># Parent 059892f65db3e4c70017241ea847717e11be0124
> >blockcopy_pp_6x8 sse2 asm code optimization
> >
> >improved, 248.67c -> 212.56c
> >
> >diff -r 059892f65db3 -r d212ce9fa370 source/common/x86/blockcopy8.asm
> >--- a/source/common/x86/blockcopy8.asm Tue Feb 03 11:58:18 2015 +0530
> >+++ b/source/common/x86/blockcopy8.asm Tue Feb 03 14:30:42 2015 +0530
> >@@ -224,65 +224,51 @@
>
> > ; void blockcopy_pp_6x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>
> > ;-----------------------------------------------------------------------------
> > INIT_XMM sse2
> >-cglobal blockcopy_pp_6x8, 4, 7, 8
> >-
> >- movd m0, [r2]
> >- movd m1, [r2 + r3]
> >- movd m2, [r2 + 2 * r3]
> >- lea r5, [r2 + 2 * r3]
> >- movd m3, [r5 + r3]
> >-
> >- movd m4, [r5 + 2 * r3]
> >- lea r5, [r5 + 2 * r3]
> >- movd m5, [r5 + r3]
> >- movd m6, [r5 + 2 * r3]
> >- lea r5, [r5 + 2 * r3]
> >- movd m7, [r5 + r3]
> >-
> >- movd [r0], m0
> >- movd [r0 + r1], m1
> >- movd [r0 + 2 * r1], m2
> >- lea r6, [r0 + 2 * r1]
> >- movd [r6 + r1], m3
> >-
> >- movd [r6 + 2 * r1], m4
> >- lea r6, [r6 + 2 * r1]
> >- movd [r6 + r1], m5
> >- movd [r6 + 2 * r1], m6
> >- lea r6, [r6 + 2 * r1]
> >- movd [r6 + r1], m7
> >-
> >- mov r4w, [r2 + 4]
> >- mov r5w, [r2 + r3 + 4]
> >- mov r6w, [r2 + 2 * r3 + 4]
> >-
> >- mov [r0 + 4], r4w
> >- mov [r0 + r1 + 4], r5w
> >- mov [r0 + 2 * r1 + 4], r6w
> >-
> >- lea r0, [r0 + 2 * r1]
> >- lea r2, [r2 + 2 * r3]
> >-
> >- mov r4w, [r2 + r3 + 4]
> >- mov r5w, [r2 + 2 * r3 + 4]
> >-
> >- mov [r0 + r1 + 4], r4w
> >- mov [r0 + 2 * r1 + 4], r5w
> >-
> >- lea r0, [r0 + 2 * r1]
> >- lea r2, [r2 + 2 * r3]
> >-
> >- mov r4w, [r2 + r3 + 4]
> >- mov r5w, [r2 + 2 * r3 + 4]
> >-
> >- mov [r0 + r1 + 4], r4w
> >- mov [r0 + 2 * r1 + 4], r5w
> >-
> >- lea r0, [r0 + 2 * r1]
> >- lea r2, [r2 + 2 * r3]
> >-
> >- mov r4w, [r2 + r3 + 4]
> >- mov [r0 + r1 + 4], r4w
> >+cglobal blockcopy_pp_6x8, 4, 7, 3
> >+
> >+ movd m0, [r2]
> >+ mov r4w, [r2 + 4]
> >+ movd m1, [r2 + r3]
> >+ mov r5w, [r2 + r3 + 4]
> >+ movd m2, [r2 + 2 * r3]
> >+ mov r6w, [r2 + 2 * r3 + 4]
> >+
> >+ movd [r0], m0
> >+ mov [r0 + 4], r4w
> >+ movd [r0 + r1], m1
> >+ mov [r0 + r1 + 4], r5w
> >+ movd [r0 + 2 * r1], m2
> >+ mov [r0 + 2 * r1 + 4], r6w
> >+
> >+ lea r2, [r2 + 2 * r3]
> >+ movd m0, [r2 + r3]
> >+ mov r4w, [r2 + r3 + 4]
> >+ movd m1, [r2 + 2 * r3]
> >+ mov r5w, [r2 + 2 * r3 + 4]
> >+ lea r2, [r2 + 2 * r3]
> >+ movd m2, [r2 + r3]
> >+ mov r6w, [r2 + r3 + 4]
> >+
> >+ lea r0, [r0 + 2 * r1]
> >+ movd [r0 + r1], m0
> >+ mov [r0 + r1 + 4], r4w
> >+ movd [r0 + 2 * r1], m1
> >+ mov [r0 + 2 * r1 + 4], r5w
> >+ lea r0, [r0 + 2 * r1]
> >+ movd [r0 + r1], m2
> >+ mov [r0 + r1 + 4], r6w
> >+
> >+ lea r2, [r2 + 2 * r3]
> >+ movd m0, [r2]
> >+ mov r4w, [r2 + 4]
> >+ movd m1, [r2 + r3]
> >+ mov r5w, [r2 + r3 + 4]
> >+
> >+ lea r0, [r0 + 2 * r1]
> >+ movd [r0], m0
> >+ mov [r0 + 4], r4w
> >+ movd [r0 + r1], m1
> >+ mov [r0 + r1 + 4], r5w
> > RET
> >
>
> > ;-----------------------------------------------------------------------------
> >_______________________________________________
> >x265-devel mailing list
> >x265-devel at videolan.org
> >https://mailman.videolan.org/listinfo/x265-devel
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150203/0532d94f/attachment-0001.html>
More information about the x265-devel
mailing list