[x265] [PATCH] blockcopy_pp_12x32: SSE2 asm code optimization
chen
chenm003 at 163.com
Thu Feb 5 13:25:34 CET 2015
this code is right
but could you try use general register move (rN, rNd) in x64 mode?
At 2015-02-05 19:59:24,praveen at multicorewareinc.com wrote:
># HG changeset patch
># User Praveen Tiwari
># Date 1423137543 -19800
># Node ID b10384b8c8a9a60fe37f4e5f3506673dcf00c004
># Parent 499eddf5c1e4dfcb8447d65cb0b48d633b3660a5
>blockcopy_pp_12x32: SSE2 asm code optimization
>
>diff -r 499eddf5c1e4 -r b10384b8c8a9 source/common/x86/blockcopy8.asm
>--- a/source/common/x86/blockcopy8.asm Thu Feb 05 16:48:36 2015 +0530
>+++ b/source/common/x86/blockcopy8.asm Thu Feb 05 17:29:03 2015 +0530
>@@ -584,7 +584,55 @@
>
> BLOCKCOPY_PP_W12_H4 12, 16
>
>-BLOCKCOPY_PP_W12_H4 12, 32
>+;-----------------------------------------------------------------------------
>+; void blockcopy_pp_12x32(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>+;-----------------------------------------------------------------------------
>+INIT_XMM sse2
>+cglobal blockcopy_pp_12x32, 4, 7, 8
>+
>+ lea r5, [3 * r1]
>+ lea r6, [3 * r3]
>+
>+ movh m0, [r2]
>+ movd m1, [r2 + 8]
>+ movh m2, [r2 + r3]
>+ movd m3, [r2 + r3 + 8]
>+ movh m4, [r2 + 2 * r3]
>+ movd m5, [r2 + 2 * r3 + 8]
>+ movh m6, [r2 + r6]
>+ movd m7, [r2 + r6 + 8]
>+
>+ movh [r0], m0
>+ movd [r0 + 8], m1
>+ movh [r0 + r1], m2
>+ movd [r0 + r1 + 8], m3
>+ movh [r0 + 2 * r1], m4
>+ movd [r0 + 2 * r1 + 8], m5
>+ movh [r0 + r5], m6
>+ movd [r0 + r5 + 8], m7
>+
>+ %rep 7
>+ lea r2, [r2 + 4 * r3]
>+ movh m0, [r2]
>+ movd m1, [r2 + 8]
>+ movh m2, [r2 + r3]
>+ movd m3, [r2 + r3 + 8]
>+ movh m4, [r2 + 2 * r3]
>+ movd m5, [r2 + 2 * r3 + 8]
>+ movh m6, [r2 + r6]
>+ movd m7, [r2 + r6 + 8]
>+
>+ lea r0, [r0 + 4 * r1]
>+ movh [r0], m0
>+ movd [r0 + 8], m1
>+ movh [r0 + r1], m2
>+ movd [r0 + r1 + 8], m3
>+ movh [r0 + 2 * r1], m4
>+ movd [r0 + 2 * r1 + 8], m5
>+ movh [r0 + r5], m6
>+ movd [r0 + r5 + 8], m7
>+%endrep
>+ RET
>
> ;-----------------------------------------------------------------------------
> ; void blockcopy_pp_16x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150205/5444d7ab/attachment.html>
More information about the x265-devel
mailing list