[x265] [PATCH] asm: 10bpp code for bolckcopy_ps_12x16
Murugan Vairavel
murugan at multicorewareinc.com
Tue Dec 10 07:38:45 CET 2013
ignore this patch. Needs modification.
On Tue, Dec 10, 2013 at 12:03 PM, <murugan at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Murugan Vairavel <murugan at multicorewareinc.com>
> # Date 1386657211 -19800
> # Tue Dec 10 12:03:31 2013 +0530
> # Node ID cbeac6eb3e8a26e6fcf35da5a65ff3e2157e46a9
> # Parent d6bb400f0733701f02f65e0f060284df3d77d9b0
> asm: 10bpp code for bolckcopy_ps_12x16
>
> diff -r d6bb400f0733 -r cbeac6eb3e8a source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Tue Dec 10 11:50:44 2013
> +0530
> +++ b/source/common/x86/asm-primitives.cpp Tue Dec 10 12:03:31 2013
> +0530
> @@ -677,6 +677,7 @@
> p.chroma[X265_CSP_I420].copy_pp[LUMA_16x16] =
> x265_blockcopy_pp_8x8_sse2;
> p.chroma[X265_CSP_I420].copy_pp[LUMA_16x32] =
> x265_blockcopy_pp_8x16_sse2;
> p.chroma[X265_CSP_I420].copy_pp[LUMA_16x64] =
> x265_blockcopy_pp_8x32_sse2;
> + p.chroma[X265_CSP_I420].copy_pp[LUMA_24x32] =
> x265_blockcopy_pp_12x16_sse2;
> }
> if (cpuMask & X265_CPU_SSSE3)
> {
> diff -r d6bb400f0733 -r cbeac6eb3e8a source/common/x86/blockcopy8.asm
> --- a/source/common/x86/blockcopy8.asm Tue Dec 10 11:50:44 2013 +0530
> +++ b/source/common/x86/blockcopy8.asm Tue Dec 10 12:03:31 2013 +0530
> @@ -578,46 +578,69 @@
>
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_PP_W12_H4 2
> INIT_XMM sse2
> -cglobal blockcopy_pp_%1x%2, 4, 7, 8, dest, deststride, src, srcstride
> -
> -mov r4d, %2
> -
> +cglobal blockcopy_pp_%1x%2, 4, 5, 4, dest, deststride, src, srcstride
> +
> + mov r4d, %2/4
> +%if HIGH_BIT_DEPTH
> + add r1, r1
> + add r3, r3
> .loop
> - movh m0, [r2]
> - movd m1, [r2 + 8]
> -
> - movh m2, [r2 + r3]
> - movd m3, [r2 + r3 + 8]
> -
> - movh m4, [r2 + 2 * r3]
> - movd m5, [r2 + 2 * r3 + 8]
> -
> - lea r5, [r2 + 2 * r3]
> -
> - movh m6, [r5 + r3]
> - movd m7, [r5 + r3 + 8]
> -
> - movh [r0], m0
> - movd [r0 + 8], m1
> -
> - movh [r0 + r1], m2
> - movd [r0 + r1 + 8], m3
> -
> - movh [r0 + 2 * r1], m4
> - movd [r0 + 2 * r1 + 8], m5
> -
> - lea r6, [r0 + 2 * r1]
> -
> - movh [r6 + r1], m6
> - movd [r6 + r1 + 8], m7
> -
> - lea r0, [r0 + 4 * r1]
> - lea r2, [r2 + 4 * r3]
> -
> - sub r4d, 4
> - jnz .loop
> -
> -RET
> + movu m0, [r2]
> + movh m1, [r2 + 16]
> + movu m2, [r2 + r3]
> + movh m3, [r2 + r3 + 16]
> + lea r2, [r2 + 2 * r3]
> +
> + movu [r0], m0
> + movh [r0 + 16], m1
> + movu [r0 + r1], m2
> + movh [r0 + r1 + 16], m3
> +
> + lea r0, [r0 + 2 * r1]
> + movu m0, [r2]
> + movh m1, [r2 + 16]
> + movu m2, [r2 + r3]
> + movh m3, [r2 + r3 + 16]
> +
> + movu [r0], m0
> + movh [r0 + 16], m1
> + movu [r0 + r1], m2
> + movh [r0 + r1 + 16], m3
> +
> + dec r4d
> + lea r0, [r0 + 2 * r1]
> + lea r2, [r2 + 2 * r3]
> + jnz .loop
> +%else
> +.loop
> + movh m0, [r2]
> + movd m1, [r2 + 8]
> + movh m2, [r2 + r3]
> + movd m3, [r2 + r3 + 8]
> + lea r2, [r2 + 2 * r3]
> +
> + movh [r0], m0
> + movd [r0 + 8], m1
> + movh [r0 + r1], m2
> + movd [r0 + r1 + 8], m3
> + lea r0, [r0 + 2 * r1]
> +
> + movh m0, [r2]
> + movd m1, [r2 + 8]
> + movh m2, [r2 + r3]
> + movd m3, [r2 + r3 + 8]
> +
> + movh [r0], m0
> + movd [r0 + 8], m1
> + movh [r0 + r1], m2
> + movd [r0 + r1 + 8], m3
> +
> + dec r4d
> + lea r0, [r0 + 4 * r1]
> + lea r2, [r2 + 4 * r3]
> + jnz .loop
> +%endif
> + RET
> %endmacro
>
> BLOCKCOPY_PP_W12_H4 12, 16
>
--
With Regards,
Murugan. V
+919659287478
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131210/dff066b8/attachment-0001.html>
More information about the x265-devel
mailing list