[x265] [PATCH] asm: 10bpp code for bolckcopy_ps_12x16

Murugan Vairavel murugan at multicorewareinc.com
Tue Dec 10 07:38:45 CET 2013


ignore this patch. Needs modification.



On Tue, Dec 10, 2013 at 12:03 PM, <murugan at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Murugan Vairavel <murugan at multicorewareinc.com>
> # Date 1386657211 -19800
> #      Tue Dec 10 12:03:31 2013 +0530
> # Node ID cbeac6eb3e8a26e6fcf35da5a65ff3e2157e46a9
> # Parent  d6bb400f0733701f02f65e0f060284df3d77d9b0
> asm: 10bpp code for bolckcopy_ps_12x16
>
> diff -r d6bb400f0733 -r cbeac6eb3e8a source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp      Tue Dec 10 11:50:44 2013
> +0530
> +++ b/source/common/x86/asm-primitives.cpp      Tue Dec 10 12:03:31 2013
> +0530
> @@ -677,6 +677,7 @@
>          p.chroma[X265_CSP_I420].copy_pp[LUMA_16x16] =
> x265_blockcopy_pp_8x8_sse2;
>          p.chroma[X265_CSP_I420].copy_pp[LUMA_16x32] =
> x265_blockcopy_pp_8x16_sse2;
>          p.chroma[X265_CSP_I420].copy_pp[LUMA_16x64] =
> x265_blockcopy_pp_8x32_sse2;
> +        p.chroma[X265_CSP_I420].copy_pp[LUMA_24x32] =
> x265_blockcopy_pp_12x16_sse2;
>      }
>      if (cpuMask & X265_CPU_SSSE3)
>      {
> diff -r d6bb400f0733 -r cbeac6eb3e8a source/common/x86/blockcopy8.asm
> --- a/source/common/x86/blockcopy8.asm  Tue Dec 10 11:50:44 2013 +0530
> +++ b/source/common/x86/blockcopy8.asm  Tue Dec 10 12:03:31 2013 +0530
> @@ -578,46 +578,69 @@
>
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_PP_W12_H4 2
>  INIT_XMM sse2
> -cglobal blockcopy_pp_%1x%2, 4, 7, 8, dest, deststride, src, srcstride
> -
> -mov         r4d,       %2
> -
> +cglobal blockcopy_pp_%1x%2, 4, 5, 4, dest, deststride, src, srcstride
> +
> +    mov         r4d,       %2/4
> +%if HIGH_BIT_DEPTH
> +    add     r1,    r1
> +    add     r3,    r3
>  .loop
> -      movh     m0,     [r2]
> -      movd     m1,     [r2 + 8]
> -
> -      movh     m2,     [r2 + r3]
> -      movd     m3,     [r2 + r3 + 8]
> -
> -      movh     m4,     [r2 + 2 * r3]
> -      movd     m5,     [r2 + 2 * r3 + 8]
> -
> -      lea      r5,     [r2 + 2 * r3]
> -
> -      movh     m6,     [r5 + r3]
> -      movd     m7,     [r5 + r3 + 8]
> -
> -      movh     [r0],                 m0
> -      movd     [r0 + 8],             m1
> -
> -      movh     [r0 + r1],            m2
> -      movd     [r0 + r1 + 8],        m3
> -
> -      movh     [r0 + 2 * r1],        m4
> -      movd     [r0 + 2 * r1 + 8],    m5
> -
> -      lea      r6,                   [r0 + 2 * r1]
> -
> -      movh     [r6 + r1],            m6
> -      movd     [r6 + r1 + 8],        m7
> -
> -      lea      r0,                   [r0 + 4 * r1]
> -      lea      r2,                   [r2 + 4 * r3]
> -
> -      sub      r4d,                   4
> -      jnz      .loop
> -
> -RET
> +    movu    m0,    [r2]
> +    movh    m1,    [r2 + 16]
> +    movu    m2,    [r2 + r3]
> +    movh    m3,    [r2 + r3 + 16]
> +    lea     r2,    [r2 + 2 * r3]
> +
> +    movu    [r0],              m0
> +    movh    [r0 + 16],         m1
> +    movu    [r0 + r1],         m2
> +    movh    [r0 + r1 + 16],    m3
> +
> +    lea     r0,    [r0 + 2 * r1]
> +    movu    m0,    [r2]
> +    movh    m1,    [r2 + 16]
> +    movu    m2,    [r2 + r3]
> +    movh    m3,    [r2 + r3 + 16]
> +
> +    movu    [r0],              m0
> +    movh    [r0 + 16],         m1
> +    movu    [r0 + r1],         m2
> +    movh    [r0 + r1 + 16],    m3
> +
> +    dec     r4d
> +    lea     r0,    [r0 + 2 * r1]
> +    lea     r2,    [r2 + 2 * r3]
> +    jnz     .loop
> +%else
> +.loop
> +    movh    m0,     [r2]
> +    movd    m1,     [r2 + 8]
> +    movh    m2,     [r2 + r3]
> +    movd    m3,     [r2 + r3 + 8]
> +    lea     r2,     [r2 + 2 * r3]
> +
> +    movh    [r0],             m0
> +    movd    [r0 + 8],         m1
> +    movh    [r0 + r1],        m2
> +    movd    [r0 + r1 + 8],    m3
> +    lea     r0,               [r0 + 2 * r1]
> +
> +    movh    m0,     [r2]
> +    movd    m1,     [r2 + 8]
> +    movh    m2,     [r2 + r3]
> +    movd    m3,     [r2 + r3 + 8]
> +
> +    movh    [r0],             m0
> +    movd    [r0 + 8],         m1
> +    movh    [r0 + r1],        m2
> +    movd    [r0 + r1 + 8],    m3
> +
> +    dec     r4d
> +    lea     r0,               [r0 + 4 * r1]
> +    lea     r2,               [r2 + 4 * r3]
> +    jnz     .loop
> +%endif
> +    RET
>  %endmacro
>
>  BLOCKCOPY_PP_W12_H4 12, 16
>



-- 
With Regards,

Murugan. V
+919659287478
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131210/dff066b8/attachment-0001.html>


More information about the x265-devel mailing list