[x265] [PATCH] asm code for blockcopy_ps_64xN

Steve Borho steve at borho.org
Tue Nov 12 02:46:53 CET 2013


On Mon, Nov 11, 2013 at 9:52 AM, <praveen at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Praveen Tiwari
> # Date 1384185158 -19800
> # Node ID 972a9a01d0b440c919becc8ec17e7187522a2e68
> # Parent  b83d45863ceb3f88da420646a3789fb787043f6e
> asm code for blockcopy_ps_64xN
>

These are all nice but the encoder is currently ignoring them all.

The next task should be to change all the TComYuv copy methods that take
width and height to instead take a LUMA or CHROMA partition enum (int) and
change those functions to use our new block-based primitives and then
change all callers to pass in the enum, being careful not to call
partitionFromSizes() more often than minimally necessary.

--
Steve

>
> diff -r b83d45863ceb -r 972a9a01d0b4 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp      Mon Nov 11 21:06:11 2013
> +0530
> +++ b/source/common/x86/asm-primitives.cpp      Mon Nov 11 21:22:38 2013
> +0530
> @@ -459,6 +459,10 @@
>          p.luma_copy_ps[LUMA_16x64] = x265_blockcopy_ps_16x64_sse4;
>          p.luma_copy_ps[LUMA_32x64] = x265_blockcopy_ps_32x64_sse4;
>          p.luma_copy_ps[LUMA_48x64] = x265_blockcopy_ps_48x64_sse4;
> +        p.luma_copy_ps[LUMA_64x16] = x265_blockcopy_ps_64x16_sse4;
> +        p.luma_copy_ps[LUMA_64x32] = x265_blockcopy_ps_64x32_sse4;
> +        p.luma_copy_ps[LUMA_64x48] = x265_blockcopy_ps_64x48_sse4;
> +        p.luma_copy_ps[LUMA_64x64] = x265_blockcopy_ps_64x64_sse4;
>      }
>      if (cpuMask & X265_CPU_AVX)
>      {
> diff -r b83d45863ceb -r 972a9a01d0b4 source/common/x86/blockcopy8.asm
> --- a/source/common/x86/blockcopy8.asm  Mon Nov 11 21:06:11 2013 +0530
> +++ b/source/common/x86/blockcopy8.asm  Mon Nov 11 21:22:38 2013 +0530
> @@ -2286,3 +2286,77 @@
>  %endmacro
>
>  BLOCKCOPY_PS_W48_H2 48, 64
> +
>
> +;-----------------------------------------------------------------------------
> +; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src,
> intptr_t srcStride);
>
> +;-----------------------------------------------------------------------------
> +%macro BLOCKCOPY_PS_W64_H2 2
> +INIT_XMM sse4
> +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
> +
> +add        r1,      r1
> +mov        r4d,     %2/2
> +pxor       m0,      m0
> +
> +.loop
> +      movu       m1,             [r2]
> +      pmovzxbw   m2,             m1
> +      movu       [r0],           m2
> +      punpckhbw  m1,             m0
> +      movu       [r0 + 16],      m1
> +
> +      movu       m1,             [r2 + 16]
> +      pmovzxbw   m2,             m1
> +      movu       [r0 + 32],      m2
> +      punpckhbw  m1,             m0
> +      movu       [r0 + 48],      m1
> +
> +      movu       m1,             [r2 + 32]
> +      pmovzxbw   m2,             m1
> +      movu       [r0 + 64],      m2
> +      punpckhbw  m1,             m0
> +      movu       [r0 + 80],      m1
> +
> +      movu       m1,             [r2 + 48]
> +      pmovzxbw   m2,             m1
> +      movu       [r0 + 96],      m2
> +      punpckhbw  m1,             m0
> +      movu       [r0 + 112],     m1
> +
> +      movu       m1,             [r2 + r3]
> +      pmovzxbw   m2,             m1
> +      movu       [r0 + r1],      m2
> +      punpckhbw  m1,             m0
> +      movu       [r0 + r1 + 16], m1
> +
> +      movu       m1,             [r2 + r3 + 16]
> +      pmovzxbw   m2,             m1
> +      movu       [r0 + r1 + 32], m2
> +      punpckhbw  m1,             m0
> +      movu       [r0 + r1 + 48], m1
> +
> +      movu       m1,             [r2 + r3 + 32]
> +      pmovzxbw   m2,             m1
> +      movu       [r0 + r1 + 64], m2
> +      punpckhbw  m1,             m0
> +      movu       [r0 + r1 + 80], m1
> +
> +      movu       m1,              [r2 + r3 + 48]
> +      pmovzxbw   m2,              m1
> +      movu       [r0 + r1 + 96],  m2
> +      punpckhbw  m1,              m0
> +      movu       [r0 + r1 + 112], m1
> +
> +      lea        r0,              [r0 + 2 * r1]
> +      lea        r2,              [r2 + 2 * r3]
> +
> +      dec        r4d
> +      jnz        .loop
> +
> +RET
> +%endmacro
> +
> +BLOCKCOPY_PS_W64_H2 64, 16
> +BLOCKCOPY_PS_W64_H2 64, 32
> +BLOCKCOPY_PS_W64_H2 64, 48
> +BLOCKCOPY_PS_W64_H2 64, 64
> diff -r b83d45863ceb -r 972a9a01d0b4 source/common/x86/blockcopy8.h
> --- a/source/common/x86/blockcopy8.h    Mon Nov 11 21:06:11 2013 +0530
> +++ b/source/common/x86/blockcopy8.h    Mon Nov 11 21:22:38 2013 +0530
> @@ -125,7 +125,11 @@
>  #define LUMA_BLOCKCOPY_DEF_SSE4(cpu) \
>      SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(16, 64, cpu); \
>      SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(32, 64, cpu); \
> -    SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(48, 64, cpu);
> +    SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(48, 64, cpu); \
> +    SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(64, 16, cpu); \
> +    SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(64, 32, cpu); \
> +    SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(64, 48, cpu); \
> +    SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(64, 64, cpu);
>
>  CHROMA_BLOCKCOPY_DEF_SSE4(_sse4);
>  LUMA_BLOCKCOPY_DEF_SSE4(_sse4);
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>



-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131111/793181cf/attachment.html>


More information about the x265-devel mailing list