[x265] [PATCH] asm code for blockcopy_ps_64xN
Steve Borho
steve at borho.org
Tue Nov 12 02:46:53 CET 2013
On Mon, Nov 11, 2013 at 9:52 AM, <praveen at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Praveen Tiwari
> # Date 1384185158 -19800
> # Node ID 972a9a01d0b440c919becc8ec17e7187522a2e68
> # Parent b83d45863ceb3f88da420646a3789fb787043f6e
> asm code for blockcopy_ps_64xN
>
These are all nice but the encoder is currently ignoring them all.
The next task should be to change all the TComYuv copy methods that take
width and height to instead take a LUMA or CHROMA partition enum (int) and
change those functions to use our new block-based primitives and then
change all callers to pass in the enum, being careful not to call
partitionFromSizes() more often than minimally necessary.
--
Steve
>
> diff -r b83d45863ceb -r 972a9a01d0b4 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Mon Nov 11 21:06:11 2013
> +0530
> +++ b/source/common/x86/asm-primitives.cpp Mon Nov 11 21:22:38 2013
> +0530
> @@ -459,6 +459,10 @@
> p.luma_copy_ps[LUMA_16x64] = x265_blockcopy_ps_16x64_sse4;
> p.luma_copy_ps[LUMA_32x64] = x265_blockcopy_ps_32x64_sse4;
> p.luma_copy_ps[LUMA_48x64] = x265_blockcopy_ps_48x64_sse4;
> + p.luma_copy_ps[LUMA_64x16] = x265_blockcopy_ps_64x16_sse4;
> + p.luma_copy_ps[LUMA_64x32] = x265_blockcopy_ps_64x32_sse4;
> + p.luma_copy_ps[LUMA_64x48] = x265_blockcopy_ps_64x48_sse4;
> + p.luma_copy_ps[LUMA_64x64] = x265_blockcopy_ps_64x64_sse4;
> }
> if (cpuMask & X265_CPU_AVX)
> {
> diff -r b83d45863ceb -r 972a9a01d0b4 source/common/x86/blockcopy8.asm
> --- a/source/common/x86/blockcopy8.asm Mon Nov 11 21:06:11 2013 +0530
> +++ b/source/common/x86/blockcopy8.asm Mon Nov 11 21:22:38 2013 +0530
> @@ -2286,3 +2286,77 @@
> %endmacro
>
> BLOCKCOPY_PS_W48_H2 48, 64
> +
>
> +;-----------------------------------------------------------------------------
> +; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src,
> intptr_t srcStride);
>
> +;-----------------------------------------------------------------------------
> +%macro BLOCKCOPY_PS_W64_H2 2
> +INIT_XMM sse4
> +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
> +
> +add r1, r1
> +mov r4d, %2/2
> +pxor m0, m0
> +
> +.loop
> + movu m1, [r2]
> + pmovzxbw m2, m1
> + movu [r0], m2
> + punpckhbw m1, m0
> + movu [r0 + 16], m1
> +
> + movu m1, [r2 + 16]
> + pmovzxbw m2, m1
> + movu [r0 + 32], m2
> + punpckhbw m1, m0
> + movu [r0 + 48], m1
> +
> + movu m1, [r2 + 32]
> + pmovzxbw m2, m1
> + movu [r0 + 64], m2
> + punpckhbw m1, m0
> + movu [r0 + 80], m1
> +
> + movu m1, [r2 + 48]
> + pmovzxbw m2, m1
> + movu [r0 + 96], m2
> + punpckhbw m1, m0
> + movu [r0 + 112], m1
> +
> + movu m1, [r2 + r3]
> + pmovzxbw m2, m1
> + movu [r0 + r1], m2
> + punpckhbw m1, m0
> + movu [r0 + r1 + 16], m1
> +
> + movu m1, [r2 + r3 + 16]
> + pmovzxbw m2, m1
> + movu [r0 + r1 + 32], m2
> + punpckhbw m1, m0
> + movu [r0 + r1 + 48], m1
> +
> + movu m1, [r2 + r3 + 32]
> + pmovzxbw m2, m1
> + movu [r0 + r1 + 64], m2
> + punpckhbw m1, m0
> + movu [r0 + r1 + 80], m1
> +
> + movu m1, [r2 + r3 + 48]
> + pmovzxbw m2, m1
> + movu [r0 + r1 + 96], m2
> + punpckhbw m1, m0
> + movu [r0 + r1 + 112], m1
> +
> + lea r0, [r0 + 2 * r1]
> + lea r2, [r2 + 2 * r3]
> +
> + dec r4d
> + jnz .loop
> +
> +RET
> +%endmacro
> +
> +BLOCKCOPY_PS_W64_H2 64, 16
> +BLOCKCOPY_PS_W64_H2 64, 32
> +BLOCKCOPY_PS_W64_H2 64, 48
> +BLOCKCOPY_PS_W64_H2 64, 64
> diff -r b83d45863ceb -r 972a9a01d0b4 source/common/x86/blockcopy8.h
> --- a/source/common/x86/blockcopy8.h Mon Nov 11 21:06:11 2013 +0530
> +++ b/source/common/x86/blockcopy8.h Mon Nov 11 21:22:38 2013 +0530
> @@ -125,7 +125,11 @@
> #define LUMA_BLOCKCOPY_DEF_SSE4(cpu) \
> SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(16, 64, cpu); \
> SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(32, 64, cpu); \
> - SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(48, 64, cpu);
> + SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(48, 64, cpu); \
> + SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(64, 16, cpu); \
> + SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(64, 32, cpu); \
> + SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(64, 48, cpu); \
> + SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(64, 64, cpu);
>
> CHROMA_BLOCKCOPY_DEF_SSE4(_sse4);
> LUMA_BLOCKCOPY_DEF_SSE4(_sse4);
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
--
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131111/793181cf/attachment.html>
More information about the x265-devel
mailing list