[x265] [PATCH] asm : asm routine for chroma_p2s for 4:4:4 color space format
Steve Borho
steve at borho.org
Mon Feb 17 20:09:28 CET 2014
On Mon, Feb 17, 2014 at 6:44 AM, <nabajit at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Nabajit Deka
> # Date 1392641037 -19800
> # Mon Feb 17 18:13:57 2014 +0530
> # Node ID f5275ca8f2985bb0daf563738e6071b81967c2cd
> # Parent ce96cdb390fe26aee6effa731e51303c1d9056b0
> asm : asm routine for chroma_p2s for 4:4:4 color space format
>
Queued. There needs to be a comment somewhere about how the chroma_p2s 444
primitive is different from the others.
>
> diff -r ce96cdb390fe -r f5275ca8f298 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Sun Feb 16 22:47:32 2014
> -0600
> +++ b/source/common/x86/asm-primitives.cpp Mon Feb 17 18:13:57 2014
> +0530
> @@ -1119,8 +1119,8 @@
>
> p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
> p.luma_p2s = x265_luma_p2s_ssse3;
> - p.chroma_p2s[X265_CSP_I444] = x265_chroma_p2s_ssse3;
> p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_ssse3;
> + p.chroma_p2s[X265_CSP_I444] = x265_chroma_p2s_i444_ssse3;
>
> CHROMA_SP_FILTERS_420(_ssse3);
> CHROMA_SP_FILTERS_444(_ssse3);
> diff -r ce96cdb390fe -r f5275ca8f298 source/common/x86/ipfilter8.asm
> --- a/source/common/x86/ipfilter8.asm Sun Feb 16 22:47:32 2014 -0600
> +++ b/source/common/x86/ipfilter8.asm Mon Feb 17 18:13:57 2014 +0530
> @@ -3680,6 +3680,64 @@
>
> RET
>
> +INIT_XMM ssse3
> +cglobal chroma_p2s_i444, 3, 7, 4
> +
> + ; load width and height
> + mov r3d, r3m
> + mov r4d, r4m
> +
> + ; load constant
> + mova m2, [tab_c_128]
> + mova m3, [tab_c_64_n64]
> +
> +.loopH:
> +
> + xor r5d, r5d
> +.loopW:
> + lea r6, [r0 + r5]
> +
> + movh m0, [r6]
> + punpcklbw m0, m2
> + pmaddubsw m0, m3
> +
> + movh m1, [r6 + r1]
> + punpcklbw m1, m2
> + pmaddubsw m1, m3
> +
> + add r5d, 8
> + cmp r5d, r3d
> + lea r6, [r2 + r5 * 2]
> + jg .width4
> + movu [r6 + FENC_STRIDE * 0 - 16], m0
> + movu [r6 + FENC_STRIDE * 2 - 16], m1
> + je .nextH
> + jmp .loopW
> +
> +.width4:
> + test r3d, 4
> + jz .width2
> + test r3d, 2
> + movh [r6 + FENC_STRIDE * 0 - 16], m0
> + movh [r6 + FENC_STRIDE * 2 - 16], m1
> + lea r6, [r6 + 8]
> + pshufd m0, m0, 2
> + pshufd m1, m1, 2
> + jz .nextH
> +
> +.width2:
> + movd [r6 + FENC_STRIDE * 0 - 16], m0
> + movd [r6 + FENC_STRIDE * 2 - 16], m1
> +
> +.nextH:
> + lea r0, [r0 + r1 * 2]
> + add r2, FENC_STRIDE * 4
> +
> + sub r4d, 2
> + jnz .loopH
> +
> + RET
> +
> %macro PROCESS_CHROMA_SP_W4_4R 0
> movq m0, [r0]
> movq m1, [r0 + r1]
> diff -r ce96cdb390fe -r f5275ca8f298 source/common/x86/ipfilter8.h
> --- a/source/common/x86/ipfilter8.h Sun Feb 16 22:47:32 2014 -0600
> +++ b/source/common/x86/ipfilter8.h Mon Feb 17 18:13:57 2014 +0530
> @@ -214,6 +214,7 @@
> void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride,
> pixel * dst, intptr_t dstStride, int idxX, int idxY);
> void x265_luma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst,
> int width, int height);
> void x265_chroma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst,
> int width, int height);
> +void x265_chroma_p2s_i444_ssse3(pixel *src, intptr_t srcStride, int16_t
> *dst, int width, int height);
> void x265_interp_4tap_vert_sp_2x4_sse4(int16_t * src, intptr_t srcStride,
> pixel * dst, intptr_t dstStride, int coeffIdx);
> void x265_interp_4tap_vert_sp_2x8_sse4(int16_t * src, intptr_t srcStride,
> pixel * dst, intptr_t dstStride, int coeffIdx);
> void x265_interp_4tap_vert_sp_6x8_sse4(int16_t * src, intptr_t srcStride,
> pixel * dst, intptr_t dstStride, int coeffIdx);
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
--
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140217/60dc43cb/attachment.html>
More information about the x265-devel
mailing list