[x265] [PATCH] asm : asm routine for chroma_p2s for 4:4:4 color space format
nabajit at multicorewareinc.com
nabajit at multicorewareinc.com
Mon Feb 17 13:44:29 CET 2014
# HG changeset patch
# User Nabajit Deka
# Date 1392641037 -19800
# Mon Feb 17 18:13:57 2014 +0530
# Node ID f5275ca8f2985bb0daf563738e6071b81967c2cd
# Parent ce96cdb390fe26aee6effa731e51303c1d9056b0
asm : asm routine for chroma_p2s for 4:4:4 color space format
diff -r ce96cdb390fe -r f5275ca8f298 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Sun Feb 16 22:47:32 2014 -0600
+++ b/source/common/x86/asm-primitives.cpp Mon Feb 17 18:13:57 2014 +0530
@@ -1119,8 +1119,8 @@
p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
p.luma_p2s = x265_luma_p2s_ssse3;
- p.chroma_p2s[X265_CSP_I444] = x265_chroma_p2s_ssse3;
p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_ssse3;
+ p.chroma_p2s[X265_CSP_I444] = x265_chroma_p2s_i444_ssse3;
CHROMA_SP_FILTERS_420(_ssse3);
CHROMA_SP_FILTERS_444(_ssse3);
diff -r ce96cdb390fe -r f5275ca8f298 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Sun Feb 16 22:47:32 2014 -0600
+++ b/source/common/x86/ipfilter8.asm Mon Feb 17 18:13:57 2014 +0530
@@ -3680,6 +3680,64 @@
RET
+INIT_XMM ssse3
+cglobal chroma_p2s_i444, 3, 7, 4
+
+ ; load width and height
+ mov r3d, r3m
+ mov r4d, r4m
+
+ ; load constant
+ mova m2, [tab_c_128]
+ mova m3, [tab_c_64_n64]
+
+.loopH:
+
+ xor r5d, r5d
+.loopW:
+ lea r6, [r0 + r5]
+
+ movh m0, [r6]
+ punpcklbw m0, m2
+ pmaddubsw m0, m3
+
+ movh m1, [r6 + r1]
+ punpcklbw m1, m2
+ pmaddubsw m1, m3
+
+ add r5d, 8
+ cmp r5d, r3d
+ lea r6, [r2 + r5 * 2]
+ jg .width4
+ movu [r6 + FENC_STRIDE * 0 - 16], m0
+ movu [r6 + FENC_STRIDE * 2 - 16], m1
+ je .nextH
+ jmp .loopW
+
+.width4:
+ test r3d, 4
+ jz .width2
+ test r3d, 2
+ movh [r6 + FENC_STRIDE * 0 - 16], m0
+ movh [r6 + FENC_STRIDE * 2 - 16], m1
+ lea r6, [r6 + 8]
+ pshufd m0, m0, 2
+ pshufd m1, m1, 2
+ jz .nextH
+
+.width2:
+ movd [r6 + FENC_STRIDE * 0 - 16], m0
+ movd [r6 + FENC_STRIDE * 2 - 16], m1
+
+.nextH:
+ lea r0, [r0 + r1 * 2]
+ add r2, FENC_STRIDE * 4
+
+ sub r4d, 2
+ jnz .loopH
+
+ RET
+
%macro PROCESS_CHROMA_SP_W4_4R 0
movq m0, [r0]
movq m1, [r0 + r1]
diff -r ce96cdb390fe -r f5275ca8f298 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h Sun Feb 16 22:47:32 2014 -0600
+++ b/source/common/x86/ipfilter8.h Mon Feb 17 18:13:57 2014 +0530
@@ -214,6 +214,7 @@
void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int idxX, int idxY);
void x265_luma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
void x265_chroma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
+void x265_chroma_p2s_i444_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
void x265_interp_4tap_vert_sp_2x4_sse4(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx);
void x265_interp_4tap_vert_sp_2x8_sse4(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx);
void x265_interp_4tap_vert_sp_6x8_sse4(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx);
More information about the x265-devel
mailing list