[x265] [PATCH] asm: sse4 chroma_p2s[4x2](2.29x), ssse3 chroma_p2s[8x2](3.60x) for i420
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Fri Apr 3 15:50:54 CEST 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1428067926 -19800
# Fri Apr 03 19:02:06 2015 +0530
# Node ID 1df11c5deb21832b221b673599337518e12f8808
# Parent e94e60f0cc0b85f98ae834d5e8f5db9a94d7f264
asm: sse4 chroma_p2s[4x2](2.29x), ssse3 chroma_p2s[8x2](3.60x) for i420
diff -r e94e60f0cc0b -r 1df11c5deb21 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Apr 03 18:50:00 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Apr 03 19:02:06 2015 +0530
@@ -1300,6 +1300,7 @@
p.pu[LUMA_64x48].convert_p2s = x265_filterPixelToShort_64x48_ssse3;
p.pu[LUMA_64x64].convert_p2s = x265_filterPixelToShort_64x64_ssse3;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s = x265_filterPixelToShort_8x2_ssse3;
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].p2s = x265_filterPixelToShort_8x4_ssse3;
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].p2s = x265_filterPixelToShort_8x8_ssse3;
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].p2s = x265_filterPixelToShort_8x16_ssse3;
@@ -1406,6 +1407,7 @@
p.pu[LUMA_4x8].convert_p2s = x265_filterPixelToShort_4x8_sse4;
p.pu[LUMA_4x16].convert_p2s = x265_filterPixelToShort_4x16_sse4;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].p2s = x265_filterPixelToShort_4x2_sse4;
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].p2s = x265_filterPixelToShort_4x4_sse4;
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].p2s = x265_filterPixelToShort_4x8_sse4;
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].p2s = x265_filterPixelToShort_4x16_sse4;
diff -r e94e60f0cc0b -r 1df11c5deb21 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Fri Apr 03 18:50:00 2015 +0530
+++ b/source/common/x86/ipfilter8.asm Fri Apr 03 19:02:06 2015 +0530
@@ -12303,64 +12303,50 @@
FILTER_VER_LUMA_SP 64, 16
FILTER_VER_LUMA_SP 16, 64
-; TODO: combin of U and V is more performance, but need more register
-; TODO: use two path for height alignment to 4 and otherwise may improvement 10% performance, but code is more complex, so I disable it
-INIT_XMM ssse3
-cglobal chroma_p2s, 3, 7, 4
-
- ; load width and height
- mov r3d, r3m
- mov r4d, r4m
-
- ; load constant
- mova m2, [pb_128]
- mova m3, [tab_c_64_n64]
-
-.loopH:
-
- xor r5d, r5d
-.loopW:
- lea r6, [r0 + r5]
-
- movh m0, [r6]
- punpcklbw m0, m2
- pmaddubsw m0, m3
-
- movh m1, [r6 + r1]
- punpcklbw m1, m2
- pmaddubsw m1, m3
-
- add r5d, 8
- cmp r5d, r3d
- lea r6, [r2 + r5 * 2]
- jg .width4
- movu [r6 + FENC_STRIDE / 2 * 0 - 16], m0
- movu [r6 + FENC_STRIDE / 2 * 2 - 16], m1
- je .nextH
- jmp .loopW
-
-.width4:
- test r3d, 4
- jz .width2
- test r3d, 2
- movh [r6 + FENC_STRIDE / 2 * 0 - 16], m0
- movh [r6 + FENC_STRIDE / 2 * 2 - 16], m1
- lea r6, [r6 + 8]
- pshufd m0, m0, 2
- pshufd m1, m1, 2
- jz .nextH
-
-.width2:
- movd [r6 + FENC_STRIDE / 2 * 0 - 16], m0
- movd [r6 + FENC_STRIDE / 2 * 2 - 16], m1
-
-.nextH:
- lea r0, [r0 + r1 * 2]
- add r2, FENC_STRIDE / 2 * 4
-
- sub r4d, 2
- jnz .loopH
-
+;-----------------------------------------------------------------------------
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal filterPixelToShort_4x2, 3, 4, 3
+ mov r3d, r3m
+ add r3d, r3d
+
+ ; load constant
+ mova m1, [pb_128]
+ mova m2, [tab_c_64_n64]
+
+ movd m0, [r0]
+ pinsrd m0, [r0 + r1], 1
+ punpcklbw m0, m1
+ pmaddubsw m0, m2
+
+ movq [r2 + r3 * 0], m0
+ movhps [r2 + r3 * 1], m0
+
+ RET
+
+;-----------------------------------------------------------------------------
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal filterPixelToShort_8x2, 3, 4, 3
+ mov r3d, r3m
+ add r3d, r3d
+
+ ; load constant
+ mova m1, [pb_128]
+ mova m2, [tab_c_64_n64]
+
+ movh m0, [r0]
+ punpcklbw m0, m1
+ pmaddubsw m0, m2
+ movu [r2 + r3 * 0], m0
+
+ movh m0, [r0 + r1]
+ punpcklbw m0, m1
+ pmaddubsw m0, m2
+ movu [r2 + r3 * 1], m0
+
RET
%macro PROCESS_CHROMA_SP_W4_4R 0
diff -r e94e60f0cc0b -r 1df11c5deb21 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h Fri Apr 03 18:50:00 2015 +0530
+++ b/source/common/x86/ipfilter8.h Fri Apr 03 19:02:06 2015 +0530
@@ -570,6 +570,15 @@
SETUP_CHROMA_SS_FUNC_DEF(64, 16, cpu); \
SETUP_CHROMA_SS_FUNC_DEF(16, 64, cpu);
+#define SETUP_CHROMA_420_P2S_FUNC_DEF(W, H, cpu) \
+ void x265_filterPixelToShort_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+
+#define CHROMA_420_P2S_FILTERS_SSE4(cpu) \
+ SETUP_CHROMA_420_P2S_FUNC_DEF(4, 2, cpu);
+
+#define CHROMA_420_P2S_FILTERS_SSSE3(cpu) \
+ SETUP_CHROMA_420_P2S_FUNC_DEF(8, 2, cpu);
+
CHROMA_420_FILTERS(_sse4);
CHROMA_420_FILTERS(_avx2);
CHROMA_420_SP_FILTERS(_sse2);
@@ -580,6 +589,8 @@
CHROMA_420_SS_FILTERS_SSE4(_sse4);
CHROMA_420_SS_FILTERS(_avx2);
CHROMA_420_SS_FILTERS_SSE4(_avx2);
+CHROMA_420_P2S_FILTERS_SSE4(_sse4);
+CHROMA_420_P2S_FILTERS_SSSE3(_ssse3);
CHROMA_422_FILTERS(_sse4);
CHROMA_422_FILTERS(_avx2);
More information about the x265-devel
mailing list