[x265] [PATCH] asm: ssse3 8bpp code for chroma_p2s[8x6](4.74x) for i420
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Mon Apr 6 16:26:03 CEST 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1428313137 -19800
# Mon Apr 06 15:08:57 2015 +0530
# Node ID e0e94b642a1f169658267ef17bea754a5af4a22d
# Parent e7890da101691cc902ef6df66ee71351c613250e
asm: ssse3 8bpp code for chroma_p2s[8x6](4.74x) for i420
diff -r e7890da10169 -r e0e94b642a1f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Apr 06 15:06:06 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Apr 06 15:08:57 2015 +0530
@@ -1338,6 +1338,7 @@
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s = x265_filterPixelToShort_8x2_ssse3;
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].p2s = x265_filterPixelToShort_8x4_ssse3;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].p2s = x265_filterPixelToShort_8x6_ssse3;
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].p2s = x265_filterPixelToShort_8x8_ssse3;
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].p2s = x265_filterPixelToShort_8x16_ssse3;
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].p2s = x265_filterPixelToShort_8x32_ssse3;
diff -r e7890da10169 -r e0e94b642a1f source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Mon Apr 06 15:06:06 2015 +0530
+++ b/source/common/x86/ipfilter8.asm Mon Apr 06 15:08:57 2015 +0530
@@ -7893,6 +7893,56 @@
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal filterPixelToShort_8x6, 3, 7, 5
+ mov r3d, r3m
+ add r3d, r3d
+ lea r4, [r1 * 3]
+ lea r5, [r1 * 5]
+ lea r6, [r3 * 3]
+
+ ; load constant
+ mova m3, [pb_128]
+ mova m4, [tab_c_64_n64]
+
+ movh m0, [r0]
+ punpcklbw m0, m3
+ pmaddubsw m0, m4
+
+ movh m1, [r0 + r1]
+ punpcklbw m1, m3
+ pmaddubsw m1, m4
+
+ movh m2, [r0 + r1 * 2]
+ punpcklbw m2, m3
+ pmaddubsw m2, m4
+
+ movu [r2 + r3 * 0], m0
+ movu [r2 + r3 * 1], m1
+ movu [r2 + r3 * 2], m2
+
+ movh m0, [r0 + r4]
+ punpcklbw m0, m3
+ pmaddubsw m0, m4
+
+ movh m1, [r0 + r1 * 4]
+ punpcklbw m1, m3
+ pmaddubsw m1, m4
+
+ movh m2, [r0 + r5]
+ punpcklbw m2, m3
+ pmaddubsw m2, m4
+
+ movu [r2 + r6 ], m0
+ movu [r2 + r3 * 4], m1
+ lea r2, [r2 + r3 * 4]
+ movu [r2 + r3], m2
+
+ RET
+
+;-----------------------------------------------------------------------------
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
+;-----------------------------------------------------------------------------
%macro P2S_H_16xN 1
INIT_XMM ssse3
cglobal filterPixelToShort_16x%1, 3, 7, 6
diff -r e7890da10169 -r e0e94b642a1f source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h Mon Apr 06 15:06:06 2015 +0530
+++ b/source/common/x86/ipfilter8.h Mon Apr 06 15:08:57 2015 +0530
@@ -578,7 +578,8 @@
SETUP_CHROMA_P2S_FUNC_DEF(6, 8, cpu);
#define CHROMA_420_P2S_FILTERS_SSSE3(cpu) \
- SETUP_CHROMA_P2S_FUNC_DEF(8, 2, cpu);
+ SETUP_CHROMA_P2S_FUNC_DEF(8, 2, cpu); \
+ SETUP_CHROMA_P2S_FUNC_DEF(8, 6, cpu);
#define CHROMA_422_P2S_FILTERS_SSE4(cpu) \
SETUP_CHROMA_P2S_FUNC_DEF(6, 16, cpu);
More information about the x265-devel
mailing list