[x265] [PATCH] asm: interp_8tap_v_sp for ipfilter_sp[FILTER_V_S_P_8]
Min Chen
chenm003 at 163.com
Mon Oct 28 15:40:39 CET 2013
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1382971217 -28800
# Node ID 536ff3d483523b9c657e97801ebf8e5ac8ab21f5
# Parent edc6c8e9a865c9c6a9dad40feff27014f68bc19d
asm: interp_8tap_v_sp for ipfilter_sp[FILTER_V_S_P_8]
diff -r edc6c8e9a865 -r 536ff3d48352 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Oct 28 22:39:39 2013 +0800
+++ b/source/common/x86/asm-primitives.cpp Mon Oct 28 22:40:17 2013 +0800
@@ -280,6 +280,7 @@
p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ssse3;
p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
+ p.ipfilter_sp[FILTER_V_S_P_8] = x265_interp_8tap_v_sp_ssse3;
}
if (cpuMask & X265_CPU_SSE4)
{
diff -r edc6c8e9a865 -r 536ff3d48352 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Mon Oct 28 22:39:39 2013 +0800
+++ b/source/common/x86/ipfilter8.asm Mon Oct 28 22:40:17 2013 +0800
@@ -774,3 +774,114 @@
jnz .loopV
RET
+
+
+;-----------------------------------------------------------------------------
+; void interp_8tap_v_sp(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal interp_8tap_v_sp, 4, 7, 8, 0-(2*4 + 3*gprsize)
+%define old_r0 (rsp + 2 * 4 + 0 * gprsize)
+%define old_r2 (rsp + 2 * 4 + 1 * gprsize)
+%define old_r3 (rsp + 2 * 4 + 2 * gprsize)
+%define old_r4d (rsp + 0 * 4)
+%define old_6rows (rsp + 1 * 4)
+
+ mov r4d, r4m
+ mov r5d, r5m
+
+ ; load coeff table
+ mov r6d, r6m
+ shl r6, 6
+ lea r6, [tab_LumaCoeffV + r6]
+
+ mov [old_r4d], r4d
+ mov [old_r2], r2
+
+ ; move to -3
+ lea r1, [r1 * 2]
+ lea r4, [r1 + r1 * 2]
+ sub r0, r4
+ lea r4, [r4 * 2]
+ mov [old_6rows], r4
+
+.loopH:
+
+ ; load width
+ mov r4d, [old_r4d]
+
+ ; save old src
+ mov [old_r0], r0
+
+.loopW:
+
+ movu m0, [r0]
+ movu m1, [r0 + r1]
+ lea r0, [r0 + r1 * 2]
+ punpcklwd m2, m0, m1
+ pmaddwd m2, [r6 + 0 * 16]
+ punpckhwd m0, m1
+ pmaddwd m0, [r6 + 0 * 16]
+
+ movu m3, [r0]
+ movu m4, [r0 + r1]
+ lea r0, [r0 + r1 * 2]
+ punpcklwd m1, m3, m4
+ pmaddwd m1, [r6 + 1 * 16]
+ paddd m2, m1
+ punpckhwd m3, m4
+ pmaddwd m3, [r6 + 1 * 16]
+ paddd m0, m3
+
+ movu m3, [r0]
+ movu m4, [r0 + r1]
+ lea r0, [r0 + r1 * 2]
+ punpcklwd m1, m3, m4
+ pmaddwd m1, [r6 + 2 * 16]
+ paddd m2, m1
+ punpckhwd m3, m4
+ pmaddwd m3, [r6 + 2 * 16]
+ paddd m0, m3
+
+ movu m3, [r0]
+ movu m4, [r0 + r1]
+ punpcklwd m1, m3, m4
+ pmaddwd m1, [r6 + 3 * 16]
+ paddd m2, m1
+ punpckhwd m3, m4
+ pmaddwd m3, [r6 + 3 * 16]
+ paddd m0, m3
+
+ paddd m2, [tab_c_526336]
+ paddd m0, [tab_c_526336]
+ psrad m2, 12
+ psrad m0, 12
+ packssdw m2, m0
+ packuswb m2, m2
+
+ ; move to next 8 col
+ sub r0, [old_6rows]
+
+ sub r4, 8
+ jl .width4
+ movq [r2], m2
+ je .nextH
+ lea r0, [r0 + 16]
+ lea r2, [r2 + 8]
+ jmp .loopW
+
+.width4:
+ movd [r2], m2
+ lea r0, [r0 + 4]
+
+.nextH:
+ ; move to next row
+ mov r0, [old_r0]
+ lea r0, [r0 + r1]
+ add [old_r2], r3d
+ mov r2, [old_r2]
+
+ dec r5d
+ jnz .loopH
+
+ RET
diff -r edc6c8e9a865 -r 536ff3d48352 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h Mon Oct 28 22:39:39 2013 +0800
+++ b/source/common/x86/ipfilter8.h Mon Oct 28 22:40:17 2013 +0800
@@ -89,6 +89,7 @@
LUMA_FILTERS(_sse4);
void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int idxX, int idxY);
+void x265_interp_8tap_v_sp_ssse3(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
#undef SETUP_CHROMA_FUNC_DEF
#undef SETUP_LUMA_FUNC_DEF
diff -r edc6c8e9a865 -r 536ff3d48352 source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp Mon Oct 28 22:39:39 2013 +0800
+++ b/source/test/ipfilterharness.cpp Mon Oct 28 22:40:17 2013 +0800
@@ -164,6 +164,8 @@
int rand_width = rand() % 100; // Randomly generated Width
int16_t rand_val, rand_srcStride, rand_dstStride;
+ rand_width &= ~3;
+
for (int i = 0; i <= 100; i++)
{
memset(IPF_vec_output_p, 0, ipf_t_size); // Initialize output buffer to zero
@@ -173,16 +175,16 @@
rand_srcStride = rand() % 100; // Randomly generated srcStride
rand_dstStride = rand() % 100; // Randomly generated dstStride
- opt(short_buff + 3 * rand_srcStride,
+ ref(short_buff + 3 * rand_srcStride,
rand_srcStride,
- IPF_vec_output_p,
+ IPF_C_output_p,
rand_dstStride,
rand_width,
rand_height, rand_val
);
- ref(short_buff + 3 * rand_srcStride,
+ opt(short_buff + 3 * rand_srcStride,
rand_srcStride,
- IPF_C_output_p,
+ IPF_vec_output_p,
rand_dstStride,
rand_width,
rand_height, rand_val
More information about the x265-devel
mailing list