[x265] [PATCH 2 of 3] asm: ipfilter_ss[FILTER_V_S_S_8]
Min Chen
chenm003 at 163.com
Mon Nov 4 12:05:32 CET 2013
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1383563104 -28800
# Node ID 539ad4851359f96591f612f8b7b6fb0483e5a48c
# Parent 2a7a5766fbd84436cdbbd5018a34db92e62553a1
asm: ipfilter_ss[FILTER_V_S_S_8]
diff -r 2a7a5766fbd8 -r 539ad4851359 source/Lib/TLibCommon/TComPrediction.cpp
--- a/source/Lib/TLibCommon/TComPrediction.cpp Mon Nov 04 19:04:43 2013 +0800
+++ b/source/Lib/TLibCommon/TComPrediction.cpp Mon Nov 04 19:05:04 2013 +0800
@@ -537,7 +537,7 @@
int filterSize = NTAPS_LUMA;
int halfFilterSize = (filterSize >> 1);
primitives.ipfilter_ps[FILTER_H_P_S_8](ref - (halfFilterSize - 1) * refStride, refStride, m_immedVals, tmpStride, width, height + filterSize - 1, g_lumaFilter[xFrac]);
- primitives.ipfilter_ss[FILTER_V_S_S_8](m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, width, height, g_lumaFilter[yFrac]);
+ primitives.ipfilter_ss[FILTER_V_S_S_8](m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, width, height, yFrac);
}
}
@@ -643,9 +643,9 @@
int filterSize = NTAPS_CHROMA;
int halfFilterSize = (filterSize >> 1);
primitives.ipfilter_ps[FILTER_H_P_S_4](refCb - (halfFilterSize - 1) * refStride, refStride, m_immedVals, extStride, cxWidth, cxHeight + filterSize - 1, g_chromaFilter[xFrac]);
- primitives.ipfilter_ss[FILTER_V_S_S_4](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, cxWidth, cxHeight, g_chromaFilter[yFrac]);
+ primitives.ipfilter_ss[FILTER_V_S_S_4](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, cxWidth, cxHeight, yFrac);
primitives.ipfilter_ps[FILTER_H_P_S_4](refCr - (halfFilterSize - 1) * refStride, refStride, m_immedVals, extStride, cxWidth, cxHeight + filterSize - 1, g_chromaFilter[xFrac]);
- primitives.ipfilter_ss[FILTER_V_S_S_4](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, cxWidth, cxHeight, g_chromaFilter[yFrac]);
+ primitives.ipfilter_ss[FILTER_V_S_S_4](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, cxWidth, cxHeight, yFrac);
}
}
diff -r 2a7a5766fbd8 -r 539ad4851359 source/common/ipfilter.cpp
--- a/source/common/ipfilter.cpp Mon Nov 04 19:04:43 2013 +0800
+++ b/source/common/ipfilter.cpp Mon Nov 04 19:05:04 2013 +0800
@@ -120,8 +120,9 @@
}
template<int N>
-void filterVertical_ss_c(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height, int16_t const *c)
+void filterVertical_ss_c(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height, const int coefIdx)
{
+ const int16_t *const c = (N == 8 ? g_lumaFilter[coefIdx] : g_chromaFilter[coefIdx]);
int shift = IF_FILTER_PREC;
int row, col;
src -= (N / 2 - 1) * srcStride;
diff -r 2a7a5766fbd8 -r 539ad4851359 source/common/primitives.h
--- a/source/common/primitives.h Mon Nov 04 19:04:43 2013 +0800
+++ b/source/common/primitives.h Mon Nov 04 19:05:04 2013 +0800
@@ -166,7 +166,7 @@
typedef void (*ipfilter_pp_t)(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int16_t *coeff);
typedef void (*ipfilter_ps_t)(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height, const int16_t *coeff);
typedef void (*ipfilter_sp_t)(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
-typedef void (*ipfilter_ss_t)(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height, const int16_t *coeff);
+typedef void (*ipfilter_ss_t)(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
typedef void (*ipfilter_p2s_t)(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height);
typedef void (*ipfilter_s2p_t)(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height);
typedef void (*blockcpy_pp_t)(int bx, int by, pixel *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned
diff -r 2a7a5766fbd8 -r 539ad4851359 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Nov 04 19:04:43 2013 +0800
+++ b/source/common/x86/asm-primitives.cpp Mon Nov 04 19:05:04 2013 +0800
@@ -279,6 +279,7 @@
SA8D_INTER_FROM_BLOCK(sse2);
p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
+ p.ipfilter_ss[FILTER_V_S_S_8] = x265_interp_8tap_v_ss_sse2;
}
if (cpuMask & X265_CPU_SSSE3)
{
diff -r 2a7a5766fbd8 -r 539ad4851359 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Mon Nov 04 19:04:43 2013 +0800
+++ b/source/common/x86/ipfilter8.asm Mon Nov 04 19:05:04 2013 +0800
@@ -2593,3 +2593,144 @@
jnz .loopH
RET
+
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_v_ss(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height, const int coefIdx)
+;-------------------------------------------------------------------------------------------------------------
+INIT_XMM sse2
+
+%if ARCH_X86_64
+cglobal interp_8tap_v_ss, 4, 7+1, 8
+%define tmp_r4d r7d
+%define tmp_r5d r8d
+%else
+cglobal interp_8tap_v_ss, 4, 7, 8, 0-2*4
+%define tmp_r4d dword [rsp + 0*4]
+%define tmp_r5d dword [rsp + 1*4]
+%endif
+
+ ; load width, height and filterIdx
+ mov r4d, r4m
+ mov r5d, r5m
+ mov r6d, r6m
+
+ ; convert to word stride
+ add r1, r1
+ add r3, r3
+
+ ; stort to temporary memory or register
+ shr r4d, 2
+ mov tmp_r4d, r4d
+ shr r5d, 2
+ mov tmp_r5d, r5d
+
+ shl r6d, 6
+%ifdef PIC
+ lea r5, [tab_LumaCoeffV]
+ lea r6, [r5 + r6]
+%else
+ lea r6, [tab_LumaCoeffV + r6]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+
+.loopH:
+ ; load width
+ mov r4d, tmp_r4d
+
+.loopW:
+
+ movh m0, [r0] ; m0 = [0]
+ movh m1, [r0 + r1] ; m1 = [1]
+ lea r0, [r0 + r1 * 2]
+ punpcklwd m0, m1
+ pmaddwd m0, [r6 + 0 * 16] ; m0 = [0+1] = R0
+
+ movh m2, [r0] ; m2 = [2]
+ movh m3, [r0 + r1] ; m3 = [3]
+ lea r0, [r0 + r1 * 2]
+ punpcklwd m1, m2
+ pmaddwd m1, [r6 + 0 * 16] ; m1 = [1+2] = R1
+ punpcklwd m2, m3 ; m2 = [2 3]
+ pmaddwd m7, m2, [r6 + 1 * 16] ;
+ paddd m0, m7 ; m0 = [0+1+2+3] = R0
+ pmaddwd m2, [r6 + 0 * 16] ; m2 = [2+3] = R2
+
+ movh m4, [r0] ; m4 = [4]
+ movh m5, [r0 + r1] ; m5 = [5]
+ lea r0, [r0 + r1 * 2]
+ punpcklwd m3, m4 ; m3 = [3 4]
+ pmaddwd m7, m3, [r6 + 1 * 16]
+ paddd m1, m7 ; m1 = [1+2+3+4] = R1
+ pmaddwd m3, [r6 + 0 * 16] ; m3 = [3+4] = R3
+ punpcklwd m4, m5 ; m4 = [4 5]
+ pmaddwd m7, m4, [r6 + 2 * 16]
+ paddd m0, m7 ; m0 = [0+1+2+3+4+5] = R0
+ pmaddwd m4, [r6 + 1 * 16]
+ paddd m2, m4 ; m2 = [2+3+4+5] = R2
+
+ movh m6, [r0] ; m6 = [6]
+ movh m7, [r0 + r1] ; m7 = [7]
+ lea r0, [r0 + r1 * 2]
+ punpcklwd m5, m6 ; m5 = [5 6]
+ pmaddwd m4, m5, [r6 + 2 * 16]
+ paddd m1, m4 ; m1 = [1+2+3+4+5+6] = R1
+ pmaddwd m5, [r6 + 1 * 16]
+ paddd m3, m5 ; m3 = [3+4+5+6] = R3
+ punpcklwd m6, m7 ; m6 = [6 7]
+ pmaddwd m4, m6, [r6 + 3 * 16]
+ paddd m0, m4 ; m0 = [0+1+2+3+4+5+6+7]= R0
+ pmaddwd m6, [r6 + 2 * 16]
+ paddd m2, m6 ; m2 = [2+3+4+5+6+7] = R2
+ psrad m0, 6
+ packssdw m0, m0
+ movh [r2], m0 ; store [0]
+
+ movh m4, [r0] ; m4 = [8]
+ movh m5, [r0 + r1] ; m5 = [9]
+ punpcklwd m7, m4 ; m7 = [7 8]
+ pmaddwd m6, m7, [r6 + 3 * 16]
+ paddd m1, m6 ; m1 = [1+2+3+4+5+6+7+8]= R1
+ pmaddwd m7, [r6 + 2 * 16]
+ paddd m3, m7 ; m3 = [3+4+5+6+7+8] = R3
+ psrad m1, 6
+ packssdw m1, m1
+ movh [r2 + r3], m1 ; store [1]
+ punpcklwd m4, m5 ; m4 = [8 9]
+ pmaddwd m4, [r6 + 3 * 16]
+ paddd m2, m4 ; m2 = [2+3+4+5+6+7+8+9]= R2
+ psrad m2, 6
+ packssdw m2, m2
+ movh [r2 + r3 * 2], m2 ; store [2]
+ lea r2, [r2 + r3 * 2]
+
+ movh m4, [r0 + r1 * 2] ; m4 = [10]
+ punpcklwd m5, m4 ; m5 = [9 10]
+ pmaddwd m5, [r6 + 3 * 16]
+ paddd m3, m5 ; m3 = [3+4+5+6+7+8+9+10]=R3
+ psrad m3, 6
+ packssdw m3, m3
+ movh [r2 + r3], m3 ; store [3]
+
+ lea r5, [r1 * 8 - 8]
+ sub r0, r5
+ lea r5, [r3 * 2 - 8]
+ sub r2, r5
+
+ dec r4d
+ jnz .loopW
+
+ ; move to next row
+ mov r4d, tmp_r4d
+ shl r4d, 3
+ lea r0, [r0 + r1 * 4]
+ sub r0, r4
+ lea r2, [r2 + r3 * 4]
+ sub r2, r4
+
+ dec tmp_r5d
+ jnz .loopH
+
+ RET
diff -r 2a7a5766fbd8 -r 539ad4851359 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Mon Nov 04 19:04:43 2013 +0800
+++ b/source/common/x86/pixel.h Mon Nov 04 19:05:04 2013 +0800
@@ -214,6 +214,7 @@
uint64_t x265_pixel_sa8d_satd_16x16_avx2(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2);
void x265_cvt32to16_shr_sse2(int16_t *dst, int *src, intptr_t, int, int);
+void x265_interp_8tap_v_ss_sse2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height, const int coefIdx);
#define DECL_HEVC_SSD(suffix) \
int x265_pixel_ssd_32x64_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \
diff -r 2a7a5766fbd8 -r 539ad4851359 source/encoder/motion.cpp
--- a/source/encoder/motion.cpp Mon Nov 04 19:04:43 2013 +0800
+++ b/source/encoder/motion.cpp Mon Nov 04 19:05:04 2013 +0800
@@ -1213,7 +1213,7 @@
int filterSize = NTAPS_LUMA;
int halfFilterSize = (filterSize >> 1);
primitives.ipfilter_ps[FILTER_H_P_S_8](fref - (halfFilterSize - 1) * ref->lumaStride, ref->lumaStride, immedVal, blockwidth, blockwidth, realHeight + filterSize - 1, g_lumaFilter[xFrac]);
- primitives.ipfilter_ss[FILTER_V_S_S_8](immedVal + (halfFilterSize - 1) * blockwidth, blockwidth, immedVal2, FENC_STRIDE, blockwidth, realHeight, g_lumaFilter[yFrac]);
+ primitives.ipfilter_ss[FILTER_V_S_S_8](immedVal + (halfFilterSize - 1) * blockwidth, blockwidth, immedVal2, FENC_STRIDE, blockwidth, realHeight, yFrac);
primitives.weightpUni(immedVal2, subpelbuf, FENC_STRIDE, FENC_STRIDE, blockwidth, realHeight, ref->weight, local_round, local_shift, ref->offset);
}
}
diff -r 2a7a5766fbd8 -r 539ad4851359 source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp Mon Nov 04 19:04:43 2013 +0800
+++ b/source/test/ipfilterharness.cpp Mon Nov 04 19:05:04 2013 +0800
@@ -318,6 +318,68 @@
return true;
}
+bool IPFilterHarness::check_IPFilter_primitive(ipfilter_ss_t ref, ipfilter_ss_t opt, int isChroma)
+{
+ int rand_val, rand_srcStride, rand_dstStride;
+ const int min_size = isChroma ? 2 : 4;
+
+ // NOTE: refill data to avoid overflow
+ const int max_filter_val = 64 * (1 << 8);
+ for (int i = 0; i < ipf_t_size; i++)
+ {
+ short_buff[i] = rand() % (2 * max_filter_val) - max_filter_val;
+ }
+
+ for (int i = 0; i <= 1000; i++)
+ {
+ int rand_height = rand() % 100; // Randomly generated Height
+ int rand_width = rand() % 100; // Randomly generated Width
+
+ memset(IPF_vec_output_s, 0xCD, ipf_t_size); // Initialize output buffer to zero
+ memset(IPF_C_output_s, 0xCD, ipf_t_size); // Initialize output buffer to zero
+
+ rand_val = rand() % 4; // Random offset in the filter
+ rand_srcStride = rand() % 100; // Randomly generated srcStride
+ rand_dstStride = rand() % 100; // Randomly generated dstStride
+
+ rand_width &= ~(min_size - 1);
+ if (rand_width < min_size)
+ rand_width = min_size;
+
+ rand_height &= ~(min_size - 1);
+ if (rand_height < min_size)
+ rand_height = min_size;
+
+ if (rand_srcStride < rand_width)
+ rand_srcStride = rand_width;
+
+ if (rand_dstStride < rand_width)
+ rand_dstStride = rand_width;
+
+ ref(short_buff + 3 * rand_srcStride,
+ rand_srcStride,
+ IPF_C_output_s,
+ rand_dstStride,
+ rand_width,
+ rand_height, rand_val
+ );
+ opt(short_buff + 3 * rand_srcStride,
+ rand_srcStride,
+ IPF_vec_output_s,
+ rand_dstStride,
+ rand_width,
+ rand_height, rand_val
+ );
+
+ if (memcmp(IPF_C_output_s, IPF_vec_output_s, ipf_t_size * sizeof(int16_t)))
+ {
+ return false;
+ }
+ }
+
+ return true;
+}
+
bool IPFilterHarness::check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt)
{
int rand_srcStride, rand_dstStride, rand_coeffIdx;
@@ -452,6 +514,18 @@
}
}
+ for (int value = 0; value < NUM_IPFILTER_S_S; value++)
+ {
+ if (opt.ipfilter_ss[value])
+ {
+ if (!check_IPFilter_primitive(ref.ipfilter_ss[value], opt.ipfilter_ss[value], (value == FILTER_V_S_S_4)))
+ {
+ printf("ipfilter_ss %d failed\n", 8 / (value + 1));
+ return false;
+ }
+ }
+ }
+
if (opt.ipfilter_p2s)
{
if (!check_IPFilter_primitive(ref.ipfilter_p2s, opt.ipfilter_p2s))
@@ -583,6 +657,17 @@
}
}
+ for (int value = 0; value < NUM_IPFILTER_S_S; value++)
+ {
+ if (opt.ipfilter_ss[value])
+ {
+ printf("ipfilter_ss %d\t", 8 / (value + 1));
+ REPORT_SPEEDUP(opt.ipfilter_ss[value], ref.ipfilter_ss[value],
+ short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
+ IPF_vec_output_s, dstStride, width, height, val);
+ }
+ }
+
if (opt.ipfilter_p2s)
{
printf("ipfilter_p2s\t");
diff -r 2a7a5766fbd8 -r 539ad4851359 source/test/ipfilterharness.h
--- a/source/test/ipfilterharness.h Mon Nov 04 19:04:43 2013 +0800
+++ b/source/test/ipfilterharness.h Mon Nov 04 19:05:04 2013 +0800
@@ -47,6 +47,7 @@
bool check_IPFilter_primitive(ipfilter_p2s_t ref, ipfilter_p2s_t opt);
bool check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt, int isChroma);
bool check_IPFilter_primitive(ipfilter_s2p_t ref, ipfilter_s2p_t opt);
+ bool check_IPFilter_primitive(ipfilter_ss_t ref, ipfilter_ss_t opt, int isChroma);
bool check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt);
bool check_IPFilterLuma_primitive(filter_pp_t ref, filter_pp_t opt);
bool check_IPFilterLumaHV_primitive(filter_hv_pp_t ref, filter_hv_pp_t opt);
More information about the x265-devel
mailing list