[x265] [PATCH] asm: filterConvertPelToShort
Min Chen
chenm003 at 163.com
Wed Oct 30 15:47:57 CET 2013
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1383144464 -28800
# Node ID 6bfafdf72eaef415aba43f4579f222cccbac60d9
# Parent 77db80a67f4e55f22bc02ed02930a269bfac6b50
asm: filterConvertPelToShort
diff -r 77db80a67f4e -r 6bfafdf72eae source/Lib/TLibCommon/TComPrediction.cpp
--- a/source/Lib/TLibCommon/TComPrediction.cpp Wed Oct 30 15:16:59 2013 +0530
+++ b/source/Lib/TLibCommon/TComPrediction.cpp Wed Oct 30 22:47:44 2013 +0800
@@ -516,6 +516,9 @@
int xFrac = mv->x & 0x3;
int yFrac = mv->y & 0x3;
+ assert((width % 4) + (height % 4) == 0);
+ assert(dstStride == MAX_CU_SIZE);
+
if ((yFrac | xFrac) == 0)
{
primitives.ipfilter_p2s(ref, refStride, dst, dstStride, width, height);
diff -r 77db80a67f4e -r 6bfafdf72eae source/common/ipfilter.cpp
--- a/source/common/ipfilter.cpp Wed Oct 30 15:16:59 2013 +0530
+++ b/source/common/ipfilter.cpp Wed Oct 30 22:47:44 2013 +0800
@@ -264,6 +264,24 @@
}
}
+void filterConvertPelToShort_c(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+{
+ int shift = IF_INTERNAL_PREC - X265_DEPTH;
+ int row, col;
+
+ for (row = 0; row < height; row++)
+ {
+ for (col = 0; col < width; col++)
+ {
+ int16_t val = src[col] << shift;
+ dst[col] = val - (int16_t)IF_INTERNAL_OFFS;
+ }
+
+ src += srcStride;
+ dst += MAX_CU_SIZE;
+ }
+}
+
template<int N>
void filterVertical_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int16_t const *c)
{
@@ -471,6 +489,7 @@
p.ipfilter_p2s = filterConvertPelToShort_c;
p.ipfilter_s2p = filterConvertShortToPel_c;
+ p.luma_p2s = filterConvertPelToShort_c;
p.extendRowBorder = extendCURowColBorder;
}
diff -r 77db80a67f4e -r 6bfafdf72eae source/common/primitives.h
--- a/source/common/primitives.h Wed Oct 30 15:16:59 2013 +0530
+++ b/source/common/primitives.h Wed Oct 30 22:47:44 2013 +0800
@@ -210,6 +210,7 @@
typedef void (*filter_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
typedef void (*filter_hv_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY);
+typedef void (*filter_p2s_t)(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
/* Define a structure containing function pointers to optimized encoder
* primitives. Each pointer can reference either an assembly routine,
@@ -247,6 +248,7 @@
filter_pp_t chroma_vpp[NUM_CHROMA_PARTITIONS];
filter_pp_t luma_vpp[NUM_LUMA_PARTITIONS];
filter_hv_pp_t luma_hvpp[NUM_LUMA_PARTITIONS];
+ filter_p2s_t luma_p2s;
intra_dc_t intra_pred_dc;
intra_planar_t intra_pred_planar;
diff -r 77db80a67f4e -r 6bfafdf72eae source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Oct 30 15:16:59 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Oct 30 22:47:44 2013 +0800
@@ -296,6 +296,7 @@
p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
p.ipfilter_sp[FILTER_V_S_P_8] = x265_interp_8tap_v_sp_ssse3;
+ p.luma_p2s = x265_luma_p2s_ssse3;
}
if (cpuMask & X265_CPU_SSE4)
{
diff -r 77db80a67f4e -r 6bfafdf72eae source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Wed Oct 30 15:16:59 2013 +0530
+++ b/source/common/x86/ipfilter8.asm Wed Oct 30 22:47:44 2013 +0800
@@ -78,6 +78,10 @@
times 4 dw 58, -10
times 4 dw 4, -1
+tab_c_128: times 16 db 0x80
+tab_c_64_n64: times 8 db 64, -64
+
+
SECTION .text
%macro FILTER_H4_w2_2 3
@@ -906,6 +910,7 @@
jnz .loopH
RET
+
;-----------------------------------------------------------------------------
;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
@@ -2056,3 +2061,69 @@
FILTER_V4_W32 32, 16
FILTER_V4_W32 32, 24
FILTER_V4_W32 32, 32
+
+
+;-----------------------------------------------------------------------------
+; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal luma_p2s, 3, 7, 8
+
+ ; load width and height
+ mov r3d, r3m
+ mov r4d, r4m
+
+ ; load constant
+ mova m6, [tab_c_128]
+ mova m7, [tab_c_64_n64]
+
+ ;shr r4d, 2
+ lea r2, [r2 - 16]
+.loopH:
+
+ xor r5d, r5d
+.loopW:
+ lea r6, [r0 + r5]
+
+ movh m0, [r6]
+ punpcklbw m0, m6
+ pmaddubsw m0, m7
+
+ movh m1, [r6 + r1]
+ punpcklbw m1, m6
+ pmaddubsw m1, m7
+
+ movh m2, [r6 + r1 * 2]
+ punpcklbw m2, m6
+ pmaddubsw m2, m7
+
+ lea r6, [r6 + r1 * 2]
+ movh m3, [r6 + r1]
+ punpcklbw m3, m6
+ pmaddubsw m3, m7
+
+ add r5, 8
+ cmp r5, r3
+ jg .width4
+ movu [r2 + r5 * 2 + FENC_STRIDE * 0], m0
+ movu [r2 + r5 * 2 + FENC_STRIDE * 2], m1
+ movu [r2 + r5 * 2 + FENC_STRIDE * 4], m2
+ movu [r2 + r5 * 2 + FENC_STRIDE * 6], m3
+ lea r5, [r5 + 8]
+ je .nextH
+ jmp .loopW
+
+.width4:
+ movh [r2 + r5 * 2 + FENC_STRIDE * 0], m0
+ movh [r2 + r5 * 2 + FENC_STRIDE * 2], m1
+ movh [r2 + r5 * 2 + FENC_STRIDE * 4], m2
+ movh [r2 + r5 * 2 + FENC_STRIDE * 6], m3
+
+.nextH:
+ lea r0, [r0 + r1 * 4]
+ add r2, FENC_STRIDE * 2 * 4
+
+ sub r4, 4
+ jnz .loopH
+
+ RET
diff -r 77db80a67f4e -r 6bfafdf72eae source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h Wed Oct 30 15:16:59 2013 +0530
+++ b/source/common/x86/ipfilter8.h Wed Oct 30 22:47:44 2013 +0800
@@ -90,6 +90,7 @@
void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int idxX, int idxY);
void x265_interp_8tap_v_sp_ssse3(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
+void x265_luma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
#undef SETUP_CHROMA_FUNC_DEF
#undef SETUP_LUMA_FUNC_DEF
diff -r 77db80a67f4e -r 6bfafdf72eae source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp Wed Oct 30 15:16:59 2013 +0530
+++ b/source/test/ipfilterharness.cpp Wed Oct 30 22:47:44 2013 +0800
@@ -240,6 +240,48 @@
return true;
}
+bool IPFilterHarness::check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt)
+{
+ int16_t rand_srcStride;
+
+ for (int i = 0; i <= 1000; i++)
+ {
+ int16_t rand_height = (int16_t)rand() % 100; // Randomly generated Height
+ int16_t rand_width = (int16_t)rand() % 100; // Randomly generated Width
+
+ memset(IPF_vec_output_s, 0, ipf_t_size); // Initialize output buffer to zero
+ memset(IPF_C_output_s, 0, ipf_t_size); // Initialize output buffer to zero
+
+ rand_srcStride = rand_width + rand() % 100; // Randomly generated srcStride
+ if (rand_srcStride < rand_width)
+ rand_srcStride = rand_width;
+
+ rand_width %= 4;
+ if (rand_width < 4)
+ rand_width = 4;
+
+ rand_height %= 4;
+ if (rand_height < 4)
+ rand_height = 4;
+
+ ref(pixel_buff,
+ rand_srcStride,
+ IPF_C_output_s,
+ rand_width,
+ rand_height);
+ opt(pixel_buff,
+ rand_srcStride,
+ IPF_vec_output_s,
+ rand_width,
+ rand_height);
+
+ if (memcmp(IPF_vec_output_s, IPF_C_output_s, ipf_t_size))
+ return false;
+ }
+
+ return true;
+}
+
bool IPFilterHarness::check_IPFilter_primitive(ipfilter_s2p_t ref, ipfilter_s2p_t opt)
{
int16_t rand_height = (int16_t)rand() % 100; // Randomly generated Height
@@ -417,6 +459,15 @@
}
}
+ if (opt.luma_p2s)
+ {
+ if (!check_IPFilter_primitive(ref.luma_p2s, opt.luma_p2s))
+ {
+ printf("ipfilter_p2s failed\n");
+ return false;
+ }
+ }
+
if (opt.ipfilter_s2p)
{
if (!check_IPFilter_primitive(ref.ipfilter_s2p, opt.ipfilter_s2p))
diff -r 77db80a67f4e -r 6bfafdf72eae source/test/ipfilterharness.h
--- a/source/test/ipfilterharness.h Wed Oct 30 15:16:59 2013 +0530
+++ b/source/test/ipfilterharness.h Wed Oct 30 22:47:44 2013 +0800
@@ -45,6 +45,7 @@
bool check_IPFilter_primitive(ipfilter_ps_t ref, ipfilter_ps_t opt);
bool check_IPFilter_primitive(ipfilter_sp_t ref, ipfilter_sp_t opt);
bool check_IPFilter_primitive(ipfilter_p2s_t ref, ipfilter_p2s_t opt);
+ bool check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt);
bool check_IPFilter_primitive(ipfilter_s2p_t ref, ipfilter_s2p_t opt);
bool check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt);
bool check_IPFilterLuma_primitive(filter_pp_t ref, filter_pp_t opt);
More information about the x265-devel
mailing list