[x265] [PATCH 1 of 4] asm: intra_filter4x4 sse4 code and added testbench support, improved 357c->141c over C code
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Fri Jun 26 15:22:49 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1435323067 -19800
# Fri Jun 26 18:21:07 2015 +0530
# Node ID 44b574b61b29a3cfba99e8f0d06622e44a86df17
# Parent d64227e54233d1646c55bcb4b0b831e5340009ed
asm: intra_filter4x4 sse4 code and added testbench support, improved 357c->141c over C code
diff -r d64227e54233 -r 44b574b61b29 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Jun 25 16:25:51 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Jun 26 18:21:07 2015 +0530
@@ -2453,6 +2453,8 @@
p.weight_pp = PFX(weight_pp_sse4);
p.weight_sp = PFX(weight_sp_sse4);
+ p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4);
+
ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
ALL_LUMA_TU(intra_pred_allangs, all_angs_pred, sse4);
diff -r d64227e54233 -r 44b574b61b29 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Thu Jun 25 16:25:51 2015 +0530
+++ b/source/common/x86/intrapred.h Fri Jun 26 18:21:07 2015 +0530
@@ -66,6 +66,7 @@
#define DECL_ALL(cpu) \
FUNCDEF_TU(void, all_angs_pred, cpu, pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); \
+ FUNCDEF_TU(void, intra_filter, cpu, const pixel *samples, pixel *filtered); \
DECL_ANGS(4, cpu); \
DECL_ANGS(8, cpu); \
DECL_ANGS(16, cpu); \
diff -r d64227e54233 -r 44b574b61b29 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm Thu Jun 25 16:25:51 2015 +0530
+++ b/source/common/x86/intrapred8.asm Fri Jun 26 18:21:07 2015 +0530
@@ -30,6 +30,9 @@
intra_pred_shuff_0_8: times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
intra_pred_shuff_15_0: times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+intra_filter4_shuf0: db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
+intra_filter4_shuf1: db 14,15,0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
+
pb_0_8 times 8 db 0, 8
pb_unpackbw1 times 2 db 1, 8, 2, 8, 3, 8, 4, 8
pb_swap8: times 2 db 7, 6, 5, 4, 3, 2, 1, 0
@@ -18276,3 +18279,44 @@
INTRA_PRED_STORE_4x4
RET
+
+;-----------------------------------------------------------------------------------
+; void intra_filter_NxN(const pixel* references, pixel* filtered)
+;-----------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_filter_4x4, 2,4,5
+ mov r2b, byte [r0 + 8] ; topLast
+ mov r3b, byte [r0 + 16] ; LeftLast
+
+ ; filtering top
+ pmovzxbw m0, [r0 + 0]
+ pmovzxbw m1, [r0 + 8]
+ pmovzxbw m2, [r0 + 16]
+
+ pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1]
+ palignr m3, m1, m0, 4
+ pshufb m3, [intra_filter4_shuf1] ; [8 7 6 5 4 3 2 9] samples[i + 1]
+
+ psllw m0, 1
+ paddw m4, m3
+ paddw m0, m4
+ paddw m0, [pw_2]
+ psrlw m0, 2
+
+ ; filtering left
+ palignr m4, m1, m1, 14 ; [14 13 12 11 10 9 8 15] samples[i - 1]
+ pinsrb m4, [r0], 2 ; [14 13 12 11 10 9 0 15] samples[i + 1]
+ palignr m3, m2, m1, 4
+ pshufb m3, [intra_filter4_shuf1]
+
+ psllw m1, 1
+ paddw m4, m3
+ paddw m1, m4
+ paddw m1, [pw_2]
+ psrlw m1, 2
+ packuswb m0, m1
+
+ movu [r1], m0
+ mov [r1 + 8], r2b ; topLast
+ mov [r1 + 16], r3b ; LeftLast
+ RET
diff -r d64227e54233 -r 44b574b61b29 source/test/intrapredharness.cpp
--- a/source/test/intrapredharness.cpp Thu Jun 25 16:25:51 2015 +0530
+++ b/source/test/intrapredharness.cpp Fri Jun 26 18:21:07 2015 +0530
@@ -31,6 +31,16 @@
{
for (int i = 0; i < INPUT_SIZE; i++)
pixel_buff[i] = rand() % PIXEL_MAX;
+
+ /* [0] --- Random values
+ * [1] --- Minimum
+ * [2] --- Maximum */
+ for (int i = 0; i < BUFFSIZE; i++)
+ {
+ pixel_test_buff[0][i] = rand() % PIXEL_MAX;
+ pixel_test_buff[1][i] = PIXEL_MIN;
+ pixel_test_buff[2][i] = PIXEL_MAX;
+ }
}
bool IntraPredHarness::check_dc_primitive(intra_pred_t ref, intra_pred_t opt, int width)
@@ -177,6 +187,27 @@
return true;
}
+bool IntraPredHarness::check_intra_filter_primitive(const intra_filter_t ref, const intra_filter_t opt)
+{
+ memset(pixel_out_c, 0, 64 * 64 * sizeof(pixel));
+ memset(pixel_out_vec, 0, 64 * 64 * sizeof(pixel));
+ int j = 0;
+
+ for (int i = 0; i < 100; i++)
+ {
+ int index = rand() % TEST_CASES;
+
+ ref(pixel_test_buff[index] + j, pixel_out_c);
+ checked(opt, pixel_test_buff[index] + j, pixel_out_vec);
+
+ if (memcmp(pixel_out_c, pixel_out_vec, 64 * 64 * sizeof(pixel)))
+ return false;
+
+ reportfail();
+ j += FENC_STRIDE;
+ }
+ return true;
+}
bool IntraPredHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
{
for (int i = BLOCK_4x4; i <= BLOCK_32x32; i++)
@@ -213,6 +244,14 @@
return false;
}
}
+ if (opt.cu[i].intra_filter)
+ {
+ if (!check_intra_filter_primitive(ref.cu[i].intra_filter, opt.cu[i].intra_filter))
+ {
+ printf("intra_filter_%dx%d failed\n", size, size);
+ return false;
+ }
+ }
}
return true;
@@ -268,5 +307,10 @@
pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, mode, bFilter);
}
}
+ if (opt.cu[i].intra_filter)
+ {
+ printf("intra_filter_%dx%d", size, size);
+ REPORT_SPEEDUP(opt.cu[i].intra_filter, ref.cu[i].intra_filter, pixel_buff, pixel_out_c);
+ }
}
}
diff -r d64227e54233 -r 44b574b61b29 source/test/intrapredharness.h
--- a/source/test/intrapredharness.h Thu Jun 25 16:25:51 2015 +0530
+++ b/source/test/intrapredharness.h Fri Jun 26 18:21:07 2015 +0530
@@ -34,7 +34,15 @@
enum { INPUT_SIZE = 4 * 65 * 65 * 100 };
enum { OUTPUT_SIZE = 64 * FENC_STRIDE };
enum { OUTPUT_SIZE_33 = 33 * OUTPUT_SIZE };
+ enum { TEST_CASES = 3 };
+ enum { INCR = 32 };
+ enum { STRIDE = 64 };
+ enum { ITERS = 100 };
+ enum { MAX_HEIGHT = 64 };
+ enum { PAD_ROWS = 64 };
+ enum { BUFFSIZE = STRIDE * (MAX_HEIGHT + PAD_ROWS) + INCR * ITERS };
+ pixel pixel_test_buff[TEST_CASES][BUFFSIZE];
ALIGN_VAR_16(pixel, pixel_buff[INPUT_SIZE]);
pixel pixel_out_c[OUTPUT_SIZE];
pixel pixel_out_vec[OUTPUT_SIZE];
@@ -45,6 +53,7 @@
bool check_planar_primitive(intra_pred_t ref, intra_pred_t opt, int width);
bool check_angular_primitive(const intra_pred_t ref[], const intra_pred_t opt[], int size);
bool check_allangs_primitive(const intra_allangs_t ref, const intra_allangs_t opt, int size);
+ bool check_intra_filter_primitive(const intra_filter_t ref, const intra_filter_t opt);
public:
More information about the x265-devel
mailing list