[x265] [PATCH] testbench: added new optimized c primitive for psyCost_ss, suitable to write asm code
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Mon Dec 15 10:41:13 CET 2014
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1418636427 -19800
# Mon Dec 15 15:10:27 2014 +0530
# Node ID afe83786dadbd923df476dcb4256fc7a0aead2a4
# Parent ff352d647f4b3a8f0c249fc7a8f4eb3645aaa974
testbench: added new optimized c primitive for psyCost_ss, suitable to write asm code
in new primitive, combined sa8d_8x8 and sad_8x8 together to save redundant loads, removed unnecessary zeroBuffer
testbench checks old c vs new c code correctness
diff -r ff352d647f4b -r afe83786dadb source/common/pixel.cpp
--- a/source/common/pixel.cpp Mon Dec 15 14:16:25 2014 +0530
+++ b/source/common/pixel.cpp Mon Dec 15 15:10:27 2014 +0530
@@ -945,6 +945,121 @@
}
}
+void psy_acEnergy_ss_8x8(const int16_t* src, intptr_t stride, int* energy, int dim)
+{
+ int n = 0;
+ const int16_t* tmpSrc = src;
+
+ for (int k = 0; k < dim; k += 8)
+ {
+ for (int j = 0; j < dim; j += 8)
+ {
+ src = tmpSrc + k * stride + j;
+ ssum2_t tmp[8][4];
+ ssum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
+ ssum2_t sum = 0, sum1 = 0;
+
+ for (int i = 0; i < 8; i++, src += stride)
+ {
+ a0 = src[0];
+ a1 = src[1];
+ sum1 += abs(a0) + abs(a1);
+ b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
+ a2 = src[2];
+ a3 = src[3];
+ sum1 += abs(a2) + abs(a3);
+ b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
+ a4 = src[4];
+ a5 = src[5];
+ sum1 += abs(a4) + abs(a5);
+ b2 = (a4 + a5) + ((a4 - a5) << BITS_PER_SUM);
+ a6 = src[6];
+ a7 = src[7];
+ sum1 += abs(a6) + abs(a7);
+ b3 = (a6 + a7) + ((a6 - a7) << BITS_PER_SUM);
+ HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0, b1, b2, b3);
+ }
+
+ for (int i = 0; i < 4; i++)
+ {
+ HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
+ HADAMARD4(a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i]);
+ b0 = abs2(a0 + a4) + abs2(a0 - a4);
+ b0 += abs2(a1 + a5) + abs2(a1 - a5);
+ b0 += abs2(a2 + a6) + abs2(a2 - a6);
+ b0 += abs2(a3 + a7) + abs2(a3 - a7);
+ sum += (sum_t)b0 + (b0 >> BITS_PER_SUM);
+ }
+
+ sum = (int)((sum + 2) >> 2);
+ sum1 >>= 2;
+
+ energy[n++] = (sum - sum1);
+ }
+ }
+}
+
+int psy_acEnergy_ss_4x4(const int16_t* source, intptr_t sstride)
+{
+ ssum2_t tmp[4][2];
+ ssum2_t a0, a1, a2, a3, b0, b1;
+ ssum2_t sum = 0;
+ ssum2_t sum1 = 0;
+
+ for (int i = 0; i < 4; i++, source += sstride)
+ {
+ a0 = source[0];
+ a1 = source[1];
+ a2 = source[2];
+ a3 = source[3];
+ sum1 += abs(a0) + abs(a1) + abs(a2) + abs(a3);
+ b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
+ b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
+ tmp[i][0] = b0 + b1;
+ tmp[i][1] = b0 - b1;
+ }
+
+ for (int i = 0; i < 2; i++)
+ {
+ HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
+ a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
+ sum += ((sum_t)a0) + (a0 >> BITS_PER_SUM);
+ }
+
+ sum = (int)(sum >> 1);
+ return (int)(sum - (sum1 >> 2));
+}
+
+template<int size>
+int psyCost_ss_opt(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
+{
+ if (size)
+ {
+ int dim = 1 << (size + 2);
+ int bufSize = dim >> (4 - size);
+ uint32_t totEnergy = 0;
+
+ int sourceEnergy[64];
+ int reconEnergy[64];
+
+ /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
+ psy_acEnergy_ss_8x8(source, sstride, sourceEnergy, dim);
+ psy_acEnergy_ss_8x8(recon, rstride, reconEnergy, dim);
+
+ for (int i = 0; i < bufSize; i++)
+ totEnergy += abs(sourceEnergy[i] - reconEnergy[i]);
+
+ return totEnergy;
+ }
+ else
+ {
+ /* 4x4 is too small for sa8d */
+ int sourceEnergy = psy_acEnergy_ss_4x4(source, sstride);
+ int reconEnergy = psy_acEnergy_ss_4x4(recon, rstride);
+ return abs(sourceEnergy - reconEnergy);
+ }
+}
+
template<int bx, int by>
void blockcopy_pp_c(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb)
{
@@ -1421,6 +1536,13 @@
p.psy_cost_ss[BLOCK_32x32] = psyCost_ss<BLOCK_32x32>;
p.psy_cost_ss[BLOCK_64x64] = psyCost_ss<BLOCK_64x64>;
+ // psy_cost_ss[NUM_SQUARE_BLOCKS to NUM_SQUARE_BLOCKS*2] new optmized c code function assignment
+ p.psy_cost_ss[BLOCK_4x4 + NUM_SQUARE_BLOCKS] = psyCost_ss_opt<BLOCK_4x4>;
+ p.psy_cost_ss[BLOCK_8x8 + NUM_SQUARE_BLOCKS] = psyCost_ss_opt<BLOCK_8x8>;
+ p.psy_cost_ss[BLOCK_16x16 + NUM_SQUARE_BLOCKS] = psyCost_ss_opt<BLOCK_16x16>;
+ p.psy_cost_ss[BLOCK_32x32 + NUM_SQUARE_BLOCKS] = psyCost_ss_opt<BLOCK_32x32>;
+ p.psy_cost_ss[BLOCK_64x64 + NUM_SQUARE_BLOCKS] = psyCost_ss_opt<BLOCK_64x64>;
+
p.sa8d_inter[LUMA_4x4] = satd_4x4;
p.sa8d_inter[LUMA_8x8] = sa8d_8x8;
p.sa8d_inter[LUMA_8x4] = satd_8x4;
diff -r ff352d647f4b -r afe83786dadb source/common/primitives.h
--- a/source/common/primitives.h Mon Dec 15 14:16:25 2014 +0530
+++ b/source/common/primitives.h Mon Dec 15 15:10:27 2014 +0530
@@ -216,7 +216,7 @@
// psy_cost_pp[NUM_SQUARE_BLOCKS to NUM_SQUARE_BLOCKS*2] new optmized c code function assignment
// TO-DO: once asm code completed, we can remove * 2
pixelcmp_t psy_cost_pp[NUM_SQUARE_BLOCKS * 2]; // difference in AC energy between two blocks
- pixelcmp_ss_t psy_cost_ss[NUM_SQUARE_BLOCKS];
+ pixelcmp_ss_t psy_cost_ss[NUM_SQUARE_BLOCKS * 2];
dct_t dct[NUM_DCTS];
idct_t idct[NUM_IDCTS];
diff -r ff352d647f4b -r afe83786dadb source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Mon Dec 15 14:16:25 2014 +0530
+++ b/source/test/pixelharness.cpp Mon Dec 15 15:10:27 2014 +0530
@@ -971,6 +971,29 @@
return true;
}
+bool PixelHarness::check_psyCost_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt)
+{
+ int j = 0;
+ intptr_t stride = STRIDE;
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ int index1 = rand() % TEST_CASES;
+ int index2 = rand() % TEST_CASES;
+
+ int optres = (int)checked(opt, short_test_buff[index1] + j, stride, short_test_buff[index2] + j, stride);
+ int refres = ref(short_test_buff[index1] + j, stride, short_test_buff[index2] + j, stride);
+
+ if (optres != refres)
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
+
bool PixelHarness::testPartition(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
{
if (opt.satd[part])
@@ -1322,6 +1345,15 @@
return false;
}
}
+
+ if (ref.psy_cost_ss[i])
+ {
+ if (!check_psyCost_ss(ref.psy_cost_ss[i], ref.psy_cost_ss[NUM_SQUARE_BLOCKS + i]))
+ {
+ printf("\npsyCost_ss[%dx%d] failed!\n", 4 << i, 4 << i);
+ return false;
+ }
+ }
}
if (opt.weight_pp)
diff -r ff352d647f4b -r afe83786dadb source/test/pixelharness.h
--- a/source/test/pixelharness.h Mon Dec 15 14:16:25 2014 +0530
+++ b/source/test/pixelharness.h Mon Dec 15 15:10:27 2014 +0530
@@ -93,6 +93,7 @@
bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);
+ bool check_psyCost_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt);
public:
More information about the x265-devel
mailing list