[x265] [PATCH] psyCost_pp: C code optimization, suitable for ASM conversion
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Fri Dec 12 12:05:17 CET 2014
# HG changeset patch
# User Praveen Tiwari
# Date 1418382296 -19800
# Node ID d299428772c54533124720ee3102a3b8fa1e379f
# Parent b1c2ef980dfe59c486454a8838c2c1bb74bf4d32
psyCost_pp: C code optimization, suitable for ASM conversion
psy_acEnergy_pp_4x4, is combined version of satd_4x4 and sad<4, 4>, eliminating unnecessary load operations, extra arguments(zeroBuf and 0).
It also replaces SAD operation with low cost sum operation as the sencod pixel buffer is zero.
diff -r b1c2ef980dfe -r d299428772c5 source/common/pixel.cpp
--- a/source/common/pixel.cpp Thu Dec 11 16:52:06 2014 -0600
+++ b/source/common/pixel.cpp Fri Dec 12 16:34:56 2014 +0530
@@ -801,6 +801,37 @@
#pragma warning(disable: 4127) // conditional expression is constant
#endif
+int psy_acEnergy_pp_4x4(const pixel* pix1, intptr_t stride_pix1)
+{
+ sum2_t tmp[4][2];
+ sum2_t a0, a1, a2, a3, b0, b1;
+ sum2_t sum = 0;
+ sum2_t sum1 = 0;
+
+ for (int i = 0; i < 4; i++, pix1 += stride_pix1)
+ {
+ a0 = pix1[0];
+ a1 = pix1[1];
+ a2 = pix1[2];
+ a3 = pix1[3];
+ sum1 = sum1 + (a0 + a1 + a2 + a3);
+ b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
+ b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
+ tmp[i][0] = b0 + b1;
+ tmp[i][1] = b0 - b1;
+ }
+
+ for (int i = 0; i < 2; i++)
+ {
+ HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
+ a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
+ sum += ((sum_t)a0) + (a0 >> BITS_PER_SUM);
+ }
+
+ sum = (int)(sum >> 1);
+ return (int)(sum - (sum1 >> 2));
+}
+
template<int size>
int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
{
@@ -827,9 +858,9 @@
}
else
{
- /* 4x4 is too small for sa8d */
- int sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);
- int reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2);
+ /* 4x4 is too small for 8x8 */
+ int sourceEnergy = psy_acEnergy_pp_4x4(source, sstride);
+ int reconEnergy = psy_acEnergy_pp_4x4(recon, rstride);
return abs(sourceEnergy - reconEnergy);
}
}
More information about the x265-devel
mailing list