[x265] [PATCH] sse: fix data type for sse functions
Divya Manivannan
divya at multicorewareinc.com
Wed Sep 16 11:11:29 CEST 2015
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1442392516 -19800
# Wed Sep 16 14:05:16 2015 +0530
# Node ID ee39c10fd573444710aca40ec3bd572aac8b3cd0
# Parent 365f7ed4d89628d49cd6af8d81d4edc01f73ffad
sse: fix data type for sse functions
diff -r 365f7ed4d896 -r ee39c10fd573 source/common/common.h
--- a/source/common/common.h Tue Sep 08 16:38:01 2015 +0530
+++ b/source/common/common.h Wed Sep 16 14:05:16 2015 +0530
@@ -134,12 +134,6 @@
typedef int32_t ssum2_t; // Signed sum
#endif // if HIGH_BIT_DEPTH
-#if X265_DEPTH <= 10
-typedef uint32_t sse_ret_t;
-#else
-typedef uint64_t sse_ret_t;
-#endif
-
#ifndef NULL
#define NULL 0
#endif
diff -r 365f7ed4d896 -r ee39c10fd573 source/common/pixel.cpp
--- a/source/common/pixel.cpp Tue Sep 08 16:38:01 2015 +0530
+++ b/source/common/pixel.cpp Wed Sep 16 14:05:16 2015 +0530
@@ -117,9 +117,9 @@
}
template<int lx, int ly, class T1, class T2>
-sse_ret_t sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
+uint64_t sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
{
- sse_ret_t sum = 0;
+ uint64_t sum = 0;
int tmp;
for (int y = 0; y < ly; y++)
diff -r 365f7ed4d896 -r ee39c10fd573 source/common/primitives.h
--- a/source/common/primitives.h Tue Sep 08 16:38:01 2015 +0530
+++ b/source/common/primitives.h Wed Sep 16 14:05:16 2015 +0530
@@ -112,8 +112,8 @@
typedef int (*pixelcmp_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
typedef int (*pixelcmp_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
-typedef sse_ret_t (*pixel_sse_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
-typedef sse_ret_t (*pixel_sse_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
+typedef uint64_t (*pixel_sse_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
+typedef uint64_t (*pixel_sse_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
typedef int (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
typedef void (*pixelcmp_x4_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
typedef void (*pixelcmp_x3_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
diff -r 365f7ed4d896 -r ee39c10fd573 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Tue Sep 08 16:38:01 2015 +0530
+++ b/source/common/x86/pixel.h Wed Sep 16 14:05:16 2015 +0530
@@ -37,7 +37,7 @@
pixel PFX(planeClipAndMax_avx2)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
#define DECL_PIXELS(cpu) \
- FUNCDEF_PU(uint32_t, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
+ FUNCDEF_PU(uint64_t, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
FUNCDEF_PU(int, pixel_sa8d, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
FUNCDEF_PU(void, pixel_sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
FUNCDEF_PU(void, pixel_sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
@@ -46,7 +46,7 @@
FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
- FUNCDEF_CHROMA_PU(uint32_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
+ FUNCDEF_CHROMA_PU(uint64_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
FUNCDEF_CHROMA_PU(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
FUNCDEF_TU_S(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
diff -r 365f7ed4d896 -r ee39c10fd573 source/encoder/rdcost.h
--- a/source/encoder/rdcost.h Tue Sep 08 16:38:01 2015 +0530
+++ b/source/encoder/rdcost.h Wed Sep 16 14:05:16 2015 +0530
@@ -88,15 +88,10 @@
m_lambda = (uint64_t)floor(256.0 * lambda);
}
- inline uint64_t calcRdCost(sse_ret_t distortion, uint32_t bits) const
+ inline uint64_t calcRdCost(uint64_t distortion, uint32_t bits) const
{
-#if X265_DEPTH <= 10
X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda2,
- "calcRdCost wrap detected dist: %u, bits %u, lambda: "X265_LL"\n", distortion, bits, m_lambda2);
-#else
- X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda2,
- "calcRdCost wrap detected dist: "X265_LL", bits %u, lambda: "X265_LL"\n", distortion, bits, m_lambda2);
-#endif
+ "calcRdCost wrap detected dist: "X265_LL", bits %u, lambda: "X265_LL"\n", distortion, bits, m_lambda2);
return distortion + ((bits * m_lambda2 + 128) >> 8);
}
@@ -113,7 +108,7 @@
}
/* return the RD cost of this prediction, including the effect of psy-rd */
- inline uint64_t calcPsyRdCost(sse_ret_t distortion, uint32_t bits, uint32_t psycost) const
+ inline uint64_t calcPsyRdCost(uint64_t distortion, uint32_t bits, uint32_t psycost) const
{
return distortion + ((m_lambda * m_psyRd * psycost) >> 24) + ((bits * m_lambda2) >> 8);
}
@@ -125,11 +120,11 @@
return sadCost + ((bits * m_lambda + 128) >> 8);
}
- inline uint32_t scaleChromaDist(uint32_t plane, uint32_t dist) const
+ inline uint64_t scaleChromaDist(uint32_t plane, uint64_t dist) const
{
X265_CHECK(dist <= (UINT64_MAX - 128) / m_chromaDistWeight[plane - 1],
"scaleChromaDist wrap detected dist: %u, lambda: %u\n", dist, m_chromaDistWeight[plane - 1]);
- return (uint32_t)((dist * (uint64_t)m_chromaDistWeight[plane - 1] + 128) >> 8);
+ return (uint64_t)((dist * (uint64_t)m_chromaDistWeight[plane - 1] + 128) >> 8);
}
inline uint32_t getCost(uint32_t bits) const
diff -r 365f7ed4d896 -r ee39c10fd573 source/encoder/search.cpp
--- a/source/encoder/search.cpp Tue Sep 08 16:38:01 2015 +0530
+++ b/source/encoder/search.cpp Wed Sep 16 14:05:16 2015 +0530
@@ -531,7 +531,7 @@
// no residual coded, recon = pred
primitives.cu[sizeIdx].copy_pp(tmpRecon, tmpReconStride, pred, stride);
- uint32_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
+ uint64_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
@@ -800,7 +800,7 @@
uint32_t qtLayer = log2TrSize - 2;
uint32_t stride = mode.fencYuv->m_csize;
const uint32_t sizeIdxC = log2TrSizeC - 2;
- uint32_t outDist = 0;
+ uint64_t outDist = 0;
uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
@@ -960,7 +960,7 @@
primitives.cu[sizeIdxC].copy_pp(recon, reconStride, pred, stride);
cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
- uint32_t tmpDist = primitives.cu[sizeIdxC].sse_pp(recon, reconStride, fenc, stride);
+ uint64_t tmpDist = primitives.cu[sizeIdxC].sse_pp(recon, reconStride, fenc, stride);
tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist);
cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
@@ -2549,7 +2549,7 @@
uint32_t tqBypass = cu.m_tqBypass[0];
if (!tqBypass)
{
- uint32_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
+ uint64_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
@@ -2620,8 +2620,8 @@
reconYuv->copyFromYuv(*predYuv);
// update with clipped distortion and cost (qp estimation loop uses unclipped values)
- uint32_t bestLumaDist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
- uint32_t bestChromaDist = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
+ uint64_t bestLumaDist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
+ uint64_t bestChromaDist = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
bestChromaDist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
if (m_rdCost.m_psyRd)
interMode.psyEnergy = m_rdCost.psyCost(sizeIdx, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
@@ -2879,7 +2879,7 @@
// non-zero cost calculation for luma - This is an approximation
// finally we have to encode correct cbf after comparing with null cost
- const uint32_t nonZeroDistY = primitives.cu[partSize].sse_ss(resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
+ const uint64_t nonZeroDistY = primitives.cu[partSize].sse_ss(resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
uint32_t nonZeroPsyEnergyY = 0; uint64_t singleCostY = 0;
if (m_rdCost.m_psyRd)
@@ -2977,9 +2977,9 @@
// non-zero cost calculation for luma, same as luma - This is an approximation
// finally we have to encode correct cbf after comparing with null cost
- uint32_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
+ uint64_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
- uint32_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
+ uint64_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
uint32_t nonZeroPsyEnergyC = 0; uint64_t singleCostC = 0;
if (m_rdCost.m_psyRd)
{
@@ -3039,7 +3039,7 @@
if (checkTransformSkipY)
{
- uint32_t nonZeroDistY = 0;
+ uint64_t nonZeroDistY = 0;
uint32_t nonZeroPsyEnergyY = 0;
uint64_t singleCostY = MAX_INT64;
@@ -3092,8 +3092,8 @@
if (bCodeChroma && checkTransformSkipC)
{
- uint32_t nonZeroDistC = 0, nonZeroPsyEnergyC = 0;
- uint64_t singleCostC = MAX_INT64;
+ uint32_t nonZeroPsyEnergyC = 0;
+ uint64_t singleCostC = MAX_INT64, nonZeroDistC = 0;
uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
@@ -3131,7 +3131,7 @@
m_quant.invtransformNxN(cu, m_tsResidual, trSizeC, m_tsCoeff,
log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
- uint32_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, m_tsResidual, trSizeC);
+ uint64_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, m_tsResidual, trSizeC);
nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
if (m_rdCost.m_psyRd)
{
diff -r 365f7ed4d896 -r ee39c10fd573 source/encoder/search.h
--- a/source/encoder/search.h Tue Sep 08 16:38:01 2015 +0530
+++ b/source/encoder/search.h Wed Sep 16 14:05:16 2015 +0530
@@ -107,12 +107,12 @@
uint64_t rdCost; // sum of partition (psy) RD costs (sse(fenc, recon) + lambda2 * bits)
uint64_t sa8dCost; // sum of partition sa8d distortion costs (sa8d(fenc, pred) + lambda * bits)
+ uint64_t resEnergy; // sum of partition residual energy after motion prediction
+ uint64_t lumaDistortion;
+ uint64_t chromaDistortion;
+ uint64_t distortion; // sum of partition SSE distortion
uint32_t sa8dBits; // signal bits used in sa8dCost calculation
uint32_t psyEnergy; // sum of partition psycho-visual energy difference
- sse_ret_t resEnergy; // sum of partition residual energy after motion prediction
- sse_ret_t lumaDistortion;
- sse_ret_t chromaDistortion;
- sse_ret_t distortion; // sum of partition SSE distortion
uint32_t totalBits; // sum of partition bits (mv + coeff)
uint32_t mvBits; // Mv bits + Ref + block type (or intra mode)
uint32_t coeffBits; // Texture bits (DCT Coeffs)
@@ -137,19 +137,12 @@
/* set costs to invalid data, catch uninitialized re-use */
rdCost = UINT64_MAX / 2;
sa8dCost = UINT64_MAX / 2;
- sa8dBits = MAX_UINT / 2;
- psyEnergy = MAX_UINT / 2;
-#if X265_DEPTH <= 10
- resEnergy = MAX_UINT / 2;
- lumaDistortion = MAX_UINT / 2;
- chromaDistortion = MAX_UINT / 2;
- distortion = MAX_UINT / 2;
-#else
resEnergy = UINT64_MAX / 2;
lumaDistortion = UINT64_MAX / 2;
chromaDistortion = UINT64_MAX / 2;
distortion = UINT64_MAX / 2;
-#endif
+ sa8dBits = MAX_UINT / 2;
+ psyEnergy = MAX_UINT / 2;
totalBits = MAX_UINT / 2;
mvBits = MAX_UINT / 2;
coeffBits = MAX_UINT / 2;
@@ -157,31 +150,17 @@
bool ok() const
{
-#if X265_DEPTH <= 10
- return !(rdCost >= UINT64_MAX / 2 ||
- sa8dCost >= UINT64_MAX / 2 ||
- sa8dBits >= MAX_UINT / 2 ||
- psyEnergy >= MAX_UINT / 2 ||
- resEnergy >= MAX_UINT / 2 ||
- lumaDistortion >= MAX_UINT / 2 ||
- chromaDistortion >= MAX_UINT / 2 ||
- distortion >= MAX_UINT / 2 ||
- totalBits >= MAX_UINT / 2 ||
- mvBits >= MAX_UINT / 2 ||
- coeffBits >= MAX_UINT / 2);
-#else
return !(rdCost >= UINT64_MAX / 2 ||
sa8dCost >= UINT64_MAX / 2 ||
- sa8dBits >= MAX_UINT / 2 ||
- psyEnergy >= MAX_UINT / 2 ||
resEnergy >= UINT64_MAX / 2 ||
lumaDistortion >= UINT64_MAX / 2 ||
chromaDistortion >= UINT64_MAX / 2 ||
distortion >= UINT64_MAX / 2 ||
+ sa8dBits >= MAX_UINT / 2 ||
+ psyEnergy >= MAX_UINT / 2 ||
totalBits >= MAX_UINT / 2 ||
mvBits >= MAX_UINT / 2 ||
coeffBits >= MAX_UINT / 2);
-#endif
}
void addSubCosts(const Mode& subMode)
@@ -416,8 +395,8 @@
struct Cost
{
uint64_t rdcost;
+ uint64_t distortion;
uint32_t bits;
- uint32_t distortion;
uint32_t energy;
Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
};
diff -r 365f7ed4d896 -r ee39c10fd573 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Tue Sep 08 16:38:01 2015 +0530
+++ b/source/test/pixelharness.cpp Wed Sep 16 14:05:16 2015 +0530
@@ -103,8 +103,8 @@
{
int index1 = rand() % TEST_CASES;
int index2 = rand() % TEST_CASES;
- sse_ret_t vres = (sse_ret_t)checked(opt, pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
- sse_ret_t cres = ref(pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
+ uint64_t vres = (uint64_t)checked(opt, pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
+ uint64_t cres = ref(pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
if (vres != cres)
return false;
@@ -124,8 +124,8 @@
{
int index1 = rand() % TEST_CASES;
int index2 = rand() % TEST_CASES;
- sse_ret_t vres = (sse_ret_t)checked(opt, short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
- sse_ret_t cres = ref(short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
+ uint64_t vres = (uint64_t)checked(opt, short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
+ uint64_t cres = ref(short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
if (vres != cres)
return false;
More information about the x265-devel
mailing list