[x265] [PATCH] sse: fix data type for sse functions

Wed Sep 16 11:11:29 CEST 2015

# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1442392516 -19800
#      Wed Sep 16 14:05:16 2015 +0530
# Node ID ee39c10fd573444710aca40ec3bd572aac8b3cd0
# Parent  365f7ed4d89628d49cd6af8d81d4edc01f73ffad
sse: fix data type for sse functions

diff -r 365f7ed4d896 -r ee39c10fd573 source/common/common.h

--- a/source/common/common.h	Tue Sep 08 16:38:01 2015 +0530
+++ b/source/common/common.h	Wed Sep 16 14:05:16 2015 +0530
@@ -134,12 +134,6 @@
 typedef int32_t  ssum2_t; // Signed sum
 #endif // if HIGH_BIT_DEPTH
 
-#if X265_DEPTH <= 10
-typedef uint32_t sse_ret_t;
-#else
-typedef uint64_t sse_ret_t;
-#endif
-
 #ifndef NULL
 #define NULL 0
 #endif
diff -r 365f7ed4d896 -r ee39c10fd573 source/common/pixel.cpp
--- a/source/common/pixel.cpp	Tue Sep 08 16:38:01 2015 +0530
+++ b/source/common/pixel.cpp	Wed Sep 16 14:05:16 2015 +0530
@@ -117,9 +117,9 @@
 }
 
 template<int lx, int ly, class T1, class T2>
-sse_ret_t sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
+uint64_t sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
 {
-    sse_ret_t sum = 0;
+    uint64_t sum = 0;
     int tmp;
 
     for (int y = 0; y < ly; y++)
diff -r 365f7ed4d896 -r ee39c10fd573 source/common/primitives.h
--- a/source/common/primitives.h	Tue Sep 08 16:38:01 2015 +0530
+++ b/source/common/primitives.h	Wed Sep 16 14:05:16 2015 +0530
@@ -112,8 +112,8 @@
 
 typedef int  (*pixelcmp_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
 typedef int  (*pixelcmp_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
-typedef sse_ret_t (*pixel_sse_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
-typedef sse_ret_t (*pixel_sse_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
+typedef uint64_t (*pixel_sse_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
+typedef uint64_t (*pixel_sse_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
 typedef int  (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
 typedef void (*pixelcmp_x4_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
 typedef void (*pixelcmp_x3_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
diff -r 365f7ed4d896 -r ee39c10fd573 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Tue Sep 08 16:38:01 2015 +0530
+++ b/source/common/x86/pixel.h	Wed Sep 16 14:05:16 2015 +0530
@@ -37,7 +37,7 @@
 pixel PFX(planeClipAndMax_avx2)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
 
 #define DECL_PIXELS(cpu) \
-    FUNCDEF_PU(uint32_t, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
+    FUNCDEF_PU(uint64_t, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
     FUNCDEF_PU(int, pixel_sa8d, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
     FUNCDEF_PU(void, pixel_sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
     FUNCDEF_PU(void, pixel_sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
@@ -46,7 +46,7 @@
     FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
     FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
     FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
-    FUNCDEF_CHROMA_PU(uint32_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
+    FUNCDEF_CHROMA_PU(uint64_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
     FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
     FUNCDEF_CHROMA_PU(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
     FUNCDEF_TU_S(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
diff -r 365f7ed4d896 -r ee39c10fd573 source/encoder/rdcost.h
--- a/source/encoder/rdcost.h	Tue Sep 08 16:38:01 2015 +0530
+++ b/source/encoder/rdcost.h	Wed Sep 16 14:05:16 2015 +0530
@@ -88,15 +88,10 @@
         m_lambda = (uint64_t)floor(256.0 * lambda);
     }
 
-    inline uint64_t calcRdCost(sse_ret_t distortion, uint32_t bits) const
+    inline uint64_t calcRdCost(uint64_t distortion, uint32_t bits) const
     {
-#if X265_DEPTH <= 10
         X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda2,
-                   "calcRdCost wrap detected dist: %u, bits %u, lambda: "X265_LL"\n", distortion, bits, m_lambda2);
-#else
-        X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda2,
-            "calcRdCost wrap detected dist: "X265_LL", bits %u, lambda: "X265_LL"\n", distortion, bits, m_lambda2);
-#endif
+                   "calcRdCost wrap detected dist: "X265_LL", bits %u, lambda: "X265_LL"\n", distortion, bits, m_lambda2);
         return distortion + ((bits * m_lambda2 + 128) >> 8);
     }
 
@@ -113,7 +108,7 @@
     }
 
     /* return the RD cost of this prediction, including the effect of psy-rd */
-    inline uint64_t calcPsyRdCost(sse_ret_t distortion, uint32_t bits, uint32_t psycost) const
+    inline uint64_t calcPsyRdCost(uint64_t distortion, uint32_t bits, uint32_t psycost) const
     {
         return distortion + ((m_lambda * m_psyRd * psycost) >> 24) + ((bits * m_lambda2) >> 8);
     }
@@ -125,11 +120,11 @@
         return sadCost + ((bits * m_lambda + 128) >> 8);
     }
 
-    inline uint32_t scaleChromaDist(uint32_t plane, uint32_t dist) const
+    inline uint64_t scaleChromaDist(uint32_t plane, uint64_t dist) const
     {
         X265_CHECK(dist <= (UINT64_MAX - 128) / m_chromaDistWeight[plane - 1],
                    "scaleChromaDist wrap detected dist: %u, lambda: %u\n", dist, m_chromaDistWeight[plane - 1]);
-        return (uint32_t)((dist * (uint64_t)m_chromaDistWeight[plane - 1] + 128) >> 8);
+        return (uint64_t)((dist * (uint64_t)m_chromaDistWeight[plane - 1] + 128) >> 8);
     }
 
     inline uint32_t getCost(uint32_t bits) const
diff -r 365f7ed4d896 -r ee39c10fd573 source/encoder/search.cpp
--- a/source/encoder/search.cpp	Tue Sep 08 16:38:01 2015 +0530
+++ b/source/encoder/search.cpp	Wed Sep 16 14:05:16 2015 +0530
@@ -531,7 +531,7 @@
             // no residual coded, recon = pred
             primitives.cu[sizeIdx].copy_pp(tmpRecon, tmpReconStride, pred, stride);
 
-        uint32_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
+        uint64_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
 
         cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
         cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
@@ -800,7 +800,7 @@
     uint32_t qtLayer = log2TrSize - 2;
     uint32_t stride = mode.fencYuv->m_csize;
     const uint32_t sizeIdxC = log2TrSizeC - 2;
-    uint32_t outDist = 0;
+    uint64_t outDist = 0;
 
     uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
     const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
@@ -960,7 +960,7 @@
                     primitives.cu[sizeIdxC].copy_pp(recon, reconStride, pred, stride);
                     cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
                 }
-                uint32_t tmpDist = primitives.cu[sizeIdxC].sse_pp(recon, reconStride, fenc, stride);
+                uint64_t tmpDist = primitives.cu[sizeIdxC].sse_pp(recon, reconStride, fenc, stride);
                 tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist);
 
                 cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
@@ -2549,7 +2549,7 @@
     uint32_t tqBypass = cu.m_tqBypass[0];
     if (!tqBypass)
     {
-        uint32_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
+        uint64_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
         cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
         cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
 
@@ -2620,8 +2620,8 @@
         reconYuv->copyFromYuv(*predYuv);
 
     // update with clipped distortion and cost (qp estimation loop uses unclipped values)
-    uint32_t bestLumaDist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
-    uint32_t bestChromaDist = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
+    uint64_t bestLumaDist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
+    uint64_t bestChromaDist = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
     bestChromaDist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
     if (m_rdCost.m_psyRd)
         interMode.psyEnergy = m_rdCost.psyCost(sizeIdx, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
@@ -2879,7 +2879,7 @@
 
             // non-zero cost calculation for luma - This is an approximation
             // finally we have to encode correct cbf after comparing with null cost
-            const uint32_t nonZeroDistY = primitives.cu[partSize].sse_ss(resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
+            const uint64_t nonZeroDistY = primitives.cu[partSize].sse_ss(resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
             uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
             uint32_t nonZeroPsyEnergyY = 0; uint64_t singleCostY = 0;
             if (m_rdCost.m_psyRd)
@@ -2977,9 +2977,9 @@
 
                         // non-zero cost calculation for luma, same as luma - This is an approximation
                         // finally we have to encode correct cbf after comparing with null cost
-                        uint32_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
+                        uint64_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
                         uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
-                        uint32_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
+                        uint64_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
                         uint32_t nonZeroPsyEnergyC = 0; uint64_t singleCostC = 0;
                         if (m_rdCost.m_psyRd)
                         {
@@ -3039,7 +3039,7 @@
 
         if (checkTransformSkipY)
         {
-            uint32_t nonZeroDistY = 0;
+            uint64_t nonZeroDistY = 0;
             uint32_t nonZeroPsyEnergyY = 0;
             uint64_t singleCostY = MAX_INT64;
 
@@ -3092,8 +3092,8 @@
 
         if (bCodeChroma && checkTransformSkipC)
         {
-            uint32_t nonZeroDistC = 0, nonZeroPsyEnergyC = 0;
-            uint64_t singleCostC = MAX_INT64;
+            uint32_t nonZeroPsyEnergyC = 0;
+            uint64_t singleCostC = MAX_INT64, nonZeroDistC = 0;
             uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
             uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
 
@@ -3131,7 +3131,7 @@
 
                         m_quant.invtransformNxN(cu, m_tsResidual, trSizeC, m_tsCoeff,
                                                 log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
-                        uint32_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, m_tsResidual, trSizeC);
+                        uint64_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, m_tsResidual, trSizeC);
                         nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
                         if (m_rdCost.m_psyRd)
                         {
diff -r 365f7ed4d896 -r ee39c10fd573 source/encoder/search.h
--- a/source/encoder/search.h	Tue Sep 08 16:38:01 2015 +0530
+++ b/source/encoder/search.h	Wed Sep 16 14:05:16 2015 +0530
@@ -107,12 +107,12 @@
 
     uint64_t   rdCost;     // sum of partition (psy) RD costs          (sse(fenc, recon) + lambda2 * bits)
     uint64_t   sa8dCost;   // sum of partition sa8d distortion costs   (sa8d(fenc, pred) + lambda * bits)
+    uint64_t   resEnergy;  // sum of partition residual energy after motion prediction
+    uint64_t   lumaDistortion;
+    uint64_t   chromaDistortion;
+    uint64_t   distortion; // sum of partition SSE distortion
     uint32_t   sa8dBits;   // signal bits used in sa8dCost calculation
     uint32_t   psyEnergy;  // sum of partition psycho-visual energy difference
-    sse_ret_t  resEnergy;  // sum of partition residual energy after motion prediction
-    sse_ret_t  lumaDistortion;
-    sse_ret_t  chromaDistortion;
-    sse_ret_t  distortion; // sum of partition SSE distortion
     uint32_t   totalBits;  // sum of partition bits (mv + coeff)
     uint32_t   mvBits;     // Mv bits + Ref + block type (or intra mode)
     uint32_t   coeffBits;  // Texture bits (DCT Coeffs)
@@ -137,19 +137,12 @@
         /* set costs to invalid data, catch uninitialized re-use */
         rdCost = UINT64_MAX / 2;
         sa8dCost = UINT64_MAX / 2;
-        sa8dBits = MAX_UINT / 2;
-        psyEnergy = MAX_UINT / 2;
-#if X265_DEPTH <= 10
-        resEnergy = MAX_UINT / 2;
-        lumaDistortion = MAX_UINT / 2;
-        chromaDistortion = MAX_UINT / 2;
-        distortion = MAX_UINT / 2;
-#else
         resEnergy = UINT64_MAX / 2;
         lumaDistortion = UINT64_MAX / 2;
         chromaDistortion = UINT64_MAX / 2;
         distortion = UINT64_MAX / 2;
-#endif
+        sa8dBits = MAX_UINT / 2;
+        psyEnergy = MAX_UINT / 2;
         totalBits = MAX_UINT / 2;
         mvBits = MAX_UINT / 2;
         coeffBits = MAX_UINT / 2;
@@ -157,31 +150,17 @@
 
     bool ok() const
     {
-#if X265_DEPTH <= 10
-        return !(rdCost >= UINT64_MAX / 2 ||
-            sa8dCost >= UINT64_MAX / 2 ||
-            sa8dBits >= MAX_UINT / 2 ||
-            psyEnergy >= MAX_UINT / 2 ||
-            resEnergy >= MAX_UINT / 2 ||
-            lumaDistortion >= MAX_UINT / 2 ||
-            chromaDistortion >= MAX_UINT / 2 ||
-            distortion >= MAX_UINT / 2 ||
-            totalBits >= MAX_UINT / 2 ||
-            mvBits >= MAX_UINT / 2 ||
-            coeffBits >= MAX_UINT / 2);
-#else
         return !(rdCost >= UINT64_MAX / 2 ||
                  sa8dCost >= UINT64_MAX / 2 ||
-                 sa8dBits >= MAX_UINT / 2 ||
-                 psyEnergy >= MAX_UINT / 2 ||
                  resEnergy >= UINT64_MAX / 2 ||
                  lumaDistortion >= UINT64_MAX / 2 ||
                  chromaDistortion >= UINT64_MAX / 2 ||
                  distortion >= UINT64_MAX / 2 ||
+                 sa8dBits >= MAX_UINT / 2 ||
+                 psyEnergy >= MAX_UINT / 2 ||
                  totalBits >= MAX_UINT / 2 ||
                  mvBits >= MAX_UINT / 2 ||
                  coeffBits >= MAX_UINT / 2);
-#endif
     }
 
     void addSubCosts(const Mode& subMode)
@@ -416,8 +395,8 @@
     struct Cost
     {
         uint64_t rdcost;
+        uint64_t distortion;
         uint32_t bits;
-        uint32_t distortion;
         uint32_t energy;
         Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
     };
diff -r 365f7ed4d896 -r ee39c10fd573 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Tue Sep 08 16:38:01 2015 +0530
+++ b/source/test/pixelharness.cpp	Wed Sep 16 14:05:16 2015 +0530
@@ -103,8 +103,8 @@
     {
         int index1 = rand() % TEST_CASES;
         int index2 = rand() % TEST_CASES;
-        sse_ret_t vres = (sse_ret_t)checked(opt, pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
-        sse_ret_t cres = ref(pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
+        uint64_t vres = (uint64_t)checked(opt, pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
+        uint64_t cres = ref(pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
         if (vres != cres)
             return false;
 
@@ -124,8 +124,8 @@
     {
         int index1 = rand() % TEST_CASES;
         int index2 = rand() % TEST_CASES;
-        sse_ret_t vres = (sse_ret_t)checked(opt, short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
-        sse_ret_t cres = ref(short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
+        uint64_t vres = (uint64_t)checked(opt, short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
+        uint64_t cres = ref(short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
         if (vres != cres)
             return false;