[x265] [PATCH] sse: fix data type for sse functions

Wed Sep 16 20:44:36 CEST 2015

On 09/16, Divya Manivannan wrote:
> # HG changeset patch
> # User Divya Manivannan <divya at multicorewareinc.com>
> # Date 1442392516 -19800
> #      Wed Sep 16 14:05:16 2015 +0530
> # Node ID ee39c10fd573444710aca40ec3bd572aac8b3cd0
> # Parent  365f7ed4d89628d49cd6af8d81d4edc01f73ffad
> sse: fix data type for sse functions

I meant to change only the data types of the counters in class Mode in
search.h (which must have higher dynamic range than just the return
value of one SSE call). And not to change the SSE function return values
themselves, but see below

> diff -r 365f7ed4d896 -r ee39c10fd573 source/common/common.h
> --- a/source/common/common.h	Tue Sep 08 16:38:01 2015 +0530
> +++ b/source/common/common.h	Wed Sep 16 14:05:16 2015 +0530
> @@ -134,12 +134,6 @@
>  typedef int32_t  ssum2_t; // Signed sum
>  #endif // if HIGH_BIT_DEPTH
>  
> -#if X265_DEPTH <= 10
> -typedef uint32_t sse_ret_t;
> -#else
> -typedef uint64_t sse_ret_t;
> -#endif
> -
>  #ifndef NULL
>  #define NULL 0
>  #endif
> diff -r 365f7ed4d896 -r ee39c10fd573 source/common/pixel.cpp
> --- a/source/common/pixel.cpp	Tue Sep 08 16:38:01 2015 +0530
> +++ b/source/common/pixel.cpp	Wed Sep 16 14:05:16 2015 +0530
> @@ -117,9 +117,9 @@
>  }
>  
>  template<int lx, int ly, class T1, class T2>
> -sse_ret_t sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
> +uint64_t sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
>  {
> -    sse_ret_t sum = 0;
> +    uint64_t sum = 0;
>      int tmp;
>  
>      for (int y = 0; y < ly; y++)
> diff -r 365f7ed4d896 -r ee39c10fd573 source/common/primitives.h
> --- a/source/common/primitives.h	Tue Sep 08 16:38:01 2015 +0530
> +++ b/source/common/primitives.h	Wed Sep 16 14:05:16 2015 +0530
> @@ -112,8 +112,8 @@
>  
>  typedef int  (*pixelcmp_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
>  typedef int  (*pixelcmp_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
> -typedef sse_ret_t (*pixel_sse_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
> -typedef sse_ret_t (*pixel_sse_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
> +typedef uint64_t (*pixel_sse_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
> +typedef uint64_t (*pixel_sse_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
>  typedef int  (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
>  typedef void (*pixelcmp_x4_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
>  typedef void (*pixelcmp_x3_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> diff -r 365f7ed4d896 -r ee39c10fd573 source/common/x86/pixel.h
> --- a/source/common/x86/pixel.h	Tue Sep 08 16:38:01 2015 +0530
> +++ b/source/common/x86/pixel.h	Wed Sep 16 14:05:16 2015 +0530
> @@ -37,7 +37,7 @@
>  pixel PFX(planeClipAndMax_avx2)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
>  
>  #define DECL_PIXELS(cpu) \
> -    FUNCDEF_PU(uint32_t, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
> +    FUNCDEF_PU(uint64_t, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
>      FUNCDEF_PU(int, pixel_sa8d, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
>      FUNCDEF_PU(void, pixel_sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
>      FUNCDEF_PU(void, pixel_sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
> @@ -46,7 +46,7 @@
>      FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
>      FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
>      FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
> -    FUNCDEF_CHROMA_PU(uint32_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
> +    FUNCDEF_CHROMA_PU(uint64_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
>      FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
>      FUNCDEF_CHROMA_PU(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
>      FUNCDEF_TU_S(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
> diff -r 365f7ed4d896 -r ee39c10fd573 source/encoder/rdcost.h
> --- a/source/encoder/rdcost.h	Tue Sep 08 16:38:01 2015 +0530
> +++ b/source/encoder/rdcost.h	Wed Sep 16 14:05:16 2015 +0530
> @@ -88,15 +88,10 @@
>          m_lambda = (uint64_t)floor(256.0 * lambda);
>      }
>  
> -    inline uint64_t calcRdCost(sse_ret_t distortion, uint32_t bits) const
> +    inline uint64_t calcRdCost(uint64_t distortion, uint32_t bits) const
>      {
> -#if X265_DEPTH <= 10
>          X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda2,
> -                   "calcRdCost wrap detected dist: %u, bits %u, lambda: "X265_LL"\n", distortion, bits, m_lambda2);
> -#else
> -        X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda2,
> -            "calcRdCost wrap detected dist: "X265_LL", bits %u, lambda: "X265_LL"\n", distortion, bits, m_lambda2);
> -#endif
> +                   "calcRdCost wrap detected dist: "X265_LL", bits %u, lambda: "X265_LL"\n", distortion, bits, m_lambda2);
>          return distortion + ((bits * m_lambda2 + 128) >> 8);
>      }
>  
> @@ -113,7 +108,7 @@
>      }
>  
>      /* return the RD cost of this prediction, including the effect of psy-rd */
> -    inline uint64_t calcPsyRdCost(sse_ret_t distortion, uint32_t bits, uint32_t psycost) const
> +    inline uint64_t calcPsyRdCost(uint64_t distortion, uint32_t bits, uint32_t psycost) const
>      {
>          return distortion + ((m_lambda * m_psyRd * psycost) >> 24) + ((bits * m_lambda2) >> 8);
>      }
> @@ -125,11 +120,11 @@
>          return sadCost + ((bits * m_lambda + 128) >> 8);
>      }
>  
> -    inline uint32_t scaleChromaDist(uint32_t plane, uint32_t dist) const
> +    inline uint64_t scaleChromaDist(uint32_t plane, uint64_t dist) const
>      {
>          X265_CHECK(dist <= (UINT64_MAX - 128) / m_chromaDistWeight[plane - 1],
>                     "scaleChromaDist wrap detected dist: %u, lambda: %u\n", dist, m_chromaDistWeight[plane - 1]);
> -        return (uint32_t)((dist * (uint64_t)m_chromaDistWeight[plane - 1] + 128) >> 8);
> +        return (uint64_t)((dist * (uint64_t)m_chromaDistWeight[plane - 1] + 128) >> 8);
>      }
>  
>      inline uint32_t getCost(uint32_t bits) const
> diff -r 365f7ed4d896 -r ee39c10fd573 source/encoder/search.cpp
> --- a/source/encoder/search.cpp	Tue Sep 08 16:38:01 2015 +0530
> +++ b/source/encoder/search.cpp	Wed Sep 16 14:05:16 2015 +0530
> @@ -531,7 +531,7 @@
>              // no residual coded, recon = pred
>              primitives.cu[sizeIdx].copy_pp(tmpRecon, tmpReconStride, pred, stride);
>  
> -        uint32_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
> +        uint64_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
>  
>          cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
>          cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
> @@ -800,7 +800,7 @@
>      uint32_t qtLayer = log2TrSize - 2;
>      uint32_t stride = mode.fencYuv->m_csize;
>      const uint32_t sizeIdxC = log2TrSizeC - 2;
> -    uint32_t outDist = 0;
> +    uint64_t outDist = 0;
>  
>      uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
>      const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
> @@ -960,7 +960,7 @@
>                      primitives.cu[sizeIdxC].copy_pp(recon, reconStride, pred, stride);
>                      cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
>                  }
> -                uint32_t tmpDist = primitives.cu[sizeIdxC].sse_pp(recon, reconStride, fenc, stride);
> +                uint64_t tmpDist = primitives.cu[sizeIdxC].sse_pp(recon, reconStride, fenc, stride);
>                  tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist);
>  
>                  cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
> @@ -2549,7 +2549,7 @@
>      uint32_t tqBypass = cu.m_tqBypass[0];
>      if (!tqBypass)
>      {
> -        uint32_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
> +        uint64_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
>          cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
>          cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
>  
> @@ -2620,8 +2620,8 @@
>          reconYuv->copyFromYuv(*predYuv);
>  
>      // update with clipped distortion and cost (qp estimation loop uses unclipped values)
> -    uint32_t bestLumaDist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
> -    uint32_t bestChromaDist = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
> +    uint64_t bestLumaDist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
> +    uint64_t bestChromaDist = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
>      bestChromaDist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
>      if (m_rdCost.m_psyRd)
>          interMode.psyEnergy = m_rdCost.psyCost(sizeIdx, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
> @@ -2879,7 +2879,7 @@
>  
>              // non-zero cost calculation for luma - This is an approximation
>              // finally we have to encode correct cbf after comparing with null cost
> -            const uint32_t nonZeroDistY = primitives.cu[partSize].sse_ss(resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
> +            const uint64_t nonZeroDistY = primitives.cu[partSize].sse_ss(resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
>              uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
>              uint32_t nonZeroPsyEnergyY = 0; uint64_t singleCostY = 0;
>              if (m_rdCost.m_psyRd)
> @@ -2977,9 +2977,9 @@
>  
>                          // non-zero cost calculation for luma, same as luma - This is an approximation
>                          // finally we have to encode correct cbf after comparing with null cost
> -                        uint32_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
> +                        uint64_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
>                          uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
> -                        uint32_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
> +                        uint64_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
>                          uint32_t nonZeroPsyEnergyC = 0; uint64_t singleCostC = 0;
>                          if (m_rdCost.m_psyRd)
>                          {
> @@ -3039,7 +3039,7 @@
>  
>          if (checkTransformSkipY)
>          {
> -            uint32_t nonZeroDistY = 0;
> +            uint64_t nonZeroDistY = 0;
>              uint32_t nonZeroPsyEnergyY = 0;
>              uint64_t singleCostY = MAX_INT64;
>  
> @@ -3092,8 +3092,8 @@
>  
>          if (bCodeChroma && checkTransformSkipC)
>          {
> -            uint32_t nonZeroDistC = 0, nonZeroPsyEnergyC = 0;
> -            uint64_t singleCostC = MAX_INT64;
> +            uint32_t nonZeroPsyEnergyC = 0;
> +            uint64_t singleCostC = MAX_INT64, nonZeroDistC = 0;
>              uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
>              uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
>  
> @@ -3131,7 +3131,7 @@
>  
>                          m_quant.invtransformNxN(cu, m_tsResidual, trSizeC, m_tsCoeff,
>                                                  log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
> -                        uint32_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, m_tsResidual, trSizeC);
> +                        uint64_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, m_tsResidual, trSizeC);

but yes in general we should not be using uint32_t anywhere to hold the
distortion returned from SSE primitives.

it should be sse_ret_t if the returned value is not used in subsequent
math that might bust the dynamic range, or uint64_t if it does. I would
be ok with making them all uint64_t just to be safe.

>                          nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
>                          if (m_rdCost.m_psyRd)
>                          {
> diff -r 365f7ed4d896 -r ee39c10fd573 source/encoder/search.h
> --- a/source/encoder/search.h	Tue Sep 08 16:38:01 2015 +0530
> +++ b/source/encoder/search.h	Wed Sep 16 14:05:16 2015 +0530
> @@ -107,12 +107,12 @@
>  
>      uint64_t   rdCost;     // sum of partition (psy) RD costs          (sse(fenc, recon) + lambda2 * bits)
>      uint64_t   sa8dCost;   // sum of partition sa8d distortion costs   (sa8d(fenc, pred) + lambda * bits)
> +    uint64_t   resEnergy;  // sum of partition residual energy after motion prediction
> +    uint64_t   lumaDistortion;
> +    uint64_t   chromaDistortion;
> +    uint64_t   distortion; // sum of partition SSE distortion
>      uint32_t   sa8dBits;   // signal bits used in sa8dCost calculation
>      uint32_t   psyEnergy;  // sum of partition psycho-visual energy difference
> -    sse_ret_t  resEnergy;  // sum of partition residual energy after motion prediction
> -    sse_ret_t  lumaDistortion;
> -    sse_ret_t  chromaDistortion;
> -    sse_ret_t  distortion; // sum of partition SSE distortion
>      uint32_t   totalBits;  // sum of partition bits (mv + coeff)
>      uint32_t   mvBits;     // Mv bits + Ref + block type (or intra mode)
>      uint32_t   coeffBits;  // Texture bits (DCT Coeffs)
> @@ -137,19 +137,12 @@
>          /* set costs to invalid data, catch uninitialized re-use */
>          rdCost = UINT64_MAX / 2;
>          sa8dCost = UINT64_MAX / 2;
> -        sa8dBits = MAX_UINT / 2;
> -        psyEnergy = MAX_UINT / 2;
> -#if X265_DEPTH <= 10
> -        resEnergy = MAX_UINT / 2;
> -        lumaDistortion = MAX_UINT / 2;
> -        chromaDistortion = MAX_UINT / 2;
> -        distortion = MAX_UINT / 2;
> -#else
>          resEnergy = UINT64_MAX / 2;
>          lumaDistortion = UINT64_MAX / 2;
>          chromaDistortion = UINT64_MAX / 2;
>          distortion = UINT64_MAX / 2;
> -#endif
> +        sa8dBits = MAX_UINT / 2;
> +        psyEnergy = MAX_UINT / 2;
>          totalBits = MAX_UINT / 2;
>          mvBits = MAX_UINT / 2;
>          coeffBits = MAX_UINT / 2;
> @@ -157,31 +150,17 @@
>  
>      bool ok() const
>      {
> -#if X265_DEPTH <= 10
> -        return !(rdCost >= UINT64_MAX / 2 ||
> -            sa8dCost >= UINT64_MAX / 2 ||
> -            sa8dBits >= MAX_UINT / 2 ||
> -            psyEnergy >= MAX_UINT / 2 ||
> -            resEnergy >= MAX_UINT / 2 ||
> -            lumaDistortion >= MAX_UINT / 2 ||
> -            chromaDistortion >= MAX_UINT / 2 ||
> -            distortion >= MAX_UINT / 2 ||
> -            totalBits >= MAX_UINT / 2 ||
> -            mvBits >= MAX_UINT / 2 ||
> -            coeffBits >= MAX_UINT / 2);
> -#else
>          return !(rdCost >= UINT64_MAX / 2 ||
>                   sa8dCost >= UINT64_MAX / 2 ||
> -                 sa8dBits >= MAX_UINT / 2 ||
> -                 psyEnergy >= MAX_UINT / 2 ||
>                   resEnergy >= UINT64_MAX / 2 ||
>                   lumaDistortion >= UINT64_MAX / 2 ||
>                   chromaDistortion >= UINT64_MAX / 2 ||
>                   distortion >= UINT64_MAX / 2 ||
> +                 sa8dBits >= MAX_UINT / 2 ||
> +                 psyEnergy >= MAX_UINT / 2 ||
>                   totalBits >= MAX_UINT / 2 ||
>                   mvBits >= MAX_UINT / 2 ||
>                   coeffBits >= MAX_UINT / 2);
> -#endif
>      }
>  
>      void addSubCosts(const Mode& subMode)
> @@ -416,8 +395,8 @@
>      struct Cost
>      {
>          uint64_t rdcost;
> +        uint64_t distortion;
>          uint32_t bits;
> -        uint32_t distortion;
>          uint32_t energy;
>          Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
>      };
> diff -r 365f7ed4d896 -r ee39c10fd573 source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp	Tue Sep 08 16:38:01 2015 +0530
> +++ b/source/test/pixelharness.cpp	Wed Sep 16 14:05:16 2015 +0530
> @@ -103,8 +103,8 @@
>      {
>          int index1 = rand() % TEST_CASES;
>          int index2 = rand() % TEST_CASES;
> -        sse_ret_t vres = (sse_ret_t)checked(opt, pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
> -        sse_ret_t cres = ref(pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
> +        uint64_t vres = (uint64_t)checked(opt, pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
> +        uint64_t cres = ref(pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
>          if (vres != cres)
>              return false;
>  
> @@ -124,8 +124,8 @@
>      {
>          int index1 = rand() % TEST_CASES;
>          int index2 = rand() % TEST_CASES;
> -        sse_ret_t vres = (sse_ret_t)checked(opt, short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
> -        sse_ret_t cres = ref(short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
> +        uint64_t vres = (uint64_t)checked(opt, short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
> +        uint64_t cres = ref(short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
>          if (vres != cres)
>              return false;
>  
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho