[x265] [PATCH] search: made a function for null cost calculation in xEstimateResidualQT()

Wed Nov 5 21:49:35 CET 2014

On 11/05, ashok at multicorewareinc.com wrote:
> # HG changeset patch
> # User Ashok Kumar Mishra<ashok at multicorewareinc.com>
> # Date 1415184822 -19800
> #      Wed Nov 05 16:23:42 2014 +0530
> # Node ID 18344f74ded0e192bc7177a217e9112c9de31983
> # Parent  2a8f3d5820a6ebe0937ce73fa81154c263df2ae9
> search: made a function for null cost calculation in xEstimateResidualQT()

this patch changes outputs quite a bit, and unambiguously for the
better. it seems there was a bug that was fixed by this.

parkjoy medium preset before:

(2.55 fps), 16644.94 kb/s, SSIM Mean Y: 0.8785235 ( 9.155 dB)

after:

(2.52 fps), 16425.97 kb/s, SSIM Mean Y: 0.8801361 ( 9.213 dB)

This needs to be understood; there's a good chance that there are
similar bugs still lurking in this key function.

> diff -r 2a8f3d5820a6 -r 18344f74ded0 source/encoder/search.cpp
> --- a/source/encoder/search.cpp	Tue Nov 04 09:46:14 2014 +0530
> +++ b/source/encoder/search.cpp	Wed Nov 05 16:23:42 2014 +0530
> @@ -2714,6 +2714,17 @@
>      }
>  }
>  
> +uint64_t Search::deriveNullCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId)
> +{
> +    m_entropyCoder.resetBits();
> +    m_entropyCoder.codeQtCbfZero(compId, tuDepth);
> +    const uint32_t nullBits = m_entropyCoder.getNumberOfWrittenBits();
> +    if (m_rdCost.m_psyRd)
> +        return m_rdCost.calcPsyRdCost(dist, nullBits, psyEnergy);
> +    else
> +        return m_rdCost.calcRdCost(dist, nullBits);
> +}

FWIW: this would be better as a macro to avoid extra function overhead,
or at least an inline function.

Would it be possible to do something like this and get the same costs?

int cbf0Bits[3];
m_entropyCoder.resetBits();
m_entropyCoder.codeQtCbfZero(0, tuDepth);
cbf0Bits[0] = m_entropyCoder.getNumberOfWrittenBits();
m_entropyCoder.codeQtCbfZero(1, tuDepth);
cbf0Bits[1] = m_entropyCoder.getNumberOfWrittenBits() - cbf0Bits[0];
m_entropyCoder.codeQtCbfZero(2, tuDepth);
cbf0Bits[2] = m_entropyCoder.getNumberOfWrittenBits() - (cbf0Bits[0] + cbf0Bits[1]);

Also, see how Entropy::bitsIntraModeMPM() measures bits without changing
the state, so it doesn't require re-loading the contexts. That might be
generally useful in a few places here.

>  void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& outCosts, uint32_t depthRange[2])
>  {
>      CUData& cu = mode.cu;
> @@ -2828,9 +2839,6 @@
>              }
>          }
>  
> -        const uint32_t numCoeffY = 1 << (log2TrSize * 2);
> -        const uint32_t numCoeffC = 1 << (log2TrSizeC * 2);
> -
>          X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n");
>          uint32_t distY = primitives.ssd_s[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size);
>          uint32_t psyEnergyY = 0;
> @@ -2861,19 +2869,15 @@
>                      singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0], nonZeroPsyEnergyY);
>                  else
>                      singleCostY = m_rdCost.calcRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0]);
> -                m_entropyCoder.resetBits();
> -                m_entropyCoder.codeQtCbfZero(TEXT_LUMA, tuDepth);
> -                const uint32_t nullBitsY = m_entropyCoder.getNumberOfWrittenBits();
> -                uint64_t nullCostY = 0;
> -                if (m_rdCost.m_psyRd)
> -                    nullCostY = m_rdCost.calcPsyRdCost(distY, nullBitsY, psyEnergyY);
> -                else
> -                    nullCostY = m_rdCost.calcRdCost(distY, nullBitsY);
> +
> +                uint64_t nullCostY = deriveNullCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
>                  if (nullCostY < singleCostY)
>                  {
>                      cbfFlag[TEXT_LUMA][0] = 0;
>  #if CHECKED_BUILD || _DEBUG
> +                    uint32_t numCoeffY = 1 << (log2TrSize << 1);
>                      memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY);
> +                    primitives.blockfill_s[partSize](curResiY, strideResiY, 0);
>  #endif
>                      if (checkTransformSkipY)
>                          minCost[TEXT_LUMA][0] = nullCostY;
> @@ -2887,21 +2891,16 @@
>                  }
>              }
>          }
> -        else if (checkTransformSkipY)
> +        else
>          {
> -            m_entropyCoder.resetBits();
> -            m_entropyCoder.codeQtCbfZero(TEXT_LUMA, tuDepth);
> -            const uint32_t nullBitsY = m_entropyCoder.getNumberOfWrittenBits();
> -            if (m_rdCost.m_psyRd)
> -                minCost[TEXT_LUMA][0] = m_rdCost.calcPsyRdCost(distY, nullBitsY, psyEnergyY);
> -            else
> -                minCost[TEXT_LUMA][0] = m_rdCost.calcRdCost(distY, nullBitsY);
> +            if (checkTransformSkipY)
> +                minCost[TEXT_LUMA][0] = deriveNullCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
> +            primitives.blockfill_s[partSize](curResiY, strideResiY, 0);
>          }
>  
>          singleDistComp[TEXT_LUMA][0] = distY;
>          singlePsyEnergyComp[TEXT_LUMA][0] = psyEnergyY;
> -        if (!cbfFlag[TEXT_LUMA][0])
> -            primitives.blockfill_s[partSize](curResiY, strideResiY, 0);
> +

was it these changes?

>          cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
>  
>          if (bCodeChroma)
> @@ -2945,19 +2944,16 @@
>                                  singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section], nonZeroPsyEnergyC);
>                              else
>                                  singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section]);
> -                            m_entropyCoder.resetBits();
> -                            m_entropyCoder.codeQtCbfZero((TextType)chromaId, tuDepth);
> -                            const uint32_t nullBitsC = m_entropyCoder.getNumberOfWrittenBits();
> -                            uint64_t nullCostC = 0;
> -                            if (m_rdCost.m_psyRd)
> -                                nullCostC = m_rdCost.calcPsyRdCost(distC, nullBitsC, psyEnergyC);
> -                            else
> -                                nullCostC = m_rdCost.calcRdCost(distC, nullBitsC);
> +
> +                            uint64_t nullCostC = deriveNullCost(distC, psyEnergyC, tuDepth, (TextType)chromaId);
> +
>                              if (nullCostC < singleCostC)
>                              {
>                                  cbfFlag[chromaId][tuIterator.section] = 0;
>  #if CHECKED_BUILD || _DEBUG
> +                                uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
>                                  memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
> +                                primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0);
>  #endif
>                                  if (checkTransformSkipC)
>                                      minCost[chromaId][tuIterator.section] = nullCostC;
> @@ -2971,23 +2967,16 @@
>                              }
>                          }
>                      }
> -                    else if (checkTransformSkipC)
> +                    else
>                      {
> -                        m_entropyCoder.resetBits();
> -                        m_entropyCoder.codeQtCbfZero((TextType)chromaId, tuDepthC);
> -                        const uint32_t nullBitsC = m_entropyCoder.getNumberOfWrittenBits();
> -                        if (m_rdCost.m_psyRd)
> -                            minCost[chromaId][tuIterator.section] = m_rdCost.calcPsyRdCost(distC, nullBitsC, psyEnergyC);
> -                        else
> -                            minCost[chromaId][tuIterator.section] = m_rdCost.calcRdCost(distC, nullBitsC);
> +                        if (checkTransformSkipC)
> +                            minCost[chromaId][tuIterator.section] = deriveNullCost(distC, psyEnergyC, tuDepthC, (TextType)chromaId);
> +                        primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0);
>                      }
>  
>                      singleDistComp[chromaId][tuIterator.section] = distC;
>                      singlePsyEnergyComp[chromaId][tuIterator.section] = psyEnergyC;
>  
> -                    if (!cbfFlag[chromaId][tuIterator.section])
> -                        primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0);
> -
>                      cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
>                  }
>                  while (tuIterator.isNextSection());
> @@ -3042,6 +3031,7 @@
>                  singlePsyEnergyComp[TEXT_LUMA][0] = nonZeroPsyEnergyY;
>                  cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
>                  bestTransformMode[TEXT_LUMA][0] = 1;
> +                uint32_t numCoeffY = 1 << (log2TrSize << 1);
>                  memcpy(coeffCurY, tsCoeffY, sizeof(coeff_t) * numCoeffY);
>                  primitives.square_copy_ss[partSize](curResiY, strideResiY, tsResiY, trSize);
>              }
> @@ -3112,6 +3102,7 @@
>                          singlePsyEnergyComp[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
>                          cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC;
>                          bestTransformMode[chromaId][tuIterator.section] = 1;
> +                        uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
>                          memcpy(coeffCurC + subTUOffset, tsCoeffC, sizeof(coeff_t) * numCoeffC);
>                          primitives.square_copy_ss[partSizeC](curResiC, strideResiC, tsResiC, trSizeC);
>                      }
> diff -r 2a8f3d5820a6 -r 18344f74ded0 source/encoder/search.h
> --- a/source/encoder/search.h	Tue Nov 04 09:46:14 2014 +0530
> +++ b/source/encoder/search.h	Wed Nov 05 16:23:42 2014 +0530
> @@ -217,6 +217,7 @@
>          Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
>      };
>  
> +    uint64_t deriveNullCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId);
>      void     estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, uint32_t depthRange[2]);
>  
>      // estimate bit cost of residual QT
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho