[x265] [PATCH] Search: remove redundant encode coefficients in intra for performance

Mon Sep 15 13:11:31 CEST 2014

Sorry, the output mismatch was due to asm. Pushed.

On Sun, Sep 14, 2014 at 4:35 PM, Deepthi Nandakumar <
deepthi at multicorewareinc.com> wrote:

> This significantly changes outputs for P and B frames. Higher bitrates and
> higher SSIM. Lets do full regression testing on this - and compare the
> bitrate/ssim for all combinations to be reasonably sure there are no bugs.
>
> On Fri, Sep 12, 2014 at 7:47 PM, <ashok at multicorewareinc.com> wrote:
>
>> # HG changeset patch
>> # User Ashok Kumar Mishra<ashok at multicorewareinc.com>
>> # Date 1410341620 -19800
>> #      Wed Sep 10 15:03:40 2014 +0530
>> # Node ID d8be3c38915d4a628b804522da8946a152041203
>> # Parent  cd8fd0afd4e873fc940ae3384fac4deed3ec7b4f
>> Search: remove redundant encode coefficients in intra for performance
>>
>> diff -r cd8fd0afd4e8 -r d8be3c38915d source/encoder/analysis.cpp
>> --- a/source/encoder/analysis.cpp       Thu Sep 11 17:25:40 2014 -0700
>> +++ b/source/encoder/analysis.cpp       Wed Sep 10 15:03:40 2014 +0530
>> @@ -1840,6 +1840,7 @@
>>  void Analysis::encodeIntraInInter(TComDataCU* cu, TComYuv* fencYuv,
>> TComYuv* predYuv,  ShortYuv* outResiYuv, TComYuv* outReconYuv)
>>  {
>>      uint64_t puCost = 0;
>> +    uint32_t puBits = 0;
>>      uint32_t depth = cu->getDepth(0);
>>      uint32_t initTrDepth = cu->getPartitionSize(0) == SIZE_2Nx2N ? 0 : 1;
>>
>> @@ -1851,7 +1852,7 @@
>>      uint32_t tuDepthRange[2];
>>      cu->getQuadtreeTULog2MinSizeInCU(tuDepthRange, 0);
>>
>> -    uint32_t puDistY = xRecurIntraCodingQT(cu, initTrDepth, 0, fencYuv,
>> predYuv, outResiYuv, false, puCost, tuDepthRange);
>> +    uint32_t puDistY = xRecurIntraCodingQT(cu, initTrDepth, 0, fencYuv,
>> predYuv, outResiYuv, false, puCost, puBits, tuDepthRange);
>>      xSetIntraResultQT(cu, initTrDepth, 0, outReconYuv);
>>
>>      //=== update PU data ====
>> diff -r cd8fd0afd4e8 -r d8be3c38915d source/encoder/search.cpp
>> --- a/source/encoder/search.cpp Thu Sep 11 17:25:40 2014 -0700
>> +++ b/source/encoder/search.cpp Wed Sep 10 15:03:40 2014 +0530
>> @@ -111,47 +111,6 @@
>>      return false;
>>  }
>>
>> -void Search::xEncSubdivCbfQTLuma(TComDataCU* cu, uint32_t trDepth,
>> uint32_t absPartIdx, uint32_t depthRange[2])
>> -{
>> -    uint32_t fullDepth  = cu->getDepth(0) + trDepth;
>> -    uint32_t trMode     = cu->getTransformIdx(absPartIdx);
>> -    uint32_t subdiv     = (trMode > trDepth ? 1 : 0);
>> -    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
>> -
>> -    if (cu->getPredictionMode(0) == MODE_INTRA &&
>> cu->getPartitionSize(0) == SIZE_NxN && trDepth == 0)
>> -    {
>> -        X265_CHECK(subdiv, "subdivision not present\n");
>> -    }
>> -    else if (log2TrSize > *(depthRange + 1))
>> -    {
>> -        X265_CHECK(subdiv, "subdivision not present\n");
>> -    }
>> -    else if (log2TrSize == cu->m_slice->m_sps->quadtreeTULog2MinSize)
>> -    {
>> -        X265_CHECK(!subdiv, "subdivision present\n");
>> -    }
>> -    else if (log2TrSize == *depthRange)
>> -    {
>> -        X265_CHECK(!subdiv, "subdivision present\n");
>> -    }
>> -    else
>> -    {
>> -        X265_CHECK(log2TrSize > *depthRange, "transform size too
>> small\n");
>> -        m_entropyCoder->codeTransformSubdivFlag(subdiv, 5 - log2TrSize);
>> -    }
>> -
>> -    if (subdiv)
>> -    {
>> -        uint32_t qtPartNum = cu->m_pic->getNumPartInCU() >> ((fullDepth
>> + 1) << 1);
>> -        for (uint32_t part = 0; part < 4; part++)
>> -            xEncSubdivCbfQTLuma(cu, trDepth + 1, absPartIdx + part *
>> qtPartNum, depthRange);
>> -
>> -        return;
>> -    }
>> -
>> -    m_entropyCoder->codeQtCbf(cu, absPartIdx, TEXT_LUMA, trMode);
>> -}
>> -
>>  void Search::xEncSubdivCbfQTChroma(TComDataCU* cu, uint32_t trDepth,
>> uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t
>> height)
>>  {
>>      uint32_t fullDepth  = cu->getDepth(0) + trDepth;
>> @@ -183,32 +142,6 @@
>>      }
>>  }
>>
>> -void Search::xEncCoeffQTLuma(TComDataCU* cu, uint32_t trDepth, uint32_t
>> absPartIdx)
>> -{
>> -    const TextType ttype = TEXT_LUMA;
>> -
>> -    if (!cu->getCbf(absPartIdx, ttype, trDepth))
>> -        return;
>> -
>> -    uint32_t fullDepth = cu->getDepth(0) + trDepth;
>> -    uint32_t trMode    = cu->getTransformIdx(absPartIdx);
>> -
>> -    if (trMode > trDepth)
>> -    {
>> -        uint32_t qtPartNum = cu->m_pic->getNumPartInCU() >> ((fullDepth
>> + 1) << 1);
>> -        for (uint32_t part = 0; part < 4; part++)
>> -            xEncCoeffQTLuma(cu, trDepth + 1, absPartIdx + part *
>> qtPartNum);
>> -
>> -        return;
>> -    }
>> -
>> -    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
>> -    uint32_t qtLayer    = log2TrSize - 2;
>> -    uint32_t coeffOffset = absPartIdx << LOG2_UNIT_SIZE * 2;
>> -    coeff_t* coeff = m_qtTempCoeff[ttype][qtLayer] + coeffOffset;
>> -    m_entropyCoder->codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize,
>> ttype);
>> -}
>> -
>>  void Search::xEncCoeffQTChroma(TComDataCU* cu, uint32_t trDepth,
>> uint32_t absPartIdx, TextType ttype)
>>  {
>>      if (!cu->getCbf(absPartIdx, ttype, trDepth))
>> @@ -316,15 +249,6 @@
>>      }
>>  }
>>
>> -uint32_t Search::xGetIntraBitsQTLuma(TComDataCU* cu, uint32_t trDepth,
>> uint32_t absPartIdx, uint32_t depthRange[2])
>> -{
>> -    m_entropyCoder->resetBits();
>> -    xEncIntraHeaderLuma(cu, trDepth, absPartIdx);
>> -    xEncSubdivCbfQTLuma(cu, trDepth, absPartIdx, depthRange);
>> -    xEncCoeffQTLuma(cu, trDepth, absPartIdx);
>> -    return m_entropyCoder->getNumberOfWrittenBits();
>> -}
>> -
>>  uint32_t Search::xGetIntraBitsQTChroma(TComDataCU* cu, uint32_t trDepth,
>> uint32_t absPartIdx, uint32_t absPartIdxStep)
>>  {
>>      int cuSize = 1 << cu->getLog2CUSize(absPartIdx);
>> @@ -340,7 +264,14 @@
>>  {
>>      m_entropyCoder->resetBits();
>>      xEncIntraHeaderLuma(cu, trDepth, absPartIdx);
>> -    xEncSubdivCbfQTLuma(cu, trDepth, absPartIdx, depthRange);
>> +
>> +    //Transform subdiv flag
>> +    if (log2TrSize != *depthRange)
>> +        m_entropyCoder->codeTransformSubdivFlag(0, 5 - log2TrSize);
>> +
>> +    //===== Cbfs =====
>> +    uint32_t trMode = cu->getTransformIdx(absPartIdx);
>> +    m_entropyCoder->codeQtCbf(cu, absPartIdx, TEXT_LUMA, trMode);
>>
>>      if (cu->getCbf(absPartIdx, TEXT_LUMA, trDepth))
>>          m_entropyCoder->codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize,
>> TEXT_LUMA);
>> @@ -463,7 +394,7 @@
>>
>>  /* returns distortion. TODO reorder params */
>>  uint32_t Search::xRecurIntraCodingQT(TComDataCU* cu, uint32_t trDepth,
>> uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv,
>> -                                     ShortYuv* resiYuv, bool
>> bAllowRQTSplit, uint64_t& rdCost, uint32_t depthRange[2])
>> +                                     ShortYuv* resiYuv, bool
>> bAllowRQTSplit, uint64_t& rdCost, uint32_t& rdBits, uint32_t depthRange[2])
>>  {
>>      uint32_t fullDepth   = cu->getDepth(0) + trDepth;
>>      uint32_t log2TrSize  = g_maxLog2CUSize - fullDepth;
>> @@ -490,8 +421,9 @@
>>      if (!bAllowRQTSplit && noSplitIntraMaxTuSize)
>>          bCheckSplit = false;
>>
>> -    uint64_t singleCost   = MAX_INT64;
>> -    uint32_t singleDistY  = 0;
>> +    uint64_t singleCost  = MAX_INT64;
>> +    uint32_t singleDistY = 0;
>> +    uint32_t singleBits  = 0;
>>      uint32_t singlePsyEnergyY = 0;
>>      uint32_t singleCbfY   = 0;
>>      int      bestModeId   = 0;
>> @@ -580,7 +512,7 @@
>>                      break;
>>                  else
>>                  {
>> -                    uint32_t singleBits = xGetIntraBitsLuma(cu, trDepth,
>> absPartIdx, log2TrSize, coeff, depthRange);
>> +                    singleBits = xGetIntraBitsLuma(cu, trDepth,
>> absPartIdx, log2TrSize, coeff, depthRange);
>>                      if (m_rdCost.m_psyRd)
>>                          singleCostTmp =
>> m_rdCost.calcPsyRdCost(singleDistYTmp, singleBits, singlePsyEnergyYTmp);
>>                      else
>> @@ -634,7 +566,7 @@
>>              }
>>              cu->setCbfSubParts(singleCbfY << trDepth, TEXT_LUMA,
>> absPartIdx, fullDepth);
>>
>> -            uint32_t singleBits = xGetIntraBitsLuma(cu, trDepth,
>> absPartIdx, log2TrSize, coeffY, depthRange);
>> +            singleBits = xGetIntraBitsLuma(cu, trDepth, absPartIdx,
>> log2TrSize, coeffY, depthRange);
>>              if (m_param->rdPenalty && (log2TrSize == 5) && !isIntraSlice)
>>                  singleBits *= 4;
>>
>> @@ -663,23 +595,30 @@
>>          uint32_t qPartsDiv     = cu->m_pic->getNumPartInCU() >>
>> ((fullDepth + 1) << 1);
>>          uint32_t absPartIdxSub = absPartIdx;
>>          uint32_t splitCbfY     = 0;
>> +        uint32_t splitBits     = 0;
>>
>>          for (uint32_t part = 0; part < 4; part++, absPartIdxSub +=
>> qPartsDiv)
>>          {
>>              cu->m_psyEnergy = 0;
>> -            splitDistY += xRecurIntraCodingQT(cu, trDepth + 1,
>> absPartIdxSub, fencYuv, predYuv, resiYuv, bAllowRQTSplit, splitCost,
>> depthRange);
>> +            splitDistY += xRecurIntraCodingQT(cu, trDepth + 1,
>> absPartIdxSub, fencYuv, predYuv, resiYuv, bAllowRQTSplit, splitCost,
>> splitBits, depthRange);
>>              splitPsyEnergyY += cu->m_psyEnergy;
>>              splitCbfY |= cu->getCbf(absPartIdxSub, TEXT_LUMA, trDepth +
>> 1);
>>          }
>> +
>> +        if (bCheckFull)
>> +        {
>> +            m_entropyCoder->resetBits();
>> +
>> +            //subdiv
>> +            if (log2TrSize != *depthRange)
>> +                m_entropyCoder->codeTransformSubdivFlag(1, 5 -
>> log2TrSize);
>> +
>> +             splitBits += m_entropyCoder->getNumberOfWrittenBits();
>> +        }
>>
>>          for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
>>              cu->getCbf(TEXT_LUMA)[absPartIdx + offs] |= (splitCbfY <<
>> trDepth);
>>
>> -        // restore context states
>> -
>> m_entropyCoder->load(m_rdEntropyCoders[fullDepth][CI_QT_TRAFO_ROOT]);
>> -
>> -        // determine rate and r-d cost
>> -        uint32_t splitBits = xGetIntraBitsQTLuma(cu, trDepth,
>> absPartIdx, depthRange);
>>          if (m_rdCost.m_psyRd)
>>              splitCost = m_rdCost.calcPsyRdCost(splitDistY, splitBits,
>> splitPsyEnergyY);
>>          else
>> @@ -689,6 +628,7 @@
>>          {
>>              outDist  += splitDistY;
>>              rdCost   += splitCost;
>> +            rdBits   += splitBits;
>>              cu->m_psyEnergy = splitPsyEnergyY;
>>              return outDist;
>>          }
>> @@ -717,6 +657,7 @@
>>      }
>>
>>      rdCost += singleCost;
>> +    rdBits += singleBits;
>>      cu->m_psyEnergy = singlePsyEnergyY;
>>      return outDist + singleDistY;
>>  }
>> @@ -1416,6 +1357,7 @@
>>          uint32_t bestPUDistY = 0;
>>          uint64_t bestPUCost  = MAX_INT64;
>>          uint32_t puDistY;
>> +        uint32_t puBits;
>>          uint64_t puCost;
>>          for (int mode = 0; mode < numModesForFullRD; mode++)
>>          {
>> @@ -1427,7 +1369,8 @@
>>
>>              // determine residual for partition
>>              puCost = 0;
>> -            puDistY = xRecurIntraCodingQT(cu, initTrDepth, partOffset,
>> fencYuv, predYuv, resiYuv, false, puCost, depthRange);
>> +            puBits = 0;
>> +            puDistY = xRecurIntraCodingQT(cu, initTrDepth, partOffset,
>> fencYuv, predYuv, resiYuv, false, puCost, puBits, depthRange);
>>
>>              // check r-d cost
>>              if (puCost < bestPUCost)
>> @@ -1446,7 +1389,8 @@
>>
>>          // determine residual for partition
>>          puCost = 0;
>> -        puDistY = xRecurIntraCodingQT(cu, initTrDepth, partOffset,
>> fencYuv, predYuv, resiYuv, true, puCost, depthRange);
>> +        puBits = 0;
>> +        puDistY = xRecurIntraCodingQT(cu, initTrDepth, partOffset,
>> fencYuv, predYuv, resiYuv, true, puCost, puBits, depthRange);
>>
>>          overallDistY += (puCost >= bestPUCost) ? bestPUDistY : puDistY;
>>
>> diff -r cd8fd0afd4e8 -r d8be3c38915d source/encoder/search.h
>> --- a/source/encoder/search.h   Thu Sep 11 17:25:40 2014 -0700
>> +++ b/source/encoder/search.h   Wed Sep 10 15:03:40 2014 +0530
>> @@ -129,14 +129,11 @@
>>      void xSetResidualQTData(TComDataCU* cu, uint32_t absPartIdx,
>> ShortYuv* resiYuv, uint32_t depth, bool bSpatial);
>>      void xSetIntraResultQT(TComDataCU* cu, uint32_t trDepth, uint32_t
>> absPartIdx, TComYuv* reconYuv);
>>
>> -    void xEncSubdivCbfQTLuma(TComDataCU* cu, uint32_t trDepth, uint32_t
>> absPartIdx, uint32_t depthRange[2]);
>>      void xEncSubdivCbfQTChroma(TComDataCU* cu, uint32_t trDepth,
>> uint32_t absPartIdx,  uint32_t absPartIdxStep, uint32_t width, uint32_t
>> height);
>> -
>> -    void xEncCoeffQTLuma(TComDataCU* cu, uint32_t trDepth, uint32_t
>> absPartIdx);
>>      void xEncCoeffQTChroma(TComDataCU* cu, uint32_t trDepth, uint32_t
>> absPartIdx, TextType ttype);
>>      void xEncIntraHeaderLuma(TComDataCU* cu, uint32_t trDepth, uint32_t
>> absPartIdx);
>>      void xEncIntraHeaderChroma(TComDataCU* cu, uint32_t absPartIdx);
>> -    uint32_t xGetIntraBitsQTLuma(TComDataCU* cu, uint32_t trDepth,
>> uint32_t absPartIdx, uint32_t depthRange[2]);
>> +
>>      uint32_t xGetIntraBitsQTChroma(TComDataCU* cu, uint32_t trDepth,
>> uint32_t absPartIdx, uint32_t absPartIdxStep);
>>      uint32_t xGetIntraBitsLuma(TComDataCU* cu, uint32_t trDepth,
>> uint32_t absPartIdx, uint32_t log2TrSize, coeff_t* coeff, uint32_t
>> depthRange[2]);
>>      uint32_t xGetIntraBitsChroma(TComDataCU* cu, uint32_t absPartIdx,
>> uint32_t log2TrSizeC, uint32_t chromaId, coeff_t* coeff);
>> @@ -147,7 +144,7 @@
>>                                   uint64_t &rdCost, uint32_t &outBits,
>> uint32_t *zeroDist, uint32_t tuDepthRange[2]);
>>
>>      uint32_t xRecurIntraCodingQT(TComDataCU* cu, uint32_t trDepth,
>> uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv,
>> -                                 ShortYuv* resiYuv, bool bAllowRQTSplit,
>> uint64_t& dRDCost, uint32_t depthRange[2]);
>> +                                 ShortYuv* resiYuv, bool bAllowRQTSplit,
>> uint64_t& dRDCost, uint32_t& puBits, uint32_t depthRange[2]);
>>
>>      uint32_t xRecurIntraChromaCodingQT(TComDataCU* cu, uint32_t trDepth,
>> uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv);
>>
>> _______________________________________________
>> x265-devel mailing list
>> x265-devel at videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
>>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140915/af2a7ddf/attachment-0001.html>