[x265] [PATCH 3 of 3 RFC] rc: update ratecontrol stats in every frame, avoid frame parallelism lag in abr

Steve Borho steve at borho.org
Fri Jul 11 22:59:18 CEST 2014


On Fri, Jul 11, 2014 at 6:24 AM,  <santhoshini at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Santhoshini Sekar <santhoshini at multicorewareinc.com>
> # Date 1405077594 -19800
> #      Fri Jul 11 16:49:54 2014 +0530
> # Node ID 070c3f30547aca9af4f8a708b6ae4a108510aad5
> # Parent  7acd78cdabfee453ba3b44b034eb2c87e587c7e6
> rc: update ratecontrol stats in every frame, avoid frame parallelism lag in abr

more explanation of why and how here would be helpful

> diff -r 7acd78cdabfe -r 070c3f30547a source/encoder/encoder.cpp
> --- a/source/encoder/encoder.cpp        Fri Jul 11 15:15:32 2014 +0530
> +++ b/source/encoder/encoder.cpp        Fri Jul 11 16:49:54 2014 +0530
> @@ -311,20 +311,6 @@
>      else
>          m_lookahead->flush();
>
> -    if (m_param->rc.rateControlMode == X265_RC_ABR)
> -    {
> -        // delay frame parallelism for non-VBV ABR
> -        if (m_pocLast == 0 && !m_param->rc.vbvBufferSize && !m_param->rc.vbvMaxBitrate)
> -            m_param->frameNumThreads = 1;
> -        else if (m_param->frameNumThreads != m_totalFrameThreads)
> -        {
> -            // re-enable frame parallelism after the first few P frames are encoded
> -            uint32_t frameCnt = (uint32_t)((0.5 * m_param->fpsNum / m_param->fpsDenom) / (m_param->bframes + 1));
> -            if (m_analyzeP.m_numPics > frameCnt)
> -                m_param->frameNumThreads = m_totalFrameThreads;
> -        }
> -    }
> -

\o/

When this is all said and done, m_totalFrameThreads should be removed

>      FrameEncoder *curEncoder = &m_frameEncoder[m_curEncoder];
>      m_curEncoder = (m_curEncoder + 1) % m_param->frameNumThreads;
>      int ret = 0;
> @@ -393,26 +379,11 @@
>              if (bChroma)
>                  m_numChromaWPBiFrames++;
>          }
> -
> -        uint64_t bytes = 0;
> -        for (uint32_t i = 0; i < m_nalList.m_numNal; i++)
> +        if (m_aborted == true)
>          {
> -            int type = m_nalList.m_nal[i].type;
> -
> -            // exclude SEI
> -            if (type != NAL_UNIT_PREFIX_SEI && type != NAL_UNIT_SUFFIX_SEI)
> -            {
> -                bytes += m_nalList.m_nal[i].sizeBytes;
> -                // and exclude start code prefix
> -                bytes -= (!i || type == NAL_UNIT_SPS || type == NAL_UNIT_PPS) ? 4 : 3;
> -            }
> -        }
> -        if (m_rateControl->rateControlEnd(out, bytes << 3, &curEncoder->m_rce, &curEncoder->m_frameStats) < 0)
> -        {
> -            m_aborted = true;
>              return -1;
>          }
> -        finishFrameStats(out, curEncoder, bytes << 3);
> +        finishFrameStats(out, curEncoder, curEncoder->m_accessUnitBits);
>          // Allow this frame to be recycled if no frame encoders are using it for reference
>          if (!pic_out)
>          {
> @@ -465,13 +436,16 @@
>          // determine references, setup RPS, etc
>          m_dpb->prepareEncode(fenc);
>
> -        // set slice QP
> -        m_rateControl->rateControlStart(fenc, m_lookahead, &curEncoder->m_rce, this);
>
>          // Allow FrameEncoder::compressFrame() to start in a worker thread
>          curEncoder->m_enable.trigger();
>      }
> -
> +    else if (!fenc && m_encodedFrameNum > 0)
> +    {
> +        RateControlEntry rce;
> +        rce.encodeOrder = m_encodedFrameNum++;
> +        m_rateControl->rateControlStart(NULL, m_lookahead, &rce, this);
> +    }

has this been tested with very short encodes? I worry we'll introduce
new deadlocks

>      return ret;
>  }
>
> diff -r 7acd78cdabfe -r 070c3f30547a source/encoder/encoder.h
> --- a/source/encoder/encoder.h  Fri Jul 11 15:15:32 2014 +0530
> +++ b/source/encoder/encoder.h  Fri Jul 11 16:49:54 2014 +0530
> @@ -71,7 +71,6 @@
>  {
>  private:
>
> -    bool               m_aborted;          // fatal error detected
>      int                m_pocLast;          ///< time index (POC)
>      int                m_encodedFrameNum;
>      int                m_outputCount;
> @@ -82,7 +81,6 @@
>      int64_t            m_prevReorderedPts[2];
>
>      ThreadPool*        m_threadPool;
> -    Lookahead*         m_lookahead;
>      FrameEncoder*      m_frameEncoder;
>      DPB*               m_dpb;
>
> @@ -91,14 +89,6 @@
>      int                m_curEncoder;
>
>
> -    /* Collect statistics globally */
> -    EncStats           m_analyzeAll;
> -    EncStats           m_analyzeI;
> -    EncStats           m_analyzeP;
> -    EncStats           m_analyzeB;
> -    FILE*              m_csvfpt;
> -    int64_t            m_encodeStartTime;
> -
>      // quality control
>      TComScalingList    m_scalingList;      ///< quantization matrix information
>
> @@ -119,6 +109,17 @@
>      Level::Tier        m_levelTier;
>      Level::Name        m_level;
>
> +    /* Collect statistics globally */
> +    EncStats           m_analyzeAll;
> +    EncStats           m_analyzeI;
> +    EncStats           m_analyzeP;
> +    EncStats           m_analyzeB;
> +    FILE*              m_csvfpt;
> +    int64_t            m_encodeStartTime;
> +
> +    Lookahead*         m_lookahead;
> +
> +    bool               m_aborted;          // fatal error detected
>      bool               m_nonPackedConstraintFlag;
>      bool               m_frameOnlyConstraintFlag;
>
> diff -r 7acd78cdabfe -r 070c3f30547a source/encoder/frameencoder.cpp
> --- a/source/encoder/frameencoder.cpp   Fri Jul 11 15:15:32 2014 +0530
> +++ b/source/encoder/frameencoder.cpp   Fri Jul 11 16:49:54 2014 +0530
> @@ -429,26 +429,6 @@
>          m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
>      }
>
> -    int qp = slice->getSliceQp();
> -
> -    int chromaQPOffset = slice->getPPS()->getChromaCbQpOffset() + slice->getSliceQpDeltaCb();
> -    int qpCb = Clip3(0, MAX_MAX_QP, qp + chromaQPOffset);
> -
> -    double lambda = x265_lambda2_tab[qp];
> -    /* Assuming qpCb and qpCr are the same, since SAO takes only a single chroma lambda. TODO: Check why */
> -    double chromaLambda = x265_lambda2_tab[qpCb];
> -
> -    // NOTE: set SAO lambda every Frame
> -    m_frameFilter.m_sao.lumaLambda = lambda;
> -    m_frameFilter.m_sao.chromaLambda = chromaLambda;
> -
> -    // Clip qps back to 0-51 range before encoding
> -    qp = Clip3(-QP_BD_OFFSET, MAX_QP, qp);
> -    slice->setSliceQp(qp);
> -    m_frame->m_avgQpAq = qp;
> -    slice->setSliceQpDelta(0);
> -    slice->setSliceQpDeltaCb(0);
> -    slice->setSliceQpDeltaCr(0);
>
>      switch (slice->getSliceType())
>      {
> @@ -601,6 +581,24 @@
>          }
>      }
>
> +    uint64_t bytes = 0;
> +    for (uint32_t i = 0; i < m_nalList.m_numNal; i++)
> +    {
> +        int type = m_nalList.m_nal[i].type;
> +
> +        // exclude SEI
> +        if (type != NAL_UNIT_PREFIX_SEI && type != NAL_UNIT_SUFFIX_SEI)
> +        {
> +            bytes += m_nalList.m_nal[i].sizeBytes;
> +            // and exclude start code prefix
> +            bytes -= (!i || type == NAL_UNIT_SPS || type == NAL_UNIT_PPS) ? 4 : 3;
> +        }
> +    }
> +    m_accessUnitBits = bytes << 3;
> +    if (m_top->m_rateControl->rateControlEnd(m_frame, m_accessUnitBits, &m_rce, &m_frameStats) < 0)
> +    {
> +        m_top->m_aborted = true;
> +    }
>      noiseReductionUpdate();
>
>      m_elapsedCompressTime = (double)(x265_mdate() - startCompressTime) / 1000000;
> @@ -699,7 +697,28 @@
>      PPAScopeEvent(FrameEncoder_compressRows);
>      TComSlice* slice = m_frame->getSlice();
>
> -    // reset entropy coders
> +    //set slice QP

space between // and comment

> +    m_top->m_rateControl->rateControlStart(m_frame, m_top->m_lookahead, &m_rce, m_top);
> +    int qp = slice->getSliceQp();
> +
> +    int chromaQPOffset = slice->getPPS()->getChromaCbQpOffset() + slice->getSliceQpDeltaCb();
> +    int qpCb = Clip3(0, MAX_MAX_QP, qp + chromaQPOffset);
> +    double lambda = x265_lambda2_tab[qp];
> +    /* Assuming qpCb and qpCr are the same, since SAO takes only a single chroma lambda. TODO: Check why */
> +    double chromaLambda = x265_lambda2_tab[qpCb];
> +
> +    // NOTE: set SAO lambda every Frame
> +    m_frameFilter.m_sao.lumaLambda = lambda;
> +    m_frameFilter.m_sao.chromaLambda = chromaLambda;
> +
> +    // Clip qps back to 0-51 range before encoding
> +    qp = Clip3(-QP_BD_OFFSET, MAX_QP, qp);
> +    slice->setSliceQp(qp);
> +    m_frame->m_avgQpAq = qp;
> +    slice->setSliceQpDelta(0);
> +    slice->setSliceQpDeltaCb(0);
> +    slice->setSliceQpDeltaCr(0);
> +     //reset entropy coders
>      m_sbacCoder.resetEntropy(slice);
>      for (int i = 0; i < this->m_numRows; i++)
>      {
> @@ -1026,7 +1045,26 @@
>          }
>      }
>
> +    int rowCount;
> +

this bit of skullduggery below deserves a nice long comment

> +    if (m_top->m_analyzeAll.m_numPics <= 2 * (m_param->fpsNum / m_param->fpsDenom))
> +        rowCount = m_numRows/2 ;
> +    else
> +        rowCount = m_refLagRows;
> +
>      // this row of CTUs has been encoded
> +    if (row == rowCount)
> +    {
> +        int64_t bits = 0;
> +        for(uint32_t col = 0; col < rowCount * numCols; col++)
> +        {
> +            TComDataCU* cu = m_frame->getCU(col);
> +            bits += cu->m_totalBits;
> +        }
> +
> +        m_rce.rowTotalBits = bits;
> +        m_top->m_rateControl->rateControlUpdateStats(&m_rce);
> +    }

is it helpful to call rateControlUpdateStats() for non-referenced
frames? should we do this after 1 row or sooner for B frames?

>      // trigger row-wise loop filters
>      if (row >= m_filterRowDelay)
> diff -r 7acd78cdabfe -r 070c3f30547a source/encoder/frameencoder.h
> --- a/source/encoder/frameencoder.h     Fri Jul 11 15:15:32 2014 +0530
> +++ b/source/encoder/frameencoder.h     Fri Jul 11 16:49:54 2014 +0530
> @@ -137,6 +137,8 @@
>      FrameStats               m_frameStats;          // stats of current frame for multipass encodes
>      volatile bool            m_bAllRowsStop;
>      volatile int             m_vbvResetTriggerRow;
> +    Frame*                   m_frame;
> +    uint64_t                 m_accessUnitBits;
>
>  protected:
>
> @@ -155,7 +157,6 @@
>      NALList                  m_nalList;
>      ThreadLocalData          m_tld;
>
> -    Frame*                   m_frame;
>
>      int                      m_filterRowDelay;
>      int                      m_filterRowDelayCus;
> diff -r 7acd78cdabfe -r 070c3f30547a source/encoder/ratecontrol.cpp
> --- a/source/encoder/ratecontrol.cpp    Fri Jul 11 15:15:32 2014 +0530
> +++ b/source/encoder/ratecontrol.cpp    Fri Jul 11 16:49:54 2014 +0530
> @@ -263,7 +263,6 @@
>      int lowresCuWidth = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
>      int lowresCuHeight = ((m_param->sourceHeight / 2)  + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
>      m_ncu = lowresCuWidth * lowresCuHeight;
> -

the battle against rate control line feeds rages on

>      if (m_param->rc.cuTree)
>          m_qCompress = 1;
>      else
> @@ -541,74 +540,88 @@
>
>  void RateControl::rateControlStart(Frame* pic, Lookahead *l, RateControlEntry* rce, Encoder* enc)
>  {
> -    m_curSlice = pic->getSlice();
> -    m_sliceType = m_curSlice->getSliceType();
> -    rce->sliceType = m_sliceType;
> -    rce->isActive = true;
> -    if (m_sliceType == B_SLICE)
> -        rce->bframes = m_bframes;
> -    else
> -        m_bframes = pic->m_lowres.leadingBframes;
> +    int orderValue = m_startEndOrder.get();
> +    int startOrdinal = rce->encodeOrder * 2;
>
> -    rce->bLastMiniGopBFrame = pic->m_lowres.bLastMiniGopBFrame;
> -    rce->bufferRate = m_bufferRate;
> -    rce->poc = m_curSlice->getPOC();
> -    if (m_isVbv)
> -    {
> -        if (rce->rowPreds[0][0].count == 0)
> +    while (orderValue != startOrdinal && pic)
> +       orderValue = m_startEndOrder.waitForChange(orderValue);
> +
> +    ScopedLock scope(m_lock);

again, the lock is likely redundant now

> +    if (pic)

rather than changing indentation of dozens of lines here, I'd prefer
an early out like:

if (!pic) { m_startEndOrder.incr(); return; }

>          {
> -            for (int i = 0; i < 3; i++)
> +        m_curSlice = pic->getSlice();
> +        m_sliceType = m_curSlice->getSliceType();
> +        rce->sliceType = m_sliceType;
> +
> +        rce->isActive = true;
> +        if (m_sliceType == B_SLICE)
> +            rce->bframes = m_bframes;
> +        else
> +            m_bframes = pic->m_lowres.leadingBframes;
> +        rce->bLastMiniGopBFrame = pic->m_lowres.bLastMiniGopBFrame;
> +        rce->bufferRate = m_bufferRate;
> +        rce->poc = m_curSlice->getPOC();
> +        rce->rowCplxrSum = 0.0;
> +        rce->rowTotalBits = 0;
> +        if (m_isVbv)
> +        {
> +            if (rce->rowPreds[0][0].count == 0)
>              {
> -                for (int j = 0; j < 2; j++)
> +                for (int i = 0; i < 3; i++)
>                  {
> -                    rce->rowPreds[i][j].coeff = 0.25;
> -                    rce->rowPreds[i][j].count = 1.0;
> -                    rce->rowPreds[i][j].decay = 0.5;
> -                    rce->rowPreds[i][j].offset = 0.0;
> +                    for (int j = 0; j < 2; j++)
> +                    {
> +                        rce->rowPreds[i][j].coeff = 0.25;
> +                        rce->rowPreds[i][j].count = 1.0;
> +                        rce->rowPreds[i][j].decay = 0.5;
> +                        rce->rowPreds[i][j].offset = 0.0;
> +                    }
>                  }
>              }
> +            rce->rowPred[0] = &rce->rowPreds[m_sliceType][0];
> +            rce->rowPred[1] = &rce->rowPreds[m_sliceType][1];
> +            updateVbvPlan(enc);
> +            rce->bufferFill = m_bufferFill;
>          }
> -        rce->rowPred[0] = &rce->rowPreds[m_sliceType][0];
> -        rce->rowPred[1] = &rce->rowPreds[m_sliceType][1];
> -        updateVbvPlan(enc);
> -        rce->bufferFill = m_bufferFill;
> -    }
> -    if (m_isAbr) //ABR,CRF
> -    {
> -        m_currentSatd = l->getEstimatedPictureCost(pic) >> (X265_DEPTH - 8);
> -        /* Update rce for use in rate control VBV later */
> -        rce->lastSatd = m_currentSatd;
> -        double q = x265_qScale2qp(rateEstimateQscale(pic, rce));
> -        q = Clip3((double)MIN_QP, (double)MAX_MAX_QP, q);
> -        m_qp = int(q + 0.5);
> -        rce->qpaRc = pic->m_avgQpRc = pic->m_avgQpAq = q;
> -        /* copy value of lastRceq into thread local rce struct *to be used in RateControlEnd() */
> -        rce->qRceq = m_lastRceq;
> -        accumPQpUpdate();
> -    }
> -    else //CQP
> -    {
> -        if (m_sliceType == B_SLICE && m_curSlice->isReferenced())
> -            m_qp = (m_qpConstant[B_SLICE] + m_qpConstant[P_SLICE]) / 2;
> -        else
> -            m_qp = m_qpConstant[m_sliceType];
> -        pic->m_avgQpAq = pic->m_avgQpRc = m_qp;
> -    }
> -    if (m_sliceType != B_SLICE)
> -    {
> -        m_lastNonBPictType = m_sliceType;
> -        m_leadingNoBSatd = m_currentSatd;
> -    }
> -    rce->leadingNoBSatd = m_leadingNoBSatd;
> -    if (pic->m_forceqp)
> -    {
> -        m_qp = int32_t(pic->m_forceqp + 0.5) - 1;
> -        m_qp = Clip3(MIN_QP, MAX_MAX_QP, m_qp);
> -        rce->qpaRc = pic->m_avgQpRc = pic->m_avgQpAq = m_qp;
> -    }
> -    m_framesDone++;
> -    /* set the final QP to slice structure */
> -    m_curSlice->setSliceQp(m_qp);
> +        if (m_isAbr) //ABR,CRF
> +        {
> +            m_currentSatd = l->getEstimatedPictureCost(pic) >> (X265_DEPTH - 8);
> +            /* Update rce for use in rate control VBV later */
> +            rce->lastSatd = m_currentSatd;
> +            double q = x265_qScale2qp(rateEstimateQscale(pic, rce));
> +            q = Clip3((double)MIN_QP, (double)MAX_MAX_QP, q);
> +            m_qp = int(q + 0.5);
> +            rce->qpaRc = pic->m_avgQpRc = pic->m_avgQpAq = q;
> +            /* copy value of lastRceq into thread local rce struct *to be used in RateControlEnd() */
> +            rce->qRceq = m_lastRceq;
> +            accumPQpUpdate();
> +        }
> +        else //CQP
> +        {
> +            if (m_sliceType == B_SLICE && m_curSlice->isReferenced())
> +                m_qp = (m_qpConstant[B_SLICE] + m_qpConstant[P_SLICE]) / 2;
> +            else
> +                m_qp = m_qpConstant[m_sliceType];
> +            pic->m_avgQpAq = pic->m_avgQpRc = m_qp;
> +        }
> +        if (m_sliceType != B_SLICE)
> +        {
> +            m_lastNonBPictType = m_sliceType;
> +            m_leadingNoBSatd = m_currentSatd;
> +        }
> +        rce->leadingNoBSatd = m_leadingNoBSatd;
> +        if (pic->m_forceqp)
> +        {
> +            m_qp = int32_t(pic->m_forceqp + 0.5) - 1;
> +            m_qp = Clip3(MIN_QP, MAX_MAX_QP, m_qp);
> +            rce->qpaRc = pic->m_avgQpRc = pic->m_avgQpAq = m_qp;
> +        }
> +        m_framesDone++;
> +        /* set the final QP to slice structure */
> +        m_curSlice->setSliceQp(m_qp);
> +        }

white-space, and an explanation for why m_startEndOrder is not incremented here

> +    else
> +        m_startEndOrder.incr();
>  }
>
>  void RateControl::accumPQpUpdate()
> @@ -1268,6 +1281,12 @@
>  /* After encoding one frame, update rate control state */
>  int RateControl::rateControlEnd(Frame* pic, int64_t bits, RateControlEntry* rce, FrameStats* stats)
>  {
> +    int orderValue = m_startEndOrder.get();
> +    int endOrdinal = (rce->encodeOrder + m_param->frameNumThreads) * 2 - 1;
> +    while (orderValue != endOrdinal)
> +            orderValue = m_startEndOrder.waitForChange(orderValue);
> +    ScopedLock scope(m_lock);
> +
>      int64_t actualBits = bits;
>      if (m_isAbr)
>      {
> @@ -1366,17 +1385,19 @@
>              }
>
>              if (rce->sliceType != B_SLICE)
> +            {
>                  /* The factor 1.5 is to tune up the actual bits, otherwise the cplxrSum is scaled too low
>                   * to improve short term compensation for next frame. */
> -                m_cplxrSum += bits * x265_qp2qScale(rce->qpaRc) / rce->qRceq;
> +                 m_cplxrSum += (bits * x265_qp2qScale(rce->qpaRc) / rce->qRceq) - (rce->rowCplxrSum);
> +            }
>              else
>              {
>                  /* Depends on the fact that B-frame's QP is an offset from the following P-frame's.
>                   * Not perfectly accurate with B-refs, but good enough. */
> -                m_cplxrSum += bits * x265_qp2qScale(rce->qpaRc) / (rce->qRceq * fabs(m_param->rc.pbFactor));
> +                m_cplxrSum += (bits * x265_qp2qScale(rce->qpaRc) / (rce->qRceq * fabs(m_param->rc.pbFactor))) - (rce->rowCplxrSum);
>              }
>              m_wantedBitsWindow += m_frameDuration * m_bitrate;
> -            m_totalBits += bits;
> +            m_totalBits += bits - rce->rowTotalBits;
>          }
>      }
>
> @@ -1425,6 +1446,7 @@
>              rce->hrdTiming->dpbOutputTime = (double)rce->picTimingSEI->m_picDpbOutputDelay * time->getNumUnitsInTick() / time->getTimeScale() + rce->hrdTiming->cpbRemovalTime;
>          }
>      }

and this needs a comment

> +    m_startEndOrder.incr();
>      rce->isActive = false;
>      return 0;
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel



-- 
Steve Borho


More information about the x265-devel mailing list