[x265] [PATCH RFC] rc: update ratecontrol stats in every frame, avoid frame parallelism lag in abr

Steve Borho steve at borho.org
Thu Jul 17 19:26:21 CEST 2014


On 07/17, santhoshini at multicorewareinc.com wrote:
> # HG changeset patch
> # User Santhoshini Sekar <santhoshini at multicorewareinc.com>
> # Date 1405596502 -19800
> #      Thu Jul 17 16:58:22 2014 +0530
> # Node ID 92db1b9ee818ff75cea743cb4c4fedd0ab93a9a6
> # Parent  d850cbf81e0f4831d8dcf89db83561969e456205
> rc: update ratecontrol stats in every frame, avoid frame parallelism lag in abr
> 
> RateControl statistics are updated for every frame when refLagRows number of
> rows are completed in processRowEncoder. With this updated data rateControl
> predicts more accurate QP

I'm afraid this doesn't apply on tip

> diff -r d850cbf81e0f -r 92db1b9ee818 source/encoder/analysis.cpp
> --- a/source/encoder/analysis.cpp	Tue Jul 15 22:47:54 2014 -0500
> +++ b/source/encoder/analysis.cpp	Thu Jul 17 16:58:22 2014 +0530
> @@ -65,7 +65,7 @@
>  
>      m_rdCost.setPsyRdScale(m_param->psyRd);
>      m_bEnableRDOQ = top->m_bEnableRDOQ;
> -    m_bFrameParallel = top->m_totalFrameThreads > 1;
> +    m_bFrameParallel = m_param->frameNumThreads > 1;
>      m_numLayers = top->m_quadtreeTULog2MaxSize - top->m_quadtreeTULog2MinSize + 1;
>  
>      return initSearch();
> diff -r d850cbf81e0f -r 92db1b9ee818 source/encoder/encoder.cpp
> --- a/source/encoder/encoder.cpp	Tue Jul 15 22:47:54 2014 -0500
> +++ b/source/encoder/encoder.cpp	Thu Jul 17 16:58:22 2014 +0530
> @@ -150,7 +150,7 @@
>  
>      if (m_frameEncoder)
>      {
> -        for (int i = 0; i < m_totalFrameThreads; i++)
> +        for (int i = 0; i < m_param->frameNumThreads; i++)
>          {
>              // Ensure frame encoder is idle before destroying it
>              m_frameEncoder[i].getEncodedPicture(m_nalList);
> @@ -320,20 +320,6 @@
>      else
>          m_lookahead->flush();
>  
> -    if (m_param->rc.rateControlMode == X265_RC_ABR)
> -    {
> -        // delay frame parallelism for non-VBV ABR
> -        if (m_pocLast == 0 && !m_param->rc.vbvBufferSize && !m_param->rc.vbvMaxBitrate)
> -            m_param->frameNumThreads = 1;
> -        else if (m_param->frameNumThreads != m_totalFrameThreads)
> -        {
> -            // re-enable frame parallelism after the first few P frames are encoded
> -            uint32_t frameCnt = (uint32_t)((0.5 * m_param->fpsNum / m_param->fpsDenom) / (m_param->bframes + 1));
> -            if (m_analyzeP.m_numPics > frameCnt)
> -                m_param->frameNumThreads = m_totalFrameThreads;
> -        }
> -    }
> -
>      FrameEncoder *curEncoder = &m_frameEncoder[m_curEncoder];
>      m_curEncoder = (m_curEncoder + 1) % m_param->frameNumThreads;
>      int ret = 0;
> @@ -402,26 +388,11 @@
>              if (bChroma)
>                  m_numChromaWPBiFrames++;
>          }
> -
> -        uint64_t bytes = 0;
> -        for (uint32_t i = 0; i < m_nalList.m_numNal; i++)
> +        if (m_aborted == true)
>          {
> -            int type = m_nalList.m_nal[i].type;
> -
> -            // exclude SEI
> -            if (type != NAL_UNIT_PREFIX_SEI && type != NAL_UNIT_SUFFIX_SEI)
> -            {
> -                bytes += m_nalList.m_nal[i].sizeBytes;
> -                // and exclude start code prefix
> -                bytes -= (!i || type == NAL_UNIT_SPS || type == NAL_UNIT_PPS) ? 4 : 3;
> -            }
> -        }
> -        if (m_rateControl->rateControlEnd(out, bytes << 3, &curEncoder->m_rce, &curEncoder->m_frameStats) < 0)
> -        {
> -            m_aborted = true;
>              return -1;
>          }
> -        finishFrameStats(out, curEncoder, bytes << 3);
> +        finishFrameStats(out, curEncoder, curEncoder->m_accessUnitBits);
>          // Allow this frame to be recycled if no frame encoders are using it for reference
>          if (!pic_out)
>          {
> @@ -474,12 +445,17 @@
>          // determine references, setup RPS, etc
>          m_dpb->prepareEncode(fenc);
>  
> -        // set slice QP
> -        m_rateControl->rateControlStart(fenc, m_lookahead, &curEncoder->m_rce, this);
>  
>          // Allow FrameEncoder::compressFrame() to start in a worker thread
>          curEncoder->m_enable.trigger();
>      }
> +    else if (!fenc && m_encodedFrameNum > 0)
> +    {
> +        // faked rateControlStart calls to avoid rateControlEnd of last frameNumThreads parallel frames from waiting
> +        RateControlEntry rce;
> +        rce.encodeOrder = m_encodedFrameNum++;
> +        m_rateControl->rateControlStart(NULL, m_lookahead, &rce, this);
> +    }
>  
>      return ret;
>  }
> @@ -1229,7 +1205,6 @@
>      {
>          x265_log(p, X265_LOG_INFO, "Warning: picture-based SAO used with frame parallelism\n");
>      }
> -    m_totalFrameThreads = m_param->frameNumThreads;
>  
>      if (p->keyframeMax < 0)
>      {
> diff -r d850cbf81e0f -r 92db1b9ee818 source/encoder/encoder.h
> --- a/source/encoder/encoder.h	Tue Jul 15 22:47:54 2014 -0500
> +++ b/source/encoder/encoder.h	Thu Jul 17 16:58:22 2014 +0530
> @@ -71,7 +71,6 @@
>  {
>  private:
>  
> -    bool               m_aborted;          // fatal error detected
>      int                m_pocLast;          ///< time index (POC)
>      int                m_encodedFrameNum;
>      int                m_outputCount;
> @@ -82,7 +81,6 @@
>      int64_t            m_prevReorderedPts[2];
>  
>      ThreadPool*        m_threadPool;
> -    Lookahead*         m_lookahead;
>      FrameEncoder*      m_frameEncoder;
>      DPB*               m_dpb;
>  
> @@ -90,15 +88,6 @@
>  
>      int                m_curEncoder;
>  
> -
> -    /* Collect statistics globally */
> -    EncStats           m_analyzeAll;
> -    EncStats           m_analyzeI;
> -    EncStats           m_analyzeP;
> -    EncStats           m_analyzeB;
> -    FILE*              m_csvfpt;
> -    int64_t            m_encodeStartTime;
> -
>      // quality control
>      TComScalingList    m_scalingList;      ///< quantization matrix information
>  
> @@ -141,6 +130,18 @@
>      //====== Tool list ========
>      int                m_lastBPSEI;
>  
> +    /* Collect statistics globally */
> +    EncStats           m_analyzeAll;
> +    EncStats           m_analyzeI;
> +    EncStats           m_analyzeP;
> +    EncStats           m_analyzeB;
> +    FILE*              m_csvfpt;
> +    int64_t            m_encodeStartTime;
> +
> +    Lookahead*         m_lookahead;
> +
> +    bool               m_aborted;          // fatal error detected
> +
>      uint32_t           m_log2ParallelMergeLevelMinus2; ///< Parallel merge estimation region
>  
>      int                m_useScalingListId; ///< Using quantization matrix i.e. 0=off, 1=default.
> @@ -165,8 +166,6 @@
>      Window             m_conformanceWindow;
>      Window             m_defaultDisplayWindow;
>  
> -    int                m_totalFrameThreads;
> -
>      uint32_t           m_numDelayedPic;
>  
>      Encoder();
> diff -r d850cbf81e0f -r 92db1b9ee818 source/encoder/frameencoder.cpp
> --- a/source/encoder/frameencoder.cpp	Tue Jul 15 22:47:54 2014 -0500
> +++ b/source/encoder/frameencoder.cpp	Thu Jul 17 16:58:22 2014 +0530
> @@ -445,26 +445,6 @@
>          m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
>      }
>  
> -    int qp = slice->getSliceQp();
> -
> -    int chromaQPOffset = slice->getPPS()->getChromaCbQpOffset() + slice->getSliceQpDeltaCb();
> -    int qpCb = Clip3(0, MAX_MAX_QP, qp + chromaQPOffset);
> -    
> -    double lambda = x265_lambda2_tab[qp];
> -    /* Assuming qpCb and qpCr are the same, since SAO takes only a single chroma lambda. TODO: Check why */
> -    double chromaLambda = x265_lambda2_tab[qpCb];
> -
> -    // NOTE: set SAO lambda every Frame
> -    m_frameFilter.m_sao.lumaLambda = lambda;
> -    m_frameFilter.m_sao.chromaLambda = chromaLambda;
> -
> -    // Clip qps back to 0-51 range before encoding
> -    qp = Clip3(-QP_BD_OFFSET, MAX_QP, qp);
> -    slice->setSliceQp(qp);
> -    m_frame->m_avgQpAq = qp;
> -    slice->setSliceQpDelta(0);
> -    slice->setSliceQpDeltaCb(0);
> -    slice->setSliceQpDeltaCr(0);
>  
>      switch (slice->getSliceType())
>      {
> @@ -622,6 +602,23 @@
>          }
>      }
>  
> +    uint64_t bytes = 0;
> +    for (uint32_t i = 0; i < m_nalList.m_numNal; i++)
> +    {
> +        int type = m_nalList.m_nal[i].type;
> +
> +        // exclude SEI
> +        if (type != NAL_UNIT_PREFIX_SEI && type != NAL_UNIT_SUFFIX_SEI)
> +        {
> +            bytes += m_nalList.m_nal[i].sizeBytes;
> +            // and exclude start code prefix
> +            bytes -= (!i || type == NAL_UNIT_SPS || type == NAL_UNIT_PPS) ? 4 : 3;
> +        }
> +    }
> +    m_accessUnitBits = bytes << 3;
> +    if (m_top->m_rateControl->rateControlEnd(m_frame, m_accessUnitBits, &m_rce, &m_frameStats) < 0)
> +        m_top->m_aborted = true;
> +
>      noiseReductionUpdate();
>  
>      m_elapsedCompressTime = (double)(x265_mdate() - startCompressTime) / 1000000;
> @@ -720,6 +717,27 @@
>      PPAScopeEvent(FrameEncoder_compressRows);
>      TComSlice* slice = m_frame->getSlice();
>  
> +    // set slice QP
> +    m_top->m_rateControl->rateControlStart(m_frame, m_top->m_lookahead, &m_rce, m_top);
> +    int qp = slice->getSliceQp();
> +
> +    int chromaQPOffset = slice->getPPS()->getChromaCbQpOffset() + slice->getSliceQpDeltaCb();
> +    int qpCb = Clip3(0, MAX_MAX_QP, qp + chromaQPOffset);
> +    double lambda = x265_lambda2_tab[qp];
> +    /* Assuming qpCb and qpCr are the same, since SAO takes only a single chroma lambda. TODO: Check why */
> +    double chromaLambda = x265_lambda2_tab[qpCb];
> +
> +    // NOTE: set SAO lambda every Frame
> +    m_frameFilter.m_sao.lumaLambda = lambda;
> +    m_frameFilter.m_sao.chromaLambda = chromaLambda;
> +
> +    // Clip qps back to 0-51 range before encoding
> +    qp = Clip3(-QP_BD_OFFSET, MAX_QP, qp);
> +    slice->setSliceQp(qp);
> +    m_frame->m_avgQpAq = qp;
> +    slice->setSliceQpDelta(0);
> +    slice->setSliceQpDeltaCb(0);
> +    slice->setSliceQpDeltaCr(0);
>      // reset entropy coders
>      m_sbacCoder.resetEntropy(slice);
>      for (int i = 0; i < this->m_numRows; i++)
> @@ -1047,7 +1065,34 @@
>          }
>      }
>  
> +    /* when a frame is half way through, update bits and complexity in rate control
> +     * for it to be available for the next frame's QScale calculation. This makes it 
> +     * more accurate with updated value */
> +    int rowCount = 0;
> +
> +    /* for the first two seconds update when the frame is half done and for rest
> +     * of the sequence update when refLagRows are completed */
> +    if (m_param->rc.rateControlMode == X265_RC_ABR)
> +    {
> +        if (m_top->m_analyzeAll.m_numPics <= 2 * (m_param->fpsNum / m_param->fpsDenom))

I believe this is a small (m_analyzeAll) race-hazard. Can't we just base
this on the frame's own encode ordinal?

> +            rowCount = m_numRows/2;
> +        else
> +            rowCount = m_refLagRows;
> +    }
> +
>      // this row of CTUs has been encoded
> +    if (row == rowCount)
> +    {
> +        int64_t bits = 0;
> +        for(uint32_t col = 0; col < rowCount * numCols; col++)
> +        {
> +            TComDataCU* cu = m_frame->getCU(col);
> +            bits += cu->m_totalBits;
> +        }
> +
> +        m_rce.rowTotalBits = bits;
> +        m_top->m_rateControl->rateControlUpdateStats(&m_rce);
> +    }
>  
>      // trigger row-wise loop filters
>      if (row >= m_filterRowDelay)
> diff -r d850cbf81e0f -r 92db1b9ee818 source/encoder/frameencoder.h
> --- a/source/encoder/frameencoder.h	Tue Jul 15 22:47:54 2014 -0500
> +++ b/source/encoder/frameencoder.h	Thu Jul 17 16:58:22 2014 +0530
> @@ -137,6 +137,8 @@
>      FrameStats               m_frameStats;          // stats of current frame for multipass encodes
>      volatile bool            m_bAllRowsStop;
>      volatile int             m_vbvResetTriggerRow;
> +    Frame*                   m_frame;
> +    uint64_t                 m_accessUnitBits;
>  
>  protected:
>  
> @@ -155,7 +157,6 @@
>      NALList                  m_nalList;
>      ThreadLocalData          m_tld;
>  
> -    Frame*                   m_frame;
>  
>      int                      m_filterRowDelay;
>      int                      m_filterRowDelayCus;
> diff -r d850cbf81e0f -r 92db1b9ee818 source/encoder/ratecontrol.cpp
> --- a/source/encoder/ratecontrol.cpp	Tue Jul 15 22:47:54 2014 -0500
> +++ b/source/encoder/ratecontrol.cpp	Thu Jul 17 16:58:22 2014 +0530
> @@ -296,10 +296,13 @@
>  
>      // validate for param->rc, maybe it is need to add a function like x265_parameters_valiate()
>      m_residualFrames = 0;
> +    m_partialResidualFrames = 0;
>      m_residualCost = 0;
> +    m_partialResidualCost = 0;
>      m_rateFactorMaxIncrement = 0;
>      m_rateFactorMaxDecrement = 0;
>      m_fps = m_param->fpsNum / m_param->fpsDenom;
> +    m_startEndOrder.set(0);
>      if (m_param->rc.rateControlMode == X265_RC_CRF)
>      {
>          m_param->rc.qp = (int)m_param->rc.rfConstant;
> @@ -666,6 +669,7 @@
>      m_totalBits = 0;
>      m_framesDone = 0;
>      m_residualCost = 0;
> +    m_partialResidualCost = 0;
>  
>      /* 720p videos seem to be a good cutoff for cplxrSum */
>      double tuneCplxFactor = (m_param->rc.cuTree && m_ncu > 3600) ? 2.5 : 1;
> @@ -979,6 +983,19 @@
>  
>  void RateControl::rateControlStart(Frame* pic, Lookahead *l, RateControlEntry* rce, Encoder* enc)
>  {
> +    int orderValue = m_startEndOrder.get();
> +    int startOrdinal = rce->encodeOrder * 2;
> +
> +    while (orderValue != startOrdinal && pic)
> +       orderValue = m_startEndOrder.waitForChange(orderValue);
> +
> +    if (!pic)
> +    {
> +        // faked rateControlStart calls
> +        m_startEndOrder.incr();
> +        return;
> +    }
> +
>      m_curSlice = pic->getSlice();
>      m_sliceType = m_curSlice->getSliceType();
>      rce->sliceType = m_sliceType;
> @@ -991,6 +1008,8 @@
>      rce->bLastMiniGopBFrame = pic->m_lowres.bLastMiniGopBFrame;
>      rce->bufferRate = m_bufferRate;
>      rce->poc = m_curSlice->getPOC();
> +    rce->rowCplxrSum = 0.0;
> +    rce->rowTotalBits = 0;
>      if (m_isVbv)
>      {
>          if (rce->rowPreds[0][0].count == 0)
> @@ -1044,6 +1063,8 @@
>          m_qp = Clip3(MIN_QP, MAX_MAX_QP, m_qp);
>          rce->qpaRc = pic->m_avgQpRc = pic->m_avgQpAq = m_qp;
>      }
> +    // Do not increment m_startEndOrder here. Make rateControlEnd of previous thread
> +    // to wait until rateControlUpdateStats of this frame is called
>      m_framesDone++;
>      /* set the final QP to slice structure */
>      m_curSlice->setSliceQp(m_qp);
> @@ -1278,7 +1299,7 @@
>                  /* use framesDone instead of POC as poc count is not serial with bframes enabled */
>                  double timeDone = (double)(m_framesDone - m_param->frameNumThreads + 1) * m_frameDuration;
>                  wantedBits = timeDone * m_bitrate;
> -                if (wantedBits > 0 && m_totalBits > 0 && !m_residualFrames)
> +                if (wantedBits > 0 && m_totalBits > 0 && !m_partialResidualFrames)
>                  {
>                      abrBuffer *= X265_MAX(1, sqrt(timeDone));
>                      overflow = Clip3(.5, 2.0, 1.0 + (m_totalBits - wantedBits) / abrBuffer);
> @@ -1300,7 +1321,7 @@
>                  double lqmin = 0, lqmax = 0;
>                  lqmin = m_lastQScaleFor[m_sliceType] / m_lstep;
>                  lqmax = m_lastQScaleFor[m_sliceType] * m_lstep;
> -                if (!m_residualFrames)
> +                if (!m_partialResidualFrames)
>                  {
>                      if (overflow > 1.1 && m_framesDone > 3)
>                          lqmax *= m_lstep;
> @@ -1342,16 +1363,17 @@
>      if (rce->sliceType == I_SLICE)
>      {
>          /* previous I still had a residual; roll it into the new loan */
> -        if (m_residualFrames)
> -            rce->rowTotalBits += m_residualCost * m_residualFrames;
> +        if (m_partialResidualFrames)
> +            rce->rowTotalBits += m_partialResidualCost * m_partialResidualFrames;
>  
> -        m_residualFrames = X265_MIN(s_amortizeFrames, m_param->keyframeMax);
> -        m_residualCost = (int)((rce->rowTotalBits * s_amortizeFraction) / m_residualFrames);
> -        rce->rowTotalBits -= m_residualCost * m_residualFrames;
> +        m_partialResidualFrames = X265_MIN(s_amortizeFrames, m_param->keyframeMax);
> +        m_partialResidualCost = (int)((rce->rowTotalBits * s_amortizeFraction) /m_partialResidualFrames);
> +        rce->rowTotalBits -= m_partialResidualCost * m_partialResidualFrames;
>      }
> -    else if (m_residualFrames)
> +    else if (m_partialResidualFrames)
>      {
> -         rce->rowTotalBits += m_residualCost;
> +         rce->rowTotalBits += m_partialResidualCost;
> +         m_partialResidualFrames--;
>      }
>  
>      if (rce->sliceType != B_SLICE)
> @@ -1361,6 +1383,12 @@
>  
>      m_cplxrSum += rce->rowCplxrSum;
>      m_totalBits += rce->rowTotalBits;
> +
> +    /* delay incrementing m_startEndOrder until here to sync with rateControlStart() */

"do not allow the next frame to enter rateControlStart() until this
frame has updated its mid-frame statistics"

> +    m_startEndOrder.incr();
> +
> +    if (rce->encodeOrder < m_param->frameNumThreads - 1)
> +        m_startEndOrder.incr(); // faked rateControlEnd calls for negative frames
>  }
>  
>  void RateControl::checkAndResetABR(RateControlEntry* rce, bool isFrameDone)
> @@ -1820,6 +1848,11 @@
>  /* After encoding one frame, update rate control state */
>  int RateControl::rateControlEnd(Frame* pic, int64_t bits, RateControlEntry* rce, FrameStats* stats)
>  {
> +    int orderValue = m_startEndOrder.get();
> +    int endOrdinal = (rce->encodeOrder + m_param->frameNumThreads) * 2 - 1;
> +    while (orderValue != endOrdinal)
> +        orderValue = m_startEndOrder.waitForChange(orderValue);
> +
>      int64_t actualBits = bits;
>      if (m_isAbr)
>      {
> @@ -1919,17 +1952,19 @@
>                  }
>              }
>              if (rce->sliceType != B_SLICE)
> +            {
>                  /* The factor 1.5 is to tune up the actual bits, otherwise the cplxrSum is scaled too low
>                   * to improve short term compensation for next frame. */
> -                m_cplxrSum += bits * x265_qp2qScale(rce->qpaRc) / rce->qRceq;
> +                m_cplxrSum += (bits * x265_qp2qScale(rce->qpaRc) / rce->qRceq) - (rce->rowCplxrSum);
> +            }
>              else
>              {
>                  /* Depends on the fact that B-frame's QP is an offset from the following P-frame's.
>                   * Not perfectly accurate with B-refs, but good enough. */
> -                m_cplxrSum += bits * x265_qp2qScale(rce->qpaRc) / (rce->qRceq * fabs(m_param->rc.pbFactor));
> +                m_cplxrSum += (bits * x265_qp2qScale(rce->qpaRc) / (rce->qRceq * fabs(m_param->rc.pbFactor))) - (rce->rowCplxrSum);
>              }
>              m_wantedBitsWindow += m_frameDuration * m_bitrate;
> -            m_totalBits += bits;
> +            m_totalBits += bits - rce->rowTotalBits;
>          }
>      }
>  
> @@ -1973,6 +2008,8 @@
>              rce->hrdTiming->dpbOutputTime = (double)rce->picTimingSEI->m_picDpbOutputDelay * time->numUnitsInTick / time->timeScale + rce->hrdTiming->cpbRemovalTime;
>          }
>      }
> +    // Allow rateControlStart of next frame only when rateControlEnd of previous frame is over
> +    m_startEndOrder.incr();
>      rce->isActive = false;
>      return 0;
>  
> diff -r d850cbf81e0f -r 92db1b9ee818 source/encoder/ratecontrol.h
> --- a/source/encoder/ratecontrol.h	Tue Jul 15 22:47:54 2014 -0500
> +++ b/source/encoder/ratecontrol.h	Thu Jul 17 16:58:22 2014 +0530
> @@ -147,6 +147,14 @@
>      int64_t  m_totalBits;        /* total bits used for already encoded frames */
>      int      m_framesDone;       /* # of frames passed through RateCotrol already */
>      double   m_fps;
> +
> +    /* a common variable on which rateControlStart, rateControlEnd and rateControUpdateStats waits to
> +     * sync the calls to these functions.

I suspect the text below could be better replaced by simple example for
-F2:

rceStart  10
rceUpdate 10
rceEnd    9
rceStart  11
rceUpdate 11
rceEnd    10
rceStart  12
rceUpdate 12
rceEnd    11

> By waiting on this variable these functions proceed in the order as same as
> +     * encode order i.e wait until the variable matches the encode order. rateControlStart of next frame will
> +     * not happen until previous frame updates. And rateControlEnd of previous frame will not happen until the
> +     * current frame updates */
> +    ThreadSafeInteger m_startEndOrder;
> +
>      /* hrd stuff */
>      SEIBufferingPeriod m_bufPeriodSEI;
>      double   m_nominalRemovalTime;
> @@ -185,7 +193,9 @@
>      static const char  *s_defaultStatFileName;
>  
>      int m_residualFrames;
> +    int m_partialResidualFrames;
>      int m_residualCost;
> +    int m_partialResidualCost;
>  
>      double getQScale(RateControlEntry *rce, double rateFactor);
>      double rateEstimateQscale(Frame* pic, RateControlEntry *rce); // main logic for calculating QP based on ABR
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho


More information about the x265-devel mailing list