[x265] [PATCH 3 of 3 RFC] rc: update ratecontrol stats in every frame, avoid frame parallelism lag in abr
Steve Borho
steve at borho.org
Fri Jul 11 22:59:18 CEST 2014
On Fri, Jul 11, 2014 at 6:24 AM, <santhoshini at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Santhoshini Sekar <santhoshini at multicorewareinc.com>
> # Date 1405077594 -19800
> # Fri Jul 11 16:49:54 2014 +0530
> # Node ID 070c3f30547aca9af4f8a708b6ae4a108510aad5
> # Parent 7acd78cdabfee453ba3b44b034eb2c87e587c7e6
> rc: update ratecontrol stats in every frame, avoid frame parallelism lag in abr
more explanation of why and how here would be helpful
> diff -r 7acd78cdabfe -r 070c3f30547a source/encoder/encoder.cpp
> --- a/source/encoder/encoder.cpp Fri Jul 11 15:15:32 2014 +0530
> +++ b/source/encoder/encoder.cpp Fri Jul 11 16:49:54 2014 +0530
> @@ -311,20 +311,6 @@
> else
> m_lookahead->flush();
>
> - if (m_param->rc.rateControlMode == X265_RC_ABR)
> - {
> - // delay frame parallelism for non-VBV ABR
> - if (m_pocLast == 0 && !m_param->rc.vbvBufferSize && !m_param->rc.vbvMaxBitrate)
> - m_param->frameNumThreads = 1;
> - else if (m_param->frameNumThreads != m_totalFrameThreads)
> - {
> - // re-enable frame parallelism after the first few P frames are encoded
> - uint32_t frameCnt = (uint32_t)((0.5 * m_param->fpsNum / m_param->fpsDenom) / (m_param->bframes + 1));
> - if (m_analyzeP.m_numPics > frameCnt)
> - m_param->frameNumThreads = m_totalFrameThreads;
> - }
> - }
> -
\o/
When this is all said and done, m_totalFrameThreads should be removed
> FrameEncoder *curEncoder = &m_frameEncoder[m_curEncoder];
> m_curEncoder = (m_curEncoder + 1) % m_param->frameNumThreads;
> int ret = 0;
> @@ -393,26 +379,11 @@
> if (bChroma)
> m_numChromaWPBiFrames++;
> }
> -
> - uint64_t bytes = 0;
> - for (uint32_t i = 0; i < m_nalList.m_numNal; i++)
> + if (m_aborted == true)
> {
> - int type = m_nalList.m_nal[i].type;
> -
> - // exclude SEI
> - if (type != NAL_UNIT_PREFIX_SEI && type != NAL_UNIT_SUFFIX_SEI)
> - {
> - bytes += m_nalList.m_nal[i].sizeBytes;
> - // and exclude start code prefix
> - bytes -= (!i || type == NAL_UNIT_SPS || type == NAL_UNIT_PPS) ? 4 : 3;
> - }
> - }
> - if (m_rateControl->rateControlEnd(out, bytes << 3, &curEncoder->m_rce, &curEncoder->m_frameStats) < 0)
> - {
> - m_aborted = true;
> return -1;
> }
> - finishFrameStats(out, curEncoder, bytes << 3);
> + finishFrameStats(out, curEncoder, curEncoder->m_accessUnitBits);
> // Allow this frame to be recycled if no frame encoders are using it for reference
> if (!pic_out)
> {
> @@ -465,13 +436,16 @@
> // determine references, setup RPS, etc
> m_dpb->prepareEncode(fenc);
>
> - // set slice QP
> - m_rateControl->rateControlStart(fenc, m_lookahead, &curEncoder->m_rce, this);
>
> // Allow FrameEncoder::compressFrame() to start in a worker thread
> curEncoder->m_enable.trigger();
> }
> -
> + else if (!fenc && m_encodedFrameNum > 0)
> + {
> + RateControlEntry rce;
> + rce.encodeOrder = m_encodedFrameNum++;
> + m_rateControl->rateControlStart(NULL, m_lookahead, &rce, this);
> + }
has this been tested with very short encodes? I worry we'll introduce
new deadlocks
> return ret;
> }
>
> diff -r 7acd78cdabfe -r 070c3f30547a source/encoder/encoder.h
> --- a/source/encoder/encoder.h Fri Jul 11 15:15:32 2014 +0530
> +++ b/source/encoder/encoder.h Fri Jul 11 16:49:54 2014 +0530
> @@ -71,7 +71,6 @@
> {
> private:
>
> - bool m_aborted; // fatal error detected
> int m_pocLast; ///< time index (POC)
> int m_encodedFrameNum;
> int m_outputCount;
> @@ -82,7 +81,6 @@
> int64_t m_prevReorderedPts[2];
>
> ThreadPool* m_threadPool;
> - Lookahead* m_lookahead;
> FrameEncoder* m_frameEncoder;
> DPB* m_dpb;
>
> @@ -91,14 +89,6 @@
> int m_curEncoder;
>
>
> - /* Collect statistics globally */
> - EncStats m_analyzeAll;
> - EncStats m_analyzeI;
> - EncStats m_analyzeP;
> - EncStats m_analyzeB;
> - FILE* m_csvfpt;
> - int64_t m_encodeStartTime;
> -
> // quality control
> TComScalingList m_scalingList; ///< quantization matrix information
>
> @@ -119,6 +109,17 @@
> Level::Tier m_levelTier;
> Level::Name m_level;
>
> + /* Collect statistics globally */
> + EncStats m_analyzeAll;
> + EncStats m_analyzeI;
> + EncStats m_analyzeP;
> + EncStats m_analyzeB;
> + FILE* m_csvfpt;
> + int64_t m_encodeStartTime;
> +
> + Lookahead* m_lookahead;
> +
> + bool m_aborted; // fatal error detected
> bool m_nonPackedConstraintFlag;
> bool m_frameOnlyConstraintFlag;
>
> diff -r 7acd78cdabfe -r 070c3f30547a source/encoder/frameencoder.cpp
> --- a/source/encoder/frameencoder.cpp Fri Jul 11 15:15:32 2014 +0530
> +++ b/source/encoder/frameencoder.cpp Fri Jul 11 16:49:54 2014 +0530
> @@ -429,26 +429,6 @@
> m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
> }
>
> - int qp = slice->getSliceQp();
> -
> - int chromaQPOffset = slice->getPPS()->getChromaCbQpOffset() + slice->getSliceQpDeltaCb();
> - int qpCb = Clip3(0, MAX_MAX_QP, qp + chromaQPOffset);
> -
> - double lambda = x265_lambda2_tab[qp];
> - /* Assuming qpCb and qpCr are the same, since SAO takes only a single chroma lambda. TODO: Check why */
> - double chromaLambda = x265_lambda2_tab[qpCb];
> -
> - // NOTE: set SAO lambda every Frame
> - m_frameFilter.m_sao.lumaLambda = lambda;
> - m_frameFilter.m_sao.chromaLambda = chromaLambda;
> -
> - // Clip qps back to 0-51 range before encoding
> - qp = Clip3(-QP_BD_OFFSET, MAX_QP, qp);
> - slice->setSliceQp(qp);
> - m_frame->m_avgQpAq = qp;
> - slice->setSliceQpDelta(0);
> - slice->setSliceQpDeltaCb(0);
> - slice->setSliceQpDeltaCr(0);
>
> switch (slice->getSliceType())
> {
> @@ -601,6 +581,24 @@
> }
> }
>
> + uint64_t bytes = 0;
> + for (uint32_t i = 0; i < m_nalList.m_numNal; i++)
> + {
> + int type = m_nalList.m_nal[i].type;
> +
> + // exclude SEI
> + if (type != NAL_UNIT_PREFIX_SEI && type != NAL_UNIT_SUFFIX_SEI)
> + {
> + bytes += m_nalList.m_nal[i].sizeBytes;
> + // and exclude start code prefix
> + bytes -= (!i || type == NAL_UNIT_SPS || type == NAL_UNIT_PPS) ? 4 : 3;
> + }
> + }
> + m_accessUnitBits = bytes << 3;
> + if (m_top->m_rateControl->rateControlEnd(m_frame, m_accessUnitBits, &m_rce, &m_frameStats) < 0)
> + {
> + m_top->m_aborted = true;
> + }
> noiseReductionUpdate();
>
> m_elapsedCompressTime = (double)(x265_mdate() - startCompressTime) / 1000000;
> @@ -699,7 +697,28 @@
> PPAScopeEvent(FrameEncoder_compressRows);
> TComSlice* slice = m_frame->getSlice();
>
> - // reset entropy coders
> + //set slice QP
space between // and comment
> + m_top->m_rateControl->rateControlStart(m_frame, m_top->m_lookahead, &m_rce, m_top);
> + int qp = slice->getSliceQp();
> +
> + int chromaQPOffset = slice->getPPS()->getChromaCbQpOffset() + slice->getSliceQpDeltaCb();
> + int qpCb = Clip3(0, MAX_MAX_QP, qp + chromaQPOffset);
> + double lambda = x265_lambda2_tab[qp];
> + /* Assuming qpCb and qpCr are the same, since SAO takes only a single chroma lambda. TODO: Check why */
> + double chromaLambda = x265_lambda2_tab[qpCb];
> +
> + // NOTE: set SAO lambda every Frame
> + m_frameFilter.m_sao.lumaLambda = lambda;
> + m_frameFilter.m_sao.chromaLambda = chromaLambda;
> +
> + // Clip qps back to 0-51 range before encoding
> + qp = Clip3(-QP_BD_OFFSET, MAX_QP, qp);
> + slice->setSliceQp(qp);
> + m_frame->m_avgQpAq = qp;
> + slice->setSliceQpDelta(0);
> + slice->setSliceQpDeltaCb(0);
> + slice->setSliceQpDeltaCr(0);
> + //reset entropy coders
> m_sbacCoder.resetEntropy(slice);
> for (int i = 0; i < this->m_numRows; i++)
> {
> @@ -1026,7 +1045,26 @@
> }
> }
>
> + int rowCount;
> +
this bit of skullduggery below deserves a nice long comment
> + if (m_top->m_analyzeAll.m_numPics <= 2 * (m_param->fpsNum / m_param->fpsDenom))
> + rowCount = m_numRows/2 ;
> + else
> + rowCount = m_refLagRows;
> +
> // this row of CTUs has been encoded
> + if (row == rowCount)
> + {
> + int64_t bits = 0;
> + for(uint32_t col = 0; col < rowCount * numCols; col++)
> + {
> + TComDataCU* cu = m_frame->getCU(col);
> + bits += cu->m_totalBits;
> + }
> +
> + m_rce.rowTotalBits = bits;
> + m_top->m_rateControl->rateControlUpdateStats(&m_rce);
> + }
is it helpful to call rateControlUpdateStats() for non-referenced
frames? should we do this after 1 row or sooner for B frames?
> // trigger row-wise loop filters
> if (row >= m_filterRowDelay)
> diff -r 7acd78cdabfe -r 070c3f30547a source/encoder/frameencoder.h
> --- a/source/encoder/frameencoder.h Fri Jul 11 15:15:32 2014 +0530
> +++ b/source/encoder/frameencoder.h Fri Jul 11 16:49:54 2014 +0530
> @@ -137,6 +137,8 @@
> FrameStats m_frameStats; // stats of current frame for multipass encodes
> volatile bool m_bAllRowsStop;
> volatile int m_vbvResetTriggerRow;
> + Frame* m_frame;
> + uint64_t m_accessUnitBits;
>
> protected:
>
> @@ -155,7 +157,6 @@
> NALList m_nalList;
> ThreadLocalData m_tld;
>
> - Frame* m_frame;
>
> int m_filterRowDelay;
> int m_filterRowDelayCus;
> diff -r 7acd78cdabfe -r 070c3f30547a source/encoder/ratecontrol.cpp
> --- a/source/encoder/ratecontrol.cpp Fri Jul 11 15:15:32 2014 +0530
> +++ b/source/encoder/ratecontrol.cpp Fri Jul 11 16:49:54 2014 +0530
> @@ -263,7 +263,6 @@
> int lowresCuWidth = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> int lowresCuHeight = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> m_ncu = lowresCuWidth * lowresCuHeight;
> -
the battle against rate control line feeds rages on
> if (m_param->rc.cuTree)
> m_qCompress = 1;
> else
> @@ -541,74 +540,88 @@
>
> void RateControl::rateControlStart(Frame* pic, Lookahead *l, RateControlEntry* rce, Encoder* enc)
> {
> - m_curSlice = pic->getSlice();
> - m_sliceType = m_curSlice->getSliceType();
> - rce->sliceType = m_sliceType;
> - rce->isActive = true;
> - if (m_sliceType == B_SLICE)
> - rce->bframes = m_bframes;
> - else
> - m_bframes = pic->m_lowres.leadingBframes;
> + int orderValue = m_startEndOrder.get();
> + int startOrdinal = rce->encodeOrder * 2;
>
> - rce->bLastMiniGopBFrame = pic->m_lowres.bLastMiniGopBFrame;
> - rce->bufferRate = m_bufferRate;
> - rce->poc = m_curSlice->getPOC();
> - if (m_isVbv)
> - {
> - if (rce->rowPreds[0][0].count == 0)
> + while (orderValue != startOrdinal && pic)
> + orderValue = m_startEndOrder.waitForChange(orderValue);
> +
> + ScopedLock scope(m_lock);
again, the lock is likely redundant now
> + if (pic)
rather than changing indentation of dozens of lines here, I'd prefer
an early out like:
if (!pic) { m_startEndOrder.incr(); return; }
> {
> - for (int i = 0; i < 3; i++)
> + m_curSlice = pic->getSlice();
> + m_sliceType = m_curSlice->getSliceType();
> + rce->sliceType = m_sliceType;
> +
> + rce->isActive = true;
> + if (m_sliceType == B_SLICE)
> + rce->bframes = m_bframes;
> + else
> + m_bframes = pic->m_lowres.leadingBframes;
> + rce->bLastMiniGopBFrame = pic->m_lowres.bLastMiniGopBFrame;
> + rce->bufferRate = m_bufferRate;
> + rce->poc = m_curSlice->getPOC();
> + rce->rowCplxrSum = 0.0;
> + rce->rowTotalBits = 0;
> + if (m_isVbv)
> + {
> + if (rce->rowPreds[0][0].count == 0)
> {
> - for (int j = 0; j < 2; j++)
> + for (int i = 0; i < 3; i++)
> {
> - rce->rowPreds[i][j].coeff = 0.25;
> - rce->rowPreds[i][j].count = 1.0;
> - rce->rowPreds[i][j].decay = 0.5;
> - rce->rowPreds[i][j].offset = 0.0;
> + for (int j = 0; j < 2; j++)
> + {
> + rce->rowPreds[i][j].coeff = 0.25;
> + rce->rowPreds[i][j].count = 1.0;
> + rce->rowPreds[i][j].decay = 0.5;
> + rce->rowPreds[i][j].offset = 0.0;
> + }
> }
> }
> + rce->rowPred[0] = &rce->rowPreds[m_sliceType][0];
> + rce->rowPred[1] = &rce->rowPreds[m_sliceType][1];
> + updateVbvPlan(enc);
> + rce->bufferFill = m_bufferFill;
> }
> - rce->rowPred[0] = &rce->rowPreds[m_sliceType][0];
> - rce->rowPred[1] = &rce->rowPreds[m_sliceType][1];
> - updateVbvPlan(enc);
> - rce->bufferFill = m_bufferFill;
> - }
> - if (m_isAbr) //ABR,CRF
> - {
> - m_currentSatd = l->getEstimatedPictureCost(pic) >> (X265_DEPTH - 8);
> - /* Update rce for use in rate control VBV later */
> - rce->lastSatd = m_currentSatd;
> - double q = x265_qScale2qp(rateEstimateQscale(pic, rce));
> - q = Clip3((double)MIN_QP, (double)MAX_MAX_QP, q);
> - m_qp = int(q + 0.5);
> - rce->qpaRc = pic->m_avgQpRc = pic->m_avgQpAq = q;
> - /* copy value of lastRceq into thread local rce struct *to be used in RateControlEnd() */
> - rce->qRceq = m_lastRceq;
> - accumPQpUpdate();
> - }
> - else //CQP
> - {
> - if (m_sliceType == B_SLICE && m_curSlice->isReferenced())
> - m_qp = (m_qpConstant[B_SLICE] + m_qpConstant[P_SLICE]) / 2;
> - else
> - m_qp = m_qpConstant[m_sliceType];
> - pic->m_avgQpAq = pic->m_avgQpRc = m_qp;
> - }
> - if (m_sliceType != B_SLICE)
> - {
> - m_lastNonBPictType = m_sliceType;
> - m_leadingNoBSatd = m_currentSatd;
> - }
> - rce->leadingNoBSatd = m_leadingNoBSatd;
> - if (pic->m_forceqp)
> - {
> - m_qp = int32_t(pic->m_forceqp + 0.5) - 1;
> - m_qp = Clip3(MIN_QP, MAX_MAX_QP, m_qp);
> - rce->qpaRc = pic->m_avgQpRc = pic->m_avgQpAq = m_qp;
> - }
> - m_framesDone++;
> - /* set the final QP to slice structure */
> - m_curSlice->setSliceQp(m_qp);
> + if (m_isAbr) //ABR,CRF
> + {
> + m_currentSatd = l->getEstimatedPictureCost(pic) >> (X265_DEPTH - 8);
> + /* Update rce for use in rate control VBV later */
> + rce->lastSatd = m_currentSatd;
> + double q = x265_qScale2qp(rateEstimateQscale(pic, rce));
> + q = Clip3((double)MIN_QP, (double)MAX_MAX_QP, q);
> + m_qp = int(q + 0.5);
> + rce->qpaRc = pic->m_avgQpRc = pic->m_avgQpAq = q;
> + /* copy value of lastRceq into thread local rce struct *to be used in RateControlEnd() */
> + rce->qRceq = m_lastRceq;
> + accumPQpUpdate();
> + }
> + else //CQP
> + {
> + if (m_sliceType == B_SLICE && m_curSlice->isReferenced())
> + m_qp = (m_qpConstant[B_SLICE] + m_qpConstant[P_SLICE]) / 2;
> + else
> + m_qp = m_qpConstant[m_sliceType];
> + pic->m_avgQpAq = pic->m_avgQpRc = m_qp;
> + }
> + if (m_sliceType != B_SLICE)
> + {
> + m_lastNonBPictType = m_sliceType;
> + m_leadingNoBSatd = m_currentSatd;
> + }
> + rce->leadingNoBSatd = m_leadingNoBSatd;
> + if (pic->m_forceqp)
> + {
> + m_qp = int32_t(pic->m_forceqp + 0.5) - 1;
> + m_qp = Clip3(MIN_QP, MAX_MAX_QP, m_qp);
> + rce->qpaRc = pic->m_avgQpRc = pic->m_avgQpAq = m_qp;
> + }
> + m_framesDone++;
> + /* set the final QP to slice structure */
> + m_curSlice->setSliceQp(m_qp);
> + }
white-space, and an explanation for why m_startEndOrder is not incremented here
> + else
> + m_startEndOrder.incr();
> }
>
> void RateControl::accumPQpUpdate()
> @@ -1268,6 +1281,12 @@
> /* After encoding one frame, update rate control state */
> int RateControl::rateControlEnd(Frame* pic, int64_t bits, RateControlEntry* rce, FrameStats* stats)
> {
> + int orderValue = m_startEndOrder.get();
> + int endOrdinal = (rce->encodeOrder + m_param->frameNumThreads) * 2 - 1;
> + while (orderValue != endOrdinal)
> + orderValue = m_startEndOrder.waitForChange(orderValue);
> + ScopedLock scope(m_lock);
> +
> int64_t actualBits = bits;
> if (m_isAbr)
> {
> @@ -1366,17 +1385,19 @@
> }
>
> if (rce->sliceType != B_SLICE)
> + {
> /* The factor 1.5 is to tune up the actual bits, otherwise the cplxrSum is scaled too low
> * to improve short term compensation for next frame. */
> - m_cplxrSum += bits * x265_qp2qScale(rce->qpaRc) / rce->qRceq;
> + m_cplxrSum += (bits * x265_qp2qScale(rce->qpaRc) / rce->qRceq) - (rce->rowCplxrSum);
> + }
> else
> {
> /* Depends on the fact that B-frame's QP is an offset from the following P-frame's.
> * Not perfectly accurate with B-refs, but good enough. */
> - m_cplxrSum += bits * x265_qp2qScale(rce->qpaRc) / (rce->qRceq * fabs(m_param->rc.pbFactor));
> + m_cplxrSum += (bits * x265_qp2qScale(rce->qpaRc) / (rce->qRceq * fabs(m_param->rc.pbFactor))) - (rce->rowCplxrSum);
> }
> m_wantedBitsWindow += m_frameDuration * m_bitrate;
> - m_totalBits += bits;
> + m_totalBits += bits - rce->rowTotalBits;
> }
> }
>
> @@ -1425,6 +1446,7 @@
> rce->hrdTiming->dpbOutputTime = (double)rce->picTimingSEI->m_picDpbOutputDelay * time->getNumUnitsInTick() / time->getTimeScale() + rce->hrdTiming->cpbRemovalTime;
> }
> }
and this needs a comment
> + m_startEndOrder.incr();
> rce->isActive = false;
> return 0;
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list