[x265] [PATCH RFC] rc: update ratecontrol stats in every frame, avoid frame parallelism lag in abr
Steve Borho
steve at borho.org
Thu Jul 17 19:26:21 CEST 2014
On 07/17, santhoshini at multicorewareinc.com wrote:
> # HG changeset patch
> # User Santhoshini Sekar <santhoshini at multicorewareinc.com>
> # Date 1405596502 -19800
> # Thu Jul 17 16:58:22 2014 +0530
> # Node ID 92db1b9ee818ff75cea743cb4c4fedd0ab93a9a6
> # Parent d850cbf81e0f4831d8dcf89db83561969e456205
> rc: update ratecontrol stats in every frame, avoid frame parallelism lag in abr
>
> RateControl statistics are updated for every frame when refLagRows number of
> rows are completed in processRowEncoder. With this updated data rateControl
> predicts more accurate QP
I'm afraid this doesn't apply on tip
> diff -r d850cbf81e0f -r 92db1b9ee818 source/encoder/analysis.cpp
> --- a/source/encoder/analysis.cpp Tue Jul 15 22:47:54 2014 -0500
> +++ b/source/encoder/analysis.cpp Thu Jul 17 16:58:22 2014 +0530
> @@ -65,7 +65,7 @@
>
> m_rdCost.setPsyRdScale(m_param->psyRd);
> m_bEnableRDOQ = top->m_bEnableRDOQ;
> - m_bFrameParallel = top->m_totalFrameThreads > 1;
> + m_bFrameParallel = m_param->frameNumThreads > 1;
> m_numLayers = top->m_quadtreeTULog2MaxSize - top->m_quadtreeTULog2MinSize + 1;
>
> return initSearch();
> diff -r d850cbf81e0f -r 92db1b9ee818 source/encoder/encoder.cpp
> --- a/source/encoder/encoder.cpp Tue Jul 15 22:47:54 2014 -0500
> +++ b/source/encoder/encoder.cpp Thu Jul 17 16:58:22 2014 +0530
> @@ -150,7 +150,7 @@
>
> if (m_frameEncoder)
> {
> - for (int i = 0; i < m_totalFrameThreads; i++)
> + for (int i = 0; i < m_param->frameNumThreads; i++)
> {
> // Ensure frame encoder is idle before destroying it
> m_frameEncoder[i].getEncodedPicture(m_nalList);
> @@ -320,20 +320,6 @@
> else
> m_lookahead->flush();
>
> - if (m_param->rc.rateControlMode == X265_RC_ABR)
> - {
> - // delay frame parallelism for non-VBV ABR
> - if (m_pocLast == 0 && !m_param->rc.vbvBufferSize && !m_param->rc.vbvMaxBitrate)
> - m_param->frameNumThreads = 1;
> - else if (m_param->frameNumThreads != m_totalFrameThreads)
> - {
> - // re-enable frame parallelism after the first few P frames are encoded
> - uint32_t frameCnt = (uint32_t)((0.5 * m_param->fpsNum / m_param->fpsDenom) / (m_param->bframes + 1));
> - if (m_analyzeP.m_numPics > frameCnt)
> - m_param->frameNumThreads = m_totalFrameThreads;
> - }
> - }
> -
> FrameEncoder *curEncoder = &m_frameEncoder[m_curEncoder];
> m_curEncoder = (m_curEncoder + 1) % m_param->frameNumThreads;
> int ret = 0;
> @@ -402,26 +388,11 @@
> if (bChroma)
> m_numChromaWPBiFrames++;
> }
> -
> - uint64_t bytes = 0;
> - for (uint32_t i = 0; i < m_nalList.m_numNal; i++)
> + if (m_aborted == true)
> {
> - int type = m_nalList.m_nal[i].type;
> -
> - // exclude SEI
> - if (type != NAL_UNIT_PREFIX_SEI && type != NAL_UNIT_SUFFIX_SEI)
> - {
> - bytes += m_nalList.m_nal[i].sizeBytes;
> - // and exclude start code prefix
> - bytes -= (!i || type == NAL_UNIT_SPS || type == NAL_UNIT_PPS) ? 4 : 3;
> - }
> - }
> - if (m_rateControl->rateControlEnd(out, bytes << 3, &curEncoder->m_rce, &curEncoder->m_frameStats) < 0)
> - {
> - m_aborted = true;
> return -1;
> }
> - finishFrameStats(out, curEncoder, bytes << 3);
> + finishFrameStats(out, curEncoder, curEncoder->m_accessUnitBits);
> // Allow this frame to be recycled if no frame encoders are using it for reference
> if (!pic_out)
> {
> @@ -474,12 +445,17 @@
> // determine references, setup RPS, etc
> m_dpb->prepareEncode(fenc);
>
> - // set slice QP
> - m_rateControl->rateControlStart(fenc, m_lookahead, &curEncoder->m_rce, this);
>
> // Allow FrameEncoder::compressFrame() to start in a worker thread
> curEncoder->m_enable.trigger();
> }
> + else if (!fenc && m_encodedFrameNum > 0)
> + {
> + // faked rateControlStart calls to avoid rateControlEnd of last frameNumThreads parallel frames from waiting
> + RateControlEntry rce;
> + rce.encodeOrder = m_encodedFrameNum++;
> + m_rateControl->rateControlStart(NULL, m_lookahead, &rce, this);
> + }
>
> return ret;
> }
> @@ -1229,7 +1205,6 @@
> {
> x265_log(p, X265_LOG_INFO, "Warning: picture-based SAO used with frame parallelism\n");
> }
> - m_totalFrameThreads = m_param->frameNumThreads;
>
> if (p->keyframeMax < 0)
> {
> diff -r d850cbf81e0f -r 92db1b9ee818 source/encoder/encoder.h
> --- a/source/encoder/encoder.h Tue Jul 15 22:47:54 2014 -0500
> +++ b/source/encoder/encoder.h Thu Jul 17 16:58:22 2014 +0530
> @@ -71,7 +71,6 @@
> {
> private:
>
> - bool m_aborted; // fatal error detected
> int m_pocLast; ///< time index (POC)
> int m_encodedFrameNum;
> int m_outputCount;
> @@ -82,7 +81,6 @@
> int64_t m_prevReorderedPts[2];
>
> ThreadPool* m_threadPool;
> - Lookahead* m_lookahead;
> FrameEncoder* m_frameEncoder;
> DPB* m_dpb;
>
> @@ -90,15 +88,6 @@
>
> int m_curEncoder;
>
> -
> - /* Collect statistics globally */
> - EncStats m_analyzeAll;
> - EncStats m_analyzeI;
> - EncStats m_analyzeP;
> - EncStats m_analyzeB;
> - FILE* m_csvfpt;
> - int64_t m_encodeStartTime;
> -
> // quality control
> TComScalingList m_scalingList; ///< quantization matrix information
>
> @@ -141,6 +130,18 @@
> //====== Tool list ========
> int m_lastBPSEI;
>
> + /* Collect statistics globally */
> + EncStats m_analyzeAll;
> + EncStats m_analyzeI;
> + EncStats m_analyzeP;
> + EncStats m_analyzeB;
> + FILE* m_csvfpt;
> + int64_t m_encodeStartTime;
> +
> + Lookahead* m_lookahead;
> +
> + bool m_aborted; // fatal error detected
> +
> uint32_t m_log2ParallelMergeLevelMinus2; ///< Parallel merge estimation region
>
> int m_useScalingListId; ///< Using quantization matrix i.e. 0=off, 1=default.
> @@ -165,8 +166,6 @@
> Window m_conformanceWindow;
> Window m_defaultDisplayWindow;
>
> - int m_totalFrameThreads;
> -
> uint32_t m_numDelayedPic;
>
> Encoder();
> diff -r d850cbf81e0f -r 92db1b9ee818 source/encoder/frameencoder.cpp
> --- a/source/encoder/frameencoder.cpp Tue Jul 15 22:47:54 2014 -0500
> +++ b/source/encoder/frameencoder.cpp Thu Jul 17 16:58:22 2014 +0530
> @@ -445,26 +445,6 @@
> m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
> }
>
> - int qp = slice->getSliceQp();
> -
> - int chromaQPOffset = slice->getPPS()->getChromaCbQpOffset() + slice->getSliceQpDeltaCb();
> - int qpCb = Clip3(0, MAX_MAX_QP, qp + chromaQPOffset);
> -
> - double lambda = x265_lambda2_tab[qp];
> - /* Assuming qpCb and qpCr are the same, since SAO takes only a single chroma lambda. TODO: Check why */
> - double chromaLambda = x265_lambda2_tab[qpCb];
> -
> - // NOTE: set SAO lambda every Frame
> - m_frameFilter.m_sao.lumaLambda = lambda;
> - m_frameFilter.m_sao.chromaLambda = chromaLambda;
> -
> - // Clip qps back to 0-51 range before encoding
> - qp = Clip3(-QP_BD_OFFSET, MAX_QP, qp);
> - slice->setSliceQp(qp);
> - m_frame->m_avgQpAq = qp;
> - slice->setSliceQpDelta(0);
> - slice->setSliceQpDeltaCb(0);
> - slice->setSliceQpDeltaCr(0);
>
> switch (slice->getSliceType())
> {
> @@ -622,6 +602,23 @@
> }
> }
>
> + uint64_t bytes = 0;
> + for (uint32_t i = 0; i < m_nalList.m_numNal; i++)
> + {
> + int type = m_nalList.m_nal[i].type;
> +
> + // exclude SEI
> + if (type != NAL_UNIT_PREFIX_SEI && type != NAL_UNIT_SUFFIX_SEI)
> + {
> + bytes += m_nalList.m_nal[i].sizeBytes;
> + // and exclude start code prefix
> + bytes -= (!i || type == NAL_UNIT_SPS || type == NAL_UNIT_PPS) ? 4 : 3;
> + }
> + }
> + m_accessUnitBits = bytes << 3;
> + if (m_top->m_rateControl->rateControlEnd(m_frame, m_accessUnitBits, &m_rce, &m_frameStats) < 0)
> + m_top->m_aborted = true;
> +
> noiseReductionUpdate();
>
> m_elapsedCompressTime = (double)(x265_mdate() - startCompressTime) / 1000000;
> @@ -720,6 +717,27 @@
> PPAScopeEvent(FrameEncoder_compressRows);
> TComSlice* slice = m_frame->getSlice();
>
> + // set slice QP
> + m_top->m_rateControl->rateControlStart(m_frame, m_top->m_lookahead, &m_rce, m_top);
> + int qp = slice->getSliceQp();
> +
> + int chromaQPOffset = slice->getPPS()->getChromaCbQpOffset() + slice->getSliceQpDeltaCb();
> + int qpCb = Clip3(0, MAX_MAX_QP, qp + chromaQPOffset);
> + double lambda = x265_lambda2_tab[qp];
> + /* Assuming qpCb and qpCr are the same, since SAO takes only a single chroma lambda. TODO: Check why */
> + double chromaLambda = x265_lambda2_tab[qpCb];
> +
> + // NOTE: set SAO lambda every Frame
> + m_frameFilter.m_sao.lumaLambda = lambda;
> + m_frameFilter.m_sao.chromaLambda = chromaLambda;
> +
> + // Clip qps back to 0-51 range before encoding
> + qp = Clip3(-QP_BD_OFFSET, MAX_QP, qp);
> + slice->setSliceQp(qp);
> + m_frame->m_avgQpAq = qp;
> + slice->setSliceQpDelta(0);
> + slice->setSliceQpDeltaCb(0);
> + slice->setSliceQpDeltaCr(0);
> // reset entropy coders
> m_sbacCoder.resetEntropy(slice);
> for (int i = 0; i < this->m_numRows; i++)
> @@ -1047,7 +1065,34 @@
> }
> }
>
> + /* when a frame is half way through, update bits and complexity in rate control
> + * for it to be available for the next frame's QScale calculation. This makes it
> + * more accurate with updated value */
> + int rowCount = 0;
> +
> + /* for the first two seconds update when the frame is half done and for rest
> + * of the sequence update when refLagRows are completed */
> + if (m_param->rc.rateControlMode == X265_RC_ABR)
> + {
> + if (m_top->m_analyzeAll.m_numPics <= 2 * (m_param->fpsNum / m_param->fpsDenom))
I believe this is a small (m_analyzeAll) race-hazard. Can't we just base
this on the frame's own encode ordinal?
> + rowCount = m_numRows/2;
> + else
> + rowCount = m_refLagRows;
> + }
> +
> // this row of CTUs has been encoded
> + if (row == rowCount)
> + {
> + int64_t bits = 0;
> + for(uint32_t col = 0; col < rowCount * numCols; col++)
> + {
> + TComDataCU* cu = m_frame->getCU(col);
> + bits += cu->m_totalBits;
> + }
> +
> + m_rce.rowTotalBits = bits;
> + m_top->m_rateControl->rateControlUpdateStats(&m_rce);
> + }
>
> // trigger row-wise loop filters
> if (row >= m_filterRowDelay)
> diff -r d850cbf81e0f -r 92db1b9ee818 source/encoder/frameencoder.h
> --- a/source/encoder/frameencoder.h Tue Jul 15 22:47:54 2014 -0500
> +++ b/source/encoder/frameencoder.h Thu Jul 17 16:58:22 2014 +0530
> @@ -137,6 +137,8 @@
> FrameStats m_frameStats; // stats of current frame for multipass encodes
> volatile bool m_bAllRowsStop;
> volatile int m_vbvResetTriggerRow;
> + Frame* m_frame;
> + uint64_t m_accessUnitBits;
>
> protected:
>
> @@ -155,7 +157,6 @@
> NALList m_nalList;
> ThreadLocalData m_tld;
>
> - Frame* m_frame;
>
> int m_filterRowDelay;
> int m_filterRowDelayCus;
> diff -r d850cbf81e0f -r 92db1b9ee818 source/encoder/ratecontrol.cpp
> --- a/source/encoder/ratecontrol.cpp Tue Jul 15 22:47:54 2014 -0500
> +++ b/source/encoder/ratecontrol.cpp Thu Jul 17 16:58:22 2014 +0530
> @@ -296,10 +296,13 @@
>
> // validate for param->rc, maybe it is need to add a function like x265_parameters_valiate()
> m_residualFrames = 0;
> + m_partialResidualFrames = 0;
> m_residualCost = 0;
> + m_partialResidualCost = 0;
> m_rateFactorMaxIncrement = 0;
> m_rateFactorMaxDecrement = 0;
> m_fps = m_param->fpsNum / m_param->fpsDenom;
> + m_startEndOrder.set(0);
> if (m_param->rc.rateControlMode == X265_RC_CRF)
> {
> m_param->rc.qp = (int)m_param->rc.rfConstant;
> @@ -666,6 +669,7 @@
> m_totalBits = 0;
> m_framesDone = 0;
> m_residualCost = 0;
> + m_partialResidualCost = 0;
>
> /* 720p videos seem to be a good cutoff for cplxrSum */
> double tuneCplxFactor = (m_param->rc.cuTree && m_ncu > 3600) ? 2.5 : 1;
> @@ -979,6 +983,19 @@
>
> void RateControl::rateControlStart(Frame* pic, Lookahead *l, RateControlEntry* rce, Encoder* enc)
> {
> + int orderValue = m_startEndOrder.get();
> + int startOrdinal = rce->encodeOrder * 2;
> +
> + while (orderValue != startOrdinal && pic)
> + orderValue = m_startEndOrder.waitForChange(orderValue);
> +
> + if (!pic)
> + {
> + // faked rateControlStart calls
> + m_startEndOrder.incr();
> + return;
> + }
> +
> m_curSlice = pic->getSlice();
> m_sliceType = m_curSlice->getSliceType();
> rce->sliceType = m_sliceType;
> @@ -991,6 +1008,8 @@
> rce->bLastMiniGopBFrame = pic->m_lowres.bLastMiniGopBFrame;
> rce->bufferRate = m_bufferRate;
> rce->poc = m_curSlice->getPOC();
> + rce->rowCplxrSum = 0.0;
> + rce->rowTotalBits = 0;
> if (m_isVbv)
> {
> if (rce->rowPreds[0][0].count == 0)
> @@ -1044,6 +1063,8 @@
> m_qp = Clip3(MIN_QP, MAX_MAX_QP, m_qp);
> rce->qpaRc = pic->m_avgQpRc = pic->m_avgQpAq = m_qp;
> }
> + // Do not increment m_startEndOrder here. Make rateControlEnd of previous thread
> + // to wait until rateControlUpdateStats of this frame is called
> m_framesDone++;
> /* set the final QP to slice structure */
> m_curSlice->setSliceQp(m_qp);
> @@ -1278,7 +1299,7 @@
> /* use framesDone instead of POC as poc count is not serial with bframes enabled */
> double timeDone = (double)(m_framesDone - m_param->frameNumThreads + 1) * m_frameDuration;
> wantedBits = timeDone * m_bitrate;
> - if (wantedBits > 0 && m_totalBits > 0 && !m_residualFrames)
> + if (wantedBits > 0 && m_totalBits > 0 && !m_partialResidualFrames)
> {
> abrBuffer *= X265_MAX(1, sqrt(timeDone));
> overflow = Clip3(.5, 2.0, 1.0 + (m_totalBits - wantedBits) / abrBuffer);
> @@ -1300,7 +1321,7 @@
> double lqmin = 0, lqmax = 0;
> lqmin = m_lastQScaleFor[m_sliceType] / m_lstep;
> lqmax = m_lastQScaleFor[m_sliceType] * m_lstep;
> - if (!m_residualFrames)
> + if (!m_partialResidualFrames)
> {
> if (overflow > 1.1 && m_framesDone > 3)
> lqmax *= m_lstep;
> @@ -1342,16 +1363,17 @@
> if (rce->sliceType == I_SLICE)
> {
> /* previous I still had a residual; roll it into the new loan */
> - if (m_residualFrames)
> - rce->rowTotalBits += m_residualCost * m_residualFrames;
> + if (m_partialResidualFrames)
> + rce->rowTotalBits += m_partialResidualCost * m_partialResidualFrames;
>
> - m_residualFrames = X265_MIN(s_amortizeFrames, m_param->keyframeMax);
> - m_residualCost = (int)((rce->rowTotalBits * s_amortizeFraction) / m_residualFrames);
> - rce->rowTotalBits -= m_residualCost * m_residualFrames;
> + m_partialResidualFrames = X265_MIN(s_amortizeFrames, m_param->keyframeMax);
> + m_partialResidualCost = (int)((rce->rowTotalBits * s_amortizeFraction) /m_partialResidualFrames);
> + rce->rowTotalBits -= m_partialResidualCost * m_partialResidualFrames;
> }
> - else if (m_residualFrames)
> + else if (m_partialResidualFrames)
> {
> - rce->rowTotalBits += m_residualCost;
> + rce->rowTotalBits += m_partialResidualCost;
> + m_partialResidualFrames--;
> }
>
> if (rce->sliceType != B_SLICE)
> @@ -1361,6 +1383,12 @@
>
> m_cplxrSum += rce->rowCplxrSum;
> m_totalBits += rce->rowTotalBits;
> +
> + /* delay incrementing m_startEndOrder until here to sync with rateControlStart() */
"do not allow the next frame to enter rateControlStart() until this
frame has updated its mid-frame statistics"
> + m_startEndOrder.incr();
> +
> + if (rce->encodeOrder < m_param->frameNumThreads - 1)
> + m_startEndOrder.incr(); // faked rateControlEnd calls for negative frames
> }
>
> void RateControl::checkAndResetABR(RateControlEntry* rce, bool isFrameDone)
> @@ -1820,6 +1848,11 @@
> /* After encoding one frame, update rate control state */
> int RateControl::rateControlEnd(Frame* pic, int64_t bits, RateControlEntry* rce, FrameStats* stats)
> {
> + int orderValue = m_startEndOrder.get();
> + int endOrdinal = (rce->encodeOrder + m_param->frameNumThreads) * 2 - 1;
> + while (orderValue != endOrdinal)
> + orderValue = m_startEndOrder.waitForChange(orderValue);
> +
> int64_t actualBits = bits;
> if (m_isAbr)
> {
> @@ -1919,17 +1952,19 @@
> }
> }
> if (rce->sliceType != B_SLICE)
> + {
> /* The factor 1.5 is to tune up the actual bits, otherwise the cplxrSum is scaled too low
> * to improve short term compensation for next frame. */
> - m_cplxrSum += bits * x265_qp2qScale(rce->qpaRc) / rce->qRceq;
> + m_cplxrSum += (bits * x265_qp2qScale(rce->qpaRc) / rce->qRceq) - (rce->rowCplxrSum);
> + }
> else
> {
> /* Depends on the fact that B-frame's QP is an offset from the following P-frame's.
> * Not perfectly accurate with B-refs, but good enough. */
> - m_cplxrSum += bits * x265_qp2qScale(rce->qpaRc) / (rce->qRceq * fabs(m_param->rc.pbFactor));
> + m_cplxrSum += (bits * x265_qp2qScale(rce->qpaRc) / (rce->qRceq * fabs(m_param->rc.pbFactor))) - (rce->rowCplxrSum);
> }
> m_wantedBitsWindow += m_frameDuration * m_bitrate;
> - m_totalBits += bits;
> + m_totalBits += bits - rce->rowTotalBits;
> }
> }
>
> @@ -1973,6 +2008,8 @@
> rce->hrdTiming->dpbOutputTime = (double)rce->picTimingSEI->m_picDpbOutputDelay * time->numUnitsInTick / time->timeScale + rce->hrdTiming->cpbRemovalTime;
> }
> }
> + // Allow rateControlStart of next frame only when rateControlEnd of previous frame is over
> + m_startEndOrder.incr();
> rce->isActive = false;
> return 0;
>
> diff -r d850cbf81e0f -r 92db1b9ee818 source/encoder/ratecontrol.h
> --- a/source/encoder/ratecontrol.h Tue Jul 15 22:47:54 2014 -0500
> +++ b/source/encoder/ratecontrol.h Thu Jul 17 16:58:22 2014 +0530
> @@ -147,6 +147,14 @@
> int64_t m_totalBits; /* total bits used for already encoded frames */
> int m_framesDone; /* # of frames passed through RateCotrol already */
> double m_fps;
> +
> + /* a common variable on which rateControlStart, rateControlEnd and rateControUpdateStats waits to
> + * sync the calls to these functions.
I suspect the text below could be better replaced by simple example for
-F2:
rceStart 10
rceUpdate 10
rceEnd 9
rceStart 11
rceUpdate 11
rceEnd 10
rceStart 12
rceUpdate 12
rceEnd 11
> By waiting on this variable these functions proceed in the order as same as
> + * encode order i.e wait until the variable matches the encode order. rateControlStart of next frame will
> + * not happen until previous frame updates. And rateControlEnd of previous frame will not happen until the
> + * current frame updates */
> + ThreadSafeInteger m_startEndOrder;
> +
> /* hrd stuff */
> SEIBufferingPeriod m_bufPeriodSEI;
> double m_nominalRemovalTime;
> @@ -185,7 +193,9 @@
> static const char *s_defaultStatFileName;
>
> int m_residualFrames;
> + int m_partialResidualFrames;
> int m_residualCost;
> + int m_partialResidualCost;
>
> double getQScale(RateControlEntry *rce, double rateFactor);
> double rateEstimateQscale(Frame* pic, RateControlEntry *rce); // main logic for calculating QP based on ABR
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list