[x265] [PATCH] stats: count of each CU partition per frame
Steve Borho
steve at borho.org
Wed Jul 1 17:55:48 CEST 2015
On 07/01, Divya Manivannan wrote:
> # HG changeset patch
> # User Divya Manivannan <divya at multicorewareinc.com>
> # Date 1435741578 -19800
> # Wed Jul 01 14:36:18 2015 +0530
> # Node ID e4ac2778e85bb86923842f3dea5a4a4f8e88b057
> # Parent 2f345c1c0d8e2351e5aaae5f3e0e017b5810f32e
> stats: count of each CU partition per frame
>
> diff -r 2f345c1c0d8e -r e4ac2778e85b doc/reST/api.rst
> --- a/doc/reST/api.rst Tue Jun 30 13:08:15 2015 -0500
> +++ b/doc/reST/api.rst Wed Jul 01 14:36:18 2015 +0530
> @@ -338,10 +338,6 @@
> Cleanup
> =======
>
> - /* x265_encoder_log:
> - * This function is now deprecated */
> - void x265_encoder_log(x265_encoder *encoder, int argc, char **argv);
> -
> Finally, the encoder must be closed in order to free all of its
> resources. An encoder that has been flushed cannot be restarted and
> reused. Once **x265_encoder_close()** has been called, the encoder
> diff -r 2f345c1c0d8e -r e4ac2778e85b source/common/framedata.h
> --- a/source/common/framedata.h Tue Jun 30 13:08:15 2015 -0500
> +++ b/source/common/framedata.h Wed Jul 01 14:36:18 2015 +0530
> @@ -34,6 +34,9 @@
> class PicYuv;
> class JobProvider;
>
> +#define INTER_MODES 4
> +#define INTRA_MODES 3
how are there three intra modes and only 4 inter modes? at the least
these deserve comments
> /* Current frame stats for 2 pass */
> struct FrameStats
> {
> @@ -49,6 +52,24 @@
> double percent8x8Intra;
> double percent8x8Inter;
> double percent8x8Skip;
> + double percentIntraNxN;
> + double percentSkipCu[4];
> + double percentInterCu[4];
> + double percentIntraDistribution[4][INTRA_MODES];
> + double percentInterDistribution[4][2];
> +
> + uint64_t cntIntraNxN;
> + uint64_t totalCu;
> + uint64_t cntSkipCu[4];
> + uint64_t cntInter[4];
> + uint64_t cntIntra[4];
> + uint64_t cuInterDistribution[4][INTER_MODES];
> + uint64_t cuIntraDistribution[4][INTRA_MODES];
all these 4's should be max-depth macros
> + FrameStats()
> + {
> + memset(this, 0, sizeof(FrameStats));
> + }
> };
>
> /* Per-frame data that is used during encodes and referenced while the picture
> diff -r 2f345c1c0d8e -r e4ac2778e85b source/encoder/encoder.cpp
> --- a/source/encoder/encoder.cpp Tue Jun 30 13:08:15 2015 -0500
> +++ b/source/encoder/encoder.cpp Wed Jul 01 14:36:18 2015 +0530
> @@ -1163,6 +1163,19 @@
> else
> frameStats->avgWPP = 1;
> frameStats->countRowBlocks = curEncoder->m_countRowBlocks;
> +
> + frameStats->cuStats.percentIntraNxN = curFrame->m_encData->m_frameStats.percentIntraNxN;
> + for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> + {
> + frameStats->cuStats.percentInterCu[depth] = curFrame->m_encData->m_frameStats.percentInterCu[depth];
> + frameStats->cuStats.percentSkipCu[depth] = curFrame->m_encData->m_frameStats.percentSkipCu[depth];
> + for (int n = 0; n < INTRA_MODES; n++)
> + {
> + if (n < 2)
> + frameStats->cuStats.percentInterDistribution[depth][n] = curFrame->m_encData->m_frameStats.percentInterDistribution[depth][n];
All this is to avoid two for-loops? I find this unnecessarily confusing.
And why use a hard-coded 2 here when you have INTER_MODES?
Having a conditional inside a for-loop is slower than having two
for-loops, particularly when they are this trivial, the compiler will
generally unroll short loops of fixed size.
> + frameStats->cuStats.percentIntraDistribution[depth][n] = curFrame->m_encData->m_frameStats.percentIntraDistribution[depth][n];
> + }
> + }
> }
> }
>
> diff -r 2f345c1c0d8e -r e4ac2778e85b source/encoder/frameencoder.cpp
> --- a/source/encoder/frameencoder.cpp Tue Jun 30 13:08:15 2015 -0500
> +++ b/source/encoder/frameencoder.cpp Wed Jul 01 14:36:18 2015 +0530
> @@ -583,6 +583,36 @@
> m_frame->m_encData->m_frameStats.percent8x8Inter = (double)totalP / totalCuCount;
> m_frame->m_encData->m_frameStats.percent8x8Skip = (double)totalSkip / totalCuCount;
> }
> + for (uint32_t i = 0; i < m_numRows; i++)
> + {
> + m_frame->m_encData->m_frameStats.cntIntraNxN += m_rows[i].rowStats.cntIntraNxN;
> + m_frame->m_encData->m_frameStats.totalCu += m_rows[i].rowStats.totalCu;
> + for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> + {
> + m_frame->m_encData->m_frameStats.cntInter[depth] += m_rows[i].rowStats.cntInter[depth];
> + m_frame->m_encData->m_frameStats.cntSkipCu[depth] += m_rows[i].rowStats.cntSkipCu[depth];
> + for (int m = 0; m < INTER_MODES; m++)
> + {
> + if (m < INTRA_MODES)
> + m_frame->m_encData->m_frameStats.cuIntraDistribution[depth][m] += m_rows[i].rowStats.cuIntraDistribution[depth][m];
> + m_frame->m_encData->m_frameStats.cuInterDistribution[depth][m] += m_rows[i].rowStats.cuInterDistribution[depth][m];
> + }
this indexing is also confusing.
> + }
> + }
> + m_frame->m_encData->m_frameStats.percentIntraNxN = (double)(m_frame->m_encData->m_frameStats.cntIntraNxN * 100) / m_frame->m_encData->m_frameStats.totalCu;
> + for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> + {
> + m_frame->m_encData->m_frameStats.percentSkipCu[depth] = (double)(m_frame->m_encData->m_frameStats.cntSkipCu[depth] * 100) / m_frame->m_encData->m_frameStats.totalCu;
> + m_frame->m_encData->m_frameStats.percentInterCu[depth] = (double)(m_frame->m_encData->m_frameStats.cntInter[depth] * 100) / m_frame->m_encData->m_frameStats.totalCu;
> + uint64_t cuInterCnt = 0;
> + for (int n = 0; n < INTRA_MODES; n++)
> + {
> + m_frame->m_encData->m_frameStats.percentIntraDistribution[depth][n] = (double)(m_frame->m_encData->m_frameStats.cuIntraDistribution[depth][n] * 100) / m_frame->m_encData->m_frameStats.totalCu;
> + cuInterCnt += m_frame->m_encData->m_frameStats.cuInterDistribution[depth][n + 1];
again, the indexing is making my head spin..
m_frameStats.cuInterDistribution[depth][n + 1] where n is the intra
modes?
> + }
> + m_frame->m_encData->m_frameStats.percentInterDistribution[depth][0] = (double)(m_frame->m_encData->m_frameStats.cuInterDistribution[depth][0] * 100) / m_frame->m_encData->m_frameStats.totalCu;
> + m_frame->m_encData->m_frameStats.percentInterDistribution[depth][1] = (double)(cuInterCnt * 100) / m_frame->m_encData->m_frameStats.totalCu;
> + }
>
> m_bs.resetBits();
> m_entropyCoder.load(m_initSliceContext);
> @@ -838,13 +868,6 @@
> const uint32_t lineStartCUAddr = row * numCols;
> bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
>
> - /* These store the count of inter, intra and skip cus within quad tree structure of each CTU */
> - uint32_t qTreeInterCnt[NUM_CU_DEPTH];
> - uint32_t qTreeIntraCnt[NUM_CU_DEPTH];
> - uint32_t qTreeSkipCnt[NUM_CU_DEPTH];
> - for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> - qTreeIntraCnt[depth] = qTreeInterCnt[depth] = qTreeSkipCnt[depth] = 0;
> -
> while (curRow.completed < numCols)
> {
> ProfileScopeEvent(encodeCTU);
> @@ -916,28 +939,43 @@
> // Completed CU processing
> curRow.completed++;
>
> - if (m_param->rc.bStatWrite)
> - curEncData.m_rowStat[row].sumQpAq += collectCTUStatistics(*ctu, qTreeInterCnt, qTreeIntraCnt, qTreeSkipCnt);
> - else if (m_param->rc.aqMode)
> + FrameStats frameLog;
> + collectCTUStatistics(*ctu, &frameLog);
if collectCTUStatistics() is going to be called unconditionally, why not
have it return the total QP like it used to, so we don't need to call
calcCTUQP() anymore? That function was only necessary because
collectCTUStatistics() could have been skipped.
> + if (m_param->rc.aqMode)
> curEncData.m_rowStat[row].sumQpAq += calcCTUQP(*ctu);
>
> // copy no. of intra, inter Cu cnt per row into frame stats for 2 pass
> if (m_param->rc.bStatWrite)
> {
> - curRow.rowStats.mvBits += best.mvBits;
> + curRow.rowStats.mvBits += best.mvBits;
> curRow.rowStats.coeffBits += best.coeffBits;
> - curRow.rowStats.miscBits += best.totalBits - (best.mvBits + best.coeffBits);
> + curRow.rowStats.miscBits += best.totalBits - (best.mvBits + best.coeffBits);
>
> for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> {
> /* 1 << shift == number of 8x8 blocks at current depth */
> int shift = 2 * (g_maxCUDepth - depth);
> - curRow.rowStats.intra8x8Cnt += qTreeIntraCnt[depth] << shift;
> - curRow.rowStats.inter8x8Cnt += qTreeInterCnt[depth] << shift;
> - curRow.rowStats.skip8x8Cnt += qTreeSkipCnt[depth] << shift;
>
> - // clear the row cu data from thread local object
> - qTreeIntraCnt[depth] = qTreeInterCnt[depth] = qTreeSkipCnt[depth] = 0;
> + if (depth != 3)
a hard-coded depth here is not right, what if the CTU size is 32 or 16?
You have to check that this is the max-depth.
> + curRow.rowStats.intra8x8Cnt += (int)(frameLog.cntIntra[depth] << shift);
> + else
> + curRow.rowStats.intra8x8Cnt += (int)(frameLog.cntIntra[depth] + frameLog.cntIntraNxN);
> +
> + curRow.rowStats.inter8x8Cnt += (int)(frameLog.cntInter[depth] << shift);
> + curRow.rowStats.skip8x8Cnt += (int)(frameLog.cntSkipCu[depth] << shift);
> + }
> + }
> + curRow.rowStats.cntIntraNxN += frameLog.cntIntraNxN;
> + curRow.rowStats.totalCu += frameLog.totalCu;
> + for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> + {
> + curRow.rowStats.cntInter[depth] += frameLog.cntInter[depth];
> + curRow.rowStats.cntSkipCu[depth] += frameLog.cntSkipCu[depth];
> + for (int m = 0; m < INTER_MODES; m++)
> + {
> + if (m < INTRA_MODES)
> + curRow.rowStats.cuIntraDistribution[depth][m] += frameLog.cuIntraDistribution[depth][m];
> + curRow.rowStats.cuInterDistribution[depth][m] += frameLog.cuInterDistribution[depth][m];
> }
> }
>
> @@ -1115,11 +1153,8 @@
> }
>
> /* collect statistics about CU coding decisions, return total QP */
> -int FrameEncoder::collectCTUStatistics(const CUData& ctu, uint32_t* qtreeInterCnt, uint32_t* qtreeIntraCnt, uint32_t* qtreeSkipCnt)
> +void FrameEncoder::collectCTUStatistics(const CUData& ctu, FrameStats* log)
> {
> - StatisticLog* log = &m_sliceTypeLog[ctu.m_slice->m_sliceType];
> - int totQP = 0;
> -
> if (ctu.m_slice->m_sliceType == I_SLICE)
> {
> uint32_t depth = 0;
> @@ -1129,14 +1164,11 @@
>
> log->totalCu++;
> log->cntIntra[depth]++;
> - qtreeIntraCnt[depth]++;
> - totQP += ctu.m_qp[absPartIdx] * (ctu.m_numPartitions >> (depth * 2));
>
> if (ctu.m_predMode[absPartIdx] == MODE_NONE)
> {
> log->totalCu--;
> log->cntIntra[depth]--;
> - qtreeIntraCnt[depth]--;
> }
> else if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N)
> {
> @@ -1159,24 +1191,14 @@
> depth = ctu.m_cuDepth[absPartIdx];
>
> log->totalCu++;
> - log->cntTotalCu[depth]++;
> - totQP += ctu.m_qp[absPartIdx] * (ctu.m_numPartitions >> (depth * 2));
>
> if (ctu.m_predMode[absPartIdx] == MODE_NONE)
> - {
> log->totalCu--;
> - log->cntTotalCu[depth]--;
> - }
> else if (ctu.isSkipped(absPartIdx))
> - {
> - log->totalCu--;
> log->cntSkipCu[depth]++;
> - qtreeSkipCnt[depth]++;
> - }
> else if (ctu.isInter(absPartIdx))
> {
> log->cntInter[depth]++;
> - qtreeInterCnt[depth]++;
>
> if (ctu.m_partSize[absPartIdx] < AMP_ID)
> log->cuInterDistribution[depth][ctu.m_partSize[absPartIdx]]++;
> @@ -1186,7 +1208,6 @@
> else if (ctu.isIntra(absPartIdx))
> {
> log->cntIntra[depth]++;
> - qtreeIntraCnt[depth]++;
>
> if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N)
> {
> @@ -1202,8 +1223,6 @@
> }
> }
> }
> -
> - return totQP;
> }
>
> /* iterate over coded CUs and determine total QP */
> diff -r 2f345c1c0d8e -r e4ac2778e85b source/encoder/frameencoder.h
> --- a/source/encoder/frameencoder.h Tue Jun 30 13:08:15 2015 -0500
> +++ b/source/encoder/frameencoder.h Wed Jul 01 14:36:18 2015 +0530
> @@ -49,8 +49,6 @@
>
> #define ANGULAR_MODE_ID 2
> #define AMP_ID 3
> -#define INTER_MODES 4
> -#define INTRA_MODES 3
>
> struct StatisticLog
> {
> @@ -156,7 +154,6 @@
> MD5Context m_state[3];
> uint32_t m_crc[3];
> uint32_t m_checksum[3];
> - StatisticLog m_sliceTypeLog[3]; // per-slice type CU statistics
>
> volatile int m_activeWorkerCount; // count of workers currently encoding or filtering CTUs
> volatile int m_totalActiveWorkerCount; // sum of m_activeWorkerCount sampled at end of each CTU
> @@ -220,7 +217,7 @@
> void encodeSlice();
>
> void threadMain();
> - int collectCTUStatistics(const CUData& ctu, uint32_t* qtreeInterCnt, uint32_t* qtreeIntraCnt, uint32_t* qtreeSkipCnt);
> + void collectCTUStatistics(const CUData& ctu, FrameStats* frameLog);
> int calcCTUQP(const CUData& ctu);
> void noiseReductionUpdate();
>
> diff -r 2f345c1c0d8e -r e4ac2778e85b source/x265.cpp
> --- a/source/x265.cpp Tue Jun 30 13:08:15 2015 -0500
> +++ b/source/x265.cpp Wed Jul 01 14:36:18 2015 +0530
> @@ -171,7 +171,41 @@
> fprintf(csvfpt, "RateFactor, ");
> fprintf(csvfpt, "Y PSNR, U PSNR, V PSNR, YUV PSNR, SSIM, SSIM (dB), List 0, List 1");
> /* detailed performance statistics */
> - fprintf(csvfpt, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms), Stall Time (ms), Avg WPP, Row Blocks\n");
> + fprintf(csvfpt, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms), Stall Time (ms), Avg WPP, Row Blocks");
> + if (csvLogLevel >= 2)
> + {
> + uint32_t size = param->maxCUSize;
> + for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> + {
> + fprintf(csvfpt, ", Intra %dx%d DC, Intra %dx%d Planar, Intra %dx%d Ang", size, size, size, size, size, size);
> + size /= 2;
> + }
> + fprintf(csvfpt, ", 4x4");
> + size = param->maxCUSize;
> + if (param->bEnableRectInter)
> + {
> + for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> + {
> + fprintf(csvfpt, ", Inter %dx%d, Inter %dx%d (Rect/Amp)", size, size, size, size);
> + size /= 2;
> + }
> + }
> + else
> + {
> + for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> + {
> + fprintf(csvfpt, ", Inter %dx%d", size, size);
> + size /= 2;
> + }
> + }
> + size = param->maxCUSize;
> + for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> + {
> + fprintf(csvfpt, ", Skip %dx%d", size, size);
> + size /= 2;
> + }
> + }
> + fprintf(csvfpt, "\n");
> }
> else
> fputs(summaryCSVHeader, csvfpt);
> @@ -312,6 +346,24 @@
> }
> fprintf(csvfpt, " %.1lf, %.1lf, %.1lf, %.1lf, %.1lf, %.1lf,", frameStats->decideWaitTime, frameStats->row0WaitTime, frameStats->wallTime, frameStats->refWaitWallTime, frameStats->totalCTUTime, frameStats->stallTime);
> fprintf(csvfpt, " %.3lf, %d", frameStats->avgWPP, frameStats->countRowBlocks);
> + if (csvLogLevel >= 2)
> + {
> + for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> + fprintf(csvfpt, ", %5.2lf%%, %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentIntraDistribution[depth][0], frameStats->cuStats.percentIntraDistribution[depth][1], frameStats->cuStats.percentIntraDistribution[depth][2]);
> + fprintf(csvfpt, ", %5.2lf%%", frameStats->cuStats.percentIntraNxN);
> + if (param->bEnableRectInter)
> + {
> + for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> + fprintf(csvfpt, ", %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0], frameStats->cuStats.percentInterDistribution[depth][1]);
> + }
> + else
> + {
> + for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> + fprintf(csvfpt, ", %5.2lf%%", frameStats->cuStats.percentInterCu[depth]);
> + }
> + for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> + fprintf(csvfpt, ", %5.2lf%%", frameStats->cuStats.percentSkipCu[depth]);
> + }
> fprintf(csvfpt, "\n");
> fflush(stderr);
> }
> @@ -703,17 +755,6 @@
> if (cliopt.reconPlayCmd)
> reconPlay = new ReconPlay(cliopt.reconPlayCmd, *param);
>
> - if (cliopt.csvfn)
> - {
> - if (cliopt.parseCSVFile())
> - {
> - cliopt.destroy();
> - if (cliopt.api)
> - cliopt.api->param_free(cliopt.param);
> - exit(5);
> - }
> - }
> -
> /* note: we could try to acquire a different libx265 API here based on
> * the profile found during option parsing, but it must be done before
> * opening an encoder */
> @@ -731,6 +772,17 @@
> /* get the encoder parameters post-initialization */
> api->encoder_parameters(encoder, param);
>
> + if (cliopt.csvfn)
> + {
> + if (cliopt.parseCSVFile())
> + {
> + cliopt.destroy();
> + if (cliopt.api)
> + cliopt.api->param_free(cliopt.param);
> + exit(5);
> + }
> + }
> +
> /* Control-C handler */
> if (signal(SIGINT, sigint_handler) == SIG_ERR)
> x265_log(param, X265_LOG_ERROR, "Unable to register CTRL+C handler: %s\n", strerror(errno));
> diff -r 2f345c1c0d8e -r e4ac2778e85b source/x265.h
> --- a/source/x265.h Tue Jun 30 13:08:15 2015 -0500
> +++ b/source/x265.h Wed Jul 01 14:36:18 2015 +0530
> @@ -100,6 +100,16 @@
> uint32_t numPartitions;
> } x265_analysis_data;
>
> +/* cu statistics */
> +typedef struct x265_cu_stats
> +{
> + double percentIntraNxN;
> + double percentInterCu[4];
> + double percentSkipCu[4];
> + double percentInterDistribution[4][2];
> + double percentIntraDistribution[4][3];
> +} x265_cu_stats;
here you must document what these arrays contain. Just guessing at it,
it looks like it is missing substantial data. How do we differentiate
between merge. 2Nx2N, rect, and amp? How do we differentiate between
merge/no-cbf and inter/no-cbf?
I'm guessing that percentIntraCu[0] can be derived as 1 -
percentInterCu[0] = percentSkipCu[0];
percentIntraNxN probably wants to be last.
> /* Frame level statistics */
> typedef struct x265_frame_stats
> {
> @@ -124,6 +134,7 @@
> int list0POC[16];
> int list1POC[16];
> char sliceType;
> + x265_cu_stats cuStats;
> } x265_frame_stats;
this is a binary change to the public API, needs a build bump
> /* Used to pass pictures into the encoder, and to get picture data back out of
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list