[x265] [PATCH] stats: count of each CU partition per frame

Wed Jul 1 17:55:48 CEST 2015

On 07/01, Divya Manivannan wrote:
> # HG changeset patch
> # User Divya Manivannan <divya at multicorewareinc.com>
> # Date 1435741578 -19800
> #      Wed Jul 01 14:36:18 2015 +0530
> # Node ID e4ac2778e85bb86923842f3dea5a4a4f8e88b057
> # Parent  2f345c1c0d8e2351e5aaae5f3e0e017b5810f32e
> stats: count of each CU partition per frame
> 
> diff -r 2f345c1c0d8e -r e4ac2778e85b doc/reST/api.rst
> --- a/doc/reST/api.rst	Tue Jun 30 13:08:15 2015 -0500
> +++ b/doc/reST/api.rst	Wed Jul 01 14:36:18 2015 +0530
> @@ -338,10 +338,6 @@
>  Cleanup
>  =======
>  
> -	/* x265_encoder_log:
> -	 *       This function is now deprecated */
> -	void x265_encoder_log(x265_encoder *encoder, int argc, char **argv);
> -
>  Finally, the encoder must be closed in order to free all of its
>  resources. An encoder that has been flushed cannot be restarted and
>  reused. Once **x265_encoder_close()** has been called, the encoder
> diff -r 2f345c1c0d8e -r e4ac2778e85b source/common/framedata.h
> --- a/source/common/framedata.h	Tue Jun 30 13:08:15 2015 -0500
> +++ b/source/common/framedata.h	Wed Jul 01 14:36:18 2015 +0530
> @@ -34,6 +34,9 @@
>  class PicYuv;
>  class JobProvider;
>  
> +#define INTER_MODES 4
> +#define INTRA_MODES 3

how are there three intra modes and only 4 inter modes? at the least
these deserve comments

>  /* Current frame stats for 2 pass */
>  struct FrameStats
>  {
> @@ -49,6 +52,24 @@
>      double      percent8x8Intra;
>      double      percent8x8Inter;
>      double      percent8x8Skip;
> +    double      percentIntraNxN;
> +    double      percentSkipCu[4];
> +    double      percentInterCu[4];
> +    double      percentIntraDistribution[4][INTRA_MODES];
> +    double      percentInterDistribution[4][2];
> +
> +    uint64_t    cntIntraNxN;
> +    uint64_t    totalCu;
> +    uint64_t    cntSkipCu[4];
> +    uint64_t    cntInter[4];
> +    uint64_t    cntIntra[4];
> +    uint64_t    cuInterDistribution[4][INTER_MODES];
> +    uint64_t    cuIntraDistribution[4][INTRA_MODES];

all these 4's should be max-depth macros

> +    FrameStats()
> +    {
> +        memset(this, 0, sizeof(FrameStats));
> +    }
>  };
>  
>  /* Per-frame data that is used during encodes and referenced while the picture
> diff -r 2f345c1c0d8e -r e4ac2778e85b source/encoder/encoder.cpp
> --- a/source/encoder/encoder.cpp	Tue Jun 30 13:08:15 2015 -0500
> +++ b/source/encoder/encoder.cpp	Wed Jul 01 14:36:18 2015 +0530
> @@ -1163,6 +1163,19 @@
>          else
>              frameStats->avgWPP = 1;
>          frameStats->countRowBlocks = curEncoder->m_countRowBlocks;
> +
> +        frameStats->cuStats.percentIntraNxN = curFrame->m_encData->m_frameStats.percentIntraNxN;
> +        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> +        {
> +            frameStats->cuStats.percentInterCu[depth] = curFrame->m_encData->m_frameStats.percentInterCu[depth];
> +            frameStats->cuStats.percentSkipCu[depth]  = curFrame->m_encData->m_frameStats.percentSkipCu[depth];
> +            for (int n = 0; n < INTRA_MODES; n++)
> +            {
> +                if (n < 2)
> +                    frameStats->cuStats.percentInterDistribution[depth][n] = curFrame->m_encData->m_frameStats.percentInterDistribution[depth][n];

All this is to avoid two for-loops? I find this unnecessarily confusing.
And why use a hard-coded 2 here when you have INTER_MODES?

Having a conditional inside a for-loop is slower than having two
for-loops, particularly when they are this trivial, the compiler will
generally unroll short loops of fixed size.

> +                frameStats->cuStats.percentIntraDistribution[depth][n] = curFrame->m_encData->m_frameStats.percentIntraDistribution[depth][n];
> +            }
> +        }
>      }
>  }
>  
> diff -r 2f345c1c0d8e -r e4ac2778e85b source/encoder/frameencoder.cpp
> --- a/source/encoder/frameencoder.cpp	Tue Jun 30 13:08:15 2015 -0500
> +++ b/source/encoder/frameencoder.cpp	Wed Jul 01 14:36:18 2015 +0530
> @@ -583,6 +583,36 @@
>          m_frame->m_encData->m_frameStats.percent8x8Inter = (double)totalP / totalCuCount;
>          m_frame->m_encData->m_frameStats.percent8x8Skip  = (double)totalSkip / totalCuCount;
>      }
> +    for (uint32_t i = 0; i < m_numRows; i++)
> +    {
> +        m_frame->m_encData->m_frameStats.cntIntraNxN += m_rows[i].rowStats.cntIntraNxN;
> +        m_frame->m_encData->m_frameStats.totalCu     += m_rows[i].rowStats.totalCu;
> +        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> +        {
> +            m_frame->m_encData->m_frameStats.cntInter[depth]  += m_rows[i].rowStats.cntInter[depth];
> +            m_frame->m_encData->m_frameStats.cntSkipCu[depth] += m_rows[i].rowStats.cntSkipCu[depth];
> +            for (int m = 0; m < INTER_MODES; m++)
> +            {
> +                if (m < INTRA_MODES)
> +                    m_frame->m_encData->m_frameStats.cuIntraDistribution[depth][m] += m_rows[i].rowStats.cuIntraDistribution[depth][m];
> +                m_frame->m_encData->m_frameStats.cuInterDistribution[depth][m] += m_rows[i].rowStats.cuInterDistribution[depth][m];
> +            }

this indexing is also confusing.

> +        }
> +    }
> +    m_frame->m_encData->m_frameStats.percentIntraNxN = (double)(m_frame->m_encData->m_frameStats.cntIntraNxN * 100) / m_frame->m_encData->m_frameStats.totalCu;
> +    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> +    {
> +        m_frame->m_encData->m_frameStats.percentSkipCu[depth]  = (double)(m_frame->m_encData->m_frameStats.cntSkipCu[depth] * 100) / m_frame->m_encData->m_frameStats.totalCu;
> +        m_frame->m_encData->m_frameStats.percentInterCu[depth] = (double)(m_frame->m_encData->m_frameStats.cntInter[depth] * 100) / m_frame->m_encData->m_frameStats.totalCu;
> +        uint64_t cuInterCnt = 0;
> +        for (int n = 0; n < INTRA_MODES; n++)
> +        {
> +            m_frame->m_encData->m_frameStats.percentIntraDistribution[depth][n] = (double)(m_frame->m_encData->m_frameStats.cuIntraDistribution[depth][n] * 100) / m_frame->m_encData->m_frameStats.totalCu;
> +            cuInterCnt += m_frame->m_encData->m_frameStats.cuInterDistribution[depth][n + 1];

again, the indexing is making my head spin..
m_frameStats.cuInterDistribution[depth][n + 1] where n is the intra
modes?

> +        }
> +        m_frame->m_encData->m_frameStats.percentInterDistribution[depth][0] = (double)(m_frame->m_encData->m_frameStats.cuInterDistribution[depth][0] * 100) / m_frame->m_encData->m_frameStats.totalCu;
> +        m_frame->m_encData->m_frameStats.percentInterDistribution[depth][1] = (double)(cuInterCnt * 100) / m_frame->m_encData->m_frameStats.totalCu;
> +    }
>  
>      m_bs.resetBits();
>      m_entropyCoder.load(m_initSliceContext);
> @@ -838,13 +868,6 @@
>      const uint32_t lineStartCUAddr = row * numCols;
>      bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
>  
> -    /* These store the count of inter, intra and skip cus within quad tree structure of each CTU */
> -    uint32_t qTreeInterCnt[NUM_CU_DEPTH];
> -    uint32_t qTreeIntraCnt[NUM_CU_DEPTH];
> -    uint32_t qTreeSkipCnt[NUM_CU_DEPTH];
> -    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> -        qTreeIntraCnt[depth] = qTreeInterCnt[depth] = qTreeSkipCnt[depth] = 0;
> -
>      while (curRow.completed < numCols)
>      {
>          ProfileScopeEvent(encodeCTU);
> @@ -916,28 +939,43 @@
>          // Completed CU processing
>          curRow.completed++;
>  
> -        if (m_param->rc.bStatWrite)
> -            curEncData.m_rowStat[row].sumQpAq += collectCTUStatistics(*ctu, qTreeInterCnt, qTreeIntraCnt, qTreeSkipCnt);
> -        else if (m_param->rc.aqMode)
> +        FrameStats frameLog;
> +        collectCTUStatistics(*ctu, &frameLog);

if collectCTUStatistics() is going to be called unconditionally, why not
have it return the total QP like it used to, so we don't need to call
calcCTUQP() anymore? That function was only necessary because
collectCTUStatistics() could have been skipped.

> +        if (m_param->rc.aqMode)
>              curEncData.m_rowStat[row].sumQpAq += calcCTUQP(*ctu);
>  
>          // copy no. of intra, inter Cu cnt per row into frame stats for 2 pass
>          if (m_param->rc.bStatWrite)
>          {
> -            curRow.rowStats.mvBits += best.mvBits;
> +            curRow.rowStats.mvBits    += best.mvBits;
>              curRow.rowStats.coeffBits += best.coeffBits;
> -            curRow.rowStats.miscBits += best.totalBits - (best.mvBits + best.coeffBits);
> +            curRow.rowStats.miscBits  += best.totalBits - (best.mvBits + best.coeffBits);
>  
>              for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
>              {
>                  /* 1 << shift == number of 8x8 blocks at current depth */
>                  int shift = 2 * (g_maxCUDepth - depth);
> -                curRow.rowStats.intra8x8Cnt += qTreeIntraCnt[depth] << shift;
> -                curRow.rowStats.inter8x8Cnt += qTreeInterCnt[depth] << shift;
> -                curRow.rowStats.skip8x8Cnt  += qTreeSkipCnt[depth] << shift;
>  
> -                // clear the row cu data from thread local object
> -                qTreeIntraCnt[depth] = qTreeInterCnt[depth] = qTreeSkipCnt[depth] = 0;
> +                if (depth != 3)

a hard-coded depth here is not right, what if the CTU size is 32 or 16?
You have to check that this is the max-depth.

> +                    curRow.rowStats.intra8x8Cnt += (int)(frameLog.cntIntra[depth] << shift);
> +                else
> +                    curRow.rowStats.intra8x8Cnt += (int)(frameLog.cntIntra[depth] + frameLog.cntIntraNxN);
> +
> +                curRow.rowStats.inter8x8Cnt += (int)(frameLog.cntInter[depth] << shift);
> +                curRow.rowStats.skip8x8Cnt  += (int)(frameLog.cntSkipCu[depth] << shift);
> +            }
> +        }
> +        curRow.rowStats.cntIntraNxN += frameLog.cntIntraNxN;
> +        curRow.rowStats.totalCu     += frameLog.totalCu;
> +        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> +        {
> +            curRow.rowStats.cntInter[depth]  += frameLog.cntInter[depth];
> +            curRow.rowStats.cntSkipCu[depth] += frameLog.cntSkipCu[depth];
> +            for (int m = 0; m < INTER_MODES; m++)
> +            {
> +                if (m < INTRA_MODES)
> +                    curRow.rowStats.cuIntraDistribution[depth][m] += frameLog.cuIntraDistribution[depth][m];
> +                curRow.rowStats.cuInterDistribution[depth][m] += frameLog.cuInterDistribution[depth][m];
>              }
>          }
>  
> @@ -1115,11 +1153,8 @@
>  }
>  
>  /* collect statistics about CU coding decisions, return total QP */
> -int FrameEncoder::collectCTUStatistics(const CUData& ctu, uint32_t* qtreeInterCnt, uint32_t* qtreeIntraCnt, uint32_t* qtreeSkipCnt)
> +void FrameEncoder::collectCTUStatistics(const CUData& ctu, FrameStats* log)
>  {
> -    StatisticLog* log = &m_sliceTypeLog[ctu.m_slice->m_sliceType];
> -    int totQP = 0;
> -
>      if (ctu.m_slice->m_sliceType == I_SLICE)
>      {
>          uint32_t depth = 0;
> @@ -1129,14 +1164,11 @@
>  
>              log->totalCu++;
>              log->cntIntra[depth]++;
> -            qtreeIntraCnt[depth]++;
> -            totQP += ctu.m_qp[absPartIdx] * (ctu.m_numPartitions >> (depth * 2));
>  
>              if (ctu.m_predMode[absPartIdx] == MODE_NONE)
>              {
>                  log->totalCu--;
>                  log->cntIntra[depth]--;
> -                qtreeIntraCnt[depth]--;
>              }
>              else if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N)
>              {
> @@ -1159,24 +1191,14 @@
>              depth = ctu.m_cuDepth[absPartIdx];
>  
>              log->totalCu++;
> -            log->cntTotalCu[depth]++;
> -            totQP += ctu.m_qp[absPartIdx] * (ctu.m_numPartitions >> (depth * 2));
>  
>              if (ctu.m_predMode[absPartIdx] == MODE_NONE)
> -            {
>                  log->totalCu--;
> -                log->cntTotalCu[depth]--;
> -            }
>              else if (ctu.isSkipped(absPartIdx))
> -            {
> -                log->totalCu--;
>                  log->cntSkipCu[depth]++;
> -                qtreeSkipCnt[depth]++;
> -            }
>              else if (ctu.isInter(absPartIdx))
>              {
>                  log->cntInter[depth]++;
> -                qtreeInterCnt[depth]++;
>  
>                  if (ctu.m_partSize[absPartIdx] < AMP_ID)
>                      log->cuInterDistribution[depth][ctu.m_partSize[absPartIdx]]++;
> @@ -1186,7 +1208,6 @@
>              else if (ctu.isIntra(absPartIdx))
>              {
>                  log->cntIntra[depth]++;
> -                qtreeIntraCnt[depth]++;
>  
>                  if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N)
>                  {
> @@ -1202,8 +1223,6 @@
>              }
>          }
>      }
> -
> -    return totQP;
>  }
>  
>  /* iterate over coded CUs and determine total QP */
> diff -r 2f345c1c0d8e -r e4ac2778e85b source/encoder/frameencoder.h
> --- a/source/encoder/frameencoder.h	Tue Jun 30 13:08:15 2015 -0500
> +++ b/source/encoder/frameencoder.h	Wed Jul 01 14:36:18 2015 +0530
> @@ -49,8 +49,6 @@
>  
>  #define ANGULAR_MODE_ID 2
>  #define AMP_ID 3
> -#define INTER_MODES 4
> -#define INTRA_MODES 3
>  
>  struct StatisticLog
>  {
> @@ -156,7 +154,6 @@
>      MD5Context               m_state[3];
>      uint32_t                 m_crc[3];
>      uint32_t                 m_checksum[3];
> -    StatisticLog             m_sliceTypeLog[3];     // per-slice type CU statistics
>  
>      volatile int             m_activeWorkerCount;        // count of workers currently encoding or filtering CTUs
>      volatile int             m_totalActiveWorkerCount;   // sum of m_activeWorkerCount sampled at end of each CTU
> @@ -220,7 +217,7 @@
>      void encodeSlice();
>  
>      void threadMain();
> -    int  collectCTUStatistics(const CUData& ctu, uint32_t* qtreeInterCnt, uint32_t* qtreeIntraCnt, uint32_t* qtreeSkipCnt);
> +    void  collectCTUStatistics(const CUData& ctu, FrameStats* frameLog);
>      int  calcCTUQP(const CUData& ctu);
>      void noiseReductionUpdate();
>  
> diff -r 2f345c1c0d8e -r e4ac2778e85b source/x265.cpp
> --- a/source/x265.cpp	Tue Jun 30 13:08:15 2015 -0500
> +++ b/source/x265.cpp	Wed Jul 01 14:36:18 2015 +0530
> @@ -171,7 +171,41 @@
>                      fprintf(csvfpt, "RateFactor, ");
>                  fprintf(csvfpt, "Y PSNR, U PSNR, V PSNR, YUV PSNR, SSIM, SSIM (dB),  List 0, List 1");
>                  /* detailed performance statistics */
> -                fprintf(csvfpt, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms), Stall Time (ms), Avg WPP, Row Blocks\n");
> +                fprintf(csvfpt, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms), Stall Time (ms), Avg WPP, Row Blocks");
> +                if (csvLogLevel >= 2)
> +                {
> +                    uint32_t size = param->maxCUSize;
> +                    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> +                    {
> +                        fprintf(csvfpt, ", Intra %dx%d DC, Intra %dx%d Planar, Intra %dx%d Ang", size, size, size, size, size, size);
> +                        size /= 2;
> +                    }
> +                    fprintf(csvfpt, ", 4x4");
> +                    size = param->maxCUSize;
> +                    if (param->bEnableRectInter)
> +                    {
> +                        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> +                        {
> +                            fprintf(csvfpt, ", Inter %dx%d, Inter %dx%d (Rect/Amp)", size, size, size, size);
> +                            size /= 2;
> +                        }
> +                    }
> +                    else
> +                    {
> +                        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> +                        {
> +                            fprintf(csvfpt, ", Inter %dx%d", size, size);
> +                            size /= 2;
> +                        }
> +                    }
> +                    size = param->maxCUSize;
> +                    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> +                    {
> +                        fprintf(csvfpt, ", Skip %dx%d", size, size);
> +                        size /= 2;
> +                    }
> +                }
> +                fprintf(csvfpt, "\n");
>              }
>              else
>                  fputs(summaryCSVHeader, csvfpt);
> @@ -312,6 +346,24 @@
>          }
>          fprintf(csvfpt, " %.1lf, %.1lf, %.1lf, %.1lf, %.1lf, %.1lf,", frameStats->decideWaitTime, frameStats->row0WaitTime, frameStats->wallTime, frameStats->refWaitWallTime, frameStats->totalCTUTime, frameStats->stallTime);
>          fprintf(csvfpt, " %.3lf, %d", frameStats->avgWPP, frameStats->countRowBlocks);
> +        if (csvLogLevel >= 2)
> +        {
> +            for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> +                fprintf(csvfpt, ", %5.2lf%%, %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentIntraDistribution[depth][0], frameStats->cuStats.percentIntraDistribution[depth][1], frameStats->cuStats.percentIntraDistribution[depth][2]);
> +            fprintf(csvfpt, ", %5.2lf%%", frameStats->cuStats.percentIntraNxN);
> +            if (param->bEnableRectInter)
> +            {
> +                for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> +                    fprintf(csvfpt, ", %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0], frameStats->cuStats.percentInterDistribution[depth][1]);
> +            }
> +            else
> +            {
> +                for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> +                    fprintf(csvfpt, ", %5.2lf%%", frameStats->cuStats.percentInterCu[depth]);
> +            }
> +            for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
> +                fprintf(csvfpt, ", %5.2lf%%", frameStats->cuStats.percentSkipCu[depth]);
> +        }
>          fprintf(csvfpt, "\n");
>          fflush(stderr);
>      }
> @@ -703,17 +755,6 @@
>      if (cliopt.reconPlayCmd)
>          reconPlay = new ReconPlay(cliopt.reconPlayCmd, *param);
>  
> -    if (cliopt.csvfn)
> -    {
> -        if (cliopt.parseCSVFile())
> -        {
> -            cliopt.destroy();
> -            if (cliopt.api)
> -                cliopt.api->param_free(cliopt.param);
> -            exit(5);
> -        }
> -    }
> -
>      /* note: we could try to acquire a different libx265 API here based on
>       * the profile found during option parsing, but it must be done before
>       * opening an encoder */
> @@ -731,6 +772,17 @@
>      /* get the encoder parameters post-initialization */
>      api->encoder_parameters(encoder, param);
>  
> +    if (cliopt.csvfn)
> +    {
> +        if (cliopt.parseCSVFile())
> +        {
> +            cliopt.destroy();
> +            if (cliopt.api)
> +                cliopt.api->param_free(cliopt.param);
> +            exit(5);
> +        }
> +    }
> +
>      /* Control-C handler */
>      if (signal(SIGINT, sigint_handler) == SIG_ERR)
>          x265_log(param, X265_LOG_ERROR, "Unable to register CTRL+C handler: %s\n", strerror(errno));
> diff -r 2f345c1c0d8e -r e4ac2778e85b source/x265.h
> --- a/source/x265.h	Tue Jun 30 13:08:15 2015 -0500
> +++ b/source/x265.h	Wed Jul 01 14:36:18 2015 +0530
> @@ -100,6 +100,16 @@
>      uint32_t         numPartitions;
>  } x265_analysis_data;
>  
> +/* cu statistics */
> +typedef struct x265_cu_stats
> +{
> +    double      percentIntraNxN;
> +    double      percentInterCu[4];
> +    double      percentSkipCu[4];
> +    double      percentInterDistribution[4][2];
> +    double      percentIntraDistribution[4][3];
> +} x265_cu_stats;

here you must document what these arrays contain. Just guessing at it,
it looks like it is missing substantial data.  How do we differentiate
between merge. 2Nx2N, rect, and amp? How do we differentiate between
merge/no-cbf and inter/no-cbf?

I'm guessing that percentIntraCu[0] can be derived as 1 -
percentInterCu[0] = percentSkipCu[0];

percentIntraNxN probably wants to be last.

>  /* Frame level statistics */
>  typedef struct x265_frame_stats
>  {
> @@ -124,6 +134,7 @@
>      int              list0POC[16];
>      int              list1POC[16];
>      char             sliceType;
> +    x265_cu_stats    cuStats;
>  } x265_frame_stats;

this is a binary change to the public API, needs a build bump

>  /* Used to pass pictures into the encoder, and to get picture data back out of
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho