[x265] [PATCH] search: add compile-time optional detailed CU stats

Steve Borho steve at borho.org
Sun Feb 1 03:01:07 CET 2015


# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1422741420 21600
#      Sat Jan 31 15:57:00 2015 -0600
# Node ID 7e594f585296cbcbc30dc78dbe257322355f6bc7
# Parent  0e685522e4d474b490b25a3449205e08afc7df5d
search: add compile-time optional detailed CU stats

By keeping accumulators per worker thread per frame encoder, we can avoid the
use of atomics and simply accumulate the results at the end of each frame. The
calls to x265_mdate() and accumulation work is still measurable and so all of
this work is only performed if the flag is enabled in search.h. It is not made
into a cmake option since only codec hackers will be interested in this data.

The logging output looks like:

x265 [info]: CU: 11200 64X64 CTUs in 11.665 seconds, 960.135 CTUs per second
x265 [info]: CU: %35.76 time spend Motion Estimation in 112522 calls
x265 [info]: CU: %08.78 time spend Intra Analysis in 83544 calls
x265 [info]: CU: %27.04 time spend Inter RDO in 21.674 calls per CTU
x265 [info]: CU: %22.15 time spend Intra RDO in 25.792 calls per CTU
x265 [info]: CU: 112522 PMode Masters blocked an average of 22.154 ns
x265 [info]: CU: 181155 PMode Tasks took average of 0.036 ms

diff -r 0e685522e4d4 -r 7e594f585296 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp	Sat Jan 31 16:03:57 2015 -0600
+++ b/source/encoder/analysis.cpp	Sat Jan 31 15:57:00 2015 -0600
@@ -143,6 +143,8 @@
         }
     }
 
+    ProfileCUScope(ctu, totalCTUTime, totalCTUs);
+
     uint32_t zOrder = 0;
     if (m_slice->m_sliceType == I_SLICE)
     {
@@ -328,8 +330,11 @@
         int id = m_numAcquiredJobs++;
         m_pmodeLock.release();
 
-        ProfileScopeEvent(pmode);
-        parallelModeAnalysis(threadId, id);
+        {
+            ProfileScopeEvent(pmode);
+            ProfileCUScope(m_modeDepth[m_curGeom->depth].pred[PRED_2Nx2N].cu, pmodeTime, countPModeTasks);
+            parallelModeAnalysis(threadId, id);
+        }
 
         m_pmodeLock.acquire();
         if (++m_numCompletedJobs == m_totalNumJobs)
@@ -346,8 +351,11 @@
         int id = m_numAcquiredME++;
         m_meLock.release();
 
-        ProfileScopeEvent(pme);
-        parallelME(threadId, id);
+        {
+            ProfileScopeEvent(pme);
+            ProfileCUScope(m_curInterMode->cu, pmeTime, countPMETasks);
+            parallelME(threadId, id);
+        }
 
         m_meLock.acquire();
         if (++m_numCompletedME == m_totalNumME)
@@ -576,7 +584,10 @@
         {
             checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
 
-            m_modeCompletionEvent.wait();
+            {
+                ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters);
+                m_modeCompletionEvent.wait();
+            }
 
             /* select best inter mode based on sa8d cost */
             Mode *bestInter = &md.pred[PRED_2Nx2N];
@@ -654,7 +665,10 @@
         else
         {
             checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
-            m_modeCompletionEvent.wait();
+            {
+                ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters);
+                m_modeCompletionEvent.wait();
+            }
 
             checkBestMode(md.pred[PRED_2Nx2N], depth);
             checkBestMode(md.pred[PRED_BIDIR], depth);
diff -r 0e685522e4d4 -r 7e594f585296 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp	Sat Jan 31 16:03:57 2015 -0600
+++ b/source/encoder/encoder.cpp	Sat Jan 31 15:57:00 2015 -0600
@@ -36,6 +36,7 @@
 #include "ratecontrol.h"
 #include "dpb.h"
 #include "nal.h"
+#include "nal.h"
 
 #include "x265.h"
 
@@ -810,6 +811,51 @@
         x265_log(m_param, X265_LOG_INFO, "lossless compression ratio %.2f::1\n", uncompressed / m_analyzeAll.m_accBits);
     }
 
+
+#if DETAILED_CU_STATS
+    /* Summarize stats from all frame encoders */
+    CUStats cuStats;
+    for (int i = 0; i < m_param->frameNumThreads; i++)
+        cuStats.accumulate(m_frameEncoder[i].m_cuStats);
+
+#define ELAPSED_SEC(val)  ((double)(val) / 1000000)
+#define ELAPSED_MSEC(val) ((double)(val) / 1000)
+
+    x265_log(m_param, X265_LOG_INFO, "CU: " X265_LL " %dX%d CTUs in %.3lf seconds, %.3lf CTUs per second\n",
+            cuStats.totalCTUs, g_maxCUSize, g_maxCUSize,
+            ELAPSED_SEC(cuStats.totalCTUTime),
+            cuStats.totalCTUs / ELAPSED_SEC(cuStats.totalCTUTime));
+    x265_log(m_param, X265_LOG_INFO, "CU: %%%05.2lf time spend Motion Estimation in " X265_LL " calls\n",
+            100.0 * cuStats.motionEstimationElapsedTime / cuStats.totalCTUTime,
+            cuStats.countMotionEstimate);
+    x265_log(m_param, X265_LOG_INFO, "CU: %%%05.2lf time spend Intra Analysis in " X265_LL " calls\n",
+            100.0 * cuStats.intraAnalysisElapsedTime / cuStats.totalCTUTime,
+            cuStats.countIntraAnalysis);
+    x265_log(m_param, X265_LOG_INFO, "CU: %%%05.2lf time spend Inter RDO in %.3lf calls per CTU\n",
+            100.0 * cuStats.interRDOElapsedTime / cuStats.totalCTUTime,
+            (double)cuStats.countInterRDO / cuStats.totalCTUs);
+    x265_log(m_param, X265_LOG_INFO, "CU: %%%05.2lf time spend Intra RDO in %.3lf calls per CTU\n",
+            100.0 * cuStats.intraRDOElapsedTime / cuStats.totalCTUTime,
+            (double)cuStats.countIntraRDO / cuStats.totalCTUs);
+    if (m_param->bDistributeModeAnalysis && cuStats.countPModeMasters)
+    {
+        x265_log(m_param, X265_LOG_INFO, "CU: %d PMode Masters blocked an average of %.3lf ns\n",
+                cuStats.countPModeMasters, cuStats.pmodeBlockTime / cuStats.countPModeMasters);
+        x265_log(m_param, X265_LOG_INFO, "CU: %d PMode Tasks took average of %.3lfms\n",
+                cuStats.countPModeTasks, ELAPSED_MSEC(cuStats.pmodeTime) / cuStats.countPModeTasks);
+    }
+    if (m_param->bDistributeMotionEstimation && cuStats.countPMEMasters)
+    {
+        x265_log(m_param, X265_LOG_INFO, "CU: %d PME Masters blocked an average of %.3lf ns\n",
+                cuStats.countPMEMasters, cuStats.pmeBlockTime / cuStats.countPMEMasters);
+        x265_log(m_param, X265_LOG_INFO, "CU: %d PME Tasks took average of %.3lfms\n",
+                cuStats.countPMETasks, ELAPSED_MSEC(cuStats.pmeTime) / cuStats.countPMETasks);
+    }
+
+#undef ELAPSED_SEC
+#undef ELAPSED_MSEC
+#endif
+
     if (!m_param->bLogCuStats)
         return;
 
diff -r 0e685522e4d4 -r 7e594f585296 source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp	Sat Jan 31 16:03:57 2015 -0600
+++ b/source/encoder/frameencoder.cpp	Sat Jan 31 15:57:00 2015 -0600
@@ -524,6 +524,13 @@
         }
     }
 
+#if DETAILED_CU_STATS
+    /* Accumulate CU statistics from each worker thread, we could report
+     * per-frame stats here, but currently we do not. */
+    for (int i = 0; i < m_top->m_numThreadLocalData; i++)
+        m_cuStats.accumulate(m_top->m_threadLocalData[i].analysis.m_stats[m_frameEncoderID]);
+#endif
+
     m_endFrameTime = x265_mdate();
 }
 
diff -r 0e685522e4d4 -r 7e594f585296 source/encoder/frameencoder.h
--- a/source/encoder/frameencoder.h	Sat Jan 31 16:03:57 2015 -0600
+++ b/source/encoder/frameencoder.h	Sat Jan 31 15:57:00 2015 -0600
@@ -177,6 +177,9 @@
     int64_t                  m_slicetypeWaitTime;        // total elapsed time waiting for decided frame
     int64_t                  m_totalWorkerElapsedTime;   // total elapsed time spent by worker threads processing CTUs
     int64_t                  m_totalNoWorkerTime;        // total elapsed time without any active worker threads
+#if DETAILED_CU_STATS
+    CUStats                  m_cuStats;
+#endif
 
     Encoder*                 m_top;
     x265_param*              m_param;
diff -r 0e685522e4d4 -r 7e594f585296 source/encoder/search.cpp
--- a/source/encoder/search.cpp	Sat Jan 31 16:03:57 2015 -0600
+++ b/source/encoder/search.cpp	Sat Jan 31 15:57:00 2015 -0600
@@ -1141,6 +1141,8 @@
     uint32_t tuDepthRange[2];
     cu.getIntraTUQtDepthRange(tuDepthRange, 0);
 
+    ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
+
     intraMode.initCosts();
     intraMode.distortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange, sharedModes);
     intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom);
@@ -1177,6 +1179,8 @@
  * be generated later. It records the best mode in the cu */
 void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
 {
+    ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
+
     CUData& cu = intraMode.cu;
     uint32_t depth = cuGeom.depth;
 
@@ -1346,6 +1350,8 @@
 
 void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
 {
+    ProfileCUScope(intraMode.cu, intraRDOElapsedTime, countIntraRDO);
+
     CUData& cu = intraMode.cu;
     Yuv* reconYuv = &intraMode.reconYuv;
 
@@ -1542,6 +1548,9 @@
             {
                 if (candCostList[i] == MAX_INT64)
                     break;
+
+                ProfileCUScope(intraMode.cu, intraRDOElapsedTime, countIntraRDO);
+
                 m_entropyCoder.load(m_rqt[depth].cur);
                 cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth);
 
@@ -1554,6 +1563,8 @@
             }
         }
 
+        ProfileCUScope(intraMode.cu, intraRDOElapsedTime, countIntraRDO);
+
         /* remeasure best mode, allowing TU splits */
         cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth);
         m_entropyCoder.load(m_rqt[depth].cur);
@@ -1656,6 +1667,8 @@
 
 uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
 {
+    ProfileCUScope(intraMode.cu, intraRDOElapsedTime, countIntraRDO);
+
     CUData& cu = intraMode.cu;
     Yuv& reconYuv = intraMode.reconYuv;
 
@@ -1908,6 +1921,8 @@
  * returns true if predYuv was filled with a motion compensated prediction */
 bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChromaSA8D)
 {
+    ProfileCUScope(interMode.cu, motionEstimationElapsedTime, countMotionEstimate);
+
     CUData& cu = interMode.cu;
     Yuv* predYuv = &interMode.predYuv;
 
@@ -2086,6 +2101,7 @@
                 m_meCompletionEvent.trigger();
             m_meLock.release();
 
+            ProfileCUScopeNamed(pmeWaitScope, interMode.cu, pmeBlockTime, countPMEMasters);
             m_meCompletionEvent.wait();
         }
         else
@@ -2481,6 +2497,8 @@
  * Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
 void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
 {
+    ProfileCUScope(interMode.cu, interRDOElapsedTime, countInterRDO);
+
     CUData& cu = interMode.cu;
     Yuv* reconYuv = &interMode.reconYuv;
     Yuv* predYuv = &interMode.predYuv;
diff -r 0e685522e4d4 -r 7e594f585296 source/encoder/search.h
--- a/source/encoder/search.h	Sat Jan 31 16:03:57 2015 -0600
+++ b/source/encoder/search.h	Sat Jan 31 15:57:00 2015 -0600
@@ -28,6 +28,7 @@
 #include "predict.h"
 #include "quant.h"
 #include "bitcost.h"
+#include "framedata.h"
 #include "yuv.h"
 #include "threadpool.h"
 
@@ -35,6 +36,17 @@
 #include "entropy.h"
 #include "motion.h"
 
+#define DETAILED_CU_STATS 1
+#if DETAILED_CU_STATS
+#define ProfileCUScopeNamed(name, cu, acc, count) \
+    m_stats[cu.m_encData->m_frameEncoderID].count++; \
+    ScopedElapsedTime name(m_stats[cu.m_encData->m_frameEncoderID].acc)
+#define ProfileCUScope(cu, acc, count) ProfileCUScopeNamed(timedScope, cu, acc, count)
+#else
+#define ProfileCUScopeNamed(name, cu, acc, count)
+#define ProfileCUScope(cu, acc, count)
+#endif
+
 namespace x265 {
 // private namespace
 
@@ -122,6 +134,68 @@
     }
 };
 
+#if DETAILED_CU_STATS
+/* This structure is intended for performance debugging and we make no attempt
+ * to handle dynamic range overflows. Care should be taken to avoid long encodes
+ * if you care about the accuracy of these elapsed times and counters. This
+ * profiling is orthoganal to PPA/VTune and can be enabled indepedently from
+ * either of them */
+struct CUStats
+{
+    int64_t  intraRDOElapsedTime;
+    int64_t  interRDOElapsedTime;
+    int64_t  intraAnalysisElapsedTime;    /* in RD > 4, includes RDO cost */
+    int64_t  motionEstimationElapsedTime;
+    int64_t  pmeTime;
+    int64_t  pmeBlockTime;
+    int64_t  pmodeTime;
+    int64_t  pmodeBlockTime;
+    int64_t  totalCTUTime;
+
+    uint64_t countIntraRDO;
+    uint64_t countInterRDO;
+    uint64_t countIntraAnalysis;
+    uint64_t countMotionEstimate;
+    uint64_t countPMETasks;
+    uint64_t countPMEMasters;
+    uint64_t countPModeTasks;
+    uint64_t countPModeMasters;
+    uint64_t totalCTUs;
+
+    CUStats() { clear(); }
+
+    void clear()
+    {
+        memset(this, 0, sizeof(*this));
+    }
+
+    void accumulate(CUStats& other)
+    {
+        intraRDOElapsedTime += other.intraRDOElapsedTime;
+        interRDOElapsedTime += other.interRDOElapsedTime;
+        intraAnalysisElapsedTime += other.intraAnalysisElapsedTime;
+        motionEstimationElapsedTime += other.motionEstimationElapsedTime;
+        pmeTime += other.pmeTime;
+        pmeBlockTime += other.pmeBlockTime;
+        pmodeTime += other.pmodeTime;
+        pmodeBlockTime += other.pmodeBlockTime;
+        totalCTUTime += other.totalCTUTime;
+
+        countIntraRDO += other.countIntraRDO;
+        countInterRDO += other.countInterRDO;
+        countIntraAnalysis += other.countIntraAnalysis;
+        countMotionEstimate += other.countMotionEstimate;
+        countPMETasks += other.countPMETasks;
+        countPMEMasters += other.countPMEMasters;
+        countPModeTasks += other.countPModeTasks;
+        countPModeMasters += other.countPModeMasters;
+        totalCTUs += other.totalCTUs;
+
+        other.clear();
+    }
+}; 
+#endif
+
 inline int getTUBits(int idx, int numIdx)
 {
     return idx + (idx < numIdx - 1);
@@ -151,6 +225,11 @@
     uint32_t        m_numLayers;
     uint32_t        m_refLagPixels;
 
+#if DETAILED_CU_STATS
+    /* Accumulate CU statistics seperately for each frame encoder */
+    CUStats         m_stats[X265_MAX_FRAME_THREADS];
+#endif
+
     Search();
     ~Search();
 


More information about the x265-devel mailing list