[x265] [PATCH] search: add compile-time optional detailed CU stats
Steve Borho
steve at borho.org
Sun Feb 1 03:01:07 CET 2015
# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1422741420 21600
# Sat Jan 31 15:57:00 2015 -0600
# Node ID 7e594f585296cbcbc30dc78dbe257322355f6bc7
# Parent 0e685522e4d474b490b25a3449205e08afc7df5d
search: add compile-time optional detailed CU stats
By keeping accumulators per worker thread per frame encoder, we can avoid the
use of atomics and simply accumulate the results at the end of each frame. The
calls to x265_mdate() and accumulation work is still measurable and so all of
this work is only performed if the flag is enabled in search.h. It is not made
into a cmake option since only codec hackers will be interested in this data.
The logging output looks like:
x265 [info]: CU: 11200 64X64 CTUs in 11.665 seconds, 960.135 CTUs per second
x265 [info]: CU: %35.76 time spend Motion Estimation in 112522 calls
x265 [info]: CU: %08.78 time spend Intra Analysis in 83544 calls
x265 [info]: CU: %27.04 time spend Inter RDO in 21.674 calls per CTU
x265 [info]: CU: %22.15 time spend Intra RDO in 25.792 calls per CTU
x265 [info]: CU: 112522 PMode Masters blocked an average of 22.154 ns
x265 [info]: CU: 181155 PMode Tasks took average of 0.036 ms
diff -r 0e685522e4d4 -r 7e594f585296 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp Sat Jan 31 16:03:57 2015 -0600
+++ b/source/encoder/analysis.cpp Sat Jan 31 15:57:00 2015 -0600
@@ -143,6 +143,8 @@
}
}
+ ProfileCUScope(ctu, totalCTUTime, totalCTUs);
+
uint32_t zOrder = 0;
if (m_slice->m_sliceType == I_SLICE)
{
@@ -328,8 +330,11 @@
int id = m_numAcquiredJobs++;
m_pmodeLock.release();
- ProfileScopeEvent(pmode);
- parallelModeAnalysis(threadId, id);
+ {
+ ProfileScopeEvent(pmode);
+ ProfileCUScope(m_modeDepth[m_curGeom->depth].pred[PRED_2Nx2N].cu, pmodeTime, countPModeTasks);
+ parallelModeAnalysis(threadId, id);
+ }
m_pmodeLock.acquire();
if (++m_numCompletedJobs == m_totalNumJobs)
@@ -346,8 +351,11 @@
int id = m_numAcquiredME++;
m_meLock.release();
- ProfileScopeEvent(pme);
- parallelME(threadId, id);
+ {
+ ProfileScopeEvent(pme);
+ ProfileCUScope(m_curInterMode->cu, pmeTime, countPMETasks);
+ parallelME(threadId, id);
+ }
m_meLock.acquire();
if (++m_numCompletedME == m_totalNumME)
@@ -576,7 +584,10 @@
{
checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
- m_modeCompletionEvent.wait();
+ {
+ ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters);
+ m_modeCompletionEvent.wait();
+ }
/* select best inter mode based on sa8d cost */
Mode *bestInter = &md.pred[PRED_2Nx2N];
@@ -654,7 +665,10 @@
else
{
checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
- m_modeCompletionEvent.wait();
+ {
+ ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters);
+ m_modeCompletionEvent.wait();
+ }
checkBestMode(md.pred[PRED_2Nx2N], depth);
checkBestMode(md.pred[PRED_BIDIR], depth);
diff -r 0e685522e4d4 -r 7e594f585296 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp Sat Jan 31 16:03:57 2015 -0600
+++ b/source/encoder/encoder.cpp Sat Jan 31 15:57:00 2015 -0600
@@ -36,6 +36,7 @@
#include "ratecontrol.h"
#include "dpb.h"
#include "nal.h"
+#include "nal.h"
#include "x265.h"
@@ -810,6 +811,51 @@
x265_log(m_param, X265_LOG_INFO, "lossless compression ratio %.2f::1\n", uncompressed / m_analyzeAll.m_accBits);
}
+
+#if DETAILED_CU_STATS
+ /* Summarize stats from all frame encoders */
+ CUStats cuStats;
+ for (int i = 0; i < m_param->frameNumThreads; i++)
+ cuStats.accumulate(m_frameEncoder[i].m_cuStats);
+
+#define ELAPSED_SEC(val) ((double)(val) / 1000000)
+#define ELAPSED_MSEC(val) ((double)(val) / 1000)
+
+ x265_log(m_param, X265_LOG_INFO, "CU: " X265_LL " %dX%d CTUs in %.3lf seconds, %.3lf CTUs per second\n",
+ cuStats.totalCTUs, g_maxCUSize, g_maxCUSize,
+ ELAPSED_SEC(cuStats.totalCTUTime),
+ cuStats.totalCTUs / ELAPSED_SEC(cuStats.totalCTUTime));
+ x265_log(m_param, X265_LOG_INFO, "CU: %%%05.2lf time spend Motion Estimation in " X265_LL " calls\n",
+ 100.0 * cuStats.motionEstimationElapsedTime / cuStats.totalCTUTime,
+ cuStats.countMotionEstimate);
+ x265_log(m_param, X265_LOG_INFO, "CU: %%%05.2lf time spend Intra Analysis in " X265_LL " calls\n",
+ 100.0 * cuStats.intraAnalysisElapsedTime / cuStats.totalCTUTime,
+ cuStats.countIntraAnalysis);
+ x265_log(m_param, X265_LOG_INFO, "CU: %%%05.2lf time spend Inter RDO in %.3lf calls per CTU\n",
+ 100.0 * cuStats.interRDOElapsedTime / cuStats.totalCTUTime,
+ (double)cuStats.countInterRDO / cuStats.totalCTUs);
+ x265_log(m_param, X265_LOG_INFO, "CU: %%%05.2lf time spend Intra RDO in %.3lf calls per CTU\n",
+ 100.0 * cuStats.intraRDOElapsedTime / cuStats.totalCTUTime,
+ (double)cuStats.countIntraRDO / cuStats.totalCTUs);
+ if (m_param->bDistributeModeAnalysis && cuStats.countPModeMasters)
+ {
+ x265_log(m_param, X265_LOG_INFO, "CU: %d PMode Masters blocked an average of %.3lf ns\n",
+ cuStats.countPModeMasters, cuStats.pmodeBlockTime / cuStats.countPModeMasters);
+ x265_log(m_param, X265_LOG_INFO, "CU: %d PMode Tasks took average of %.3lfms\n",
+ cuStats.countPModeTasks, ELAPSED_MSEC(cuStats.pmodeTime) / cuStats.countPModeTasks);
+ }
+ if (m_param->bDistributeMotionEstimation && cuStats.countPMEMasters)
+ {
+ x265_log(m_param, X265_LOG_INFO, "CU: %d PME Masters blocked an average of %.3lf ns\n",
+ cuStats.countPMEMasters, cuStats.pmeBlockTime / cuStats.countPMEMasters);
+ x265_log(m_param, X265_LOG_INFO, "CU: %d PME Tasks took average of %.3lfms\n",
+ cuStats.countPMETasks, ELAPSED_MSEC(cuStats.pmeTime) / cuStats.countPMETasks);
+ }
+
+#undef ELAPSED_SEC
+#undef ELAPSED_MSEC
+#endif
+
if (!m_param->bLogCuStats)
return;
diff -r 0e685522e4d4 -r 7e594f585296 source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp Sat Jan 31 16:03:57 2015 -0600
+++ b/source/encoder/frameencoder.cpp Sat Jan 31 15:57:00 2015 -0600
@@ -524,6 +524,13 @@
}
}
+#if DETAILED_CU_STATS
+ /* Accumulate CU statistics from each worker thread, we could report
+ * per-frame stats here, but currently we do not. */
+ for (int i = 0; i < m_top->m_numThreadLocalData; i++)
+ m_cuStats.accumulate(m_top->m_threadLocalData[i].analysis.m_stats[m_frameEncoderID]);
+#endif
+
m_endFrameTime = x265_mdate();
}
diff -r 0e685522e4d4 -r 7e594f585296 source/encoder/frameencoder.h
--- a/source/encoder/frameencoder.h Sat Jan 31 16:03:57 2015 -0600
+++ b/source/encoder/frameencoder.h Sat Jan 31 15:57:00 2015 -0600
@@ -177,6 +177,9 @@
int64_t m_slicetypeWaitTime; // total elapsed time waiting for decided frame
int64_t m_totalWorkerElapsedTime; // total elapsed time spent by worker threads processing CTUs
int64_t m_totalNoWorkerTime; // total elapsed time without any active worker threads
+#if DETAILED_CU_STATS
+ CUStats m_cuStats;
+#endif
Encoder* m_top;
x265_param* m_param;
diff -r 0e685522e4d4 -r 7e594f585296 source/encoder/search.cpp
--- a/source/encoder/search.cpp Sat Jan 31 16:03:57 2015 -0600
+++ b/source/encoder/search.cpp Sat Jan 31 15:57:00 2015 -0600
@@ -1141,6 +1141,8 @@
uint32_t tuDepthRange[2];
cu.getIntraTUQtDepthRange(tuDepthRange, 0);
+ ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
+
intraMode.initCosts();
intraMode.distortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange, sharedModes);
intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom);
@@ -1177,6 +1179,8 @@
* be generated later. It records the best mode in the cu */
void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
{
+ ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
+
CUData& cu = intraMode.cu;
uint32_t depth = cuGeom.depth;
@@ -1346,6 +1350,8 @@
void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
{
+ ProfileCUScope(intraMode.cu, intraRDOElapsedTime, countIntraRDO);
+
CUData& cu = intraMode.cu;
Yuv* reconYuv = &intraMode.reconYuv;
@@ -1542,6 +1548,9 @@
{
if (candCostList[i] == MAX_INT64)
break;
+
+ ProfileCUScope(intraMode.cu, intraRDOElapsedTime, countIntraRDO);
+
m_entropyCoder.load(m_rqt[depth].cur);
cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth);
@@ -1554,6 +1563,8 @@
}
}
+ ProfileCUScope(intraMode.cu, intraRDOElapsedTime, countIntraRDO);
+
/* remeasure best mode, allowing TU splits */
cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth);
m_entropyCoder.load(m_rqt[depth].cur);
@@ -1656,6 +1667,8 @@
uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
{
+ ProfileCUScope(intraMode.cu, intraRDOElapsedTime, countIntraRDO);
+
CUData& cu = intraMode.cu;
Yuv& reconYuv = intraMode.reconYuv;
@@ -1908,6 +1921,8 @@
* returns true if predYuv was filled with a motion compensated prediction */
bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChromaSA8D)
{
+ ProfileCUScope(interMode.cu, motionEstimationElapsedTime, countMotionEstimate);
+
CUData& cu = interMode.cu;
Yuv* predYuv = &interMode.predYuv;
@@ -2086,6 +2101,7 @@
m_meCompletionEvent.trigger();
m_meLock.release();
+ ProfileCUScopeNamed(pmeWaitScope, interMode.cu, pmeBlockTime, countPMEMasters);
m_meCompletionEvent.wait();
}
else
@@ -2481,6 +2497,8 @@
* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
{
+ ProfileCUScope(interMode.cu, interRDOElapsedTime, countInterRDO);
+
CUData& cu = interMode.cu;
Yuv* reconYuv = &interMode.reconYuv;
Yuv* predYuv = &interMode.predYuv;
diff -r 0e685522e4d4 -r 7e594f585296 source/encoder/search.h
--- a/source/encoder/search.h Sat Jan 31 16:03:57 2015 -0600
+++ b/source/encoder/search.h Sat Jan 31 15:57:00 2015 -0600
@@ -28,6 +28,7 @@
#include "predict.h"
#include "quant.h"
#include "bitcost.h"
+#include "framedata.h"
#include "yuv.h"
#include "threadpool.h"
@@ -35,6 +36,17 @@
#include "entropy.h"
#include "motion.h"
+#define DETAILED_CU_STATS 1
+#if DETAILED_CU_STATS
+#define ProfileCUScopeNamed(name, cu, acc, count) \
+ m_stats[cu.m_encData->m_frameEncoderID].count++; \
+ ScopedElapsedTime name(m_stats[cu.m_encData->m_frameEncoderID].acc)
+#define ProfileCUScope(cu, acc, count) ProfileCUScopeNamed(timedScope, cu, acc, count)
+#else
+#define ProfileCUScopeNamed(name, cu, acc, count)
+#define ProfileCUScope(cu, acc, count)
+#endif
+
namespace x265 {
// private namespace
@@ -122,6 +134,68 @@
}
};
+#if DETAILED_CU_STATS
+/* This structure is intended for performance debugging and we make no attempt
+ * to handle dynamic range overflows. Care should be taken to avoid long encodes
+ * if you care about the accuracy of these elapsed times and counters. This
+ * profiling is orthoganal to PPA/VTune and can be enabled indepedently from
+ * either of them */
+struct CUStats
+{
+ int64_t intraRDOElapsedTime;
+ int64_t interRDOElapsedTime;
+ int64_t intraAnalysisElapsedTime; /* in RD > 4, includes RDO cost */
+ int64_t motionEstimationElapsedTime;
+ int64_t pmeTime;
+ int64_t pmeBlockTime;
+ int64_t pmodeTime;
+ int64_t pmodeBlockTime;
+ int64_t totalCTUTime;
+
+ uint64_t countIntraRDO;
+ uint64_t countInterRDO;
+ uint64_t countIntraAnalysis;
+ uint64_t countMotionEstimate;
+ uint64_t countPMETasks;
+ uint64_t countPMEMasters;
+ uint64_t countPModeTasks;
+ uint64_t countPModeMasters;
+ uint64_t totalCTUs;
+
+ CUStats() { clear(); }
+
+ void clear()
+ {
+ memset(this, 0, sizeof(*this));
+ }
+
+ void accumulate(CUStats& other)
+ {
+ intraRDOElapsedTime += other.intraRDOElapsedTime;
+ interRDOElapsedTime += other.interRDOElapsedTime;
+ intraAnalysisElapsedTime += other.intraAnalysisElapsedTime;
+ motionEstimationElapsedTime += other.motionEstimationElapsedTime;
+ pmeTime += other.pmeTime;
+ pmeBlockTime += other.pmeBlockTime;
+ pmodeTime += other.pmodeTime;
+ pmodeBlockTime += other.pmodeBlockTime;
+ totalCTUTime += other.totalCTUTime;
+
+ countIntraRDO += other.countIntraRDO;
+ countInterRDO += other.countInterRDO;
+ countIntraAnalysis += other.countIntraAnalysis;
+ countMotionEstimate += other.countMotionEstimate;
+ countPMETasks += other.countPMETasks;
+ countPMEMasters += other.countPMEMasters;
+ countPModeTasks += other.countPModeTasks;
+ countPModeMasters += other.countPModeMasters;
+ totalCTUs += other.totalCTUs;
+
+ other.clear();
+ }
+};
+#endif
+
inline int getTUBits(int idx, int numIdx)
{
return idx + (idx < numIdx - 1);
@@ -151,6 +225,11 @@
uint32_t m_numLayers;
uint32_t m_refLagPixels;
+#if DETAILED_CU_STATS
+ /* Accumulate CU statistics seperately for each frame encoder */
+ CUStats m_stats[X265_MAX_FRAME_THREADS];
+#endif
+
Search();
~Search();
More information about the x265-devel
mailing list