[x265-commits] [x265] slicetype: use access macros for alloc and free

Steve Borho steve at borho.org
Fri Feb 6 19:21:00 CET 2015


details:   http://hg.videolan.org/x265/rev/a69806a7ab7d
branches:  
changeset: 9320:a69806a7ab7d
user:      Steve Borho <steve at borho.org>
date:      Fri Feb 06 11:26:04 2015 -0600
description:
slicetype: use access macros for alloc and free
Subject: [x265] stats: include time spent in slicetype decisions

details:   http://hg.videolan.org/x265/rev/5389e6d11567
branches:  
changeset: 9321:5389e6d11567
user:      Steve Borho <steve at borho.org>
date:      Fri Feb 06 11:25:38 2015 -0600
description:
stats: include time spent in slicetype decisions

the lookahead may decide not to use worker threads, and when this happens we
often see more than 100% total utilization, which means the API thread is
ejecting a worker thread from the CPU for a time in order to run slicetypeDecide

diffstat:

 source/encoder/encoder.cpp   |  11 ++++++++++-
 source/encoder/slicetype.cpp |  33 +++++++++++++++++++++++++++++----
 source/encoder/slicetype.h   |  17 ++++++++++++++++-
 3 files changed, 55 insertions(+), 6 deletions(-)

diffs (168 lines):

diff -r e51b19ab2319 -r 5389e6d11567 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp	Fri Feb 06 10:29:58 2015 -0600
+++ b/source/encoder/encoder.cpp	Fri Feb 06 11:25:38 2015 -0600
@@ -822,7 +822,11 @@ void Encoder::printSummary()
 #define ELAPSED_SEC(val)  ((double)(val) / 1000000)
 #define ELAPSED_MSEC(val) ((double)(val) / 1000)
 
-    int64_t totalWorkerTime = cuStats.totalCTUTime + cuStats.loopFilterElapsedTime + cuStats.pmodeTime + cuStats.pmeTime;
+    int64_t lookaheadWorkerTime = m_lookahead->m_slicetypeDecideElapsedTime;
+    if (m_lookahead->usingWorkerThreads())
+        lookaheadWorkerTime += m_lookahead->m_est.m_processRowElapsedTime;
+
+    int64_t totalWorkerTime = cuStats.totalCTUTime + cuStats.loopFilterElapsedTime + cuStats.pmodeTime + cuStats.pmeTime + lookaheadWorkerTime;
     int64_t elapsedEncodeTime = x265_mdate() - m_encodeStartTime;
 
     int64_t interRDOTotalTime = 0, intraRDOTotalTime = 0;
@@ -878,6 +882,11 @@ void Encoder::printSummary()
                  ELAPSED_MSEC(cuStats.pmodeTime) / cuStats.countPModeTasks);
     }
 
+    x265_log(m_param, X265_LOG_INFO, "CU: %%%05.2lf time spent in slicetypeDecide (avg %.3lfms) and lookahead row cost (avg %.3lfns)\n",
+             100.0 * lookaheadWorkerTime / totalWorkerTime,
+             ELAPSED_MSEC(m_lookahead->m_slicetypeDecideElapsedTime) / m_lookahead->m_countSlicetypeDecide,
+             (double)m_lookahead->m_est.m_processRowElapsedTime / m_lookahead->m_est.m_countProcessRow);
+
     x265_log(m_param, X265_LOG_INFO, "CU: %%%05.2lf time spent in other tasks\n",
              100.0 * unaccounted / totalWorkerTime);
 
diff -r e51b19ab2319 -r 5389e6d11567 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Fri Feb 06 10:29:58 2015 -0600
+++ b/source/encoder/slicetype.cpp	Fri Feb 06 11:25:38 2015 -0600
@@ -67,6 +67,12 @@ Lookahead::Lookahead(x265_param *param, 
     m_bFilled = false;
     m_bFlushed = false;
     m_bFlush = false;
+
+#if DETAILED_CU_STATS
+    m_slicetypeDecideElapsedTime = 0;
+    m_countSlicetypeDecide = 0;
+#endif
+
     m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
     m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
     m_scratch = (int*)x265_malloc(m_widthInCU * sizeof(int));
@@ -334,6 +340,10 @@ void Lookahead::getEstimatedPictureCost(
 void Lookahead::slicetypeDecide()
 {
     ProfileScopeEvent(slicetypeDecideEV);
+#if DETAILED_CU_STATS
+    ScopedElapsedTime filterPerfScope(m_slicetypeDecideElapsedTime);
+    m_countSlicetypeDecide++;
+#endif
 
     Lowres *frames[X265_LOOKAHEAD_MAX];
     Frame *list[X265_LOOKAHEAD_MAX];
@@ -1265,9 +1275,7 @@ CostEstimate::CostEstimate(ThreadPool *p
 CostEstimate::~CostEstimate()
 {
     for (int i = 0; i < 4; i++)
-    {
-        x265_free(m_wbuffer[i]);
-    }
+        X265_FREE(m_wbuffer[i]);
 
     delete[] m_rows;
 }
@@ -1278,6 +1286,11 @@ void CostEstimate::init(x265_param *_par
     m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
     m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
 
+#if DETAILED_CU_STATS
+    m_processRowElapsedTime = 0;
+    m_countProcessRow = 0;
+#endif
+
     m_rows = new EstimateRow[m_heightInCU];
     for (int i = 0; i < m_heightInCU; i++)
     {
@@ -1300,7 +1313,7 @@ void CostEstimate::init(x265_param *_par
         /* allocate weighted lowres buffers */
         for (int i = 0; i < 4; i++)
         {
-            m_wbuffer[i] = (pixel*)x265_malloc(sizeof(pixel) * (curFrame->m_lowres.lumaStride * m_paddedLines));
+            m_wbuffer[i] = X265_MALLOC(pixel, curFrame->m_lowres.lumaStride * m_paddedLines);
             m_weightedRef.lowresPlane[i] = m_wbuffer[i] + padoffset;
         }
 
@@ -1348,6 +1361,10 @@ int64_t CostEstimate::estimateFrameCost(
             if (!fenc->bIntraCalculated)
                 fenc->rowSatds[0][0][i] = 0;
             fenc->rowSatds[b - p0][p1 - b][i] = 0;
+#if DETAILED_CU_STATS
+            m_rows[i].m_processRowElapsedTime = 0;
+            m_rows[i].m_countProcessRow = 0;
+#endif
         }
 
         m_bFrameCompleted = false;
@@ -1374,6 +1391,10 @@ int64_t CostEstimate::estimateFrameCost(
         // Accumulate cost from each row
         for (int row = 0; row < m_heightInCU; row++)
         {
+#if DETAILED_CU_STATS
+            m_processRowElapsedTime += m_rows[row].m_processRowElapsedTime;
+            m_countProcessRow += m_rows[row].m_countProcessRow;
+#endif
             score += m_rows[row].m_costEst;
             fenc->costEst[0][0] += m_rows[row].m_costIntra;
             if (m_param->rc.aqMode)
@@ -1524,6 +1545,10 @@ void CostEstimate::weightsAnalyse(Lowres
 void CostEstimate::processRow(int row, int /*threadId*/)
 {
     ProfileScopeEvent(costEstimateRow);
+#if DETAILED_CU_STATS
+    ScopedElapsedTime filterPerfScope(m_processRowElapsedTime);
+    m_countProcessRow++;
+#endif
 
     int realrow = m_heightInCU - 1 - row;
     Lowres **frames = m_curframes;
diff -r e51b19ab2319 -r 5389e6d11567 source/encoder/slicetype.h
--- a/source/encoder/slicetype.h	Fri Feb 06 10:29:58 2015 -0600
+++ b/source/encoder/slicetype.h	Fri Feb 06 11:25:38 2015 -0600
@@ -69,6 +69,11 @@ public:
     int                 m_widthInCU;
     int                 m_heightInCU;
 
+#if DETAILED_CU_STATS
+    int64_t             m_processRowElapsedTime;
+    uint64_t            m_countProcessRow;
+#endif
+
     EstimateRow()
     {
         m_me.setQP(X265_LOOKAHEAD_QP);
@@ -107,6 +112,11 @@ public:
     volatile bool    m_bFrameCompleted;
     int              m_curb, m_curp0, m_curp1;
 
+#if DETAILED_CU_STATS
+    int64_t          m_processRowElapsedTime;
+    uint64_t         m_countProcessRow;
+#endif
+
     void     processRow(int row, int threadId);
     int64_t  estimateFrameCost(Lowres **frames, int p0, int p1, int b, bool bIntraPenalty);
 
@@ -138,6 +148,12 @@ public:
     int              m_lastKeyframe;
     int              m_histogram[X265_BFRAME_MAX + 1];
 
+#if DETAILED_CU_STATS
+    int64_t          m_slicetypeDecideElapsedTime;
+    uint64_t         m_countSlicetypeDecide;
+    bool             usingWorkerThreads() const { return !!m_pool; }
+#endif
+
     void addPicture(Frame*, int sliceType);
     void flush();
     void stop();
@@ -147,7 +163,6 @@ public:
 
 protected:
 
-
     Lock  m_inputQueueLock;
     Lock  m_outputQueueLock;
     Event m_outputAvailable;


More information about the x265-commits mailing list