[x265-commits] [x265] cli: improve CLI help for --pools

Steve Borho steve at borho.org
Tue Feb 24 04:12:36 CET 2015


details:   http://hg.videolan.org/x265/rev/0f19509127e2
branches:  
changeset: 9399:0f19509127e2
user:      Steve Borho <steve at borho.org>
date:      Mon Feb 23 18:08:49 2015 -0600
description:
cli: improve CLI help for --pools
Subject: [x265] slicetype: fine tuning for slicetypeDecide batch work

details:   http://hg.videolan.org/x265/rev/910e091d045a
branches:  
changeset: 9400:910e091d045a
user:      Steve Borho <steve at borho.org>
date:      Mon Feb 23 20:07:11 2015 -0600
description:
slicetype: fine tuning for slicetypeDecide batch work

At the start of the encode we have a full lookahead queue that all needs
analysis and slicetype planning and none of the frame encoders can start until
this work is finished, so we dedicate the entire thread pool towards this
effort.

But after this burst of activity, the lookahead is competing with the frame
encoders for worker thread attention and so it generally doesn't want to do any
batch work that might be unnecesary. So based on the thread pool size we
throttle down the work distribution in slicetypeDecide.

diffstat:

 source/encoder/slicetype.cpp |  27 ++++++++++++++++++++++++---
 source/encoder/slicetype.h   |   2 ++
 source/x265cli.h             |   3 ++-
 3 files changed, 28 insertions(+), 4 deletions(-)

diffs (85 lines):

diff -r e69c49eb738e -r 910e091d045a source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Mon Feb 23 18:03:30 2015 -0600
+++ b/source/encoder/slicetype.cpp	Mon Feb 23 20:07:11 2015 -0600
@@ -486,7 +486,23 @@ Lookahead::Lookahead(x265_param *param, 
     m_fullQueueSize = m_param->lookaheadDepth;
     m_bAdaptiveQuant = m_param->rc.aqMode || m_param->bEnableWeightedPred || m_param->bEnableWeightedBiPred;
 
-    if (m_pool && m_pool->m_numWorkers > 2)
+    /* If we have a thread pool and are using --b-adapt 2, it is generally
+     * preferrable to perform all motion searches for each lowres frame in large
+     * batched; this will create one job per --bframe per lowres frame, and
+     * these jobs are performed by workers bonded to the thread running
+     * slicetypeDecide() */
+    m_bBatchMotionSearch = m_pool && m_param->bFrameAdaptive == X265_B_ADAPT_TRELLIS;
+
+    /* It is also beneficial to pre-calculate all possible frame cost estimates
+     * using worker threads bonded to the worker thread running
+     * slicetypeDecide(). This creates bframes * bframes jobs which take less
+     * time than the motion search batches but there are many of them. This may
+     * do much unnecessary work, some frame cost estimates are not needed, so if
+     * the thread pool is small we disable this feature after the initial burst
+     * of work */
+    m_bBatchFrameCosts = m_bBatchMotionSearch;
+
+    if (m_bBatchMotionSearch && m_pool->m_numWorkers > 12)
     {
         m_numRowsPerSlice = m_heightInCU / (m_pool->m_numWorkers - 1);   // default to numWorkers - 1 slices
         m_numRowsPerSlice = X265_MAX(m_numRowsPerSlice, 10);             // at least 10 rows per slice
@@ -1170,7 +1186,7 @@ void Lookahead::slicetypeAnalyse(Lowres 
         return;
     }
 
-    if (m_pool && m_param->bFrameAdaptive == X265_B_ADAPT_TRELLIS && numFrames > 1 && frames[1]->frameNum < m_param->lookaheadDepth)
+    if (m_bBatchMotionSearch)
     {
         /* pre-calculate all motion searches, using many worker threads */
         CostEstimateGroup estGroup(*this, frames);
@@ -1182,9 +1198,11 @@ void Lookahead::slicetypeAnalyse(Lowres 
                     estGroup.add(b - i, b + i < numFrames ? b + i : b, b);
             }
         }
+        /* auto-disable after the first batch if pool is small */
+        m_bBatchMotionSearch &= m_pool->m_numWorkers >= 4;
         estGroup.finishBatch();
 
-        if (m_pool->m_numWorkers >= 16)
+        if (m_bBatchFrameCosts)
         {
             /* pre-calculate all frame cost estimates, using many worker threads */
             for (int b = 2; b < numFrames; b++)
@@ -1201,6 +1219,9 @@ void Lookahead::slicetypeAnalyse(Lowres 
                     }
                 }
             }
+
+            /* auto-disable after the first batch if the pool is not large */
+            m_bBatchFrameCosts &= m_pool->m_numWorkers > 12;
             estGroup.finishBatch();
         }
     }
diff -r e69c49eb738e -r 910e091d045a source/encoder/slicetype.h
--- a/source/encoder/slicetype.h	Mon Feb 23 18:03:30 2015 -0600
+++ b/source/encoder/slicetype.h	Mon Feb 23 20:07:11 2015 -0600
@@ -112,6 +112,8 @@ public:
     bool          m_sliceTypeBusy;
     bool          m_bAdaptiveQuant;
     bool          m_outputSignalRequired;
+    bool          m_bBatchMotionSearch;
+    bool          m_bBatchFrameCosts;
     Lock          m_preLookaheadLock;
     Event         m_outputSignal;
 
diff -r e69c49eb738e -r 910e091d045a source/x265cli.h
--- a/source/x265cli.h	Mon Feb 23 18:03:30 2015 -0600
+++ b/source/x265cli.h	Mon Feb 23 20:07:11 2015 -0600
@@ -254,7 +254,8 @@ static void showHelp(x265_param *param)
     H0("   --level-idc <integer|float>   Force a minimum required decoder level (as '5.0' or '50')\n");
     H0("   --[no-]high-tier              If a decoder level is specified, this modifier selects High tier of that level\n");
     H0("\nThreading, performance:\n");
-    H0("   --threads <integer>           Number of threads for thread pool (0: detect CPU core count, default)\n");
+    H0("   --pools <integer,...>         Comma separated thread count per thread pool (pool per NUMA node)\n");
+    H0("                                 '-' implies no threads on node, '+' implies one thread per core on node\n");
     H0("-F/--frame-threads <integer>     Number of concurrently encoded frames. 0: auto-determined by core count\n");
     H0("   --[no-]wpp                    Enable Wavefront Parallel Processing. Default %s\n", OPT(param->bEnableWavefront));
     H0("   --[no-]pmode                  Parallel mode analysis. Default %s\n", OPT(param->bDistributeModeAnalysis));


More information about the x265-commits mailing list