[x265] [PATCH] Set up separate threadpool for lookahead

Aarthi Priya Thirumalai aarthi at multicorewareinc.com
Thu Dec 15 10:02:28 CET 2016


# HG changeset patch
# User Aarthi Priya Thirumalai <aarthi at multicorewareinc.com>
# Date 1481274360 -19800
#      Fri Dec 09 14:36:00 2016 +0530
# Node ID e8152da7aa0e03a27f01a2a95aedf914248fe577
# Parent  78e1e1354a25d287f17359ed489833e46bf177f1
Set up separate threadpool for lookahead

The user can allocate specific number of threads for lookahead by specifying --lookahead-threads <val>,
creating a separate lookahead pool for these threads. This will improve performance when lookahead
becomes a bottle neck. THe threads for lookahead must be ideally less than half the total number
of available worker threads.

diff -r 78e1e1354a25 -r e8152da7aa0e doc/reST/cli.rst
--- a/doc/reST/cli.rst	Fri Dec 09 10:45:12 2016 +0530
+++ b/doc/reST/cli.rst	Fri Dec 09 14:36:00 2016 +0530
@@ -1227,8 +1227,18 @@
     Default: 8 for ultrafast, superfast, faster, fast, medium
              4 for slow, slower
              disabled for veryslow, slower
+			 
+.. option:: --lookahead-threads <integer>
 
+    Use multiple worker threads dedicated to doing only lookahead instead of sharing
+    the worker threads with frame Encoders. A dedicated lookahead threadpool is created with the
+    specified number of worker threads. This can range from 0 upto half the
+    hardware threads available for encoding. Using too many threads for lookahead can starve
+    resources for frame Encoder and can harm performance. Default is 0 - disabled, Lookahead 
+	shares worker threads with other FrameEncoders . 
 
+    **Values:** 0 - disabled(default). Max - Half of available hardware threads.
+	
 .. option:: --b-adapt <integer>
 
 	Set the level of effort in determining B frame placement.
diff -r 78e1e1354a25 -r e8152da7aa0e source/CMakeLists.txt
--- a/source/CMakeLists.txt	Fri Dec 09 10:45:12 2016 +0530
+++ b/source/CMakeLists.txt	Fri Dec 09 14:36:00 2016 +0530
@@ -28,9 +28,8 @@
 option(NATIVE_BUILD "Target the build CPU" OFF)
 option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
-
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 102)
+set(X265_BUILD 103)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
diff -r 78e1e1354a25 -r e8152da7aa0e source/common/param.cpp
--- a/source/common/param.cpp	Fri Dec 09 10:45:12 2016 +0530
+++ b/source/common/param.cpp	Fri Dec 09 14:36:00 2016 +0530
@@ -149,8 +149,8 @@
     param->bBPyramid = 1;
     param->scenecutThreshold = 40; /* Magic number pulled in from x264 */
     param->lookaheadSlices = 8;
+    param->lookaheadThreads = 0;
     param->scenecutBias = 5.0;
-
     /* Intra Coding Tools */
     param->bEnableConstrainedIntra = 0;
     param->bEnableStrongIntraSmoothing = 1;
@@ -919,7 +919,7 @@
         OPT("opt-ref-list-length-pps") p->bOptRefListLengthPPS = atobool(value);
         OPT("multi-pass-opt-rps") p->bMultiPassOptRPS = atobool(value);
         OPT("scenecut-bias") p->scenecutBias = atof(value);
-
+        OPT("lookahead-threads") p->lookaheadThreads = atoi(value);
         else
             return X265_PARAM_BAD_NAME;
     }
@@ -1412,6 +1412,7 @@
     TOOLOPT(param->bEnableFastIntra, "fast-intra");
     TOOLOPT(param->bEnableStrongIntraSmoothing, "strong-intra-smoothing");
     TOOLVAL(param->lookaheadSlices, "lslices=%d");
+    TOOLVAL(param->lookaheadThreads, "lthreads=%d")
     if (param->maxSlices > 1)
         TOOLVAL(param->maxSlices, "slices=%d");
     if (param->bEnableLoopFilter)
diff -r 78e1e1354a25 -r e8152da7aa0e source/common/threadpool.cpp
--- a/source/common/threadpool.cpp	Fri Dec 09 10:45:12 2016 +0530
+++ b/source/common/threadpool.cpp	Fri Dec 09 14:36:00 2016 +0530
@@ -244,8 +244,7 @@
 
     return bondCount;
 }
-
-ThreadPool* ThreadPool::allocThreadPools(x265_param* p, int& numPools)
+ThreadPool* ThreadPool::allocThreadPools(x265_param* p, int& numPools, bool isThreadsReserved)
 {
     enum { MAX_NODE_NUM = 127 };
     int cpusPerNode[MAX_NODE_NUM + 1];
@@ -397,17 +396,32 @@
         x265_log(p, X265_LOG_DEBUG, "Reducing number of thread pools for frame thread count\n");
         numPools = X265_MAX(p->frameNumThreads / 2, 1);
     }
-
+    if (isThreadsReserved)
+        numPools = 1;
     ThreadPool *pools = new ThreadPool[numPools];
     if (pools)
     {
-        int maxProviders = (p->frameNumThreads + numPools - 1) / numPools + 1; /* +1 is Lookahead, always assigned to threadpool 0 */
+        int maxProviders = (p->frameNumThreads + numPools - 1) / numPools + !isThreadsReserved; /* +1 is Lookahead, always assigned to threadpool 0 */
         int node = 0;
         for (int i = 0; i < numPools; i++)
         {
             while (!threadsPerPool[node])
                 node++;
             int numThreads = X265_MIN(MAX_POOL_THREADS, threadsPerPool[node]);
+            int origNumThreads = numThreads;
+            if (p->lookaheadThreads > numThreads / 2)
+            {
+                p->lookaheadThreads = numThreads / 2;
+                x265_log(p, X265_LOG_DEBUG, "Setting lookahead threads to a maximum of half the total number of threads\n");
+            }
+            if (isThreadsReserved)
+            {
+                numThreads = p->lookaheadThreads;
+                maxProviders = 1;
+            }
+
+            else
+                numThreads -= p->lookaheadThreads;
             if (!pools[i].create(numThreads, maxProviders, nodeMaskPerPool[node]))
             {
                 X265_FREE(pools);
@@ -425,7 +439,7 @@
             }
             else
                 x265_log(p, X265_LOG_INFO, "Thread pool created using %d threads\n", numThreads);
-            threadsPerPool[node] -= numThreads;
+            threadsPerPool[node] -= origNumThreads;
         }
     }
     else
diff -r 78e1e1354a25 -r e8152da7aa0e source/common/threadpool.h
--- a/source/common/threadpool.h	Fri Dec 09 10:45:12 2016 +0530
+++ b/source/common/threadpool.h	Fri Dec 09 14:36:00 2016 +0530
@@ -102,9 +102,7 @@
     void setThreadNodeAffinity(void *numaMask);
     int  tryAcquireSleepingThread(sleepbitmap_t firstTryBitmap, sleepbitmap_t secondTryBitmap);
     int  tryBondPeers(int maxPeers, sleepbitmap_t peerBitmap, BondedTaskGroup& master);
-
-    static ThreadPool* allocThreadPools(x265_param* p, int& numPools);
-
+    static ThreadPool* allocThreadPools(x265_param* p, int& numPools, bool isThreadsReserved);
     static int  getCpuCount();
     static int  getNumaNodeCount();
 };
diff -r 78e1e1354a25 -r e8152da7aa0e source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp	Fri Dec 09 10:45:12 2016 +0530
+++ b/source/encoder/encoder.cpp	Fri Dec 09 14:36:00 2016 +0530
@@ -128,11 +128,9 @@
         else
             p->frameNumThreads = 1;
     }
-
     m_numPools = 0;
     if (allowPools)
-        m_threadPool = ThreadPool::allocThreadPools(p, m_numPools);
-
+        m_threadPool = ThreadPool::allocThreadPools(p, m_numPools, 0);
     if (!m_numPools)
     {
         // issue warnings if any of these features were requested
@@ -201,17 +199,26 @@
         m_scalingList.setDefaultScalingList();
     else if (m_scalingList.parseScalingList(m_param->scalingLists))
         m_aborted = true;
-
-    m_lookahead = new Lookahead(m_param, m_threadPool);
-    if (m_numPools)
+    int pools = m_numPools;
+    ThreadPool* lookAheadThreadPool = 0;
+    if (m_param->lookaheadThreads > 0)
     {
-        m_lookahead->m_jpId = m_threadPool[0].m_numProviders++;
-        m_threadPool[0].m_jpTable[m_lookahead->m_jpId] = m_lookahead;
+        lookAheadThreadPool = ThreadPool::allocThreadPools(p, pools, 1);
     }
-
+    else
+        lookAheadThreadPool = m_threadPool;
+    m_lookahead = new Lookahead(m_param, lookAheadThreadPool);
+    if (pools)
+    {
+        m_lookahead->m_jpId = lookAheadThreadPool[0].m_numProviders++;
+        lookAheadThreadPool[0].m_jpTable[m_lookahead->m_jpId] = m_lookahead;
+    }
+    if (m_param->lookaheadThreads > 0)
+        for (int i = 0; i < pools; i++)
+            lookAheadThreadPool[i].start();
+    m_lookahead->m_numPools = pools;
     m_dpb = new DPB(m_param);
     m_rateControl = new RateControl(*m_param);
-
     initVPS(&m_vps);
     initSPS(&m_sps);
     initPPS(&m_pps);
diff -r 78e1e1354a25 -r e8152da7aa0e source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Fri Dec 09 10:45:12 2016 +0530
+++ b/source/encoder/slicetype.cpp	Fri Dec 09 14:36:00 2016 +0530
@@ -656,8 +656,12 @@
         if (wait)
             m_outputSignal.wait();
     }
+    if (m_pool && m_param->lookaheadThreads > 0)
+    {
+        for (int i = 0; i < m_numPools; i++)
+            m_pool[i].stopWorkers();
+    }
 }
-
 void Lookahead::destroy()
 {
     // these two queues will be empty unless the encode was aborted
@@ -676,10 +680,10 @@
     }
 
     X265_FREE(m_scratch);
-
     delete [] m_tld;
+    if (m_param->lookaheadThreads > 0)
+        delete [] m_pool;
 }
-
 /* The synchronization of slicetypeDecide is managed here.  The findJob() method
  * polls the occupancy of the input queue. If the queue is
  * full, it will run slicetypeDecide() and output a mini-gop of frames to the
diff -r 78e1e1354a25 -r e8152da7aa0e source/encoder/slicetype.h
--- a/source/encoder/slicetype.h	Fri Dec 09 10:45:12 2016 +0530
+++ b/source/encoder/slicetype.h	Fri Dec 09 14:36:00 2016 +0530
@@ -129,8 +129,8 @@
     bool          m_bBatchFrameCosts;
     bool          m_filled;
     bool          m_isSceneTransition;
+    int           m_numPools;
     Lookahead(x265_param *param, ThreadPool *pool);
-
 #if DETAILED_CU_STATS
     int64_t       m_slicetypeDecideElapsedTime;
     int64_t       m_preLookaheadElapsedTime;
diff -r 78e1e1354a25 -r e8152da7aa0e source/x265.h
--- a/source/x265.h	Fri Dec 09 10:45:12 2016 +0530
+++ b/source/x265.h	Fri Dec 09 14:36:00 2016 +0530
@@ -1335,6 +1335,13 @@
     * intra cost of a frame used in scenecut detection. Default 5. */
     double     scenecutBias;
 
+    /* Use multiple worker threads dedicated to doing only lookahead instead of sharing
+    * the worker threads with Frame Encoders. A dedicated lookahead threadpool is created with the
+    * specified number of worker threads. This can range from 0 upto half the
+    * hardware threads available for encoding. Using too many threads for lookahead can starve
+    * resources for frame Encoder and can harm performance. Default is 0 - disabled. */
+    int       lookaheadThreads;
+
 } x265_param;
 
 /* x265_param_alloc:
diff -r 78e1e1354a25 -r e8152da7aa0e source/x265cli.h
--- a/source/x265cli.h	Fri Dec 09 10:45:12 2016 +0530
+++ b/source/x265cli.h	Fri Dec 09 14:36:00 2016 +0530
@@ -125,6 +125,7 @@
     { "intra-refresh",        no_argument, NULL, 0 },
     { "rc-lookahead",   required_argument, NULL, 0 },
     { "lookahead-slices", required_argument, NULL, 0 },
+    { "lookahead-threads", required_argument, NULL, 0 },
     { "bframes",        required_argument, NULL, 'b' },
     { "bframe-bias",    required_argument, NULL, 0 },
     { "b-adapt",        required_argument, NULL, 0 },
@@ -372,6 +373,7 @@
     H0("   --intra-refresh               Use Periodic Intra Refresh instead of IDR frames\n");
     H0("   --rc-lookahead <integer>      Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
     H1("   --lookahead-slices <0..16>    Number of slices to use per lookahead cost estimate. Default %d\n", param->lookaheadSlices);
+    H0("   --lookahead-threads <integer> Number of threads to be dedicated to perform lookahead only. Default %d\n", param->lookaheadThreads);
     H0("   --bframes <integer>           Maximum number of consecutive b-frames (now it only enables B GOP structure) Default %d\n", param->bframes);
     H1("   --bframe-bias <integer>       Bias towards B frame decisions. Default %d\n", param->bFrameBias);
     H0("   --b-adapt <0..2>              0 - none, 1 - fast, 2 - full (trellis) adaptive B frame scheduling. Default %d\n", param->bFrameAdaptive);


More information about the x265-devel mailing list