[x265] [PATCH] Set up separate threadpool for lookahead
Aarthi Priya Thirumalai
aarthi at multicorewareinc.com
Thu Dec 15 10:02:28 CET 2016
# HG changeset patch
# User Aarthi Priya Thirumalai <aarthi at multicorewareinc.com>
# Date 1481274360 -19800
# Fri Dec 09 14:36:00 2016 +0530
# Node ID e8152da7aa0e03a27f01a2a95aedf914248fe577
# Parent 78e1e1354a25d287f17359ed489833e46bf177f1
Set up separate threadpool for lookahead
The user can allocate specific number of threads for lookahead by specifying --lookahead-threads <val>,
creating a separate lookahead pool for these threads. This will improve performance when lookahead
becomes a bottle neck. THe threads for lookahead must be ideally less than half the total number
of available worker threads.
diff -r 78e1e1354a25 -r e8152da7aa0e doc/reST/cli.rst
--- a/doc/reST/cli.rst Fri Dec 09 10:45:12 2016 +0530
+++ b/doc/reST/cli.rst Fri Dec 09 14:36:00 2016 +0530
@@ -1227,8 +1227,18 @@
Default: 8 for ultrafast, superfast, faster, fast, medium
4 for slow, slower
disabled for veryslow, slower
+
+.. option:: --lookahead-threads <integer>
+ Use multiple worker threads dedicated to doing only lookahead instead of sharing
+ the worker threads with frame Encoders. A dedicated lookahead threadpool is created with the
+ specified number of worker threads. This can range from 0 upto half the
+ hardware threads available for encoding. Using too many threads for lookahead can starve
+ resources for frame Encoder and can harm performance. Default is 0 - disabled, Lookahead
+ shares worker threads with other FrameEncoders .
+ **Values:** 0 - disabled(default). Max - Half of available hardware threads.
+
.. option:: --b-adapt <integer>
Set the level of effort in determining B frame placement.
diff -r 78e1e1354a25 -r e8152da7aa0e source/CMakeLists.txt
--- a/source/CMakeLists.txt Fri Dec 09 10:45:12 2016 +0530
+++ b/source/CMakeLists.txt Fri Dec 09 14:36:00 2016 +0530
@@ -28,9 +28,8 @@
option(NATIVE_BUILD "Target the build CPU" OFF)
option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
-
# X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 102)
+set(X265_BUILD 103)
configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
"${PROJECT_BINARY_DIR}/x265.def")
configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
diff -r 78e1e1354a25 -r e8152da7aa0e source/common/param.cpp
--- a/source/common/param.cpp Fri Dec 09 10:45:12 2016 +0530
+++ b/source/common/param.cpp Fri Dec 09 14:36:00 2016 +0530
@@ -149,8 +149,8 @@
param->bBPyramid = 1;
param->scenecutThreshold = 40; /* Magic number pulled in from x264 */
param->lookaheadSlices = 8;
+ param->lookaheadThreads = 0;
param->scenecutBias = 5.0;
-
/* Intra Coding Tools */
param->bEnableConstrainedIntra = 0;
param->bEnableStrongIntraSmoothing = 1;
@@ -919,7 +919,7 @@
OPT("opt-ref-list-length-pps") p->bOptRefListLengthPPS = atobool(value);
OPT("multi-pass-opt-rps") p->bMultiPassOptRPS = atobool(value);
OPT("scenecut-bias") p->scenecutBias = atof(value);
-
+ OPT("lookahead-threads") p->lookaheadThreads = atoi(value);
else
return X265_PARAM_BAD_NAME;
}
@@ -1412,6 +1412,7 @@
TOOLOPT(param->bEnableFastIntra, "fast-intra");
TOOLOPT(param->bEnableStrongIntraSmoothing, "strong-intra-smoothing");
TOOLVAL(param->lookaheadSlices, "lslices=%d");
+ TOOLVAL(param->lookaheadThreads, "lthreads=%d")
if (param->maxSlices > 1)
TOOLVAL(param->maxSlices, "slices=%d");
if (param->bEnableLoopFilter)
diff -r 78e1e1354a25 -r e8152da7aa0e source/common/threadpool.cpp
--- a/source/common/threadpool.cpp Fri Dec 09 10:45:12 2016 +0530
+++ b/source/common/threadpool.cpp Fri Dec 09 14:36:00 2016 +0530
@@ -244,8 +244,7 @@
return bondCount;
}
-
-ThreadPool* ThreadPool::allocThreadPools(x265_param* p, int& numPools)
+ThreadPool* ThreadPool::allocThreadPools(x265_param* p, int& numPools, bool isThreadsReserved)
{
enum { MAX_NODE_NUM = 127 };
int cpusPerNode[MAX_NODE_NUM + 1];
@@ -397,17 +396,32 @@
x265_log(p, X265_LOG_DEBUG, "Reducing number of thread pools for frame thread count\n");
numPools = X265_MAX(p->frameNumThreads / 2, 1);
}
-
+ if (isThreadsReserved)
+ numPools = 1;
ThreadPool *pools = new ThreadPool[numPools];
if (pools)
{
- int maxProviders = (p->frameNumThreads + numPools - 1) / numPools + 1; /* +1 is Lookahead, always assigned to threadpool 0 */
+ int maxProviders = (p->frameNumThreads + numPools - 1) / numPools + !isThreadsReserved; /* +1 is Lookahead, always assigned to threadpool 0 */
int node = 0;
for (int i = 0; i < numPools; i++)
{
while (!threadsPerPool[node])
node++;
int numThreads = X265_MIN(MAX_POOL_THREADS, threadsPerPool[node]);
+ int origNumThreads = numThreads;
+ if (p->lookaheadThreads > numThreads / 2)
+ {
+ p->lookaheadThreads = numThreads / 2;
+ x265_log(p, X265_LOG_DEBUG, "Setting lookahead threads to a maximum of half the total number of threads\n");
+ }
+ if (isThreadsReserved)
+ {
+ numThreads = p->lookaheadThreads;
+ maxProviders = 1;
+ }
+
+ else
+ numThreads -= p->lookaheadThreads;
if (!pools[i].create(numThreads, maxProviders, nodeMaskPerPool[node]))
{
X265_FREE(pools);
@@ -425,7 +439,7 @@
}
else
x265_log(p, X265_LOG_INFO, "Thread pool created using %d threads\n", numThreads);
- threadsPerPool[node] -= numThreads;
+ threadsPerPool[node] -= origNumThreads;
}
}
else
diff -r 78e1e1354a25 -r e8152da7aa0e source/common/threadpool.h
--- a/source/common/threadpool.h Fri Dec 09 10:45:12 2016 +0530
+++ b/source/common/threadpool.h Fri Dec 09 14:36:00 2016 +0530
@@ -102,9 +102,7 @@
void setThreadNodeAffinity(void *numaMask);
int tryAcquireSleepingThread(sleepbitmap_t firstTryBitmap, sleepbitmap_t secondTryBitmap);
int tryBondPeers(int maxPeers, sleepbitmap_t peerBitmap, BondedTaskGroup& master);
-
- static ThreadPool* allocThreadPools(x265_param* p, int& numPools);
-
+ static ThreadPool* allocThreadPools(x265_param* p, int& numPools, bool isThreadsReserved);
static int getCpuCount();
static int getNumaNodeCount();
};
diff -r 78e1e1354a25 -r e8152da7aa0e source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp Fri Dec 09 10:45:12 2016 +0530
+++ b/source/encoder/encoder.cpp Fri Dec 09 14:36:00 2016 +0530
@@ -128,11 +128,9 @@
else
p->frameNumThreads = 1;
}
-
m_numPools = 0;
if (allowPools)
- m_threadPool = ThreadPool::allocThreadPools(p, m_numPools);
-
+ m_threadPool = ThreadPool::allocThreadPools(p, m_numPools, 0);
if (!m_numPools)
{
// issue warnings if any of these features were requested
@@ -201,17 +199,26 @@
m_scalingList.setDefaultScalingList();
else if (m_scalingList.parseScalingList(m_param->scalingLists))
m_aborted = true;
-
- m_lookahead = new Lookahead(m_param, m_threadPool);
- if (m_numPools)
+ int pools = m_numPools;
+ ThreadPool* lookAheadThreadPool = 0;
+ if (m_param->lookaheadThreads > 0)
{
- m_lookahead->m_jpId = m_threadPool[0].m_numProviders++;
- m_threadPool[0].m_jpTable[m_lookahead->m_jpId] = m_lookahead;
+ lookAheadThreadPool = ThreadPool::allocThreadPools(p, pools, 1);
}
-
+ else
+ lookAheadThreadPool = m_threadPool;
+ m_lookahead = new Lookahead(m_param, lookAheadThreadPool);
+ if (pools)
+ {
+ m_lookahead->m_jpId = lookAheadThreadPool[0].m_numProviders++;
+ lookAheadThreadPool[0].m_jpTable[m_lookahead->m_jpId] = m_lookahead;
+ }
+ if (m_param->lookaheadThreads > 0)
+ for (int i = 0; i < pools; i++)
+ lookAheadThreadPool[i].start();
+ m_lookahead->m_numPools = pools;
m_dpb = new DPB(m_param);
m_rateControl = new RateControl(*m_param);
-
initVPS(&m_vps);
initSPS(&m_sps);
initPPS(&m_pps);
diff -r 78e1e1354a25 -r e8152da7aa0e source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp Fri Dec 09 10:45:12 2016 +0530
+++ b/source/encoder/slicetype.cpp Fri Dec 09 14:36:00 2016 +0530
@@ -656,8 +656,12 @@
if (wait)
m_outputSignal.wait();
}
+ if (m_pool && m_param->lookaheadThreads > 0)
+ {
+ for (int i = 0; i < m_numPools; i++)
+ m_pool[i].stopWorkers();
+ }
}
-
void Lookahead::destroy()
{
// these two queues will be empty unless the encode was aborted
@@ -676,10 +680,10 @@
}
X265_FREE(m_scratch);
-
delete [] m_tld;
+ if (m_param->lookaheadThreads > 0)
+ delete [] m_pool;
}
-
/* The synchronization of slicetypeDecide is managed here. The findJob() method
* polls the occupancy of the input queue. If the queue is
* full, it will run slicetypeDecide() and output a mini-gop of frames to the
diff -r 78e1e1354a25 -r e8152da7aa0e source/encoder/slicetype.h
--- a/source/encoder/slicetype.h Fri Dec 09 10:45:12 2016 +0530
+++ b/source/encoder/slicetype.h Fri Dec 09 14:36:00 2016 +0530
@@ -129,8 +129,8 @@
bool m_bBatchFrameCosts;
bool m_filled;
bool m_isSceneTransition;
+ int m_numPools;
Lookahead(x265_param *param, ThreadPool *pool);
-
#if DETAILED_CU_STATS
int64_t m_slicetypeDecideElapsedTime;
int64_t m_preLookaheadElapsedTime;
diff -r 78e1e1354a25 -r e8152da7aa0e source/x265.h
--- a/source/x265.h Fri Dec 09 10:45:12 2016 +0530
+++ b/source/x265.h Fri Dec 09 14:36:00 2016 +0530
@@ -1335,6 +1335,13 @@
* intra cost of a frame used in scenecut detection. Default 5. */
double scenecutBias;
+ /* Use multiple worker threads dedicated to doing only lookahead instead of sharing
+ * the worker threads with Frame Encoders. A dedicated lookahead threadpool is created with the
+ * specified number of worker threads. This can range from 0 upto half the
+ * hardware threads available for encoding. Using too many threads for lookahead can starve
+ * resources for frame Encoder and can harm performance. Default is 0 - disabled. */
+ int lookaheadThreads;
+
} x265_param;
/* x265_param_alloc:
diff -r 78e1e1354a25 -r e8152da7aa0e source/x265cli.h
--- a/source/x265cli.h Fri Dec 09 10:45:12 2016 +0530
+++ b/source/x265cli.h Fri Dec 09 14:36:00 2016 +0530
@@ -125,6 +125,7 @@
{ "intra-refresh", no_argument, NULL, 0 },
{ "rc-lookahead", required_argument, NULL, 0 },
{ "lookahead-slices", required_argument, NULL, 0 },
+ { "lookahead-threads", required_argument, NULL, 0 },
{ "bframes", required_argument, NULL, 'b' },
{ "bframe-bias", required_argument, NULL, 0 },
{ "b-adapt", required_argument, NULL, 0 },
@@ -372,6 +373,7 @@
H0(" --intra-refresh Use Periodic Intra Refresh instead of IDR frames\n");
H0(" --rc-lookahead <integer> Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
H1(" --lookahead-slices <0..16> Number of slices to use per lookahead cost estimate. Default %d\n", param->lookaheadSlices);
+ H0(" --lookahead-threads <integer> Number of threads to be dedicated to perform lookahead only. Default %d\n", param->lookaheadThreads);
H0(" --bframes <integer> Maximum number of consecutive b-frames (now it only enables B GOP structure) Default %d\n", param->bframes);
H1(" --bframe-bias <integer> Bias towards B frame decisions. Default %d\n", param->bFrameBias);
H0(" --b-adapt <0..2> 0 - none, 1 - fast, 2 - full (trellis) adaptive B frame scheduling. Default %d\n", param->bFrameAdaptive);
More information about the x265-devel
mailing list