[x265] [PATCH] slicetype: use a worker thread for slicetypeDecide when it may help
Steve Borho
steve at borho.org
Tue Apr 15 04:55:58 CEST 2014
# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1397353033 18000
# Sat Apr 12 20:37:13 2014 -0500
# Node ID a852dbf87a239ee20ac270148b48b2b5eb65feda
# Parent 08d64a70594ed31cd80046bd4a7e9fa52119be47
slicetype: use a worker thread for slicetypeDecide when it may help
If slicetype/scenecut analysis is enabled and the user has a thread pool of at
least 4 threads, use a worker thread to run slicetypeDecide.
Improves performance in presets that were bottlenecked by b-adapt 2 style
lookahead complexity.
diff -r 08d64a70594e -r a852dbf87a23 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp Mon Apr 14 13:18:18 2014 -0500
+++ b/source/encoder/encoder.cpp Sat Apr 12 20:37:13 2014 -0500
@@ -439,12 +439,11 @@
ret = 1;
}
- if (!m_lookahead->outputQueue.empty())
+ // pop a single frame from decided list, then provide to frame encoder
+ // curEncoder is guaranteed to be idle at this point
+ TComPic* fenc = m_lookahead->getDecidedPicture();
+ if (fenc)
{
- // pop a single frame from decided list, then provide to frame encoder
- // curEncoder is guaranteed to be idle at this point
- TComPic *fenc = m_lookahead->outputQueue.popFront();
-
m_encodedFrameNum++;
if (m_bframeDelay)
{
diff -r 08d64a70594e -r a852dbf87a23 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp Mon Apr 14 13:18:18 2014 -0500
+++ b/source/encoder/slicetype.cpp Sat Apr 12 20:37:13 2014 -0500
@@ -57,11 +57,14 @@
}
Lookahead::Lookahead(Encoder *_cfg, ThreadPool* pool)
- : est(pool)
+ : JobProvider(pool)
+ , est(pool)
{
param = _cfg->param;
lastKeyframe = -param->keyframeMax;
lastNonB = NULL;
+ bFilling = true;
+ bFlushed = false;
widthInCU = ((param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
heightInCU = ((param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
scratch = (int*)x265_malloc(widthInCU * sizeof(int));
@@ -70,10 +73,23 @@
Lookahead::~Lookahead() { }
-void Lookahead::init() { }
+void Lookahead::init()
+{
+ if (m_pool && m_pool->getThreadCount() >= 4 &&
+ ((param->bFrameAdaptive && param->bframes) ||
+ param->rc.cuTree || param->scenecutThreshold ||
+ (param->lookaheadDepth && param->rc.vbvBufferSize)))
+ m_pool = m_pool; /* allow use of worker thread */
+ else
+ m_pool = NULL; /* disable use of worker thread */
+}
void Lookahead::destroy()
{
+ if (m_pool)
+ // flush will dequeue, if it is necessary
+ JobProvider::flush();
+
// these two queues will be empty unless the encode was aborted
while (!inputQueue.empty())
{
@@ -92,26 +108,103 @@
x265_free(scratch);
}
+/* Called by API thread */
void Lookahead::addPicture(TComPic *pic, int sliceType)
{
TComPicYuv *orig = pic->getPicYuvOrg();
pic->m_lowres.init(orig, pic->getSlice()->getPOC(), sliceType);
+
+ inputQueueLock.acquire();
inputQueue.pushBack(*pic);
if (inputQueue.size() >= param->lookaheadDepth)
- slicetypeDecide();
+ {
+ /* when queue fills the first time, run slicetypeDecide synchronously,
+ * since the encoder will always be blocked here */
+ if (m_pool && !bFilling)
+ {
+ inputQueueLock.release();
+ bReady = 1;
+ m_pool->pokeIdleThread();
+ }
+ else
+ slicetypeDecide();
+
+ if (bFilling && m_pool)
+ JobProvider::enqueue();
+ bFilling = false;
+ }
+ else
+ inputQueueLock.release();
}
+/* Called by API thread */
void Lookahead::flush()
{
+ /* flush synchronously */
+ inputQueueLock.acquire();
if (!inputQueue.empty())
+ {
slicetypeDecide();
+ }
+ else
+ inputQueueLock.release();
+
+ /* just in case the input queue is never allowed to fill */
+ bFilling = false;
+
+ inputQueueLock.acquire();
+
+ /* bFlushed indicates that an empty output queue actually means all frames
+ * have been decided (no more inputs for the encoder) */
+ if (inputQueue.empty())
+ bFlushed = true;
+ inputQueueLock.release();
}
-// Called by RateControl to get the estimated SATD cost for a given picture.
-// It assumes dpb->prepareEncode() has already been called for the picture and
-// all the references are established
+/* Called by API thread. If the lookahead queue has not yet been filled the
+ * first time, it immediately returns NULL. Else the function blocks until
+ * outputs are available and then pops the first frame from the output queue. If
+ * flush() has been called and the output queue is empty, NULL is returned. */
+TComPic* Lookahead::getDecidedPicture()
+{
+ outputQueueLock.acquire();
+
+ if (bFilling)
+ {
+ outputQueueLock.release();
+ return NULL;
+ }
+
+ while (outputQueue.empty() && !bFlushed)
+ {
+ outputQueueLock.release();
+ outputAvailable.wait();
+ outputQueueLock.acquire();
+ }
+
+ TComPic *fenc = outputQueue.popFront();
+ outputQueueLock.release();
+ return fenc;
+}
+
+/* Called by pool worker threads */
+bool Lookahead::findJob()
+{
+ if (bReady && ATOMIC_CAS(&bReady, 1, 0) == 1)
+ {
+ inputQueueLock.acquire();
+ slicetypeDecide();
+ return true;
+ }
+ else
+ return false;
+}
+
+/* Called by rate-control to get the estimated SATD cost for a given picture.
+ * It assumes dpb->prepareEncode() has already been called for the picture and
+ * all the references are established */
int64_t Lookahead::getEstimatedPictureCost(TComPic *pic)
{
Lowres *frames[X265_LOOKAHEAD_MAX];
@@ -212,32 +305,51 @@
return pic->m_lowres.satdCost;
}
+/* called by API thread or worker thread with inputQueueLock acquired */
void Lookahead::slicetypeDecide()
{
+ ScopedLock lock(decideLock);
+
Lowres *frames[X265_LOOKAHEAD_MAX];
TComPic *list[X265_LOOKAHEAD_MAX];
- TComPic *ipic = inputQueue.first();
- bool isKeyFrameAnalyse = (param->rc.cuTree || (param->rc.vbvBufferSize && param->lookaheadDepth));
+ int maxSearch = X265_MIN(param->lookaheadDepth, X265_LOOKAHEAD_MAX);
- if (!est.rows && ipic)
- est.init(param, ipic);
+ memset(frames, 0, sizeof(frames));
+ memset(list, 0, sizeof(list));
- if ((param->bFrameAdaptive && param->bframes) ||
- param->rc.cuTree || param->scenecutThreshold ||
- (param->lookaheadDepth && param->rc.vbvBufferSize))
+ {
+ TComPic *pic = inputQueue.first();
+ int j;
+ for (j = 0; j < param->bframes + 2; j++)
+ {
+ if (!pic) break;
+ list[j] = pic;
+ pic = pic->m_next;
+ }
+
+ pic = inputQueue.first();
+ frames[0] = lastNonB;
+ for (j = 0; j < maxSearch; j++)
+ {
+ if (!pic) break;
+ frames[j + 1] = &pic->m_lowres;
+ pic = pic->m_next;
+ }
+ maxSearch = j;
+ }
+
+ inputQueueLock.release();
+
+ if (!est.rows && list[0])
+ est.init(param, list[0]);
+
+ if (lastNonB &&
+ ((param->bFrameAdaptive && param->bframes) ||
+ param->rc.cuTree || param->scenecutThreshold ||
+ (param->lookaheadDepth && param->rc.vbvBufferSize)))
{
slicetypeAnalyse(frames, false);
}
- else
- frames[0] = lastNonB;
-
- int j;
- for (j = 0; ipic && j < param->bframes + 2; ipic = ipic->m_next)
- {
- list[j++] = ipic;
- }
-
- list[j] = NULL;
int bframes, brefs;
for (bframes = 0, brefs = 0;; bframes++)
@@ -365,6 +477,7 @@
}
}
+ inputQueueLock.acquire();
/* dequeue all frames from inputQueue that are about to be enqueued
* in the output queue. The order is important because TComPic can
* only be in one list at a time */
@@ -374,8 +487,11 @@
TComPic *pic;
pic = inputQueue.popFront();
pts[i] = pic->m_pts;
+ maxSearch--;
}
+ inputQueueLock.release();
+ outputQueueLock.acquire();
/* add non-B to output queue */
int idx = 0;
list[bframes]->m_reorderedPts = pts[idx++];
@@ -405,10 +521,25 @@
}
}
+ bool isKeyFrameAnalyse = (param->rc.cuTree || (param->rc.vbvBufferSize && param->lookaheadDepth));
if (isKeyFrameAnalyse && IS_X265_TYPE_I(lastNonB->sliceType))
{
+ inputQueueLock.acquire();
+ TComPic *pic = inputQueue.first();
+ frames[0] = lastNonB;
+ int j;
+ for (j = 0; j < maxSearch; j++)
+ {
+ frames[j + 1] = &pic->m_lowres;
+ pic = pic->m_next;
+ }
+ frames[j + 1] = NULL;
+ inputQueueLock.release();
slicetypeAnalyse(frames, true);
}
+
+ outputQueueLock.release();
+ outputAvailable.trigger();
}
void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe)
@@ -472,15 +603,12 @@
int resetStart;
bool bIsVbvLookahead = param->rc.vbvBufferSize && param->lookaheadDepth;
- if (!lastNonB)
- return;
-
- frames[0] = lastNonB;
- TComPic* pic = inputQueue.first();
- for (framecnt = 0; (framecnt < maxSearch) && pic && pic->m_lowres.sliceType == X265_TYPE_AUTO; framecnt++)
+ /* count undecided frames */
+ for (framecnt = 0; framecnt < maxSearch; framecnt++)
{
- frames[framecnt + 1] = &pic->m_lowres;
- pic = pic->m_next;
+ Lowres *fenc = frames[framecnt + 1];
+ if (!fenc || fenc->sliceType != X265_TYPE_AUTO)
+ break;
}
if (!framecnt)
diff -r 08d64a70594e -r a852dbf87a23 source/encoder/slicetype.h
--- a/source/encoder/slicetype.h Mon Apr 14 13:18:18 2014 -0500
+++ b/source/encoder/slicetype.h Sat Apr 12 20:37:13 2014 -0500
@@ -116,7 +116,7 @@
uint32_t weightCostLuma(Lowres **frames, int b, int p0, wpScalingParam *w);
};
-struct Lookahead
+struct Lookahead : public JobProvider
{
Lookahead(Encoder *, ThreadPool *pool);
~Lookahead();
@@ -138,11 +138,22 @@
void addPicture(TComPic*, int sliceType);
void flush();
+ TComPic* getDecidedPicture();
int64_t getEstimatedPictureCost(TComPic *pic);
protected:
+ Lock inputQueueLock;
+ Lock outputQueueLock;
+ Lock decideLock;
+ Event outputAvailable;
+ volatile int bReady;
+ volatile bool bFilling;
+ volatile bool bFlushed;
+
+ bool findJob();
+
/* called by addPicture() or flush() to trigger slice decisions */
void slicetypeDecide();
void slicetypeAnalyse(Lowres **frames, bool bKeyframe);
More information about the x265-devel
mailing list