[x265] [PATCH] slicetype: use a worker thread for slicetypeDecide when it may help

Steve Borho steve at borho.org
Tue Apr 15 04:55:58 CEST 2014


# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1397353033 18000
#      Sat Apr 12 20:37:13 2014 -0500
# Node ID a852dbf87a239ee20ac270148b48b2b5eb65feda
# Parent  08d64a70594ed31cd80046bd4a7e9fa52119be47
slicetype: use a worker thread for slicetypeDecide when it may help

If slicetype/scenecut analysis is enabled and the user has a thread pool of at
least 4 threads, use a worker thread to run slicetypeDecide.

Improves performance in presets that were bottlenecked by b-adapt 2 style
lookahead complexity.

diff -r 08d64a70594e -r a852dbf87a23 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp	Mon Apr 14 13:18:18 2014 -0500
+++ b/source/encoder/encoder.cpp	Sat Apr 12 20:37:13 2014 -0500
@@ -439,12 +439,11 @@
         ret = 1;
     }
 
-    if (!m_lookahead->outputQueue.empty())
+    // pop a single frame from decided list, then provide to frame encoder
+    // curEncoder is guaranteed to be idle at this point
+    TComPic* fenc = m_lookahead->getDecidedPicture();
+    if (fenc)
     {
-        // pop a single frame from decided list, then provide to frame encoder
-        // curEncoder is guaranteed to be idle at this point
-        TComPic *fenc = m_lookahead->outputQueue.popFront();
-
         m_encodedFrameNum++;
         if (m_bframeDelay)
         {
diff -r 08d64a70594e -r a852dbf87a23 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Mon Apr 14 13:18:18 2014 -0500
+++ b/source/encoder/slicetype.cpp	Sat Apr 12 20:37:13 2014 -0500
@@ -57,11 +57,14 @@
 }
 
 Lookahead::Lookahead(Encoder *_cfg, ThreadPool* pool)
-    : est(pool)
+    : JobProvider(pool)
+    , est(pool)
 {
     param = _cfg->param;
     lastKeyframe = -param->keyframeMax;
     lastNonB = NULL;
+    bFilling = true;
+    bFlushed = false;
     widthInCU = ((param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
     heightInCU = ((param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
     scratch = (int*)x265_malloc(widthInCU * sizeof(int));
@@ -70,10 +73,23 @@
 
 Lookahead::~Lookahead() { }
 
-void Lookahead::init() { }
+void Lookahead::init()
+{
+    if (m_pool && m_pool->getThreadCount() >= 4 &&
+        ((param->bFrameAdaptive && param->bframes) ||
+         param->rc.cuTree || param->scenecutThreshold ||
+         (param->lookaheadDepth && param->rc.vbvBufferSize)))
+        m_pool = m_pool; /* allow use of worker thread */
+    else
+        m_pool = NULL;   /* disable use of worker thread */
+}
 
 void Lookahead::destroy()
 {
+    if (m_pool)
+        // flush will dequeue, if it is necessary
+        JobProvider::flush();
+
     // these two queues will be empty unless the encode was aborted
     while (!inputQueue.empty())
     {
@@ -92,26 +108,103 @@
     x265_free(scratch);
 }
 
+/* Called by API thread */
 void Lookahead::addPicture(TComPic *pic, int sliceType)
 {
     TComPicYuv *orig = pic->getPicYuvOrg();
 
     pic->m_lowres.init(orig, pic->getSlice()->getPOC(), sliceType);
+
+    inputQueueLock.acquire();
     inputQueue.pushBack(*pic);
 
     if (inputQueue.size() >= param->lookaheadDepth)
-        slicetypeDecide();
+    {
+        /* when queue fills the first time, run slicetypeDecide synchronously,
+         * since the encoder will always be blocked here */
+        if (m_pool && !bFilling)
+        {
+            inputQueueLock.release();
+            bReady = 1;
+            m_pool->pokeIdleThread();
+        }
+        else
+            slicetypeDecide();
+
+        if (bFilling && m_pool)
+            JobProvider::enqueue();
+        bFilling = false;
+    }
+    else
+        inputQueueLock.release();
 }
 
+/* Called by API thread */
 void Lookahead::flush()
 {
+    /* flush synchronously */
+    inputQueueLock.acquire();
     if (!inputQueue.empty())
+    {
         slicetypeDecide();
+    }
+    else
+        inputQueueLock.release();
+
+    /* just in case the input queue is never allowed to fill */
+    bFilling = false;
+
+    inputQueueLock.acquire();
+
+    /* bFlushed indicates that an empty output queue actually means all frames
+     * have been decided (no more inputs for the encoder) */
+    if (inputQueue.empty())
+        bFlushed = true;
+    inputQueueLock.release();
 }
 
-// Called by RateControl to get the estimated SATD cost for a given picture.
-// It assumes dpb->prepareEncode() has already been called for the picture and
-// all the references are established
+/* Called by API thread. If the lookahead queue has not yet been filled the
+ * first time, it immediately returns NULL.  Else the function blocks until
+ * outputs are available and then pops the first frame from the output queue. If
+ * flush() has been called and the output queue is empty, NULL is returned. */
+TComPic* Lookahead::getDecidedPicture()
+{
+    outputQueueLock.acquire();
+
+    if (bFilling)
+    {
+        outputQueueLock.release();
+        return NULL;
+    }
+
+    while (outputQueue.empty() && !bFlushed)
+    {
+        outputQueueLock.release();
+        outputAvailable.wait();
+        outputQueueLock.acquire();
+    }
+
+    TComPic *fenc = outputQueue.popFront();
+    outputQueueLock.release();
+    return fenc;
+}
+
+/* Called by pool worker threads */
+bool Lookahead::findJob()
+{
+    if (bReady && ATOMIC_CAS(&bReady, 1, 0) == 1)
+    {
+        inputQueueLock.acquire();
+        slicetypeDecide();
+        return true;
+    }
+    else
+        return false;
+}
+
+/* Called by rate-control to get the estimated SATD cost for a given picture.
+ * It assumes dpb->prepareEncode() has already been called for the picture and
+ * all the references are established */
 int64_t Lookahead::getEstimatedPictureCost(TComPic *pic)
 {
     Lowres *frames[X265_LOOKAHEAD_MAX];
@@ -212,32 +305,51 @@
     return pic->m_lowres.satdCost;
 }
 
+/* called by API thread or worker thread with inputQueueLock acquired */
 void Lookahead::slicetypeDecide()
 {
+    ScopedLock lock(decideLock);
+
     Lowres *frames[X265_LOOKAHEAD_MAX];
     TComPic *list[X265_LOOKAHEAD_MAX];
-    TComPic *ipic = inputQueue.first();
-    bool isKeyFrameAnalyse = (param->rc.cuTree || (param->rc.vbvBufferSize && param->lookaheadDepth));
+    int maxSearch = X265_MIN(param->lookaheadDepth, X265_LOOKAHEAD_MAX);
 
-    if (!est.rows && ipic)
-        est.init(param, ipic);
+    memset(frames, 0, sizeof(frames));
+    memset(list, 0, sizeof(list));
 
-    if ((param->bFrameAdaptive && param->bframes) ||
-        param->rc.cuTree || param->scenecutThreshold ||
-        (param->lookaheadDepth && param->rc.vbvBufferSize))
+    {
+        TComPic *pic = inputQueue.first();
+        int j;
+        for (j = 0; j < param->bframes + 2; j++)
+        {
+            if (!pic) break;
+            list[j] = pic;
+            pic = pic->m_next;
+        }
+
+        pic = inputQueue.first();
+        frames[0] = lastNonB;
+        for (j = 0; j < maxSearch; j++)
+        {
+            if (!pic) break;
+            frames[j + 1] = &pic->m_lowres;
+            pic = pic->m_next;
+        }
+        maxSearch = j;
+    }
+
+    inputQueueLock.release();
+
+    if (!est.rows && list[0])
+        est.init(param, list[0]);
+
+    if (lastNonB &&
+        ((param->bFrameAdaptive && param->bframes) ||
+         param->rc.cuTree || param->scenecutThreshold ||
+         (param->lookaheadDepth && param->rc.vbvBufferSize)))
     {
         slicetypeAnalyse(frames, false);
     }
-    else
-        frames[0] = lastNonB;
-
-    int j;
-    for (j = 0; ipic && j < param->bframes + 2; ipic = ipic->m_next)
-    {
-        list[j++] = ipic;
-    }
-
-    list[j] = NULL;
 
     int bframes, brefs;
     for (bframes = 0, brefs = 0;; bframes++)
@@ -365,6 +477,7 @@
         }
     }
 
+    inputQueueLock.acquire();
     /* dequeue all frames from inputQueue that are about to be enqueued
      * in the output queue. The order is important because TComPic can
      * only be in one list at a time */
@@ -374,8 +487,11 @@
         TComPic *pic;
         pic = inputQueue.popFront();
         pts[i] = pic->m_pts;
+        maxSearch--;
     }
+    inputQueueLock.release();
 
+    outputQueueLock.acquire();
     /* add non-B to output queue */
     int idx = 0;
     list[bframes]->m_reorderedPts = pts[idx++];
@@ -405,10 +521,25 @@
         }
     }
 
+    bool isKeyFrameAnalyse = (param->rc.cuTree || (param->rc.vbvBufferSize && param->lookaheadDepth));
     if (isKeyFrameAnalyse && IS_X265_TYPE_I(lastNonB->sliceType))
     {
+        inputQueueLock.acquire();
+        TComPic *pic = inputQueue.first();
+        frames[0] = lastNonB;
+        int j;
+        for (j = 0; j < maxSearch; j++)
+        {
+            frames[j + 1] = &pic->m_lowres;
+            pic = pic->m_next;
+        }
+        frames[j + 1] = NULL;
+        inputQueueLock.release();
         slicetypeAnalyse(frames, true);
     }
+
+    outputQueueLock.release();
+    outputAvailable.trigger();
 }
 
 void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe)
@@ -472,15 +603,12 @@
     int resetStart;
     bool bIsVbvLookahead = param->rc.vbvBufferSize && param->lookaheadDepth;
 
-    if (!lastNonB)
-        return;
-
-    frames[0] = lastNonB;
-    TComPic* pic = inputQueue.first();
-    for (framecnt = 0; (framecnt < maxSearch) && pic && pic->m_lowres.sliceType == X265_TYPE_AUTO; framecnt++)
+    /* count undecided frames */
+    for (framecnt = 0; framecnt < maxSearch; framecnt++)
     {
-        frames[framecnt + 1] = &pic->m_lowres;
-        pic = pic->m_next;
+        Lowres *fenc = frames[framecnt + 1];
+        if (!fenc || fenc->sliceType != X265_TYPE_AUTO)
+            break;
     }
 
     if (!framecnt)
diff -r 08d64a70594e -r a852dbf87a23 source/encoder/slicetype.h
--- a/source/encoder/slicetype.h	Mon Apr 14 13:18:18 2014 -0500
+++ b/source/encoder/slicetype.h	Sat Apr 12 20:37:13 2014 -0500
@@ -116,7 +116,7 @@
     uint32_t weightCostLuma(Lowres **frames, int b, int p0, wpScalingParam *w);
 };
 
-struct Lookahead
+struct Lookahead : public JobProvider
 {
     Lookahead(Encoder *, ThreadPool *pool);
     ~Lookahead();
@@ -138,11 +138,22 @@
 
     void addPicture(TComPic*, int sliceType);
     void flush();
+    TComPic* getDecidedPicture();
 
     int64_t getEstimatedPictureCost(TComPic *pic);
 
 protected:
 
+    Lock  inputQueueLock;
+    Lock  outputQueueLock;
+    Lock  decideLock;
+    Event outputAvailable;
+    volatile int  bReady;
+    volatile bool bFilling;
+    volatile bool bFlushed;
+
+    bool findJob();
+
     /* called by addPicture() or flush() to trigger slice decisions */
     void slicetypeDecide();
     void slicetypeAnalyse(Lowres **frames, bool bKeyframe);


More information about the x265-devel mailing list