[x265] [PATCH 2 of 2] Lookahead: implement wavefront parallel processing

deepthidevaki at multicorewareinc.com deepthidevaki at multicorewareinc.com
Fri Oct 18 14:51:18 CEST 2013


# HG changeset patch
# User Deepthi Devaki <deepthidevaki at multicorewareinc.com>
# Date 1382096456 -19800
# Node ID 38778e2c92e83824b48c2e22a696b7cd913b32f1
# Parent  721aa50b54be835593f602fe56e990187ffcee65
Lookahead: implement wavefront parallel processing

diff -r 721aa50b54be -r 38778e2c92e8 source/common/wavefront.h
--- a/source/common/wavefront.h	Fri Oct 18 17:01:32 2013 +0530
+++ b/source/common/wavefront.h	Fri Oct 18 17:10:56 2013 +0530
@@ -47,11 +47,6 @@
 
     int m_numRows;
 
-    // WaveFront's implementation of JobProvider::findJob. Consults
-    // m_queuedBitmap and calls ProcessRow(row) for lowest numbered queued row
-    // or returns false
-    bool findJob();
-
 public:
 
     WaveFront(ThreadPool *pool) : JobProvider(pool), m_queuedBitmap(0) {}
@@ -71,6 +66,11 @@
 
     void clearEnabledRowMask();
 
+    // WaveFront's implementation of JobProvider::findJob. Consults
+    // m_queuedBitmap and calls ProcessRow(row) for lowest numbered queued row
+    // or returns false
+    bool findJob();
+
     // Returns true if a row above curRow is available for processing.  The processRow()
     // method may call this function periodically and voluntarily exit
     bool checkHigherPriorityRow(int curRow);
diff -r 721aa50b54be -r 38778e2c92e8 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp	Fri Oct 18 17:01:32 2013 +0530
+++ b/source/encoder/encoder.cpp	Fri Oct 18 17:10:56 2013 +0530
@@ -91,7 +91,7 @@
             m_frameEncoder[i].setThreadPool(m_threadPool);
         }
     }
-    m_lookahead = new Lookahead(this);
+    m_lookahead = new Lookahead(this, m_threadPool);
     m_dpb = new DPB(this);
     m_rateControl = new RateControl(this);
 }
@@ -144,6 +144,7 @@
             m_frameEncoder[i].init(this, numRows);
         }
     }
+    m_lookahead->init();
 }
 
 int Encoder::getStreamHeaders(NALUnitEBSP **nalunits)
diff -r 721aa50b54be -r 38778e2c92e8 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Fri Oct 18 17:01:32 2013 +0530
+++ b/source/encoder/slicetype.cpp	Fri Oct 18 17:10:56 2013 +0530
@@ -62,29 +62,43 @@
     dst.y = median(a.y, b.y, c.y);
 }
 
-Lookahead::Lookahead(TEncCfg *_cfg)
+Lookahead::Lookahead(TEncCfg *_cfg, ThreadPool* pool) : WaveFront(pool)
 {
     this->cfg = _cfg;
     numDecided = 0;
     lastKeyframe = -cfg->param.keyframeMax;
     lastNonB = NULL;
-    predictions = (pixel*)X265_MALLOC(pixel, 35 * 8 * 8);
-    me.setQP(X265_LOOKAHEAD_QP);
-    me.setSearchMethod(X265_HEX_SEARCH);
-    me.setSubpelRefine(1);
-    merange = 16;
     widthInCU = ((cfg->param.sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
     heightInCU = ((cfg->param.sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+
+    lhrows = new LookaheadRow[heightInCU];
+    for (int i = 0; i < heightInCU; i++)
+    {
+        lhrows[i].widthInCU = widthInCU;
+        lhrows[i].heightInCU = heightInCU;
+        lhrows[i].frames = frames;
+    }
 }
 
 Lookahead::~Lookahead()
 {
 }
 
+void Lookahead::init()
+{
+    if (!WaveFront::init(heightInCU))
+    {
+        m_pool = NULL;
+    }
+    else
+    {
+        WaveFront::enableAllRows();
+    }
+}
+
 void Lookahead::destroy()
 {
-    if (predictions)
-        X265_FREE(predictions);
+    delete[] lhrows;
 
     // these two queues will be empty, unless the encode was aborted
     while (!inputQueue.empty())
@@ -165,9 +179,12 @@
 int Lookahead::estimateFrameCost(int p0, int p1, int b, bool bIntraPenalty)
 {
     int score = 0;
-    bool bDoSearch[2];
     Lowres *fenc = frames[b];
 
+    curb = b;
+    curp0 = p0;
+    curp1 = p1;
+
     if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1 - b][0] != -1)
         score = fenc->costEst[b - p0][p1 - b];
     else
@@ -181,27 +198,46 @@
 
         fenc->costEst[b - p0][p1 - b] = 0;
 
-        /* Lowres lookahead goes backwards because the MVs are used as
-         * predictors in the main encode.  This considerably improves MV
-         * prediction overall. */
         // TODO: use lowres MVs as motion candidates in full-res search
-        me.setSourcePlane(fenc->lowresPlane[0], fenc->lumaStride);
-        for (int j = heightInCU - 1; j >= 0; j--)
+
+        for (int i = 0; i < heightInCU; i++)
         {
-            if (!fenc->bIntraCalculated)
-                fenc->rowSatds[0][0][j] = 0;
-            fenc->rowSatds[b - p0][p1 - b][j] = 0;
+            lhrows[i].init();
+            lhrows[i].me.setSourcePlane(fenc->lowresPlane[0], fenc->lumaStride);
+        }
 
-            for (int i = widthInCU - 1; i >= 0; i--)
+        rowsCompleted = false;
+
+        if (m_pool)
+        {
+            WaveFront::enqueue();
+            // enableAllRows must be already called
+            enqueueRow(0);
+            while (!rowsCompleted)
             {
-                estimateCUCost(i, j, p0, p1, b, bDoSearch);
+                WaveFront::findJob();
+            }
+
+            WaveFront::dequeue();
+        }
+        else
+        {
+            for (int row = 0; row < heightInCU; row++)
+            {
+                processRow(row);
             }
         }
 
+        // Accumulate cost from each row
+        for (int row = 0; row < heightInCU; row++)
+        {
+            score += lhrows[row].costEst;
+            fenc->costEst[0][0] += lhrows[row].costIntra;
+            fenc->intraMbs[b - p0] += lhrows[row].intraMbs;
+        }
+
         fenc->bIntraCalculated = true;
 
-        score = fenc->costEst[b - p0][p1 - b];
-
         if (b != p1)
             score = (uint64_t)score * 100 / (130 + cfg->param.bFrameBias);
 
@@ -218,7 +254,16 @@
     return score;
 }
 
-void Lookahead::estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2])
+void LookaheadRow::init()
+{
+    costEst = 0;
+    costIntra = 0;
+    intraMbs = 0;
+    active = false;
+    completed = 0;
+}
+
+void LookaheadRow::estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2])
 {
     Lowres *fref0 = frames[p0];
     Lowres *fref1 = frames[p1];
@@ -412,14 +457,14 @@
         // TOOD: i_icost += intra_penalty + lowres_penalty;
         fenc->intraCost[cuXY] = icost;
         fenc->rowSatds[0][0][cuy] += icost;
-        if (bFrameScoreCU) fenc->costEst[0][0] += icost;
+        if (bFrameScoreCU) costIntra += icost;
     }
 
     if (!bBidir)
     {
         if (fenc->intraCost[cuXY] < bcost)
         {
-            if (bFrameScoreCU) fenc->intraMbs[b - p0]++;
+            if (bFrameScoreCU) intraMbs++;
             bcost = fenc->intraCost[cuXY];
             listused = 0;
         }
@@ -429,7 +474,7 @@
     if (p0 != p1)
     {
         fenc->rowSatds[b - p0][p1 - b][cuy] += bcost;
-        if (bFrameScoreCU) fenc->costEst[b - p0][p1 - b] += bcost;
+        if (bFrameScoreCU) costEst += bcost;
     }
     fenc->lowresCosts[b - p0][p1 - b][cuXY] = (uint16_t)(X265_MIN(bcost, LOWRES_COST_MASK) | (listused << LOWRES_COST_SHIFT));
 }
@@ -547,6 +592,7 @@
             {
                 frames[i + 1] = &list[i]->m_lowres;
             }
+
             if (IS_X265_TYPE_I(frames[bframes + 1]->sliceType))
                 p0 = bframes + 1;
             else // P
@@ -914,7 +960,7 @@
 {
     char paths[2][X265_LOOKAHEAD_MAX + 1];
     int num_paths = X265_MIN(cfg->param.bframes + 1, length);
-    int best_cost = me.COST_MAX;
+    int best_cost = MotionEstimate::COST_MAX;
     int idx = 0;
 
     /* Iterate over all currently possible paths */
@@ -993,3 +1039,45 @@
 
     return cost;
 }
+
+void Lookahead::processRow(int row)
+{
+    int realrow = heightInCU - 1 - row;
+    Lowres *fenc = frames[curb];
+
+    if (!fenc->bIntraCalculated)
+        fenc->rowSatds[0][0][realrow] = 0;
+    fenc->rowSatds[curb - curp0][curp1 - curb][realrow] = 0;
+
+    /* Lowres lookahead goes backwards because the MVs are used as
+     * predictors in the main encode.  This considerably improves MV
+     * prediction overall. */
+    for (int i = widthInCU - 1 - lhrows[row].completed; i >= 0; i--)
+    {
+        lhrows[row].estimateCUCost(i, realrow, curp0, curp1, curb, bDoSearch);
+        lhrows[row].completed++;
+
+        if (lhrows[row].completed >= 2 && row < heightInCU - 1)
+        {
+            ScopedLock below(lhrows[row + 1].lock);
+            if (lhrows[row + 1].active == false &&
+                lhrows[row + 1].completed + 2 <= lhrows[row].completed)
+            {
+                lhrows[row + 1].active = true;
+                enqueueRow(row + 1);
+            }
+        }
+
+        ScopedLock self(lhrows[row].lock);
+        if (row > 0 && (int32_t)lhrows[row].completed < widthInCU - 1 && lhrows[row - 1].completed < lhrows[row].completed + 2)
+        {
+            lhrows[row].active = false;
+            return;
+        }
+    }
+
+    if (row == heightInCU - 1)
+    {
+        rowsCompleted = true;
+    }
+}
diff -r 721aa50b54be -r 38778e2c92e8 source/encoder/slicetype.h
--- a/source/encoder/slicetype.h	Fri Oct 18 17:01:32 2013 +0530
+++ b/source/encoder/slicetype.h	Fri Oct 18 17:10:56 2013 +0530
@@ -26,7 +26,7 @@
 
 #include "motion.h"
 #include "piclist.h"
-#include "common.h"
+#include "wavefront.h"
 
 namespace x265 {
 // private namespace
@@ -35,14 +35,46 @@
 class TComPic;
 class TEncCfg;
 
-struct Lookahead
+struct LookaheadRow
 {
-    MotionEstimate   me;
+    Lock                lock;
+    volatile bool       active;
+    volatile uint32_t   completed;      // Number of CUs in this row for which cost estimation is completed
+    pixel*              predictions;    // buffer for 35 intra predictions
+    MotionEstimate      me;
+    int                 costEst;        // Estimated cost for all CUs in a row
+    int                 costIntra;      // Estimated Intra cost for all CUs in a row
+    int                 intraMbs;       // Number of Intra CUs
+
+    Lowres** frames;
+    int widthInCU;
+    int heightInCU;
+    int merange;
+
+    LookaheadRow()
+    {
+        me.setQP(X265_LOOKAHEAD_QP);
+        me.setSearchMethod(X265_HEX_SEARCH);
+        me.setSubpelRefine(1);
+        predictions = (pixel*)X265_MALLOC(pixel, 35 * 8 * 8);
+        merange = 16;
+    }
+
+    ~LookaheadRow()
+    {
+        X265_FREE(predictions);
+    }
+
+    void init();
+
+    void estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2]);
+};
+
+struct Lookahead : public WaveFront
+{
     TEncCfg         *cfg;
-    pixel           *predictions;   // buffer for 35 intra predictions
     Lowres          *frames[X265_LOOKAHEAD_MAX];
     Lowres          *lastNonB;
-    int              merange;
     int              numDecided;
     int              lastKeyframe;
     int              widthInCU;       // width of lowres frame in downscale CUs
@@ -51,9 +83,16 @@
     PicList inputQueue;  // input pictures in order received
     PicList outputQueue; // pictures to be encoded, in encode order
 
-    Lookahead(TEncCfg *);
+    bool bDoSearch[2];
+    int curb, curp0, curp1;
+    bool rowsCompleted;
+
+    LookaheadRow* lhrows;
+
+    Lookahead(TEncCfg *, ThreadPool *);
     ~Lookahead();
 
+    void init();
     void addPicture(TComPic*, int sliceType);
     void flush();
     void destroy();
@@ -61,13 +100,14 @@
     int getEstimatedPictureCost(TComPic *pic);
 
     int estimateFrameCost(int p0, int p1, int b, bool bIntraPenalty);
-    void estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2]);
 
     void slicetypeAnalyse(bool bKeyframe);
     int scenecut(int p0, int p1, bool bRealScenecut, int numFrames, int maxSearch);
     int scenecutInternal(int p0, int p1, bool bRealScenecut);
     void slicetypePath(int length, char(*best_paths)[X265_LOOKAHEAD_MAX + 1]);
     int slicetypePathCost(char *path, int threshold);
+
+    void processRow(int row);
 };
 }
 


More information about the x265-devel mailing list