[x265] [PATCH 2 of 2] Lookahead: implement wavefront parallel processing
deepthidevaki at multicorewareinc.com
deepthidevaki at multicorewareinc.com
Fri Oct 18 14:51:18 CEST 2013
# HG changeset patch
# User Deepthi Devaki <deepthidevaki at multicorewareinc.com>
# Date 1382096456 -19800
# Node ID 38778e2c92e83824b48c2e22a696b7cd913b32f1
# Parent 721aa50b54be835593f602fe56e990187ffcee65
Lookahead: implement wavefront parallel processing
diff -r 721aa50b54be -r 38778e2c92e8 source/common/wavefront.h
--- a/source/common/wavefront.h Fri Oct 18 17:01:32 2013 +0530
+++ b/source/common/wavefront.h Fri Oct 18 17:10:56 2013 +0530
@@ -47,11 +47,6 @@
int m_numRows;
- // WaveFront's implementation of JobProvider::findJob. Consults
- // m_queuedBitmap and calls ProcessRow(row) for lowest numbered queued row
- // or returns false
- bool findJob();
-
public:
WaveFront(ThreadPool *pool) : JobProvider(pool), m_queuedBitmap(0) {}
@@ -71,6 +66,11 @@
void clearEnabledRowMask();
+ // WaveFront's implementation of JobProvider::findJob. Consults
+ // m_queuedBitmap and calls ProcessRow(row) for lowest numbered queued row
+ // or returns false
+ bool findJob();
+
// Returns true if a row above curRow is available for processing. The processRow()
// method may call this function periodically and voluntarily exit
bool checkHigherPriorityRow(int curRow);
diff -r 721aa50b54be -r 38778e2c92e8 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp Fri Oct 18 17:01:32 2013 +0530
+++ b/source/encoder/encoder.cpp Fri Oct 18 17:10:56 2013 +0530
@@ -91,7 +91,7 @@
m_frameEncoder[i].setThreadPool(m_threadPool);
}
}
- m_lookahead = new Lookahead(this);
+ m_lookahead = new Lookahead(this, m_threadPool);
m_dpb = new DPB(this);
m_rateControl = new RateControl(this);
}
@@ -144,6 +144,7 @@
m_frameEncoder[i].init(this, numRows);
}
}
+ m_lookahead->init();
}
int Encoder::getStreamHeaders(NALUnitEBSP **nalunits)
diff -r 721aa50b54be -r 38778e2c92e8 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp Fri Oct 18 17:01:32 2013 +0530
+++ b/source/encoder/slicetype.cpp Fri Oct 18 17:10:56 2013 +0530
@@ -62,29 +62,43 @@
dst.y = median(a.y, b.y, c.y);
}
-Lookahead::Lookahead(TEncCfg *_cfg)
+Lookahead::Lookahead(TEncCfg *_cfg, ThreadPool* pool) : WaveFront(pool)
{
this->cfg = _cfg;
numDecided = 0;
lastKeyframe = -cfg->param.keyframeMax;
lastNonB = NULL;
- predictions = (pixel*)X265_MALLOC(pixel, 35 * 8 * 8);
- me.setQP(X265_LOOKAHEAD_QP);
- me.setSearchMethod(X265_HEX_SEARCH);
- me.setSubpelRefine(1);
- merange = 16;
widthInCU = ((cfg->param.sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
heightInCU = ((cfg->param.sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+
+ lhrows = new LookaheadRow[heightInCU];
+ for (int i = 0; i < heightInCU; i++)
+ {
+ lhrows[i].widthInCU = widthInCU;
+ lhrows[i].heightInCU = heightInCU;
+ lhrows[i].frames = frames;
+ }
}
Lookahead::~Lookahead()
{
}
+void Lookahead::init()
+{
+ if (!WaveFront::init(heightInCU))
+ {
+ m_pool = NULL;
+ }
+ else
+ {
+ WaveFront::enableAllRows();
+ }
+}
+
void Lookahead::destroy()
{
- if (predictions)
- X265_FREE(predictions);
+ delete[] lhrows;
// these two queues will be empty, unless the encode was aborted
while (!inputQueue.empty())
@@ -165,9 +179,12 @@
int Lookahead::estimateFrameCost(int p0, int p1, int b, bool bIntraPenalty)
{
int score = 0;
- bool bDoSearch[2];
Lowres *fenc = frames[b];
+ curb = b;
+ curp0 = p0;
+ curp1 = p1;
+
if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1 - b][0] != -1)
score = fenc->costEst[b - p0][p1 - b];
else
@@ -181,27 +198,46 @@
fenc->costEst[b - p0][p1 - b] = 0;
- /* Lowres lookahead goes backwards because the MVs are used as
- * predictors in the main encode. This considerably improves MV
- * prediction overall. */
// TODO: use lowres MVs as motion candidates in full-res search
- me.setSourcePlane(fenc->lowresPlane[0], fenc->lumaStride);
- for (int j = heightInCU - 1; j >= 0; j--)
+
+ for (int i = 0; i < heightInCU; i++)
{
- if (!fenc->bIntraCalculated)
- fenc->rowSatds[0][0][j] = 0;
- fenc->rowSatds[b - p0][p1 - b][j] = 0;
+ lhrows[i].init();
+ lhrows[i].me.setSourcePlane(fenc->lowresPlane[0], fenc->lumaStride);
+ }
- for (int i = widthInCU - 1; i >= 0; i--)
+ rowsCompleted = false;
+
+ if (m_pool)
+ {
+ WaveFront::enqueue();
+ // enableAllRows must be already called
+ enqueueRow(0);
+ while (!rowsCompleted)
{
- estimateCUCost(i, j, p0, p1, b, bDoSearch);
+ WaveFront::findJob();
+ }
+
+ WaveFront::dequeue();
+ }
+ else
+ {
+ for (int row = 0; row < heightInCU; row++)
+ {
+ processRow(row);
}
}
+ // Accumulate cost from each row
+ for (int row = 0; row < heightInCU; row++)
+ {
+ score += lhrows[row].costEst;
+ fenc->costEst[0][0] += lhrows[row].costIntra;
+ fenc->intraMbs[b - p0] += lhrows[row].intraMbs;
+ }
+
fenc->bIntraCalculated = true;
- score = fenc->costEst[b - p0][p1 - b];
-
if (b != p1)
score = (uint64_t)score * 100 / (130 + cfg->param.bFrameBias);
@@ -218,7 +254,16 @@
return score;
}
-void Lookahead::estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2])
+void LookaheadRow::init()
+{
+ costEst = 0;
+ costIntra = 0;
+ intraMbs = 0;
+ active = false;
+ completed = 0;
+}
+
+void LookaheadRow::estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2])
{
Lowres *fref0 = frames[p0];
Lowres *fref1 = frames[p1];
@@ -412,14 +457,14 @@
// TOOD: i_icost += intra_penalty + lowres_penalty;
fenc->intraCost[cuXY] = icost;
fenc->rowSatds[0][0][cuy] += icost;
- if (bFrameScoreCU) fenc->costEst[0][0] += icost;
+ if (bFrameScoreCU) costIntra += icost;
}
if (!bBidir)
{
if (fenc->intraCost[cuXY] < bcost)
{
- if (bFrameScoreCU) fenc->intraMbs[b - p0]++;
+ if (bFrameScoreCU) intraMbs++;
bcost = fenc->intraCost[cuXY];
listused = 0;
}
@@ -429,7 +474,7 @@
if (p0 != p1)
{
fenc->rowSatds[b - p0][p1 - b][cuy] += bcost;
- if (bFrameScoreCU) fenc->costEst[b - p0][p1 - b] += bcost;
+ if (bFrameScoreCU) costEst += bcost;
}
fenc->lowresCosts[b - p0][p1 - b][cuXY] = (uint16_t)(X265_MIN(bcost, LOWRES_COST_MASK) | (listused << LOWRES_COST_SHIFT));
}
@@ -547,6 +592,7 @@
{
frames[i + 1] = &list[i]->m_lowres;
}
+
if (IS_X265_TYPE_I(frames[bframes + 1]->sliceType))
p0 = bframes + 1;
else // P
@@ -914,7 +960,7 @@
{
char paths[2][X265_LOOKAHEAD_MAX + 1];
int num_paths = X265_MIN(cfg->param.bframes + 1, length);
- int best_cost = me.COST_MAX;
+ int best_cost = MotionEstimate::COST_MAX;
int idx = 0;
/* Iterate over all currently possible paths */
@@ -993,3 +1039,45 @@
return cost;
}
+
+void Lookahead::processRow(int row)
+{
+ int realrow = heightInCU - 1 - row;
+ Lowres *fenc = frames[curb];
+
+ if (!fenc->bIntraCalculated)
+ fenc->rowSatds[0][0][realrow] = 0;
+ fenc->rowSatds[curb - curp0][curp1 - curb][realrow] = 0;
+
+ /* Lowres lookahead goes backwards because the MVs are used as
+ * predictors in the main encode. This considerably improves MV
+ * prediction overall. */
+ for (int i = widthInCU - 1 - lhrows[row].completed; i >= 0; i--)
+ {
+ lhrows[row].estimateCUCost(i, realrow, curp0, curp1, curb, bDoSearch);
+ lhrows[row].completed++;
+
+ if (lhrows[row].completed >= 2 && row < heightInCU - 1)
+ {
+ ScopedLock below(lhrows[row + 1].lock);
+ if (lhrows[row + 1].active == false &&
+ lhrows[row + 1].completed + 2 <= lhrows[row].completed)
+ {
+ lhrows[row + 1].active = true;
+ enqueueRow(row + 1);
+ }
+ }
+
+ ScopedLock self(lhrows[row].lock);
+ if (row > 0 && (int32_t)lhrows[row].completed < widthInCU - 1 && lhrows[row - 1].completed < lhrows[row].completed + 2)
+ {
+ lhrows[row].active = false;
+ return;
+ }
+ }
+
+ if (row == heightInCU - 1)
+ {
+ rowsCompleted = true;
+ }
+}
diff -r 721aa50b54be -r 38778e2c92e8 source/encoder/slicetype.h
--- a/source/encoder/slicetype.h Fri Oct 18 17:01:32 2013 +0530
+++ b/source/encoder/slicetype.h Fri Oct 18 17:10:56 2013 +0530
@@ -26,7 +26,7 @@
#include "motion.h"
#include "piclist.h"
-#include "common.h"
+#include "wavefront.h"
namespace x265 {
// private namespace
@@ -35,14 +35,46 @@
class TComPic;
class TEncCfg;
-struct Lookahead
+struct LookaheadRow
{
- MotionEstimate me;
+ Lock lock;
+ volatile bool active;
+ volatile uint32_t completed; // Number of CUs in this row for which cost estimation is completed
+ pixel* predictions; // buffer for 35 intra predictions
+ MotionEstimate me;
+ int costEst; // Estimated cost for all CUs in a row
+ int costIntra; // Estimated Intra cost for all CUs in a row
+ int intraMbs; // Number of Intra CUs
+
+ Lowres** frames;
+ int widthInCU;
+ int heightInCU;
+ int merange;
+
+ LookaheadRow()
+ {
+ me.setQP(X265_LOOKAHEAD_QP);
+ me.setSearchMethod(X265_HEX_SEARCH);
+ me.setSubpelRefine(1);
+ predictions = (pixel*)X265_MALLOC(pixel, 35 * 8 * 8);
+ merange = 16;
+ }
+
+ ~LookaheadRow()
+ {
+ X265_FREE(predictions);
+ }
+
+ void init();
+
+ void estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2]);
+};
+
+struct Lookahead : public WaveFront
+{
TEncCfg *cfg;
- pixel *predictions; // buffer for 35 intra predictions
Lowres *frames[X265_LOOKAHEAD_MAX];
Lowres *lastNonB;
- int merange;
int numDecided;
int lastKeyframe;
int widthInCU; // width of lowres frame in downscale CUs
@@ -51,9 +83,16 @@
PicList inputQueue; // input pictures in order received
PicList outputQueue; // pictures to be encoded, in encode order
- Lookahead(TEncCfg *);
+ bool bDoSearch[2];
+ int curb, curp0, curp1;
+ bool rowsCompleted;
+
+ LookaheadRow* lhrows;
+
+ Lookahead(TEncCfg *, ThreadPool *);
~Lookahead();
+ void init();
void addPicture(TComPic*, int sliceType);
void flush();
void destroy();
@@ -61,13 +100,14 @@
int getEstimatedPictureCost(TComPic *pic);
int estimateFrameCost(int p0, int p1, int b, bool bIntraPenalty);
- void estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2]);
void slicetypeAnalyse(bool bKeyframe);
int scenecut(int p0, int p1, bool bRealScenecut, int numFrames, int maxSearch);
int scenecutInternal(int p0, int p1, bool bRealScenecut);
void slicetypePath(int length, char(*best_paths)[X265_LOOKAHEAD_MAX + 1]);
int slicetypePathCost(char *path, int threshold);
+
+ void processRow(int row);
};
}
More information about the x265-devel
mailing list