[x265] [PATCH REVIEW] Lookahead: Implement wavefront parallel processing
deepthidevaki at multicorewareinc.com
deepthidevaki at multicorewareinc.com
Thu Oct 17 12:39:09 CEST 2013
# HG changeset patch
# User Deepthi Devaki <deepthidevaki at multicorewareinc.com>
# Date 1382006045 -19800
# Node ID 677d71fd793959c7e98df56f99f0b02a89fe050d
# Parent fc9dbd798ac37ec1acc0596aa179f0deb586c092
Lookahead: Implement wavefront parallel processing
diff -r fc9dbd798ac3 -r 677d71fd7939 source/common/wavefront.h
--- a/source/common/wavefront.h Thu Oct 17 14:14:40 2013 +0530
+++ b/source/common/wavefront.h Thu Oct 17 16:04:05 2013 +0530
@@ -47,11 +47,6 @@
int m_numRows;
- // WaveFront's implementation of JobProvider::findJob. Consults
- // m_queuedBitmap and calls ProcessRow(row) for lowest numbered queued row
- // or returns false
- bool findJob();
-
public:
WaveFront(ThreadPool *pool) : JobProvider(pool), m_queuedBitmap(0) {}
@@ -69,6 +64,11 @@
void clearEnabledRowMask();
+ // WaveFront's implementation of JobProvider::findJob. Consults
+ // m_queuedBitmap and calls ProcessRow(row) for lowest numbered queued row
+ // or returns false
+ bool findJob();
+
// Returns true if a row above curRow is available for processing. The processRow()
// method may call this function periodically and voluntarily exit
bool checkHigherPriorityRow(int curRow);
diff -r fc9dbd798ac3 -r 677d71fd7939 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp Thu Oct 17 14:14:40 2013 +0530
+++ b/source/encoder/encoder.cpp Thu Oct 17 16:04:05 2013 +0530
@@ -92,6 +92,7 @@
}
}
m_lookahead = new Lookahead(this);
+ m_lookahead->setThreadPool(m_threadPool);
m_dpb = new DPB(this);
m_rateControl = new RateControl(this);
}
@@ -144,6 +145,7 @@
m_frameEncoder[i].init(this, numRows);
}
}
+ m_lookahead->init();
}
int Encoder::getStreamHeaders(NALUnitEBSP **nalunits)
diff -r fc9dbd798ac3 -r 677d71fd7939 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp Thu Oct 17 14:14:40 2013 +0530
+++ b/source/encoder/slicetype.cpp Thu Oct 17 16:04:05 2013 +0530
@@ -62,29 +62,67 @@
dst.y = median(a.y, b.y, c.y);
}
-Lookahead::Lookahead(TEncCfg *_cfg)
+Lookahead::Lookahead(TEncCfg *_cfg) : WaveFront(NULL)
{
this->cfg = _cfg;
numDecided = 0;
lastKeyframe = -cfg->param.keyframeMax;
lastNonB = NULL;
- predictions = (pixel*)X265_MALLOC(pixel, 35 * 8 * 8);
- me.setQP(X265_LOOKAHEAD_QP);
- me.setSearchMethod(X265_HEX_SEARCH);
- me.setSubpelRefine(1);
merange = 16;
widthInCU = ((cfg->param.sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
heightInCU = ((cfg->param.sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+
+ lock = new Lock[heightInCU];
+ rowme = new MotionEstimate[heightInCU];
+
+ active = (bool*)X265_MALLOC(bool, heightInCU);
+ completed = (uint32_t*)X265_MALLOC(uint32_t, heightInCU);
+ rowpredictions = (pixel**)X265_MALLOC(pixel*, heightInCU);
+ rowcostEst = (int*)X265_MALLOC(int, heightInCU);
+ rowcostIntra = (int*)X265_MALLOC(int, heightInCU);
+ rowIntraMbs = (int*)X265_MALLOC(int, heightInCU);
+ for (int i = 0; i < heightInCU; i++)
+ {
+ rowme[i].setQP(X265_LOOKAHEAD_QP);
+ rowme[i].setSearchMethod(X265_HEX_SEARCH);
+ rowme[i].setSubpelRefine(1);
+ rowpredictions[i] = (pixel*)X265_MALLOC(pixel, 35 * 8 * 8);
+ rowcostEst[i] = 0;
+ rowcostIntra[i] = 0;
+ rowIntraMbs[i] = 0;
+ }
}
Lookahead::~Lookahead()
{
}
+void Lookahead::init()
+{
+ if (!WaveFront::init(heightInCU))
+ {
+ m_pool = NULL;
+ }
+ memset((void*)completed, 0, heightInCU * sizeof(uint32_t));
+ memset((void*)active, 0, heightInCU * sizeof(bool));
+}
+
void Lookahead::destroy()
{
- if (predictions)
- X265_FREE(predictions);
+ delete[] lock;
+ delete[] rowme;
+
+ X265_FREE((void*)active);
+ X265_FREE((void*)completed);
+ for (int i = 0; i < heightInCU; i++)
+ {
+ X265_FREE(rowpredictions[i]);
+ }
+
+ X265_FREE(rowpredictions);
+ X265_FREE(rowcostEst);
+ X265_FREE(rowcostIntra);
+ X265_FREE(rowIntraMbs);
// these two queues will be empty, unless the encode was aborted
while (!inputQueue.empty())
@@ -165,9 +203,12 @@
int Lookahead::estimateFrameCost(int p0, int p1, int b, bool bIntraPenalty)
{
int score = 0;
- bool bDoSearch[2];
Lowres *fenc = frames[b];
+ curb = b;
+ curp0 = p0;
+ curp1 = p1;
+
if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1 - b][0] != -1)
score = fenc->costEst[b - p0][p1 - b];
else
@@ -185,23 +226,58 @@
* predictors in the main encode. This considerably improves MV
* prediction overall. */
// TODO: use lowres MVs as motion candidates in full-res search
- me.setSourcePlane(fenc->lowresPlane[0], fenc->lumaStride);
- for (int j = heightInCU - 1; j >= 0; j--)
+
+ for (int i = 0; i < heightInCU; i++)
{
- if (!fenc->bIntraCalculated)
- fenc->rowSatds[0][0][j] = 0;
- fenc->rowSatds[b - p0][p1 - b][j] = 0;
+ rowme[i].setSourcePlane(fenc->lowresPlane[0], fenc->lumaStride);
+ }
- for (int i = widthInCU - 1; i >= 0; i--)
+ memset((void*)completed, 0, heightInCU * sizeof(uint32_t));
+ memset((void*)active, 0, heightInCU * sizeof(bool));
+ memset((void*)rowcostEst, 0, heightInCU * sizeof(int));
+ memset((void*)rowcostIntra, 0, heightInCU * sizeof(int));
+ memset((void*)rowIntraMbs, 0, heightInCU * sizeof(int));
+ rowsCompleted = false;
+
+ if (m_pool && cfg->param.bEnableWavefront)
+ {
+ WaveFront::clearEnabledRowMask();
+ WaveFront::enqueue();
+
+ for (int row = 0; row < heightInCU; row++)
{
- estimateCUCost(i, j, p0, p1, b, bDoSearch);
+ enableRow(row);
+ if (row == 0)
+ enqueueRow(0);
+ else
+ m_pool->pokeIdleThread();
+ }
+
+ while (!rowsCompleted)
+ {
+ WaveFront::findJob();
+ }
+
+ WaveFront::dequeue();
+ }
+ else
+ {
+ for (int row = 0; row < heightInCU; row++)
+ {
+ processRow(row);
}
}
+ //Accumulate cost from each row
+ for (int row = 0; row < heightInCU; row++)
+ {
+ score += rowcostEst[row];
+ fenc->costEst[0][0] += rowcostIntra[row];
+ fenc->intraMbs[b - p0] += rowIntraMbs[row];
+ }
+
fenc->bIntraCalculated = true;
- score = fenc->costEst[b - p0][p1 - b];
-
if (b != p1)
score = (uint64_t)score * 100 / (130 + cfg->param.bFrameBias);
@@ -218,7 +294,7 @@
return score;
}
-void Lookahead::estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2])
+void Lookahead::estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2], int row)
{
Lowres *fref0 = frames[p0];
Lowres *fref1 = frames[p1];
@@ -233,7 +309,7 @@
const bool bFrameScoreCU = (cux > 0 && cux < widthInCU - 1 &&
cuy > 0 && cuy < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2;
- me.setSourcePU(pelOffset, cuSize, cuSize);
+ rowme[row].setSourcePU(pelOffset, cuSize, cuSize);
MV(*fenc_mvs[2]) = { &fenc->lowresMvs[0][b - p0 - 1][cuXY],
&fenc->lowresMvs[1][p1 - b - 1][cuXY] };
@@ -241,7 +317,7 @@
&fenc->lowresMvCosts[1][p1 - b - 1][cuXY] };
MV mvmin, mvmax;
- int bcost = me.COST_MAX;
+ int bcost = rowme[row].COST_MAX;
int listused = 0;
// establish search bounds that don't cross extended frame boundaries
@@ -286,7 +362,7 @@
median_mv(mvp, mvc[0], mvc[1], mvc[2]);
}
- *fenc_costs[i] = me.motionEstimate(i ? fref1 : fref0, mvmin, mvmax, mvp, numc, mvc, merange, *fenc_mvs[i]);
+ *fenc_costs[i] = rowme[row].motionEstimate(i ? fref1 : fref0, mvmin, mvmax, mvp, numc, mvc, merange, *fenc_mvs[i]);
COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1);
}
if (bBidir)
@@ -388,23 +464,23 @@
int predsize = cuSize * cuSize;
// generate 35 intra predictions into tmp
- primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, predictions, cuSize, cuSize, (cuSize <= 16));
+ primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, rowpredictions[row], cuSize, cuSize, (cuSize <= 16));
pixel *above = (cuSize >= 8) ? pAbove1 : pAbove0;
pixel *left = (cuSize >= 8) ? pLeft1 : pLeft0;
- primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left + 1, predictions + predsize, cuSize, cuSize);
- primitives.intra_pred_allangs[nLog2SizeMinus2](predictions + 2 * predsize, pAbove0, pLeft0, pAbove1, pLeft1, (cuSize <= 16));
+ primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left + 1, rowpredictions[row] + predsize, cuSize, cuSize);
+ primitives.intra_pred_allangs[nLog2SizeMinus2](rowpredictions[row] + 2 * predsize, pAbove0, pLeft0, pAbove1, pLeft1, (cuSize <= 16));
// calculate 35 satd costs, keep least cost
ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
- primitives.transpose[nLog2SizeMinus2](buf_trans, me.fenc, FENC_STRIDE);
+ primitives.transpose[nLog2SizeMinus2](buf_trans, rowme[row].fenc, FENC_STRIDE);
pixelcmp_t satd = primitives.satd[PartitionFromSizes(cuSize, cuSize)];
- int icost = me.COST_MAX, cost;
+ int icost = rowme[row].COST_MAX, cost;
for (UInt mode = 0; mode < 35; mode++)
{
if ((mode >= 2) && (mode < 18))
- cost = satd(buf_trans, cuSize, &predictions[mode * predsize], cuSize);
+ cost = satd(buf_trans, cuSize, &rowpredictions[row][mode * predsize], cuSize);
else
- cost = satd(me.fenc, FENC_STRIDE, &predictions[mode * predsize], cuSize);
+ cost = satd(rowme[row].fenc, FENC_STRIDE, &rowpredictions[row][mode * predsize], cuSize);
if (cost < icost)
icost = cost;
}
@@ -412,14 +488,14 @@
// TOOD: i_icost += intra_penalty + lowres_penalty;
fenc->intraCost[cuXY] = icost;
fenc->rowSatds[0][0][cuy] += icost;
- if (bFrameScoreCU) fenc->costEst[0][0] += icost;
+ if (bFrameScoreCU) rowcostIntra[row] += icost;
}
if (!bBidir)
{
if (fenc->intraCost[cuXY] < bcost)
{
- if (bFrameScoreCU) fenc->intraMbs[b - p0]++;
+ if (bFrameScoreCU) rowIntraMbs[row]++;
bcost = fenc->intraCost[cuXY];
listused = 0;
}
@@ -429,7 +505,7 @@
if (p0 != p1)
{
fenc->rowSatds[b - p0][p1 - b][cuy] += bcost;
- if (bFrameScoreCU) fenc->costEst[b - p0][p1 - b] += bcost;
+ if (bFrameScoreCU) rowcostEst[row] += bcost;
}
fenc->lowresCosts[b - p0][p1 - b][cuXY] = (uint16_t)(X265_MIN(bcost, LOWRES_COST_MASK) | (listused << LOWRES_COST_SHIFT));
}
@@ -547,6 +623,7 @@
{
frames[i + 1] = &list[i]->m_lowres;
}
+
if (IS_X265_TYPE_I(frames[bframes + 1]->sliceType))
p0 = bframes + 1;
else // P
@@ -914,7 +991,7 @@
{
char paths[2][X265_LOOKAHEAD_MAX + 1];
int num_paths = X265_MIN(cfg->param.bframes + 1, length);
- int best_cost = me.COST_MAX;
+ int best_cost = rowme[0].COST_MAX;
int idx = 0;
/* Iterate over all currently possible paths */
@@ -993,3 +1070,41 @@
return cost;
}
+
+void Lookahead::processRow(int row)
+{
+ int realrow = heightInCU - 1 - row;
+ Lowres *fenc = frames[curb];
+
+ if (!fenc->bIntraCalculated)
+ fenc->rowSatds[0][0][realrow] = 0;
+ fenc->rowSatds[curb - curp0][curp1 - curb][realrow] = 0;
+ for (int i = widthInCU - 1 - completed[row]; i >= 0; i--) //Go backwards
+ {
+ estimateCUCost(i, realrow, curp0, curp1, curb, bDoSearch, row);
+ completed[row]++;
+
+ if (completed[row] >= 2 && row < heightInCU - 1)
+ {
+ ScopedLock below(lock[row + 1]);
+ if (active[row + 1] == false &&
+ completed[row + 1] + 2 <= completed[row])
+ {
+ active[row + 1] = true;
+ enqueueRow(row + 1);
+ }
+ }
+
+ ScopedLock self(lock[row]);
+ if (row > 0 && (int32_t)completed[row] < widthInCU - 1 && completed[row - 1] < completed[row] + 2)
+ {
+ active[row] = false;
+ return;
+ }
+ }
+
+ if (row == heightInCU - 1)
+ {
+ rowsCompleted = true;
+ }
+}
diff -r fc9dbd798ac3 -r 677d71fd7939 source/encoder/slicetype.h
--- a/source/encoder/slicetype.h Thu Oct 17 14:14:40 2013 +0530
+++ b/source/encoder/slicetype.h Thu Oct 17 16:04:05 2013 +0530
@@ -26,7 +26,7 @@
#include "motion.h"
#include "piclist.h"
-#include "common.h"
+#include "wavefront.h"
namespace x265 {
// private namespace
@@ -35,11 +35,9 @@
class TComPic;
class TEncCfg;
-struct Lookahead
+struct Lookahead : public WaveFront
{
- MotionEstimate me;
TEncCfg *cfg;
- pixel *predictions; // buffer for 35 intra predictions
Lowres *frames[X265_LOOKAHEAD_MAX];
Lowres *lastNonB;
int merange;
@@ -51,9 +49,24 @@
PicList inputQueue; // input pictures in order received
PicList outputQueue; // pictures to be encoded, in encode order
+ bool bDoSearch[2];
+ int curb, curp0, curp1;
+ bool rowsCompleted;
+
+ //For wavefront parallelism
+ Lock* lock;
+ volatile bool* active;
+ volatile uint32_t* completed;
+ pixel** rowpredictions; // buffer for 35 intra predictions
+ MotionEstimate* rowme;
+ int* rowcostEst;
+ int* rowcostIntra;
+ int* rowIntraMbs;
+
Lookahead(TEncCfg *);
~Lookahead();
+ void init();
void addPicture(TComPic*, int sliceType);
void flush();
void destroy();
@@ -61,13 +74,15 @@
int getEstimatedPictureCost(TComPic *pic);
int estimateFrameCost(int p0, int p1, int b, bool bIntraPenalty);
- void estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2]);
+ void estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2], int row);
void slicetypeAnalyse(bool bKeyframe);
int scenecut(int p0, int p1, bool bRealScenecut, int numFrames, int maxSearch);
int scenecutInternal(int p0, int p1, bool bRealScenecut);
void slicetypePath(int length, char(*best_paths)[X265_LOOKAHEAD_MAX + 1]);
int slicetypePathCost(char *path, int threshold);
+
+ void processRow(int row);
};
}
More information about the x265-devel
mailing list