[x265] [PATCH REVIEW] Lookahead: Implement wavefront parallel processing
Steve Borho
steve at borho.org
Fri Oct 18 01:52:22 CEST 2013
On Thu, Oct 17, 2013 at 5:39 AM, <deepthidevaki at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Deepthi Devaki <deepthidevaki at multicorewareinc.com>
> # Date 1382006045 -19800
> # Node ID 677d71fd793959c7e98df56f99f0b02a89fe050d
> # Parent fc9dbd798ac37ec1acc0596aa179f0deb586c092
> Lookahead: Implement wavefront parallel processing
>
this looks good; I think you have the general idea down. It can be
simplified a great deal, comments below
> diff -r fc9dbd798ac3 -r 677d71fd7939 source/common/wavefront.h
> --- a/source/common/wavefront.h Thu Oct 17 14:14:40 2013 +0530
> +++ b/source/common/wavefront.h Thu Oct 17 16:04:05 2013 +0530
> @@ -47,11 +47,6 @@
>
> int m_numRows;
>
> - // WaveFront's implementation of JobProvider::findJob. Consults
> - // m_queuedBitmap and calls ProcessRow(row) for lowest numbered
> queued row
> - // or returns false
> - bool findJob();
> -
> public:
>
> WaveFront(ThreadPool *pool) : JobProvider(pool), m_queuedBitmap(0) {}
> @@ -69,6 +64,11 @@
>
> void clearEnabledRowMask();
>
> + // WaveFront's implementation of JobProvider::findJob. Consults
> + // m_queuedBitmap and calls ProcessRow(row) for lowest numbered
> queued row
> + // or returns false
> + bool findJob();
> +
>
I think this is unnecessary if you refer to your own findJob() method
(this->findJob())
> // Returns true if a row above curRow is available for processing.
> The processRow()
> // method may call this function periodically and voluntarily exit
> bool checkHigherPriorityRow(int curRow);
> diff -r fc9dbd798ac3 -r 677d71fd7939 source/encoder/encoder.cpp
> --- a/source/encoder/encoder.cpp Thu Oct 17 14:14:40 2013 +0530
> +++ b/source/encoder/encoder.cpp Thu Oct 17 16:04:05 2013 +0530
> @@ -92,6 +92,7 @@
> }
> }
> m_lookahead = new Lookahead(this);
> + m_lookahead->setThreadPool(m_threadPool);
> m_dpb = new DPB(this);
> m_rateControl = new RateControl(this);
> }
> @@ -144,6 +145,7 @@
> m_frameEncoder[i].init(this, numRows);
> }
> }
> + m_lookahead->init();
> }
>
> int Encoder::getStreamHeaders(NALUnitEBSP **nalunits)
> diff -r fc9dbd798ac3 -r 677d71fd7939 source/encoder/slicetype.cpp
> --- a/source/encoder/slicetype.cpp Thu Oct 17 14:14:40 2013 +0530
> +++ b/source/encoder/slicetype.cpp Thu Oct 17 16:04:05 2013 +0530
> @@ -62,29 +62,67 @@
> dst.y = median(a.y, b.y, c.y);
> }
>
> -Lookahead::Lookahead(TEncCfg *_cfg)
> +Lookahead::Lookahead(TEncCfg *_cfg) : WaveFront(NULL)
>
Pass the thread pool pointer as a second argument and initialize WaveFront
with it; then you don't need the setThreadPool() method
> {
> this->cfg = _cfg;
> numDecided = 0;
> lastKeyframe = -cfg->param.keyframeMax;
> lastNonB = NULL;
> - predictions = (pixel*)X265_MALLOC(pixel, 35 * 8 * 8);
> - me.setQP(X265_LOOKAHEAD_QP);
> - me.setSearchMethod(X265_HEX_SEARCH);
> - me.setSubpelRefine(1);
> merange = 16;
> widthInCU = ((cfg->param.sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1)
> >> X265_LOWRES_CU_BITS;
> heightInCU = ((cfg->param.sourceHeight / 2) + X265_LOWRES_CU_SIZE -
> 1) >> X265_LOWRES_CU_BITS;
> +
> + lock = new Lock[heightInCU];
> + rowme = new MotionEstimate[heightInCU];
> +
> + active = (bool*)X265_MALLOC(bool, heightInCU);
> + completed = (uint32_t*)X265_MALLOC(uint32_t, heightInCU);
> + rowpredictions = (pixel**)X265_MALLOC(pixel*, heightInCU);
> + rowcostEst = (int*)X265_MALLOC(int, heightInCU);
> + rowcostIntra = (int*)X265_MALLOC(int, heightInCU);
> + rowIntraMbs = (int*)X265_MALLOC(int, heightInCU);
> + for (int i = 0; i < heightInCU; i++)
> + {
> + rowme[i].setQP(X265_LOOKAHEAD_QP);
> + rowme[i].setSearchMethod(X265_HEX_SEARCH);
> + rowme[i].setSubpelRefine(1);
> + rowpredictions[i] = (pixel*)X265_MALLOC(pixel, 35 * 8 * 8);
> + rowcostEst[i] = 0;
> + rowcostIntra[i] = 0;
> + rowIntraMbs[i] = 0;
> + }
>
This would be much cleaner as a per-row struct similar to CTURow
(LookaheadRow?)
> }
>
> Lookahead::~Lookahead()
> {
> }
>
> +void Lookahead::init()
> +{
> + if (!WaveFront::init(heightInCU))
> + {
> + m_pool = NULL;
> + }
> + memset((void*)completed, 0, heightInCU * sizeof(uint32_t));
> + memset((void*)active, 0, heightInCU * sizeof(bool));
>
these memsets are unnecessary
> +}
> +
> void Lookahead::destroy()
> {
> - if (predictions)
> - X265_FREE(predictions);
> + delete[] lock;
> + delete[] rowme;
> +
> + X265_FREE((void*)active);
> + X265_FREE((void*)completed);
> + for (int i = 0; i < heightInCU; i++)
> + {
> + X265_FREE(rowpredictions[i]);
> + }
> +
> + X265_FREE(rowpredictions);
> + X265_FREE(rowcostEst);
> + X265_FREE(rowcostIntra);
> + X265_FREE(rowIntraMbs);
>
> // these two queues will be empty, unless the encode was aborted
> while (!inputQueue.empty())
> @@ -165,9 +203,12 @@
> int Lookahead::estimateFrameCost(int p0, int p1, int b, bool
> bIntraPenalty)
> {
> int score = 0;
> - bool bDoSearch[2];
> Lowres *fenc = frames[b];
>
> + curb = b;
> + curp0 = p0;
> + curp1 = p1;
> +
> if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1 -
> b][0] != -1)
> score = fenc->costEst[b - p0][p1 - b];
> else
> @@ -185,23 +226,58 @@
> * predictors in the main encode. This considerably improves MV
> * prediction overall. */
> // TODO: use lowres MVs as motion candidates in full-res search
> - me.setSourcePlane(fenc->lowresPlane[0], fenc->lumaStride);
> - for (int j = heightInCU - 1; j >= 0; j--)
> +
> + for (int i = 0; i < heightInCU; i++)
> {
> - if (!fenc->bIntraCalculated)
> - fenc->rowSatds[0][0][j] = 0;
> - fenc->rowSatds[b - p0][p1 - b][j] = 0;
> + rowme[i].setSourcePlane(fenc->lowresPlane[0],
> fenc->lumaStride);
> + }
>
> - for (int i = widthInCU - 1; i >= 0; i--)
> + memset((void*)completed, 0, heightInCU * sizeof(uint32_t));
> + memset((void*)active, 0, heightInCU * sizeof(bool));
> + memset((void*)rowcostEst, 0, heightInCU * sizeof(int));
> + memset((void*)rowcostIntra, 0, heightInCU * sizeof(int));
> + memset((void*)rowIntraMbs, 0, heightInCU * sizeof(int));
> + rowsCompleted = false;
> +
> + if (m_pool && cfg->param.bEnableWavefront)
>
checking bEnableWavefront is unnecessary here; we can always use wave-front
lookahead scheduling so long as we have a thread pool.
> + {
> + WaveFront::clearEnabledRowMask();
>
We need another method which just sets the enabledRowMask to all 1s because
it's only purpose is to track reference frame dependencies and we have none
here.
> + WaveFront::enqueue();
> +
> + for (int row = 0; row < heightInCU; row++)
> {
> - estimateCUCost(i, j, p0, p1, b, bDoSearch);
> + enableRow(row);
> + if (row == 0)
> + enqueueRow(0);
> + else
> + m_pool->pokeIdleThread();
> + }
>
this whole loop is unnecessary (no frame dependencies), just use
equeueRow(0)
> +
> + while (!rowsCompleted)
> + {
> + WaveFront::findJob();
> + }
> +
> + WaveFront::dequeue();
> + }
> + else
> + {
> + for (int row = 0; row < heightInCU; row++)
> + {
> + processRow(row);
> }
> }
>
> + //Accumulate cost from each row
>
nit: space after //
> + for (int row = 0; row < heightInCU; row++)
> + {
> + score += rowcostEst[row];
> + fenc->costEst[0][0] += rowcostIntra[row];
> + fenc->intraMbs[b - p0] += rowIntraMbs[row];
>
score must be saved in fenc->costEst[b - p0][p1 - b];
> + }
> +
> fenc->bIntraCalculated = true;
>
> - score = fenc->costEst[b - p0][p1 - b];
> -
> if (b != p1)
> score = (uint64_t)score * 100 / (130 + cfg->param.bFrameBias);
>
> @@ -218,7 +294,7 @@
> return score;
> }
>
> -void Lookahead::estimateCUCost(int cux, int cuy, int p0, int p1, int b,
> bool bDoSearch[2])
> +void Lookahead::estimateCUCost(int cux, int cuy, int p0, int p1, int b,
> bool bDoSearch[2], int row)
>
perhaps this would be cleaner if it were a method of LookaheadRow
> {
> Lowres *fref0 = frames[p0];
> Lowres *fref1 = frames[p1];
> @@ -233,7 +309,7 @@
> const bool bFrameScoreCU = (cux > 0 && cux < widthInCU - 1 &&
> cuy > 0 && cuy < heightInCU - 1) ||
> widthInCU <= 2 || heightInCU <= 2;
>
> - me.setSourcePU(pelOffset, cuSize, cuSize);
> + rowme[row].setSourcePU(pelOffset, cuSize, cuSize);
>
> MV(*fenc_mvs[2]) = { &fenc->lowresMvs[0][b - p0 - 1][cuXY],
> &fenc->lowresMvs[1][p1 - b - 1][cuXY] };
> @@ -241,7 +317,7 @@
> &fenc->lowresMvCosts[1][p1 - b - 1][cuXY] };
>
> MV mvmin, mvmax;
> - int bcost = me.COST_MAX;
> + int bcost = rowme[row].COST_MAX;
> int listused = 0;
>
> // establish search bounds that don't cross extended frame boundaries
> @@ -286,7 +362,7 @@
> median_mv(mvp, mvc[0], mvc[1], mvc[2]);
> }
>
> - *fenc_costs[i] = me.motionEstimate(i ? fref1 : fref0, mvmin,
> mvmax, mvp, numc, mvc, merange, *fenc_mvs[i]);
> + *fenc_costs[i] = rowme[row].motionEstimate(i ? fref1 : fref0,
> mvmin, mvmax, mvp, numc, mvc, merange, *fenc_mvs[i]);
> COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1);
> }
> if (bBidir)
> @@ -388,23 +464,23 @@
> int predsize = cuSize * cuSize;
>
> // generate 35 intra predictions into tmp
> - primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, predictions,
> cuSize, cuSize, (cuSize <= 16));
> + primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1,
> rowpredictions[row], cuSize, cuSize, (cuSize <= 16));
> pixel *above = (cuSize >= 8) ? pAbove1 : pAbove0;
> pixel *left = (cuSize >= 8) ? pLeft1 : pLeft0;
> - primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left + 1,
> predictions + predsize, cuSize, cuSize);
> - primitives.intra_pred_allangs[nLog2SizeMinus2](predictions + 2 *
> predsize, pAbove0, pLeft0, pAbove1, pLeft1, (cuSize <= 16));
> + primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left + 1,
> rowpredictions[row] + predsize, cuSize, cuSize);
> +
> primitives.intra_pred_allangs[nLog2SizeMinus2](rowpredictions[row] + 2 *
> predsize, pAbove0, pLeft0, pAbove1, pLeft1, (cuSize <= 16));
>
> // calculate 35 satd costs, keep least cost
> ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
> - primitives.transpose[nLog2SizeMinus2](buf_trans, me.fenc,
> FENC_STRIDE);
> + primitives.transpose[nLog2SizeMinus2](buf_trans, rowme[row].fenc,
> FENC_STRIDE);
> pixelcmp_t satd = primitives.satd[PartitionFromSizes(cuSize,
> cuSize)];
> - int icost = me.COST_MAX, cost;
> + int icost = rowme[row].COST_MAX, cost;
> for (UInt mode = 0; mode < 35; mode++)
> {
> if ((mode >= 2) && (mode < 18))
> - cost = satd(buf_trans, cuSize, &predictions[mode *
> predsize], cuSize);
> + cost = satd(buf_trans, cuSize, &rowpredictions[row][mode
> * predsize], cuSize);
> else
> - cost = satd(me.fenc, FENC_STRIDE, &predictions[mode *
> predsize], cuSize);
> + cost = satd(rowme[row].fenc, FENC_STRIDE,
> &rowpredictions[row][mode * predsize], cuSize);
> if (cost < icost)
> icost = cost;
> }
> @@ -412,14 +488,14 @@
> // TOOD: i_icost += intra_penalty + lowres_penalty;
> fenc->intraCost[cuXY] = icost;
> fenc->rowSatds[0][0][cuy] += icost;
> - if (bFrameScoreCU) fenc->costEst[0][0] += icost;
> + if (bFrameScoreCU) rowcostIntra[row] += icost;
> }
>
> if (!bBidir)
> {
> if (fenc->intraCost[cuXY] < bcost)
> {
> - if (bFrameScoreCU) fenc->intraMbs[b - p0]++;
> + if (bFrameScoreCU) rowIntraMbs[row]++;
> bcost = fenc->intraCost[cuXY];
> listused = 0;
> }
> @@ -429,7 +505,7 @@
> if (p0 != p1)
> {
> fenc->rowSatds[b - p0][p1 - b][cuy] += bcost;
> - if (bFrameScoreCU) fenc->costEst[b - p0][p1 - b] += bcost;
> + if (bFrameScoreCU) rowcostEst[row] += bcost;
> }
> fenc->lowresCosts[b - p0][p1 - b][cuXY] = (uint16_t)(X265_MIN(bcost,
> LOWRES_COST_MASK) | (listused << LOWRES_COST_SHIFT));
> }
> @@ -547,6 +623,7 @@
> {
> frames[i + 1] = &list[i]->m_lowres;
> }
> +
> if (IS_X265_TYPE_I(frames[bframes + 1]->sliceType))
> p0 = bframes + 1;
> else // P
> @@ -914,7 +991,7 @@
> {
> char paths[2][X265_LOOKAHEAD_MAX + 1];
> int num_paths = X265_MIN(cfg->param.bframes + 1, length);
> - int best_cost = me.COST_MAX;
> + int best_cost = rowme[0].COST_MAX;
> int idx = 0;
>
> /* Iterate over all currently possible paths */
> @@ -993,3 +1070,41 @@
>
> return cost;
> }
> +
> +void Lookahead::processRow(int row)
> +{
> + int realrow = heightInCU - 1 - row;
> + Lowres *fenc = frames[curb];
> +
> + if (!fenc->bIntraCalculated)
> + fenc->rowSatds[0][0][realrow] = 0;
> + fenc->rowSatds[curb - curp0][curp1 - curb][realrow] = 0;
> + for (int i = widthInCU - 1 - completed[row]; i >= 0; i--) //Go
> backwards
> + {
> + estimateCUCost(i, realrow, curp0, curp1, curb, bDoSearch, row);
> + completed[row]++;
> +
> + if (completed[row] >= 2 && row < heightInCU - 1)
> + {
> + ScopedLock below(lock[row + 1]);
> + if (active[row + 1] == false &&
> + completed[row + 1] + 2 <= completed[row])
> + {
> + active[row + 1] = true;
> + enqueueRow(row + 1);
> + }
> + }
> +
> + ScopedLock self(lock[row]);
> + if (row > 0 && (int32_t)completed[row] < widthInCU - 1 &&
> completed[row - 1] < completed[row] + 2)
> + {
> + active[row] = false;
> + return;
> + }
> + }
> +
> + if (row == heightInCU - 1)
> + {
> + rowsCompleted = true;
> + }
> +}
> diff -r fc9dbd798ac3 -r 677d71fd7939 source/encoder/slicetype.h
> --- a/source/encoder/slicetype.h Thu Oct 17 14:14:40 2013 +0530
> +++ b/source/encoder/slicetype.h Thu Oct 17 16:04:05 2013 +0530
> @@ -26,7 +26,7 @@
>
> #include "motion.h"
> #include "piclist.h"
> -#include "common.h"
> +#include "wavefront.h"
>
> namespace x265 {
> // private namespace
> @@ -35,11 +35,9 @@
> class TComPic;
> class TEncCfg;
>
> -struct Lookahead
> +struct Lookahead : public WaveFront
> {
> - MotionEstimate me;
> TEncCfg *cfg;
> - pixel *predictions; // buffer for 35 intra predictions
> Lowres *frames[X265_LOOKAHEAD_MAX];
> Lowres *lastNonB;
> int merange;
> @@ -51,9 +49,24 @@
> PicList inputQueue; // input pictures in order received
> PicList outputQueue; // pictures to be encoded, in encode order
>
> + bool bDoSearch[2];
> + int curb, curp0, curp1;
> + bool rowsCompleted;
> +
> + //For wavefront parallelism
> + Lock* lock;
> + volatile bool* active;
> + volatile uint32_t* completed;
> + pixel** rowpredictions; // buffer for 35 intra
> predictions
> + MotionEstimate* rowme;
> + int* rowcostEst;
> + int* rowcostIntra;
> + int* rowIntraMbs;
> +
> Lookahead(TEncCfg *);
> ~Lookahead();
>
> + void init();
> void addPicture(TComPic*, int sliceType);
> void flush();
> void destroy();
> @@ -61,13 +74,15 @@
> int getEstimatedPictureCost(TComPic *pic);
>
> int estimateFrameCost(int p0, int p1, int b, bool bIntraPenalty);
> - void estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool
> bDoSearch[2]);
> + void estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool
> bDoSearch[2], int row);
>
> void slicetypeAnalyse(bool bKeyframe);
> int scenecut(int p0, int p1, bool bRealScenecut, int numFrames, int
> maxSearch);
> int scenecutInternal(int p0, int p1, bool bRealScenecut);
> void slicetypePath(int length, char(*best_paths)[X265_LOOKAHEAD_MAX +
> 1]);
> int slicetypePathCost(char *path, int threshold);
> +
> + void processRow(int row);
> };
> }
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
--
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131017/a1fde673/attachment-0001.html>
More information about the x265-devel
mailing list