[x265] [PATCH REVIEW] Lookahead: Implement wavefront parallel processing
Steve Borho
steve at borho.org
Fri Oct 18 19:42:21 CEST 2013
On Fri, Oct 18, 2013 at 5:43 AM, Deepthi Devaki Akkoorath <
deepthidevaki at multicorewareinc.com> wrote:
>
>
>
> On Fri, Oct 18, 2013 at 5:22 AM, Steve Borho <steve at borho.org> wrote:
>
>>
>>
>>
>> On Thu, Oct 17, 2013 at 5:39 AM, <deepthidevaki at multicorewareinc.com>wrote:
>>
>>> # HG changeset patch
>>> # User Deepthi Devaki <deepthidevaki at multicorewareinc.com>
>>> # Date 1382006045 -19800
>>> # Node ID 677d71fd793959c7e98df56f99f0b02a89fe050d
>>> # Parent fc9dbd798ac37ec1acc0596aa179f0deb586c092
>>> Lookahead: Implement wavefront parallel processing
>>>
>>
>> this looks good; I think you have the general idea down. It can be
>> simplified a great deal, comments below
>>
>>
>>> diff -r fc9dbd798ac3 -r 677d71fd7939 source/common/wavefront.h
>>> --- a/source/common/wavefront.h Thu Oct 17 14:14:40 2013 +0530
>>> +++ b/source/common/wavefront.h Thu Oct 17 16:04:05 2013 +0530
>>> @@ -47,11 +47,6 @@
>>>
>>> int m_numRows;
>>>
>>> - // WaveFront's implementation of JobProvider::findJob. Consults
>>> - // m_queuedBitmap and calls ProcessRow(row) for lowest numbered
>>> queued row
>>> - // or returns false
>>> - bool findJob();
>>> -
>>> public:
>>>
>>> WaveFront(ThreadPool *pool) : JobProvider(pool), m_queuedBitmap(0)
>>> {}
>>> @@ -69,6 +64,11 @@
>>>
>>> void clearEnabledRowMask();
>>>
>>> + // WaveFront's implementation of JobProvider::findJob. Consults
>>> + // m_queuedBitmap and calls ProcessRow(row) for lowest numbered
>>> queued row
>>> + // or returns false
>>> + bool findJob();
>>> +
>>>
>>
>> I think this is unnecessary if you refer to your own findJob() method
>> (this->findJob())
>>
>>
>
> Do you mean to write separate findJob() for Lookahead? But then it will be
> a replica of Wavefront::findjob()
>
I was confused and thought you were defining your own findJob(); disregard
this comment
>
>
>> // Returns true if a row above curRow is available for processing.
>>> The processRow()
>>> // method may call this function periodically and voluntarily exit
>>> bool checkHigherPriorityRow(int curRow);
>>> diff -r fc9dbd798ac3 -r 677d71fd7939 source/encoder/encoder.cpp
>>> --- a/source/encoder/encoder.cpp Thu Oct 17 14:14:40 2013 +0530
>>> +++ b/source/encoder/encoder.cpp Thu Oct 17 16:04:05 2013 +0530
>>> @@ -92,6 +92,7 @@
>>> }
>>> }
>>> m_lookahead = new Lookahead(this);
>>> + m_lookahead->setThreadPool(m_threadPool);
>>> m_dpb = new DPB(this);
>>> m_rateControl = new RateControl(this);
>>> }
>>> @@ -144,6 +145,7 @@
>>> m_frameEncoder[i].init(this, numRows);
>>> }
>>> }
>>> + m_lookahead->init();
>>> }
>>>
>>> int Encoder::getStreamHeaders(NALUnitEBSP **nalunits)
>>> diff -r fc9dbd798ac3 -r 677d71fd7939 source/encoder/slicetype.cpp
>>> --- a/source/encoder/slicetype.cpp Thu Oct 17 14:14:40 2013 +0530
>>> +++ b/source/encoder/slicetype.cpp Thu Oct 17 16:04:05 2013 +0530
>>> @@ -62,29 +62,67 @@
>>> dst.y = median(a.y, b.y, c.y);
>>> }
>>>
>>> -Lookahead::Lookahead(TEncCfg *_cfg)
>>> +Lookahead::Lookahead(TEncCfg *_cfg) : WaveFront(NULL)
>>>
>>
>> Pass the thread pool pointer as a second argument and initialize
>> WaveFront with it; then you don't need the setThreadPool() method
>>
>>
>>> {
>>> this->cfg = _cfg;
>>> numDecided = 0;
>>> lastKeyframe = -cfg->param.keyframeMax;
>>> lastNonB = NULL;
>>> - predictions = (pixel*)X265_MALLOC(pixel, 35 * 8 * 8);
>>> - me.setQP(X265_LOOKAHEAD_QP);
>>> - me.setSearchMethod(X265_HEX_SEARCH);
>>> - me.setSubpelRefine(1);
>>> merange = 16;
>>> widthInCU = ((cfg->param.sourceWidth / 2) + X265_LOWRES_CU_SIZE -
>>> 1) >> X265_LOWRES_CU_BITS;
>>> heightInCU = ((cfg->param.sourceHeight / 2) + X265_LOWRES_CU_SIZE -
>>> 1) >> X265_LOWRES_CU_BITS;
>>> +
>>> + lock = new Lock[heightInCU];
>>> + rowme = new MotionEstimate[heightInCU];
>>> +
>>> + active = (bool*)X265_MALLOC(bool, heightInCU);
>>> + completed = (uint32_t*)X265_MALLOC(uint32_t, heightInCU);
>>> + rowpredictions = (pixel**)X265_MALLOC(pixel*, heightInCU);
>>> + rowcostEst = (int*)X265_MALLOC(int, heightInCU);
>>> + rowcostIntra = (int*)X265_MALLOC(int, heightInCU);
>>> + rowIntraMbs = (int*)X265_MALLOC(int, heightInCU);
>>> + for (int i = 0; i < heightInCU; i++)
>>> + {
>>> + rowme[i].setQP(X265_LOOKAHEAD_QP);
>>> + rowme[i].setSearchMethod(X265_HEX_SEARCH);
>>> + rowme[i].setSubpelRefine(1);
>>> + rowpredictions[i] = (pixel*)X265_MALLOC(pixel, 35 * 8 * 8);
>>> + rowcostEst[i] = 0;
>>> + rowcostIntra[i] = 0;
>>> + rowIntraMbs[i] = 0;
>>> + }
>>>
>>
>> This would be much cleaner as a per-row struct similar to CTURow
>> (LookaheadRow?)
>>
>>
>>> }
>>>
>>> Lookahead::~Lookahead()
>>> {
>>> }
>>>
>>> +void Lookahead::init()
>>> +{
>>> + if (!WaveFront::init(heightInCU))
>>> + {
>>> + m_pool = NULL;
>>> + }
>>> + memset((void*)completed, 0, heightInCU * sizeof(uint32_t));
>>> + memset((void*)active, 0, heightInCU * sizeof(bool));
>>>
>>
>> these memsets are unnecessary
>>
>>
>>> +}
>>> +
>>> void Lookahead::destroy()
>>> {
>>> - if (predictions)
>>> - X265_FREE(predictions);
>>> + delete[] lock;
>>> + delete[] rowme;
>>> +
>>> + X265_FREE((void*)active);
>>> + X265_FREE((void*)completed);
>>> + for (int i = 0; i < heightInCU; i++)
>>> + {
>>> + X265_FREE(rowpredictions[i]);
>>> + }
>>> +
>>> + X265_FREE(rowpredictions);
>>> + X265_FREE(rowcostEst);
>>> + X265_FREE(rowcostIntra);
>>> + X265_FREE(rowIntraMbs);
>>>
>>> // these two queues will be empty, unless the encode was aborted
>>> while (!inputQueue.empty())
>>> @@ -165,9 +203,12 @@
>>> int Lookahead::estimateFrameCost(int p0, int p1, int b, bool
>>> bIntraPenalty)
>>> {
>>> int score = 0;
>>> - bool bDoSearch[2];
>>> Lowres *fenc = frames[b];
>>>
>>> + curb = b;
>>> + curp0 = p0;
>>> + curp1 = p1;
>>> +
>>> if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1
>>> - b][0] != -1)
>>> score = fenc->costEst[b - p0][p1 - b];
>>> else
>>> @@ -185,23 +226,58 @@
>>> * predictors in the main encode. This considerably improves MV
>>> * prediction overall. */
>>> // TODO: use lowres MVs as motion candidates in full-res search
>>> - me.setSourcePlane(fenc->lowresPlane[0], fenc->lumaStride);
>>> - for (int j = heightInCU - 1; j >= 0; j--)
>>> +
>>> + for (int i = 0; i < heightInCU; i++)
>>> {
>>> - if (!fenc->bIntraCalculated)
>>> - fenc->rowSatds[0][0][j] = 0;
>>> - fenc->rowSatds[b - p0][p1 - b][j] = 0;
>>> + rowme[i].setSourcePlane(fenc->lowresPlane[0],
>>> fenc->lumaStride);
>>> + }
>>>
>>> - for (int i = widthInCU - 1; i >= 0; i--)
>>> + memset((void*)completed, 0, heightInCU * sizeof(uint32_t));
>>> + memset((void*)active, 0, heightInCU * sizeof(bool));
>>> + memset((void*)rowcostEst, 0, heightInCU * sizeof(int));
>>> + memset((void*)rowcostIntra, 0, heightInCU * sizeof(int));
>>> + memset((void*)rowIntraMbs, 0, heightInCU * sizeof(int));
>>> + rowsCompleted = false;
>>> +
>>> + if (m_pool && cfg->param.bEnableWavefront)
>>>
>>
>> checking bEnableWavefront is unnecessary here; we can always use
>> wave-front lookahead scheduling so long as we have a thread pool.
>>
>>
>>> + {
>>> + WaveFront::clearEnabledRowMask();
>>>
>>
>> We need another method which just sets the enabledRowMask to all 1s
>> because it's only purpose is to track reference frame dependencies and we
>> have none here.
>>
>>
>>> + WaveFront::enqueue();
>>> +
>>> + for (int row = 0; row < heightInCU; row++)
>>> {
>>> - estimateCUCost(i, j, p0, p1, b, bDoSearch);
>>> + enableRow(row);
>>> + if (row == 0)
>>> + enqueueRow(0);
>>> + else
>>> + m_pool->pokeIdleThread();
>>> + }
>>>
>>
>> this whole loop is unnecessary (no frame dependencies), just use
>> equeueRow(0)
>>
>>
>>> +
>>> + while (!rowsCompleted)
>>> + {
>>> + WaveFront::findJob();
>>> + }
>>> +
>>> + WaveFront::dequeue();
>>> + }
>>> + else
>>> + {
>>> + for (int row = 0; row < heightInCU; row++)
>>> + {
>>> + processRow(row);
>>> }
>>> }
>>>
>>> + //Accumulate cost from each row
>>>
>> nit: space after //
>>
>>> + for (int row = 0; row < heightInCU; row++)
>>> + {
>>> + score += rowcostEst[row];
>>> + fenc->costEst[0][0] += rowcostIntra[row];
>>> + fenc->intraMbs[b - p0] += rowIntraMbs[row];
>>>
>>
>> score must be saved in fenc->costEst[b - p0][p1 - b];
>>
>>
>
> Yes. It is stored later.
>
>
>> + }
>>> +
>>> fenc->bIntraCalculated = true;
>>>
>>> - score = fenc->costEst[b - p0][p1 - b];
>>> -
>>> if (b != p1)
>>> score = (uint64_t)score * 100 / (130 +
>>> cfg->param.bFrameBias);
>>>
>>> @@ -218,7 +294,7 @@
>>> return score;
>>> }
>>>
>>> -void Lookahead::estimateCUCost(int cux, int cuy, int p0, int p1, int b,
>>> bool bDoSearch[2])
>>> +void Lookahead::estimateCUCost(int cux, int cuy, int p0, int p1, int b,
>>> bool bDoSearch[2], int row)
>>>
>>
>> perhaps this would be cleaner if it were a method of LookaheadRow
>>
>>
>>> {
>>> Lowres *fref0 = frames[p0];
>>> Lowres *fref1 = frames[p1];
>>> @@ -233,7 +309,7 @@
>>> const bool bFrameScoreCU = (cux > 0 && cux < widthInCU - 1 &&
>>> cuy > 0 && cuy < heightInCU - 1) ||
>>> widthInCU <= 2 || heightInCU <= 2;
>>>
>>> - me.setSourcePU(pelOffset, cuSize, cuSize);
>>> + rowme[row].setSourcePU(pelOffset, cuSize, cuSize);
>>>
>>> MV(*fenc_mvs[2]) = { &fenc->lowresMvs[0][b - p0 - 1][cuXY],
>>> &fenc->lowresMvs[1][p1 - b - 1][cuXY] };
>>> @@ -241,7 +317,7 @@
>>> &fenc->lowresMvCosts[1][p1 - b - 1][cuXY] };
>>>
>>> MV mvmin, mvmax;
>>> - int bcost = me.COST_MAX;
>>> + int bcost = rowme[row].COST_MAX;
>>> int listused = 0;
>>>
>>> // establish search bounds that don't cross extended frame
>>> boundaries
>>> @@ -286,7 +362,7 @@
>>> median_mv(mvp, mvc[0], mvc[1], mvc[2]);
>>> }
>>>
>>> - *fenc_costs[i] = me.motionEstimate(i ? fref1 : fref0,
>>> mvmin, mvmax, mvp, numc, mvc, merange, *fenc_mvs[i]);
>>> + *fenc_costs[i] = rowme[row].motionEstimate(i ? fref1 :
>>> fref0, mvmin, mvmax, mvp, numc, mvc, merange, *fenc_mvs[i]);
>>> COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1);
>>> }
>>> if (bBidir)
>>> @@ -388,23 +464,23 @@
>>> int predsize = cuSize * cuSize;
>>>
>>> // generate 35 intra predictions into tmp
>>> - primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, predictions,
>>> cuSize, cuSize, (cuSize <= 16));
>>> + primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1,
>>> rowpredictions[row], cuSize, cuSize, (cuSize <= 16));
>>> pixel *above = (cuSize >= 8) ? pAbove1 : pAbove0;
>>> pixel *left = (cuSize >= 8) ? pLeft1 : pLeft0;
>>> - primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left +
>>> 1, predictions + predsize, cuSize, cuSize);
>>> - primitives.intra_pred_allangs[nLog2SizeMinus2](predictions + 2
>>> * predsize, pAbove0, pLeft0, pAbove1, pLeft1, (cuSize <= 16));
>>> + primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left +
>>> 1, rowpredictions[row] + predsize, cuSize, cuSize);
>>> +
>>> primitives.intra_pred_allangs[nLog2SizeMinus2](rowpredictions[row] + 2 *
>>> predsize, pAbove0, pLeft0, pAbove1, pLeft1, (cuSize <= 16));
>>>
>>> // calculate 35 satd costs, keep least cost
>>> ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
>>> - primitives.transpose[nLog2SizeMinus2](buf_trans, me.fenc,
>>> FENC_STRIDE);
>>> + primitives.transpose[nLog2SizeMinus2](buf_trans,
>>> rowme[row].fenc, FENC_STRIDE);
>>> pixelcmp_t satd = primitives.satd[PartitionFromSizes(cuSize,
>>> cuSize)];
>>> - int icost = me.COST_MAX, cost;
>>> + int icost = rowme[row].COST_MAX, cost;
>>> for (UInt mode = 0; mode < 35; mode++)
>>> {
>>> if ((mode >= 2) && (mode < 18))
>>> - cost = satd(buf_trans, cuSize, &predictions[mode *
>>> predsize], cuSize);
>>> + cost = satd(buf_trans, cuSize,
>>> &rowpredictions[row][mode * predsize], cuSize);
>>> else
>>> - cost = satd(me.fenc, FENC_STRIDE, &predictions[mode *
>>> predsize], cuSize);
>>> + cost = satd(rowme[row].fenc, FENC_STRIDE,
>>> &rowpredictions[row][mode * predsize], cuSize);
>>> if (cost < icost)
>>> icost = cost;
>>> }
>>> @@ -412,14 +488,14 @@
>>> // TOOD: i_icost += intra_penalty + lowres_penalty;
>>> fenc->intraCost[cuXY] = icost;
>>> fenc->rowSatds[0][0][cuy] += icost;
>>> - if (bFrameScoreCU) fenc->costEst[0][0] += icost;
>>> + if (bFrameScoreCU) rowcostIntra[row] += icost;
>>> }
>>>
>>> if (!bBidir)
>>> {
>>> if (fenc->intraCost[cuXY] < bcost)
>>> {
>>> - if (bFrameScoreCU) fenc->intraMbs[b - p0]++;
>>> + if (bFrameScoreCU) rowIntraMbs[row]++;
>>> bcost = fenc->intraCost[cuXY];
>>> listused = 0;
>>> }
>>> @@ -429,7 +505,7 @@
>>> if (p0 != p1)
>>> {
>>> fenc->rowSatds[b - p0][p1 - b][cuy] += bcost;
>>> - if (bFrameScoreCU) fenc->costEst[b - p0][p1 - b] += bcost;
>>> + if (bFrameScoreCU) rowcostEst[row] += bcost;
>>> }
>>> fenc->lowresCosts[b - p0][p1 - b][cuXY] =
>>> (uint16_t)(X265_MIN(bcost, LOWRES_COST_MASK) | (listused <<
>>> LOWRES_COST_SHIFT));
>>> }
>>> @@ -547,6 +623,7 @@
>>> {
>>> frames[i + 1] = &list[i]->m_lowres;
>>> }
>>> +
>>> if (IS_X265_TYPE_I(frames[bframes + 1]->sliceType))
>>> p0 = bframes + 1;
>>> else // P
>>> @@ -914,7 +991,7 @@
>>> {
>>> char paths[2][X265_LOOKAHEAD_MAX + 1];
>>> int num_paths = X265_MIN(cfg->param.bframes + 1, length);
>>> - int best_cost = me.COST_MAX;
>>> + int best_cost = rowme[0].COST_MAX;
>>> int idx = 0;
>>>
>>> /* Iterate over all currently possible paths */
>>> @@ -993,3 +1070,41 @@
>>>
>>> return cost;
>>> }
>>> +
>>> +void Lookahead::processRow(int row)
>>> +{
>>> + int realrow = heightInCU - 1 - row;
>>> + Lowres *fenc = frames[curb];
>>> +
>>> + if (!fenc->bIntraCalculated)
>>> + fenc->rowSatds[0][0][realrow] = 0;
>>> + fenc->rowSatds[curb - curp0][curp1 - curb][realrow] = 0;
>>> + for (int i = widthInCU - 1 - completed[row]; i >= 0; i--) //Go
>>> backwards
>>> + {
>>> + estimateCUCost(i, realrow, curp0, curp1, curb, bDoSearch, row);
>>> + completed[row]++;
>>> +
>>> + if (completed[row] >= 2 && row < heightInCU - 1)
>>> + {
>>> + ScopedLock below(lock[row + 1]);
>>> + if (active[row + 1] == false &&
>>> + completed[row + 1] + 2 <= completed[row])
>>> + {
>>> + active[row + 1] = true;
>>> + enqueueRow(row + 1);
>>> + }
>>> + }
>>> +
>>> + ScopedLock self(lock[row]);
>>> + if (row > 0 && (int32_t)completed[row] < widthInCU - 1 &&
>>> completed[row - 1] < completed[row] + 2)
>>> + {
>>> + active[row] = false;
>>> + return;
>>> + }
>>> + }
>>> +
>>> + if (row == heightInCU - 1)
>>> + {
>>> + rowsCompleted = true;
>>> + }
>>> +}
>>> diff -r fc9dbd798ac3 -r 677d71fd7939 source/encoder/slicetype.h
>>> --- a/source/encoder/slicetype.h Thu Oct 17 14:14:40 2013 +0530
>>> +++ b/source/encoder/slicetype.h Thu Oct 17 16:04:05 2013 +0530
>>> @@ -26,7 +26,7 @@
>>>
>>> #include "motion.h"
>>> #include "piclist.h"
>>> -#include "common.h"
>>> +#include "wavefront.h"
>>>
>>> namespace x265 {
>>> // private namespace
>>> @@ -35,11 +35,9 @@
>>> class TComPic;
>>> class TEncCfg;
>>>
>>> -struct Lookahead
>>> +struct Lookahead : public WaveFront
>>> {
>>> - MotionEstimate me;
>>> TEncCfg *cfg;
>>> - pixel *predictions; // buffer for 35 intra predictions
>>> Lowres *frames[X265_LOOKAHEAD_MAX];
>>> Lowres *lastNonB;
>>> int merange;
>>> @@ -51,9 +49,24 @@
>>> PicList inputQueue; // input pictures in order received
>>> PicList outputQueue; // pictures to be encoded, in encode order
>>>
>>> + bool bDoSearch[2];
>>> + int curb, curp0, curp1;
>>> + bool rowsCompleted;
>>> +
>>> + //For wavefront parallelism
>>> + Lock* lock;
>>> + volatile bool* active;
>>> + volatile uint32_t* completed;
>>> + pixel** rowpredictions; // buffer for 35 intra
>>> predictions
>>> + MotionEstimate* rowme;
>>> + int* rowcostEst;
>>> + int* rowcostIntra;
>>> + int* rowIntraMbs;
>>> +
>>> Lookahead(TEncCfg *);
>>> ~Lookahead();
>>>
>>> + void init();
>>> void addPicture(TComPic*, int sliceType);
>>> void flush();
>>> void destroy();
>>> @@ -61,13 +74,15 @@
>>> int getEstimatedPictureCost(TComPic *pic);
>>>
>>> int estimateFrameCost(int p0, int p1, int b, bool bIntraPenalty);
>>> - void estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool
>>> bDoSearch[2]);
>>> + void estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool
>>> bDoSearch[2], int row);
>>>
>>> void slicetypeAnalyse(bool bKeyframe);
>>> int scenecut(int p0, int p1, bool bRealScenecut, int numFrames, int
>>> maxSearch);
>>> int scenecutInternal(int p0, int p1, bool bRealScenecut);
>>> void slicetypePath(int length, char(*best_paths)[X265_LOOKAHEAD_MAX
>>> + 1]);
>>> int slicetypePathCost(char *path, int threshold);
>>> +
>>> + void processRow(int row);
>>> };
>>> }
>>>
>>> _______________________________________________
>>> x265-devel mailing list
>>> x265-devel at videolan.org
>>> https://mailman.videolan.org/listinfo/x265-devel
>>>
>>
>>
>>
>> --
>> Steve Borho
>>
>> _______________________________________________
>> x265-devel mailing list
>> x265-devel at videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
>>
>>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
--
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131018/0ae3338b/attachment-0001.html>
More information about the x265-devel
mailing list