[x265] [PATCH REVIEW] Lookahead: Implement wavefront parallel processing

Deepthi Devaki Akkoorath deepthidevaki at multicorewareinc.com
Fri Oct 18 12:43:15 CEST 2013


On Fri, Oct 18, 2013 at 5:22 AM, Steve Borho <steve at borho.org> wrote:

>
>
>
> On Thu, Oct 17, 2013 at 5:39 AM, <deepthidevaki at multicorewareinc.com>wrote:
>
>> # HG changeset patch
>> # User Deepthi Devaki <deepthidevaki at multicorewareinc.com>
>> # Date 1382006045 -19800
>> # Node ID 677d71fd793959c7e98df56f99f0b02a89fe050d
>> # Parent  fc9dbd798ac37ec1acc0596aa179f0deb586c092
>> Lookahead: Implement wavefront parallel processing
>>
>
> this looks good; I think you have the general idea down.  It can be
> simplified a great deal, comments below
>
>
>> diff -r fc9dbd798ac3 -r 677d71fd7939 source/common/wavefront.h
>> --- a/source/common/wavefront.h Thu Oct 17 14:14:40 2013 +0530
>> +++ b/source/common/wavefront.h Thu Oct 17 16:04:05 2013 +0530
>> @@ -47,11 +47,6 @@
>>
>>      int m_numRows;
>>
>> -    // WaveFront's implementation of JobProvider::findJob. Consults
>> -    // m_queuedBitmap and calls ProcessRow(row) for lowest numbered
>> queued row
>> -    // or returns false
>> -    bool findJob();
>> -
>>  public:
>>
>>      WaveFront(ThreadPool *pool) : JobProvider(pool), m_queuedBitmap(0) {}
>> @@ -69,6 +64,11 @@
>>
>>      void clearEnabledRowMask();
>>
>> +    // WaveFront's implementation of JobProvider::findJob. Consults
>> +    // m_queuedBitmap and calls ProcessRow(row) for lowest numbered
>> queued row
>> +    // or returns false
>> +    bool findJob();
>> +
>>
>
> I think this is unnecessary if you refer to your own findJob() method
> (this->findJob())
>
>

Do you mean to write separate findJob() for Lookahead? But then it will be
a replica of Wavefront::findjob()


>      // Returns true if a row above curRow is available for processing.
>>  The processRow()
>>      // method may call this function periodically and voluntarily exit
>>      bool checkHigherPriorityRow(int curRow);
>> diff -r fc9dbd798ac3 -r 677d71fd7939 source/encoder/encoder.cpp
>> --- a/source/encoder/encoder.cpp        Thu Oct 17 14:14:40 2013 +0530
>> +++ b/source/encoder/encoder.cpp        Thu Oct 17 16:04:05 2013 +0530
>> @@ -92,6 +92,7 @@
>>          }
>>      }
>>      m_lookahead = new Lookahead(this);
>> +    m_lookahead->setThreadPool(m_threadPool);
>>      m_dpb = new DPB(this);
>>      m_rateControl = new RateControl(this);
>>  }
>> @@ -144,6 +145,7 @@
>>              m_frameEncoder[i].init(this, numRows);
>>          }
>>      }
>> +    m_lookahead->init();
>>  }
>>
>>  int Encoder::getStreamHeaders(NALUnitEBSP **nalunits)
>> diff -r fc9dbd798ac3 -r 677d71fd7939 source/encoder/slicetype.cpp
>> --- a/source/encoder/slicetype.cpp      Thu Oct 17 14:14:40 2013 +0530
>> +++ b/source/encoder/slicetype.cpp      Thu Oct 17 16:04:05 2013 +0530
>> @@ -62,29 +62,67 @@
>>      dst.y = median(a.y, b.y, c.y);
>>  }
>>
>> -Lookahead::Lookahead(TEncCfg *_cfg)
>> +Lookahead::Lookahead(TEncCfg *_cfg) : WaveFront(NULL)
>>
>
> Pass the thread pool pointer as a second argument and initialize WaveFront
> with it; then you don't need the setThreadPool() method
>
>
>>  {
>>      this->cfg = _cfg;
>>      numDecided = 0;
>>      lastKeyframe = -cfg->param.keyframeMax;
>>      lastNonB = NULL;
>> -    predictions = (pixel*)X265_MALLOC(pixel, 35 * 8 * 8);
>> -    me.setQP(X265_LOOKAHEAD_QP);
>> -    me.setSearchMethod(X265_HEX_SEARCH);
>> -    me.setSubpelRefine(1);
>>      merange = 16;
>>      widthInCU = ((cfg->param.sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1)
>> >> X265_LOWRES_CU_BITS;
>>      heightInCU = ((cfg->param.sourceHeight / 2) + X265_LOWRES_CU_SIZE -
>> 1) >> X265_LOWRES_CU_BITS;
>> +
>> +    lock = new Lock[heightInCU];
>> +    rowme = new MotionEstimate[heightInCU];
>> +
>> +    active = (bool*)X265_MALLOC(bool, heightInCU);
>> +    completed = (uint32_t*)X265_MALLOC(uint32_t, heightInCU);
>> +    rowpredictions = (pixel**)X265_MALLOC(pixel*, heightInCU);
>> +    rowcostEst = (int*)X265_MALLOC(int, heightInCU);
>> +    rowcostIntra = (int*)X265_MALLOC(int, heightInCU);
>> +    rowIntraMbs = (int*)X265_MALLOC(int, heightInCU);
>> +    for (int i = 0; i < heightInCU; i++)
>> +    {
>> +        rowme[i].setQP(X265_LOOKAHEAD_QP);
>> +        rowme[i].setSearchMethod(X265_HEX_SEARCH);
>> +        rowme[i].setSubpelRefine(1);
>> +        rowpredictions[i] = (pixel*)X265_MALLOC(pixel, 35 * 8 * 8);
>> +        rowcostEst[i] = 0;
>> +        rowcostIntra[i] = 0;
>> +        rowIntraMbs[i] = 0;
>> +    }
>>
>
> This would be much cleaner as a per-row struct similar to CTURow
> (LookaheadRow?)
>
>
>>  }
>>
>>  Lookahead::~Lookahead()
>>  {
>>  }
>>
>> +void Lookahead::init()
>> +{
>> +    if (!WaveFront::init(heightInCU))
>> +    {
>> +        m_pool = NULL;
>> +    }
>> +    memset((void*)completed, 0, heightInCU * sizeof(uint32_t));
>> +    memset((void*)active, 0, heightInCU * sizeof(bool));
>>
>
> these memsets are unnecessary
>
>
>> +}
>> +
>>  void Lookahead::destroy()
>>  {
>> -    if (predictions)
>> -        X265_FREE(predictions);
>> +    delete[] lock;
>> +    delete[] rowme;
>> +
>> +    X265_FREE((void*)active);
>> +    X265_FREE((void*)completed);
>> +    for (int i = 0; i < heightInCU; i++)
>> +    {
>> +        X265_FREE(rowpredictions[i]);
>> +    }
>> +
>> +    X265_FREE(rowpredictions);
>> +    X265_FREE(rowcostEst);
>> +    X265_FREE(rowcostIntra);
>> +    X265_FREE(rowIntraMbs);
>>
>>      // these two queues will be empty, unless the encode was aborted
>>      while (!inputQueue.empty())
>> @@ -165,9 +203,12 @@
>>  int Lookahead::estimateFrameCost(int p0, int p1, int b, bool
>> bIntraPenalty)
>>  {
>>      int score = 0;
>> -    bool bDoSearch[2];
>>      Lowres *fenc = frames[b];
>>
>> +    curb = b;
>> +    curp0 = p0;
>> +    curp1 = p1;
>> +
>>      if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1
>> - b][0] != -1)
>>          score = fenc->costEst[b - p0][p1 - b];
>>      else
>> @@ -185,23 +226,58 @@
>>           * predictors in the main encode.  This considerably improves MV
>>           * prediction overall. */
>>          // TODO: use lowres MVs as motion candidates in full-res search
>> -        me.setSourcePlane(fenc->lowresPlane[0], fenc->lumaStride);
>> -        for (int j = heightInCU - 1; j >= 0; j--)
>> +
>> +        for (int i = 0; i < heightInCU; i++)
>>          {
>> -            if (!fenc->bIntraCalculated)
>> -                fenc->rowSatds[0][0][j] = 0;
>> -            fenc->rowSatds[b - p0][p1 - b][j] = 0;
>> +            rowme[i].setSourcePlane(fenc->lowresPlane[0],
>> fenc->lumaStride);
>> +        }
>>
>> -            for (int i = widthInCU - 1; i >= 0; i--)
>> +        memset((void*)completed, 0, heightInCU * sizeof(uint32_t));
>> +        memset((void*)active, 0, heightInCU * sizeof(bool));
>> +        memset((void*)rowcostEst, 0, heightInCU * sizeof(int));
>> +        memset((void*)rowcostIntra, 0, heightInCU * sizeof(int));
>> +        memset((void*)rowIntraMbs, 0, heightInCU * sizeof(int));
>> +        rowsCompleted = false;
>> +
>> +        if (m_pool && cfg->param.bEnableWavefront)
>>
>
> checking bEnableWavefront is unnecessary here; we can always use
> wave-front lookahead scheduling so long as we have a thread pool.
>
>
>> +        {
>> +            WaveFront::clearEnabledRowMask();
>>
>
> We need another method which just sets the enabledRowMask to all 1s
> because it's only purpose is to track reference frame dependencies and we
> have none here.
>
>
>> +            WaveFront::enqueue();
>> +
>> +            for (int row = 0; row < heightInCU; row++)
>>              {
>> -                estimateCUCost(i, j, p0, p1, b, bDoSearch);
>> +                enableRow(row);
>> +                if (row == 0)
>> +                    enqueueRow(0);
>> +                else
>> +                    m_pool->pokeIdleThread();
>> +            }
>>
>
> this whole loop is unnecessary (no frame dependencies), just use
> equeueRow(0)
>
>
>> +
>> +            while (!rowsCompleted)
>> +            {
>> +                WaveFront::findJob();
>> +            }
>> +
>> +            WaveFront::dequeue();
>> +        }
>> +        else
>> +        {
>> +            for (int row = 0; row < heightInCU; row++)
>> +            {
>> +                processRow(row);
>>              }
>>          }
>>
>> +        //Accumulate cost from each row
>>
> nit: space after //
>
>> +        for (int row = 0; row < heightInCU; row++)
>> +        {
>> +            score += rowcostEst[row];
>> +            fenc->costEst[0][0] += rowcostIntra[row];
>> +            fenc->intraMbs[b - p0] += rowIntraMbs[row];
>>
>
> score must be saved in fenc->costEst[b - p0][p1 - b];
>
>

Yes. It is stored later.


> +        }
>> +
>>          fenc->bIntraCalculated = true;
>>
>> -        score = fenc->costEst[b - p0][p1 - b];
>> -
>>          if (b != p1)
>>              score = (uint64_t)score * 100 / (130 +
>> cfg->param.bFrameBias);
>>
>> @@ -218,7 +294,7 @@
>>      return score;
>>  }
>>
>> -void Lookahead::estimateCUCost(int cux, int cuy, int p0, int p1, int b,
>> bool bDoSearch[2])
>> +void Lookahead::estimateCUCost(int cux, int cuy, int p0, int p1, int b,
>> bool bDoSearch[2], int row)
>>
>
> perhaps this would be cleaner if it were a method of LookaheadRow
>
>
>>  {
>>      Lowres *fref0 = frames[p0];
>>      Lowres *fref1 = frames[p1];
>> @@ -233,7 +309,7 @@
>>      const bool bFrameScoreCU = (cux > 0 && cux < widthInCU - 1 &&
>>                                  cuy > 0 && cuy < heightInCU - 1) ||
>> widthInCU <= 2 || heightInCU <= 2;
>>
>> -    me.setSourcePU(pelOffset, cuSize, cuSize);
>> +    rowme[row].setSourcePU(pelOffset, cuSize, cuSize);
>>
>>      MV(*fenc_mvs[2]) = { &fenc->lowresMvs[0][b - p0 - 1][cuXY],
>>                           &fenc->lowresMvs[1][p1 - b - 1][cuXY] };
>> @@ -241,7 +317,7 @@
>>                              &fenc->lowresMvCosts[1][p1 - b - 1][cuXY] };
>>
>>      MV mvmin, mvmax;
>> -    int bcost = me.COST_MAX;
>> +    int bcost = rowme[row].COST_MAX;
>>      int listused = 0;
>>
>>      // establish search bounds that don't cross extended frame boundaries
>> @@ -286,7 +362,7 @@
>>                  median_mv(mvp, mvc[0], mvc[1], mvc[2]);
>>              }
>>
>> -            *fenc_costs[i] = me.motionEstimate(i ? fref1 : fref0, mvmin,
>> mvmax, mvp, numc, mvc, merange, *fenc_mvs[i]);
>> +            *fenc_costs[i] = rowme[row].motionEstimate(i ? fref1 :
>> fref0, mvmin, mvmax, mvp, numc, mvc, merange, *fenc_mvs[i]);
>>              COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1);
>>          }
>>          if (bBidir)
>> @@ -388,23 +464,23 @@
>>          int predsize = cuSize * cuSize;
>>
>>          // generate 35 intra predictions into tmp
>> -        primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, predictions,
>> cuSize, cuSize, (cuSize <= 16));
>> +        primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1,
>> rowpredictions[row], cuSize, cuSize, (cuSize <= 16));
>>          pixel *above = (cuSize >= 8) ? pAbove1 : pAbove0;
>>          pixel *left  = (cuSize >= 8) ? pLeft1 : pLeft0;
>> -        primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left +
>> 1, predictions + predsize, cuSize, cuSize);
>> -        primitives.intra_pred_allangs[nLog2SizeMinus2](predictions + 2 *
>> predsize, pAbove0, pLeft0, pAbove1, pLeft1, (cuSize <= 16));
>> +        primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left +
>> 1, rowpredictions[row] + predsize, cuSize, cuSize);
>> +
>>  primitives.intra_pred_allangs[nLog2SizeMinus2](rowpredictions[row] + 2 *
>> predsize, pAbove0, pLeft0, pAbove1, pLeft1, (cuSize <= 16));
>>
>>          // calculate 35 satd costs, keep least cost
>>          ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
>> -        primitives.transpose[nLog2SizeMinus2](buf_trans, me.fenc,
>> FENC_STRIDE);
>> +        primitives.transpose[nLog2SizeMinus2](buf_trans,
>> rowme[row].fenc, FENC_STRIDE);
>>          pixelcmp_t satd = primitives.satd[PartitionFromSizes(cuSize,
>> cuSize)];
>> -        int icost = me.COST_MAX, cost;
>> +        int icost = rowme[row].COST_MAX, cost;
>>          for (UInt mode = 0; mode < 35; mode++)
>>          {
>>              if ((mode >= 2) && (mode < 18))
>> -                cost = satd(buf_trans, cuSize, &predictions[mode *
>> predsize], cuSize);
>> +                cost = satd(buf_trans, cuSize, &rowpredictions[row][mode
>> * predsize], cuSize);
>>              else
>> -                cost = satd(me.fenc, FENC_STRIDE, &predictions[mode *
>> predsize], cuSize);
>> +                cost = satd(rowme[row].fenc, FENC_STRIDE,
>> &rowpredictions[row][mode * predsize], cuSize);
>>              if (cost < icost)
>>                  icost = cost;
>>          }
>> @@ -412,14 +488,14 @@
>>          // TOOD: i_icost += intra_penalty + lowres_penalty;
>>          fenc->intraCost[cuXY] = icost;
>>          fenc->rowSatds[0][0][cuy] += icost;
>> -        if (bFrameScoreCU) fenc->costEst[0][0] += icost;
>> +        if (bFrameScoreCU) rowcostIntra[row] += icost;
>>      }
>>
>>      if (!bBidir)
>>      {
>>          if (fenc->intraCost[cuXY] < bcost)
>>          {
>> -            if (bFrameScoreCU) fenc->intraMbs[b - p0]++;
>> +            if (bFrameScoreCU) rowIntraMbs[row]++;
>>              bcost = fenc->intraCost[cuXY];
>>              listused = 0;
>>          }
>> @@ -429,7 +505,7 @@
>>      if (p0 != p1)
>>      {
>>          fenc->rowSatds[b - p0][p1 - b][cuy] += bcost;
>> -        if (bFrameScoreCU) fenc->costEst[b - p0][p1 - b] += bcost;
>> +        if (bFrameScoreCU) rowcostEst[row] += bcost;
>>      }
>>      fenc->lowresCosts[b - p0][p1 - b][cuXY] = (uint16_t)(X265_MIN(bcost,
>> LOWRES_COST_MASK) | (listused << LOWRES_COST_SHIFT));
>>  }
>> @@ -547,6 +623,7 @@
>>              {
>>                  frames[i + 1] = &list[i]->m_lowres;
>>              }
>> +
>>              if (IS_X265_TYPE_I(frames[bframes + 1]->sliceType))
>>                  p0 = bframes + 1;
>>              else // P
>> @@ -914,7 +991,7 @@
>>  {
>>      char paths[2][X265_LOOKAHEAD_MAX + 1];
>>      int num_paths = X265_MIN(cfg->param.bframes + 1, length);
>> -    int best_cost = me.COST_MAX;
>> +    int best_cost = rowme[0].COST_MAX;
>>      int idx = 0;
>>
>>      /* Iterate over all currently possible paths */
>> @@ -993,3 +1070,41 @@
>>
>>      return cost;
>>  }
>> +
>> +void Lookahead::processRow(int row)
>> +{
>> +    int realrow = heightInCU - 1 - row;
>> +    Lowres *fenc = frames[curb];
>> +
>> +    if (!fenc->bIntraCalculated)
>> +        fenc->rowSatds[0][0][realrow] = 0;
>> +    fenc->rowSatds[curb - curp0][curp1 - curb][realrow] = 0;
>> +    for (int i = widthInCU - 1 - completed[row]; i >= 0; i--) //Go
>> backwards
>> +    {
>> +        estimateCUCost(i, realrow, curp0, curp1, curb, bDoSearch, row);
>> +        completed[row]++;
>> +
>> +        if (completed[row] >= 2 && row < heightInCU - 1)
>> +        {
>> +            ScopedLock below(lock[row + 1]);
>> +            if (active[row + 1] == false &&
>> +                completed[row + 1] + 2 <= completed[row])
>> +            {
>> +                active[row + 1] = true;
>> +                enqueueRow(row + 1);
>> +            }
>> +        }
>> +
>> +        ScopedLock self(lock[row]);
>> +        if (row > 0 && (int32_t)completed[row] < widthInCU - 1 &&
>> completed[row - 1] < completed[row] + 2)
>> +        {
>> +            active[row] = false;
>> +            return;
>> +        }
>> +    }
>> +
>> +    if (row == heightInCU - 1)
>> +    {
>> +        rowsCompleted = true;
>> +    }
>> +}
>> diff -r fc9dbd798ac3 -r 677d71fd7939 source/encoder/slicetype.h
>> --- a/source/encoder/slicetype.h        Thu Oct 17 14:14:40 2013 +0530
>> +++ b/source/encoder/slicetype.h        Thu Oct 17 16:04:05 2013 +0530
>> @@ -26,7 +26,7 @@
>>
>>  #include "motion.h"
>>  #include "piclist.h"
>> -#include "common.h"
>> +#include "wavefront.h"
>>
>>  namespace x265 {
>>  // private namespace
>> @@ -35,11 +35,9 @@
>>  class TComPic;
>>  class TEncCfg;
>>
>> -struct Lookahead
>> +struct Lookahead : public WaveFront
>>  {
>> -    MotionEstimate   me;
>>      TEncCfg         *cfg;
>> -    pixel           *predictions;   // buffer for 35 intra predictions
>>      Lowres          *frames[X265_LOOKAHEAD_MAX];
>>      Lowres          *lastNonB;
>>      int              merange;
>> @@ -51,9 +49,24 @@
>>      PicList inputQueue;  // input pictures in order received
>>      PicList outputQueue; // pictures to be encoded, in encode order
>>
>> +    bool bDoSearch[2];
>> +    int curb, curp0, curp1;
>> +    bool rowsCompleted;
>> +
>> +    //For wavefront parallelism
>> +    Lock*                lock;
>> +    volatile bool*       active;
>> +    volatile uint32_t*   completed;
>> +    pixel**              rowpredictions;    // buffer for 35 intra
>> predictions
>> +    MotionEstimate*      rowme;
>> +    int*                 rowcostEst;
>> +    int*                 rowcostIntra;
>> +    int*                 rowIntraMbs;
>> +
>>      Lookahead(TEncCfg *);
>>      ~Lookahead();
>>
>> +    void init();
>>      void addPicture(TComPic*, int sliceType);
>>      void flush();
>>      void destroy();
>> @@ -61,13 +74,15 @@
>>      int getEstimatedPictureCost(TComPic *pic);
>>
>>      int estimateFrameCost(int p0, int p1, int b, bool bIntraPenalty);
>> -    void estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool
>> bDoSearch[2]);
>> +    void estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool
>> bDoSearch[2], int row);
>>
>>      void slicetypeAnalyse(bool bKeyframe);
>>      int scenecut(int p0, int p1, bool bRealScenecut, int numFrames, int
>> maxSearch);
>>      int scenecutInternal(int p0, int p1, bool bRealScenecut);
>>      void slicetypePath(int length, char(*best_paths)[X265_LOOKAHEAD_MAX
>> + 1]);
>>      int slicetypePathCost(char *path, int threshold);
>> +
>> +    void processRow(int row);
>>  };
>>  }
>>
>> _______________________________________________
>> x265-devel mailing list
>> x265-devel at videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
>>
>
>
>
> --
> Steve Borho
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131018/ef77de9e/attachment-0001.html>


More information about the x265-devel mailing list