[x265] [PATCH] vbv hanging issue; fix for multiple slices

Ashok Kumar Mishra ashok at multicorewareinc.com
Tue Sep 26 15:19:20 CEST 2017


Please find the attached patch.

On Thu, Sep 21, 2017 at 8:21 PM, <ashok at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Ashok Kumar Mishra <ashok at multicorewareinc.com>
> # Date 1506005452 -19800
> #      Thu Sep 21 20:20:52 2017 +0530
> # Node ID 546387e0b983ac1d68cda73777b34a122928cd32
> # Parent  71f700844b0b2a9120bfd8a2d1f13e219aa20677
> vbv hanging issue; fix for multiple slices
> When multiple slices are enabled, vbv rate control must take care of
> correct rows in slices, since multiple slices are encoding simultaneously.
>
> diff -r 71f700844b0b -r 546387e0b983 source/encoder/frameencoder.cpp
> --- a/source/encoder/frameencoder.cpp   Tue Sep 12 18:13:03 2017 +0530
> +++ b/source/encoder/frameencoder.cpp   Thu Sep 21 20:20:52 2017 +0530
> @@ -88,6 +88,7 @@
>      delete[] m_outStreams;
>      delete[] m_backupStreams;
>      X265_FREE(m_sliceBaseRow);
> +    X265_FREE(m_sliceMaxBlockRow);
>      X265_FREE(m_cuGeoms);
>      X265_FREE(m_ctuGeomMap);
>      X265_FREE(m_substreamSizes);
> @@ -118,6 +119,40 @@
>
>      m_sliceBaseRow = X265_MALLOC(uint32_t, m_param->maxSlices + 1);
>      ok &= !!m_sliceBaseRow;
> +    m_sliceGroupSize = (uint16_t)(m_numRows + m_param->maxSlices - 1) /
> m_param->maxSlices;
> +    uint32_t sliceGroupSizeAccu = (m_numRows << 8) / m_param->maxSlices;
> +    uint32_t rowSum = sliceGroupSizeAccu;
> +    uint32_t sidx = 0;
> +    for (uint32_t i = 0; i < m_numRows; i++)
> +    {
> +        const uint32_t rowRange = (rowSum >> 8);
> +        if ((i >= rowRange) & (sidx != m_param->maxSlices - 1))
> +        {
> +            rowSum += sliceGroupSizeAccu;
> +            m_sliceBaseRow[++sidx] = i;
> +        }
> +    }
> +    X265_CHECK(sidx < m_param->maxSlices, "sliceID check failed!");
> +    m_sliceBaseRow[0] = 0;
> +    m_sliceBaseRow[m_param->maxSlices] = m_numRows;
> +
> +    m_sliceMaxBlockRow = X265_MALLOC(uint32_t, m_param->maxSlices + 1);
> +    ok &= !!m_sliceMaxBlockRow;
> +    uint32_t maxBlockRows = (m_param->sourceHeight + (16 - 1)) / 16;
> +    sliceGroupSizeAccu = (maxBlockRows << 8) / m_param->maxSlices;
> +    rowSum = sliceGroupSizeAccu;
> +    sidx = 0;
> +    for (uint32_t i = 0; i < maxBlockRows; i++)
> +    {
> +        const uint32_t rowRange = (rowSum >> 8);
> +        if ((i >= rowRange) & (sidx != m_param->maxSlices - 1))
> +        {
> +            rowSum += sliceGroupSizeAccu;
> +            m_sliceMaxBlockRow[++sidx] = i;
> +        }
> +    }
> +    m_sliceMaxBlockRow[0] = 0;
> +    m_sliceMaxBlockRow[m_param->maxSlices] = maxBlockRows;
>
>      /* determine full motion search range */
>      int range  = m_param->searchRange;       /* fpel search */
> @@ -341,6 +376,8 @@
>      m_completionCount = 0;
>      m_bAllRowsStop = false;
>      m_vbvResetTriggerRow = -1;
> +    m_rowSliceTotalBits[0] = 0;
> +    m_rowSliceTotalBits[1] = 0;
>
>      m_SSDY = m_SSDU = m_SSDV = 0;
>      m_ssim = 0;
> @@ -550,28 +587,13 @@
>
>      /* reset entropy coders and compute slice id */
>      m_entropyCoder.load(m_initSliceContext);
> -    const uint32_t sliceGroupSize = (m_numRows + m_param->maxSlices - 1)
> / m_param->maxSlices;
> -    const uint32_t sliceGroupSizeAccu = (m_numRows << 8) /
> m_param->maxSlices;
> -    m_sliceGroupSize = (uint16_t)sliceGroupSize;
> +
> +    for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
> +        for (uint32_t row = m_sliceBaseRow[sliceId]; row <
> m_sliceBaseRow[sliceId + 1]; row++)
> +            m_rows[row].init(m_initSliceContext, sliceId);
>
> -    uint32_t rowSum = sliceGroupSizeAccu;
> -    uint32_t sidx = 0;
> -    for (uint32_t i = 0; i < m_numRows; i++)
> -    {
> -        const uint32_t rowRange = (rowSum >> 8);
> -
> -        if ((i >= rowRange) & (sidx != m_param->maxSlices - 1))
> -        {
> -            rowSum += sliceGroupSizeAccu;
> -            m_sliceBaseRow[++sidx] = i;
> -        }
> -
> -        m_rows[i].init(m_initSliceContext, sidx);
> -    }
> -    X265_CHECK(sidx < m_param->maxSlices, "sliceID check failed!");
> -
> -    m_sliceBaseRow[0] = 0;
> -    m_sliceBaseRow[m_param->maxSlices] = m_numRows;
> +    // reset slice counter for rate control update
> +    m_sliceCnt = 0;
>
>      uint32_t numSubstreams = m_param->bEnableWavefront ?
> slice->m_sps->numCuInHeight : m_param->maxSlices;
>      X265_CHECK(m_param->bEnableWavefront || (m_param->maxSlices == 1),
> "Multiple slices without WPP unsupport now!");
> @@ -586,8 +608,10 @@
>                  m_rows[i].rowGoOnCoder.setBitstream(&m_outStreams[i]);
>      }
>      else
> +    {
>          for (uint32_t i = 0; i < numSubstreams; i++)
>              m_outStreams[i].resetBits();
> +    }
>
>      int prevBPSEI = m_rce.encodeOrder ? m_top->m_lastBPSEI : 0;
>
> @@ -697,10 +721,9 @@
>       * compressed in a wave-front pattern if WPP is enabled. Row based
> loop
>       * filters runs behind the CTU compression and reconstruction */
>
> -    for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
> -    {
> +    for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
>          m_rows[m_sliceBaseRow[sliceId]].active = true;
> -    }
> +
>      if (m_param->bEnableWavefront)
>      {
>          int i = 0;
> @@ -982,9 +1005,8 @@
>              // complete the slice header by writing WPP row-starts
>              m_entropyCoder.setBitstream(&m_bs);
>              if (slice->m_pps->bEntropyCodingSyncEnabled)
> -            {
>                  m_entropyCoder.codeSliceHeaderWPPEntryPoints(
> &m_substreamSizes[prevSliceRow], (nextSliceRow - prevSliceRow - 1),
> maxStreamSize);
> -            }
> +
>              m_bs.writeByteAlignment();
>
>              m_nalList.serialize(slice->m_nalUnitType, m_bs);
> @@ -1270,20 +1292,17 @@
>      const uint32_t lineStartCUAddr = row * numCols;
>      bool bIsVbv = m_param->rc.vbvBufferSize > 0 &&
> m_param->rc.vbvMaxBitrate > 0;
>
> +    const uint32_t sliceId = curRow.sliceId;
>      uint32_t maxBlockCols = (m_frame->m_fencPic->m_picWidth + (16 - 1))
> / 16;
> -    uint32_t maxBlockRows = (m_frame->m_fencPic->m_picHeight + (16 - 1))
> / 16;
>      uint32_t noOfBlocks = m_param->maxCUSize / 16;
>      const uint32_t bFirstRowInSlice = ((row == 0) || (m_rows[row -
> 1].sliceId != curRow.sliceId)) ? 1 : 0;
>      const uint32_t bLastRowInSlice = ((row == m_numRows - 1) ||
> (m_rows[row + 1].sliceId != curRow.sliceId)) ? 1 : 0;
> -    const uint32_t sliceId = curRow.sliceId;
>      const uint32_t endRowInSlicePlus1 = m_sliceBaseRow[sliceId + 1];
>      const uint32_t rowInSlice = row - m_sliceBaseRow[sliceId];
>
> -    if (bFirstRowInSlice && !curRow.completed)
> -    {
> -        // Load SBAC coder context from previous row and initialize row
> state.
> -        rowCoder.load(m_initSliceContext);
> -    }
> +    // Load SBAC coder context from previous row and initialize row state.
> +    if (bFirstRowInSlice && !curRow.completed)
> +        rowCoder.load(m_initSliceContext);
>
>      // calculate mean QP for consistent deltaQP signalling calculation
>      if (m_param->bOptCUDeltaQP)
> @@ -1294,15 +1313,12 @@
>              if (m_param->bEnableWavefront || !row)
>              {
>                  double meanQPOff = 0;
> -                uint32_t loopIncr, count = 0;
>                  bool isReferenced = IS_REFERENCED(m_frame);
>                  double *qpoffs = (isReferenced && m_param->rc.cuTree) ?
> m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
>                  if (qpoffs)
>                  {
> -                    if (m_param->rc.qgSize == 8)
> -                        loopIncr = 8;
> -                    else
> -                        loopIncr = 16;
> +                    uint32_t loopIncr = (m_param->rc.qgSize == 8) ? 8 :
> 16;
> +
>                      uint32_t cuYStart = 0, height = m_frame->m_fencPic->m_
> picHeight;
>                      if (m_param->bEnableWavefront)
>                      {
> @@ -1312,6 +1328,7 @@
>
>                      uint32_t qgSize = m_param->rc.qgSize, width =
> m_frame->m_fencPic->m_picWidth;
>                      uint32_t maxOffsetCols = (m_frame->m_fencPic->m_picWidth
> + (loopIncr - 1)) / loopIncr;
> +                    uint32_t count = 0;
>                      for (uint32_t cuY = cuYStart; cuY < height && (cuY <
> m_frame->m_fencPic->m_picHeight); cuY += qgSize)
>                      {
>                          for (uint32_t cuX = 0; cuX < width; cuX += qgSize)
> @@ -1372,16 +1389,16 @@
>                  curRow.bufferedEntropy.copyState(rowCoder);
>                  curRow.bufferedEntropy.loadContexts(rowCoder);
>              }
> -            if (!row && m_vbvResetTriggerRow != intRow)
> +            if (bFirstRowInSlice && m_vbvResetTriggerRow != intRow)
>              {
>                  curEncData.m_rowStat[row].rowQp = curEncData.m_avgQpRc;
>                  curEncData.m_rowStat[row].rowQpScale =
> x265_qp2qScale(curEncData.m_avgQpRc);
>              }
>
>              FrameData::RCStatCU& cuStat = curEncData.m_cuStat[cuAddr];
> -            if (m_param->bEnableWavefront && row >= col && row &&
> m_vbvResetTriggerRow != intRow)
> +            if (m_param->bEnableWavefront && rowInSlice >= col &&
> !bFirstRowInSlice && m_vbvResetTriggerRow != intRow)
>                  cuStat.baseQp = curEncData.m_cuStat[cuAddr - numCols +
> 1].baseQp;
> -            else if (!m_param->bEnableWavefront && row &&
> m_vbvResetTriggerRow != intRow)
> +            else if (!m_param->bEnableWavefront && !bFirstRowInSlice &&
> m_vbvResetTriggerRow != intRow)
>                  cuStat.baseQp = curEncData.m_rowStat[row - 1].rowQp;
>              else
>                  cuStat.baseQp = curEncData.m_rowStat[row].rowQp;
> @@ -1393,7 +1410,8 @@
>              {
>                  cuStat.vbvCost = 0;
>                  cuStat.intraVbvCost = 0;
> -                for (uint32_t h = 0; h < noOfBlocks && block_y <
> maxBlockRows; h++, block_y++)
> +
> +                for (uint32_t h = 0; h < noOfBlocks && block_y <
> m_sliceMaxBlockRow[sliceId + 1]; h++, block_y++)
>                  {
>                      uint32_t idx = block_x + (block_y * maxBlockCols);
>
> @@ -1497,10 +1515,8 @@
>                  int shift = 2 * (m_param->maxCUDepth - depth);
>                  int cuSize = m_param->maxCUSize >> depth;
>
> -                if (cuSize == 8)
> -                    curRow.rowStats.intra8x8Cnt +=
> (int)(frameLog.cntIntra[depth] + frameLog.cntIntraNxN);
> -                else
> -                    curRow.rowStats.intra8x8Cnt +=
> (int)(frameLog.cntIntra[depth] << shift);
> +                curRow.rowStats.intra8x8Cnt += (cuSize == 8) ?
> (int)(frameLog.cntIntra[depth] + frameLog.cntIntraNxN) :
> +
>  (int)(frameLog.cntIntra[depth] << shift);
>
>                  curRow.rowStats.inter8x8Cnt +=
> (int)(frameLog.cntInter[depth] << shift);
>                  curRow.rowStats.skip8x8Cnt += (int)((frameLog.cntSkipCu[depth]
> + frameLog.cntMergeCu[depth]) << shift);
> @@ -1530,12 +1546,13 @@
>          if (bIsVbv)
>          {
>              // Update encoded bits, satdCost, baseQP for each CU if tune
> grain is disabled
> -            if ((m_param->bEnableWavefront && (!cuAddr ||
> !m_param->rc.bEnableConstVbv)) || !m_param->bEnableWavefront)
> +            FrameData::RCStatCU& cuStat = curEncData.m_cuStat[cuAddr];
> +            if ((m_param->bEnableWavefront && ((cuAddr ==
> m_sliceBaseRow[sliceId] * numCols) || !m_param->rc.bEnableConstVbv)) ||
> !m_param->bEnableWavefront)
>              {
> -                curEncData.m_rowStat[row].rowSatd +=
> curEncData.m_cuStat[cuAddr].vbvCost;
> -                curEncData.m_rowStat[row].rowIntraSatd +=
> curEncData.m_cuStat[cuAddr].intraVbvCost;
> -                curEncData.m_rowStat[row].encodedBits +=
> curEncData.m_cuStat[cuAddr].totalBits;
> -                curEncData.m_rowStat[row].sumQpRc +=
> curEncData.m_cuStat[cuAddr].baseQp;
> +                curEncData.m_rowStat[row].rowSatd += cuStat.vbvCost;
> +                curEncData.m_rowStat[row].rowIntraSatd +=
> cuStat.intraVbvCost;
> +                curEncData.m_rowStat[row].encodedBits +=
> cuStat.totalBits;
> +                curEncData.m_rowStat[row].sumQpRc += cuStat.baseQp;
>                  curEncData.m_rowStat[row].numEncodedCUs = cuAddr;
>              }
>
> @@ -1543,7 +1560,7 @@
>              if (!m_param->bEnableWavefront && col == numCols - 1)
>              {
>                  double qpBase = curEncData.m_cuStat[cuAddr].baseQp;
> -                int reEncode = m_top->m_rateControl->rowVbvRateControl(m_frame,
> row, &m_rce, qpBase);
> +                int reEncode = m_top->m_rateControl->rowVbvRateControl(m_frame,
> row, &m_rce, qpBase, m_sliceBaseRow, sliceId);
>                  qpBase = x265_clip3((double)m_param->rc.qpMin,
> (double)m_param->rc.qpMax, qpBase);
>                  curEncData.m_rowStat[row].rowQp = qpBase;
>                  curEncData.m_rowStat[row].rowQpScale =
> x265_qp2qScale(qpBase);
> @@ -1569,15 +1586,16 @@
>                  }
>              }
>              // If current block is at row diagonal checkpoint, call vbv
> ratecontrol.
> -            else if (m_param->bEnableWavefront && row == col && row)
> +            else if (m_param->bEnableWavefront && rowInSlice == col &&
> !bFirstRowInSlice)
>              {
>                  if (m_param->rc.bEnableConstVbv)
>                  {
> -                    int32_t startCuAddr = numCols * row;
> -                    int32_t EndCuAddr = startCuAddr + col;
> -                    for (int32_t r = row; r >= 0; r--)
> +                    uint32_t startCuAddr = numCols * row;
> +                    uint32_t EndCuAddr = startCuAddr + col;
> +
> +                    for (int32_t r = row; r >= (int32_t)m_sliceBaseRow[sliceId];
> r--)
>                      {
> -                        for (int32_t c = startCuAddr; c <= EndCuAddr && c
> <= (int32_t)numCols * (r + 1) - 1; c++)
> +                        for (uint32_t c = startCuAddr; c <= EndCuAddr &&
> c <= numCols * (r + 1) - 1; c++)
>                          {
>                              curEncData.m_rowStat[r].rowSatd +=
> curEncData.m_cuStat[c].vbvCost;
>                              curEncData.m_rowStat[r].rowIntraSatd +=
> curEncData.m_cuStat[c].intraVbvCost;
> @@ -1590,10 +1608,10 @@
>                      }
>                  }
>                  double qpBase = curEncData.m_cuStat[cuAddr].baseQp;
> -                int reEncode = m_top->m_rateControl->rowVbvRateControl(m_frame,
> row, &m_rce, qpBase);
> +                int reEncode = m_top->m_rateControl->rowVbvRateControl(m_frame,
> row, &m_rce, qpBase, m_sliceBaseRow, sliceId);
>                  qpBase = x265_clip3((double)m_param->rc.qpMin,
> (double)m_param->rc.qpMax, qpBase);
>                  curEncData.m_rowStat[row].rowQp = qpBase;
> -                curEncData.m_rowStat[row].rowQpScale =
> x265_qp2qScale(qpBase);
> +                curEncData.m_rowStat[row].rowQpScale =
> x265_qp2qScale(qpBase);
>
>                  if (reEncode < 0)
>                  {
> @@ -1604,7 +1622,7 @@
>                      m_vbvResetTriggerRow = row;
>                      m_bAllRowsStop = true;
>
> -                    for (uint32_t r = m_numRows - 1; r >= row; r--)
> +                    for (uint32_t r = m_sliceBaseRow[sliceId + 1] - 1; r
> >= row; r--)
>                      {
>                          CTURow& stopRow = m_rows[r];
>
> @@ -1686,11 +1704,11 @@
>      /* this row of CTUs has been compressed */
>      if (m_param->bEnableWavefront && m_param->rc.bEnableConstVbv)
>      {
> -        if (row == m_numRows - 1)
> +        if (bLastRowInSlice)
>          {
> -            for (int32_t r = 0; r < (int32_t)m_numRows; r++)
> +            for (uint32_t r = m_sliceBaseRow[sliceId]; r <
> m_sliceBaseRow[sliceId + 1]; r++)
>              {
> -                for (int32_t c = curEncData.m_rowStat[r].numEncodedCUs +
> 1; c < (int32_t)numCols * (r + 1); c++)
> +                for (uint32_t c = curEncData.m_rowStat[r].numEncodedCUs
> + 1; c < numCols * (r + 1); c++)
>                  {
>                      curEncData.m_rowStat[r].rowSatd +=
> curEncData.m_cuStat[c].vbvCost;
>                      curEncData.m_rowStat[r].rowIntraSatd +=
> curEncData.m_cuStat[c].intraVbvCost;
> @@ -1708,26 +1726,41 @@
>       * after half the frame is encoded, but after this initial period we
> update
>       * after refLagRows (the number of rows reference frames must have
> completed
>       * before referencees may begin encoding) */
> -    uint32_t rowCount = 0;
>      if (m_param->rc.rateControlMode == X265_RC_ABR || bIsVbv)
>      {
> +        uint32_t rowCount = 0;
> +        uint32_t maxRows = m_sliceBaseRow[sliceId + 1] -
> m_sliceBaseRow[sliceId];
>          if (!m_rce.encodeOrder)
> -            rowCount = m_numRows - 1;
> +            rowCount = maxRows - 1;
>          else if ((uint32_t)m_rce.encodeOrder <= 2 * (m_param->fpsNum /
> m_param->fpsDenom))
> -            rowCount = X265_MIN((m_numRows + 1) / 2, m_numRows - 1);
> +            rowCount = X265_MIN((maxRows + 1) / 2, maxRows - 1);
>          else
> -            rowCount = X265_MIN(m_refLagRows, m_numRows - 1);
> -        if (row == rowCount)
> +            rowCount = X265_MIN(m_refLagRows, maxRows - 1);
> +
> +        if (rowInSlice == rowCount / m_param->maxSlices)
>          {
> -            m_rce.rowTotalBits = 0;
> +            m_rowSliceTotalBits[sliceId] = 0;
>              if (bIsVbv)
> -                for (uint32_t i = 0; i < rowCount; i++)
> -                    m_rce.rowTotalBits += curEncData.m_rowStat[i].
> encodedBits;
> +            {
> +                for (uint32_t i = m_sliceBaseRow[sliceId]; i < (rowCount
> / m_param->maxSlices) + m_sliceBaseRow[sliceId]; i++)
> +                    m_rowSliceTotalBits[sliceId] +=
> curEncData.m_rowStat[i].encodedBits;
> +            }
>              else
> -                for (uint32_t cuAddr = 0; cuAddr < rowCount * numCols;
> cuAddr++)
> -                    m_rce.rowTotalBits += curEncData.m_cuStat[cuAddr].
> totalBits;
> +            {
> +                uint32_t startAddr = rowCount * numCols * sliceId;
> +                uint32_t finishAddr = startAddr + rowCount * numCols;
> +
> +                for (uint32_t cuAddr = startAddr; cuAddr < finishAddr;
> cuAddr++)
> +                    m_rowSliceTotalBits[sliceId] +=
> curEncData.m_cuStat[cuAddr].totalBits;
> +            }
>
> -            m_top->m_rateControl->rateControlUpdateStats(&m_rce);
> +            if (ATOMIC_INC(&m_sliceCnt) == (int)m_param->maxSlices)
> +            {
> +                m_rce.rowTotalBits = 0;
> +                for (uint32_t i = 0; i < m_param->maxSlices; i++)
> +                    m_rce.rowTotalBits += m_rowSliceTotalBits[i];
> +                m_top->m_rateControl->rateControlUpdateStats(&m_rce);
> +            }
>          }
>      }
>
> diff -r 71f700844b0b -r 546387e0b983 source/encoder/frameencoder.h
> --- a/source/encoder/frameencoder.h     Tue Sep 12 18:13:03 2017 +0530
> +++ b/source/encoder/frameencoder.h     Thu Sep 21 20:20:52 2017 +0530
> @@ -138,6 +138,7 @@
>      volatile bool            m_bAllRowsStop;
>      volatile int             m_completionCount;
>      volatile int             m_vbvResetTriggerRow;
> +    volatile int             m_sliceCnt;
>
>      uint32_t                 m_numRows;
>      uint32_t                 m_numCols;
> @@ -147,8 +148,10 @@
>
>      CTURow*                  m_rows;
>      uint16_t                 m_sliceAddrBits;
> -    uint16_t                 m_sliceGroupSize;
> -    uint32_t*                m_sliceBaseRow;
> +    uint32_t                 m_sliceGroupSize;
> +    uint32_t*                m_sliceBaseRow;
> +    uint32_t*                m_sliceMaxBlockRow;
> +    int64_t                  m_rowSliceTotalBits[2];
>      RateControlEntry         m_rce;
>      SEIDecodedPictureHash    m_seiReconPictureDigest;
>
> diff -r 71f700844b0b -r 546387e0b983 source/encoder/ratecontrol.cpp
> --- a/source/encoder/ratecontrol.cpp    Tue Sep 12 18:13:03 2017 +0530
> +++ b/source/encoder/ratecontrol.cpp    Thu Sep 21 20:20:52 2017 +0530
> @@ -732,7 +732,6 @@
>      m_bitrate = m_param->rc.bitrate * 1000;
>  }
>
> -
>  void RateControl::initHRD(SPS& sps)
>  {
>      int vbvBufferSize = m_param->rc.vbvBufferSize * 1000;
> @@ -765,6 +764,7 @@
>
>      #undef MAX_DURATION
>  }
> +
>  bool RateControl::analyseABR2Pass(uint64_t allAvailableBits)
>  {
>      double rateFactor, stepMult;
> @@ -1473,6 +1473,7 @@
>
>      return q;
>  }
> +
>  double RateControl::countExpectedBits(int startPos, int endPos)
>  {
>      double expectedBits = 0;
> @@ -1484,6 +1485,7 @@
>      }
>      return expectedBits;
>  }
> +
>  bool RateControl::findUnderflow(double *fills, int *t0, int *t1, int
> over, int endPos)
>  {
>      /* find an interval ending on an overflow or underflow (depending on
> whether
> @@ -1531,6 +1533,7 @@
>      }
>      return adjusted;
>  }
> +
>  bool RateControl::cuTreeReadFor2Pass(Frame* frame)
>  {
>      int index = m_encOrder[frame->m_poc];
> @@ -1579,24 +1582,24 @@
>  double RateControl::tuneAbrQScaleFromFeedback(double qScale)
>  {
>      double abrBuffer = 2 * m_rateTolerance * m_bitrate;
> -        /* use framesDone instead of POC as poc count is not serial with
> bframes enabled */
> -        double overflow = 1.0;
> -        double timeDone = (double)(m_framesDone -
> m_param->frameNumThreads + 1) * m_frameDuration;
> -        double wantedBits = timeDone * m_bitrate;
> -        int64_t encodedBits = m_totalBits;
> -        if (m_param->totalFrames && m_param->totalFrames <= 2 * m_fps)
> -        {
> -            abrBuffer = m_param->totalFrames * (m_bitrate / m_fps);
> -            encodedBits = m_encodedBits;
> -        }
> +    /* use framesDone instead of POC as poc count is not serial with
> bframes enabled */
> +    double overflow = 1.0;
> +    double timeDone = (double)(m_framesDone - m_param->frameNumThreads +
> 1) * m_frameDuration;
> +    double wantedBits = timeDone * m_bitrate;
> +    int64_t encodedBits = m_totalBits;
> +    if (m_param->totalFrames && m_param->totalFrames <= 2 * m_fps)
> +    {
> +        abrBuffer = m_param->totalFrames * (m_bitrate / m_fps);
> +        encodedBits = m_encodedBits;
> +    }
>
> -        if (wantedBits > 0 && encodedBits > 0 &&
> (!m_partialResidualFrames ||
> -            m_param->rc.bStrictCbr || m_isGrainEnabled))
> -        {
> -            abrBuffer *= X265_MAX(1, sqrt(timeDone));
> -            overflow = x265_clip3(.5, 2.0, 1.0 + (encodedBits -
> wantedBits) / abrBuffer);
> -            qScale *= overflow;
> -        }
> +    if (wantedBits > 0 && encodedBits > 0 && (!m_partialResidualFrames ||
> +        m_param->rc.bStrictCbr || m_isGrainEnabled))
> +    {
> +        abrBuffer *= X265_MAX(1, sqrt(timeDone));
> +        overflow = x265_clip3(.5, 2.0, 1.0 + (encodedBits - wantedBits) /
> abrBuffer);
> +        qScale *= overflow;
> +    }
>      return qScale;
>  }
>
> @@ -2330,17 +2333,18 @@
>      return totalSatdBits + encodedBitsSoFar;
>  }
>
> -int RateControl::rowVbvRateControl(Frame* curFrame, uint32_t row,
> RateControlEntry* rce, double& qpVbv)
> +int RateControl::rowVbvRateControl(Frame* curFrame, uint32_t row,
> RateControlEntry* rce, double& qpVbv, uint32_t* m_sliceBaseRow, uint32_t
> sliceId)
>  {
>      FrameData& curEncData = *curFrame->m_encData;
>      double qScaleVbv = x265_qp2qScale(qpVbv);
>      uint64_t rowSatdCost = curEncData.m_rowStat[row].rowSatd;
>      double encodedBits = curEncData.m_rowStat[row].encodedBits;
> +    uint32_t rowInSlice = row - m_sliceBaseRow[sliceId];
>
> -    if (m_param->bEnableWavefront && row == 1)
> +    if (m_param->bEnableWavefront && rowInSlice == 1)
>      {
> -        rowSatdCost += curEncData.m_rowStat[0].rowSatd;
> -        encodedBits += curEncData.m_rowStat[0].encodedBits;
> +        rowSatdCost += curEncData.m_rowStat[row - 1].rowSatd;
> +        encodedBits += curEncData.m_rowStat[row - 1].encodedBits;
>      }
>      rowSatdCost >>= X265_DEPTH - 8;
>      updatePredictor(rce->rowPred[0], qScaleVbv, (double)rowSatdCost,
> encodedBits);
> @@ -2350,8 +2354,8 @@
>          if (qpVbv < refFrame->m_encData->m_rowStat[row].rowQp)
>          {
>              uint64_t intraRowSatdCost = curEncData.m_rowStat[row].
> rowIntraSatd;
> -            if (m_param->bEnableWavefront && row == 1)
> -                intraRowSatdCost += curEncData.m_rowStat[0].rowIntraSatd;
> +            if (m_param->bEnableWavefront && rowInSlice == 1)
> +                intraRowSatdCost += curEncData.m_rowStat[row -
> 1].rowIntraSatd;
>              intraRowSatdCost >>= X265_DEPTH - 8;
>              updatePredictor(rce->rowPred[1], qScaleVbv,
> (double)intraRowSatdCost, encodedBits);
>          }
> @@ -2376,7 +2380,7 @@
>      const SPS& sps = *curEncData.m_slice->m_sps;
>      double maxFrameError = X265_MAX(0.05, 1.0 / sps.numCuInHeight);
>
> -    if (row < sps.numCuInHeight - 1)
> +    if (row < m_sliceBaseRow[sliceId + 1] - 1)
>      {
>          /* More threads means we have to be more cautious in letting
> ratecontrol use up extra bits. */
>          double rcTol = bufferLeftPlanned / m_param->frameNumThreads *
> m_rateTolerance;
> @@ -2693,8 +2697,8 @@
>              m_encodedBitsWindow[pos % s_slidingWindowFrames] = actualBits;
>          if(rce->sliceType != I_SLICE)
>          {
> -        int qp = int (rce->qpaRc + 0.5);
> -        m_qpToEncodedBits[qp] =  m_qpToEncodedBits[qp] == 0 ? actualBits
> : (m_qpToEncodedBits[qp] + actualBits) * 0.5;
> +            int qp = int (rce->qpaRc + 0.5);
> +            m_qpToEncodedBits[qp] =  m_qpToEncodedBits[qp] == 0 ?
> actualBits : (m_qpToEncodedBits[qp] + actualBits) * 0.5;
>          }
>          curFrame->m_rcData->wantedBitsWindow = m_wantedBitsWindow;
>          curFrame->m_rcData->cplxrSum = m_cplxrSum;
> @@ -2779,7 +2783,8 @@
>              curFrame->m_encData->m_frameStats.percent8x8Skip  * m_ncu) <
> 0)
>              goto writeFailure;
>      }
> -    else{
> +    else
> +    {
>          RPS* rpsWriter = &curFrame->m_encData->m_slice->m_rps;
>          int i, num = rpsWriter->numberOfPictures;
>          char deltaPOC[128];
> diff -r 71f700844b0b -r 546387e0b983 source/encoder/ratecontrol.h
> --- a/source/encoder/ratecontrol.h      Tue Sep 12 18:13:03 2017 +0530
> +++ b/source/encoder/ratecontrol.h      Thu Sep 21 20:20:52 2017 +0530
> @@ -244,7 +244,7 @@
>      int  rateControlStart(Frame* curFrame, RateControlEntry* rce,
> Encoder* enc);
>      void rateControlUpdateStats(RateControlEntry* rce);
>      int  rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry*
> rce, int *filler);
> -    int  rowVbvRateControl(Frame* curFrame, uint32_t row,
> RateControlEntry* rce, double& qpVbv);
> +    int  rowVbvRateControl(Frame* curFrame, uint32_t row,
> RateControlEntry* rce, double& qpVbv, uint32_t* m_sliceBaseRow, uint32_t
> sliceId);
>      int  rateControlSliceType(int frameNum);
>      bool cuTreeReadFor2Pass(Frame* curFrame);
>      void hrdFullness(SEIBufferingPeriod* sei);
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20170926/57817cd4/attachment-0001.html>
-------------- next part --------------
# HG changeset patch
# User Ashok Kumar Mishra <ashok at multicorewareinc.com>
# Date 1506091858 -19800
#      Fri Sep 22 20:20:58 2017 +0530
# Node ID 0882827c33cccab9aa8622c443c5bbba86d8b482
# Parent  e62b12bd8b4573b15290ebf110e01c8fafce55be
vbv hanging issue; fix for multiple slices
When multiple slices are enabled, vbv rate control must take care of
correct rows in slices, since multiple slices are encoding simultaneously.

diff -r e62b12bd8b45 -r 0882827c33cc source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp	Thu Jun 29 13:13:56 2017 +0530
+++ b/source/encoder/frameencoder.cpp	Fri Sep 22 20:20:58 2017 +0530
@@ -88,6 +88,7 @@
     delete[] m_outStreams;
     delete[] m_backupStreams;
     X265_FREE(m_sliceBaseRow);
+    X265_FREE(m_sliceMaxBlockRow);
     X265_FREE(m_cuGeoms);
     X265_FREE(m_ctuGeomMap);
     X265_FREE(m_substreamSizes);
@@ -118,6 +119,40 @@
 
     m_sliceBaseRow = X265_MALLOC(uint32_t, m_param->maxSlices + 1);
     ok &= !!m_sliceBaseRow;
+    m_sliceGroupSize = (uint16_t)(m_numRows + m_param->maxSlices - 1) / m_param->maxSlices;
+    uint32_t sliceGroupSizeAccu = (m_numRows << 8) / m_param->maxSlices;    
+    uint32_t rowSum = sliceGroupSizeAccu;
+    uint32_t sidx = 0;
+    for (uint32_t i = 0; i < m_numRows; i++)
+    {
+        const uint32_t rowRange = (rowSum >> 8);
+        if ((i >= rowRange) & (sidx != m_param->maxSlices - 1))
+        {
+            rowSum += sliceGroupSizeAccu;
+            m_sliceBaseRow[++sidx] = i;
+        }        
+    }
+    X265_CHECK(sidx < m_param->maxSlices, "sliceID check failed!");
+    m_sliceBaseRow[0] = 0;
+    m_sliceBaseRow[m_param->maxSlices] = m_numRows;
+
+    m_sliceMaxBlockRow = X265_MALLOC(uint32_t, m_param->maxSlices + 1);
+    ok &= !!m_sliceMaxBlockRow;
+    uint32_t maxBlockRows = (m_param->sourceHeight + (16 - 1)) / 16;
+    sliceGroupSizeAccu = (maxBlockRows << 8) / m_param->maxSlices;
+    rowSum = sliceGroupSizeAccu;
+    sidx = 0;
+    for (uint32_t i = 0; i < maxBlockRows; i++)
+    {
+        const uint32_t rowRange = (rowSum >> 8);
+        if ((i >= rowRange) & (sidx != m_param->maxSlices - 1))
+        {
+            rowSum += sliceGroupSizeAccu;
+            m_sliceMaxBlockRow[++sidx] = i;
+        }
+    }
+    m_sliceMaxBlockRow[0] = 0;
+    m_sliceMaxBlockRow[m_param->maxSlices] = maxBlockRows;
 
     /* determine full motion search range */
     int range  = m_param->searchRange;       /* fpel search */
@@ -341,6 +376,8 @@
     m_completionCount = 0;
     m_bAllRowsStop = false;
     m_vbvResetTriggerRow = -1;
+    m_rowSliceTotalBits[0] = 0;
+    m_rowSliceTotalBits[1] = 0;
 
     m_SSDY = m_SSDU = m_SSDV = 0;
     m_ssim = 0;
@@ -550,28 +587,13 @@
 
     /* reset entropy coders and compute slice id */
     m_entropyCoder.load(m_initSliceContext);
-    const uint32_t sliceGroupSize = (m_numRows + m_param->maxSlices - 1) / m_param->maxSlices;
-    const uint32_t sliceGroupSizeAccu = (m_numRows << 8) / m_param->maxSlices;
-    m_sliceGroupSize = (uint16_t)sliceGroupSize;
+	
+    for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)   
+        for (uint32_t row = m_sliceBaseRow[sliceId]; row < m_sliceBaseRow[sliceId + 1]; row++)
+            m_rows[row].init(m_initSliceContext, sliceId);   
 
-    uint32_t rowSum = sliceGroupSizeAccu;
-    uint32_t sidx = 0;
-    for (uint32_t i = 0; i < m_numRows; i++)
-    {
-        const uint32_t rowRange = (rowSum >> 8);
-
-        if ((i >= rowRange) & (sidx != m_param->maxSlices - 1))
-        {
-            rowSum += sliceGroupSizeAccu;
-            m_sliceBaseRow[++sidx] = i;
-        }
-
-        m_rows[i].init(m_initSliceContext, sidx);
-    }
-    X265_CHECK(sidx < m_param->maxSlices, "sliceID check failed!");
-
-    m_sliceBaseRow[0] = 0;
-    m_sliceBaseRow[m_param->maxSlices] = m_numRows;
+    // reset slice counter for rate control update
+    m_sliceCnt = 0;
 
     uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : m_param->maxSlices;
     X265_CHECK(m_param->bEnableWavefront || (m_param->maxSlices == 1), "Multiple slices without WPP unsupport now!");
@@ -586,8 +608,10 @@
                 m_rows[i].rowGoOnCoder.setBitstream(&m_outStreams[i]);
     }
     else
+    {
         for (uint32_t i = 0; i < numSubstreams; i++)
             m_outStreams[i].resetBits();
+    }
 
     int prevBPSEI = m_rce.encodeOrder ? m_top->m_lastBPSEI : 0;
 
@@ -697,10 +721,9 @@
      * compressed in a wave-front pattern if WPP is enabled. Row based loop
      * filters runs behind the CTU compression and reconstruction */
 
-    for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
-    {
+    for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)    
         m_rows[m_sliceBaseRow[sliceId]].active = true;
-    }
+    
     if (m_param->bEnableWavefront)
     {
         int i = 0;
@@ -719,6 +742,7 @@
             }
         }
     }
+
     if (m_param->bEnableWavefront)
     {
         for (uint32_t rowInSlice = 0; rowInSlice < m_sliceGroupSize; rowInSlice++)
@@ -751,6 +775,7 @@
                             m_mref[l][ref].applyWeight(rowIdx, m_numRows, sliceEndRow, sliceId);
                     }
                 }
+
                 enableRowEncoder(m_row_to_idx[row]); /* clear external dependency for this row */
                 if (!rowInSlice)
                 {
@@ -980,9 +1005,8 @@
             // complete the slice header by writing WPP row-starts
             m_entropyCoder.setBitstream(&m_bs);
             if (slice->m_pps->bEntropyCodingSyncEnabled)
-            {
                 m_entropyCoder.codeSliceHeaderWPPEntryPoints(&m_substreamSizes[prevSliceRow], (nextSliceRow - prevSliceRow - 1), maxStreamSize);
-            }
+            
             m_bs.writeByteAlignment();
 
             m_nalList.serialize(slice->m_nalUnitType, m_bs);
@@ -1211,17 +1235,21 @@
     int64_t startTime = x265_mdate();
     if (ATOMIC_INC(&m_activeWorkerCount) == 1 && m_stallStartTime)
         m_totalNoWorkerTime += x265_mdate() - m_stallStartTime;
+
     const uint32_t realRow = m_idx_to_row[row >> 1];
     const uint32_t typeNum = m_idx_to_row[row & 1];
+
     if (!typeNum)
         processRowEncoder(realRow, m_tld[threadId]);
     else
     {
         m_frameFilter.processRow(realRow);
+
         // NOTE: Active next row
         if (realRow != m_sliceBaseRow[m_rows[realRow].sliceId + 1] - 1)
             enqueueRowFilter(m_row_to_idx[realRow + 1]);
     }
+
     if (ATOMIC_DEC(&m_activeWorkerCount) == 0)
         m_stallStartTime = x265_mdate();
 
@@ -1264,20 +1292,18 @@
     const uint32_t lineStartCUAddr = row * numCols;
     bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
 
+    const uint32_t sliceId = curRow.sliceId;
     uint32_t maxBlockCols = (m_frame->m_fencPic->m_picWidth + (16 - 1)) / 16;
-    uint32_t maxBlockRows = (m_frame->m_fencPic->m_picHeight + (16 - 1)) / 16;
     uint32_t noOfBlocks = m_param->maxCUSize / 16;
     const uint32_t bFirstRowInSlice = ((row == 0) || (m_rows[row - 1].sliceId != curRow.sliceId)) ? 1 : 0;
     const uint32_t bLastRowInSlice = ((row == m_numRows - 1) || (m_rows[row + 1].sliceId != curRow.sliceId)) ? 1 : 0;
-    const uint32_t sliceId = curRow.sliceId;
     const uint32_t endRowInSlicePlus1 = m_sliceBaseRow[sliceId + 1];
     const uint32_t rowInSlice = row - m_sliceBaseRow[sliceId];
 
-    if (bFirstRowInSlice && !curRow.completed)
-    {
-        // Load SBAC coder context from previous row and initialize row state.
-        rowCoder.load(m_initSliceContext);
-    }
+    // Load SBAC coder context from previous row and initialize row state.
+    if (bFirstRowInSlice && !curRow.completed)        
+        rowCoder.load(m_initSliceContext);     
+
     // calculate mean QP for consistent deltaQP signalling calculation
     if (m_param->bOptCUDeltaQP)
     {
@@ -1287,15 +1313,12 @@
             if (m_param->bEnableWavefront || !row)
             {
                 double meanQPOff = 0;
-                uint32_t loopIncr, count = 0;
                 bool isReferenced = IS_REFERENCED(m_frame);
                 double *qpoffs = (isReferenced && m_param->rc.cuTree) ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
                 if (qpoffs)
                 {
-                    if (m_param->rc.qgSize == 8)
-                        loopIncr = 8;
-                    else
-                        loopIncr = 16;
+                    uint32_t loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
+
                     uint32_t cuYStart = 0, height = m_frame->m_fencPic->m_picHeight;
                     if (m_param->bEnableWavefront)
                     {
@@ -1305,6 +1328,7 @@
 
                     uint32_t qgSize = m_param->rc.qgSize, width = m_frame->m_fencPic->m_picWidth;
                     uint32_t maxOffsetCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
+                    uint32_t count = 0;
                     for (uint32_t cuY = cuYStart; cuY < height && (cuY < m_frame->m_fencPic->m_picHeight); cuY += qgSize)
                     {
                         for (uint32_t cuX = 0; cuX < width; cuX += qgSize)
@@ -1336,7 +1360,8 @@
             }
             curRow.avgQPComputed = 1;
         }
-    }
+    }    
+
     // Initialize restrict on MV range in slices
     tld.analysis.m_sliceMinY = -(int16_t)(rowInSlice * m_param->maxCUSize * 4) + 3 * 4;
     tld.analysis.m_sliceMaxY = (int16_t)((endRowInSlicePlus1 - 1 - row) * (m_param->maxCUSize * 4) - 4 * 4);
@@ -1364,16 +1389,16 @@
                 curRow.bufferedEntropy.copyState(rowCoder);
                 curRow.bufferedEntropy.loadContexts(rowCoder);
             }
-            if (!row && m_vbvResetTriggerRow != intRow)
+            if (bFirstRowInSlice && m_vbvResetTriggerRow != intRow)            
             {
                 curEncData.m_rowStat[row].rowQp = curEncData.m_avgQpRc;
                 curEncData.m_rowStat[row].rowQpScale = x265_qp2qScale(curEncData.m_avgQpRc);
             }
 
             FrameData::RCStatCU& cuStat = curEncData.m_cuStat[cuAddr];
-            if (m_param->bEnableWavefront && row >= col && row && m_vbvResetTriggerRow != intRow)
+            if (m_param->bEnableWavefront && rowInSlice >= col && !bFirstRowInSlice && m_vbvResetTriggerRow != intRow)
                 cuStat.baseQp = curEncData.m_cuStat[cuAddr - numCols + 1].baseQp;
-            else if (!m_param->bEnableWavefront && row && m_vbvResetTriggerRow != intRow)
+            else if (!m_param->bEnableWavefront && !bFirstRowInSlice && m_vbvResetTriggerRow != intRow)
                 cuStat.baseQp = curEncData.m_rowStat[row - 1].rowQp;
             else
                 cuStat.baseQp = curEncData.m_rowStat[row].rowQp;
@@ -1385,7 +1410,8 @@
             {
                 cuStat.vbvCost = 0;
                 cuStat.intraVbvCost = 0;
-                for (uint32_t h = 0; h < noOfBlocks && block_y < maxBlockRows; h++, block_y++)
+
+                for (uint32_t h = 0; h < noOfBlocks && block_y < m_sliceMaxBlockRow[sliceId + 1]; h++, block_y++)
                 {
                     uint32_t idx = block_x + (block_y * maxBlockCols);
 
@@ -1433,11 +1459,12 @@
         {
             // NOTE: in VBV mode, we may reencode anytime, so we can't do Deblock stage-Horizon and SAO
             if (!bIsVbv)
-            {
+            {                
                 // Delay one row to avoid intra prediction conflict
                 if (m_pool && !bFirstRowInSlice)
-                {
+                {                    
                     int allowCol = col;
+
                     // avoid race condition on last column
                     if (rowInSlice >= 2)
                     {
@@ -1446,11 +1473,13 @@
                     }
                     m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(allowCol);
                 }
+
                 // Last Row may start early
                 if (m_pool && bLastRowInSlice)
                 {
                     // Deblocking last row
                     int allowCol = col;
+
                     // avoid race condition on last column
                     if (rowInSlice >= 2)
                     {
@@ -1472,6 +1501,7 @@
 
         FrameStats frameLog;
         curEncData.m_rowStat[row].sumQpAq += collectCTUStatistics(*ctu, &frameLog);
+
         // copy number of intra, inter cu per row into frame stats for 2 pass
         if (m_param->rc.bStatWrite)
         {
@@ -1485,10 +1515,8 @@
                 int shift = 2 * (m_param->maxCUDepth - depth);
                 int cuSize = m_param->maxCUSize >> depth;
 
-                if (cuSize == 8)
-                    curRow.rowStats.intra8x8Cnt += (int)(frameLog.cntIntra[depth] + frameLog.cntIntraNxN);
-                else
-                    curRow.rowStats.intra8x8Cnt += (int)(frameLog.cntIntra[depth] << shift);
+                curRow.rowStats.intra8x8Cnt += (cuSize == 8) ? (int)(frameLog.cntIntra[depth] + frameLog.cntIntraNxN) :
+                                                               (int)(frameLog.cntIntra[depth] << shift);
 
                 curRow.rowStats.inter8x8Cnt += (int)(frameLog.cntInter[depth] << shift);
                 curRow.rowStats.skip8x8Cnt += (int)((frameLog.cntSkipCu[depth] + frameLog.cntMergeCu[depth]) << shift);
@@ -1518,12 +1546,13 @@
         if (bIsVbv)
         {   
             // Update encoded bits, satdCost, baseQP for each CU if tune grain is disabled
-            if ((m_param->bEnableWavefront && (!cuAddr || !m_param->rc.bEnableConstVbv)) || !m_param->bEnableWavefront)
+            FrameData::RCStatCU& cuStat = curEncData.m_cuStat[cuAddr];    
+            if ((m_param->bEnableWavefront && ((cuAddr == m_sliceBaseRow[sliceId] * numCols) || !m_param->rc.bEnableConstVbv)) || !m_param->bEnableWavefront)
             {
-                curEncData.m_rowStat[row].rowSatd += curEncData.m_cuStat[cuAddr].vbvCost;
-                curEncData.m_rowStat[row].rowIntraSatd += curEncData.m_cuStat[cuAddr].intraVbvCost;
-                curEncData.m_rowStat[row].encodedBits += curEncData.m_cuStat[cuAddr].totalBits;
-                curEncData.m_rowStat[row].sumQpRc += curEncData.m_cuStat[cuAddr].baseQp;
+                curEncData.m_rowStat[row].rowSatd += cuStat.vbvCost;
+                curEncData.m_rowStat[row].rowIntraSatd += cuStat.intraVbvCost;
+                curEncData.m_rowStat[row].encodedBits += cuStat.totalBits;
+                curEncData.m_rowStat[row].sumQpRc += cuStat.baseQp;
                 curEncData.m_rowStat[row].numEncodedCUs = cuAddr;
             }
             
@@ -1531,7 +1560,7 @@
             if (!m_param->bEnableWavefront && col == numCols - 1)
             {
                 double qpBase = curEncData.m_cuStat[cuAddr].baseQp;
-                int reEncode = m_top->m_rateControl->rowVbvRateControl(m_frame, row, &m_rce, qpBase);
+                int reEncode = m_top->m_rateControl->rowVbvRateControl(m_frame, row, &m_rce, qpBase, m_sliceBaseRow, sliceId);
                 qpBase = x265_clip3((double)m_param->rc.qpMin, (double)m_param->rc.qpMax, qpBase);
                 curEncData.m_rowStat[row].rowQp = qpBase;
                 curEncData.m_rowStat[row].rowQpScale = x265_qp2qScale(qpBase);
@@ -1557,15 +1586,16 @@
                 }
             }
             // If current block is at row diagonal checkpoint, call vbv ratecontrol.
-            else if (m_param->bEnableWavefront && row == col && row)
+            else if (m_param->bEnableWavefront && rowInSlice == col && !bFirstRowInSlice)
             {
                 if (m_param->rc.bEnableConstVbv)
                 {
-                    int32_t startCuAddr = numCols * row;
-                    int32_t EndCuAddr = startCuAddr + col;
-                    for (int32_t r = row; r >= 0; r--)
+                    uint32_t startCuAddr = numCols * row;
+                    uint32_t EndCuAddr = startCuAddr + col;
+
+                    for (int32_t r = row; r >= (int32_t)m_sliceBaseRow[sliceId]; r--)
                     {
-                        for (int32_t c = startCuAddr; c <= EndCuAddr && c <= (int32_t)numCols * (r + 1) - 1; c++)
+                        for (uint32_t c = startCuAddr; c <= EndCuAddr && c <= numCols * (r + 1) - 1; c++)
                         {
                             curEncData.m_rowStat[r].rowSatd += curEncData.m_cuStat[c].vbvCost;
                             curEncData.m_rowStat[r].rowIntraSatd += curEncData.m_cuStat[c].intraVbvCost;
@@ -1578,10 +1608,10 @@
                     }
                 }
                 double qpBase = curEncData.m_cuStat[cuAddr].baseQp;
-                int reEncode = m_top->m_rateControl->rowVbvRateControl(m_frame, row, &m_rce, qpBase);
+                int reEncode = m_top->m_rateControl->rowVbvRateControl(m_frame, row, &m_rce, qpBase, m_sliceBaseRow, sliceId);
                 qpBase = x265_clip3((double)m_param->rc.qpMin, (double)m_param->rc.qpMax, qpBase);
                 curEncData.m_rowStat[row].rowQp = qpBase;
-                curEncData.m_rowStat[row].rowQpScale =  x265_qp2qScale(qpBase);
+                curEncData.m_rowStat[row].rowQpScale = x265_qp2qScale(qpBase);
 
                 if (reEncode < 0)
                 {
@@ -1592,7 +1622,7 @@
                     m_vbvResetTriggerRow = row;
                     m_bAllRowsStop = true;
 
-                    for (uint32_t r = m_numRows - 1; r >= row; r--)
+                    for (uint32_t r = m_sliceBaseRow[sliceId + 1] - 1; r >= row; r--)
                     {
                         CTURow& stopRow = m_rows[r];
 
@@ -1670,14 +1700,15 @@
             return;
         }
     }
+
     /* this row of CTUs has been compressed */
     if (m_param->bEnableWavefront && m_param->rc.bEnableConstVbv)
     {
-        if (row == m_numRows - 1)
+        if (bLastRowInSlice)       
         {
-            for (int32_t r = 0; r < (int32_t)m_numRows; r++)
+            for (uint32_t r = m_sliceBaseRow[sliceId]; r < m_sliceBaseRow[sliceId + 1]; r++)
             {
-                for (int32_t c = curEncData.m_rowStat[r].numEncodedCUs + 1; c < (int32_t)numCols * (r + 1); c++)
+                for (uint32_t c = curEncData.m_rowStat[r].numEncodedCUs + 1; c < numCols * (r + 1); c++)
                 {
                     curEncData.m_rowStat[r].rowSatd += curEncData.m_cuStat[c].vbvCost;
                     curEncData.m_rowStat[r].rowIntraSatd += curEncData.m_cuStat[c].intraVbvCost;
@@ -1695,26 +1726,41 @@
      * after half the frame is encoded, but after this initial period we update
      * after refLagRows (the number of rows reference frames must have completed
      * before referencees may begin encoding) */
-    uint32_t rowCount = 0;
     if (m_param->rc.rateControlMode == X265_RC_ABR || bIsVbv)
     {
+        uint32_t rowCount = 0;
+        uint32_t maxRows = m_sliceBaseRow[sliceId + 1] - m_sliceBaseRow[sliceId];
         if (!m_rce.encodeOrder)
-            rowCount = m_numRows - 1;
+            rowCount = maxRows - 1; 
         else if ((uint32_t)m_rce.encodeOrder <= 2 * (m_param->fpsNum / m_param->fpsDenom))
-            rowCount = X265_MIN((m_numRows + 1) / 2, m_numRows - 1);
+            rowCount = X265_MIN((maxRows + 1) / 2, maxRows - 1);
         else
-            rowCount = X265_MIN(m_refLagRows, m_numRows - 1);
-        if (row == rowCount)
+            rowCount = X265_MIN(m_refLagRows, maxRows - 1);
+
+        if (rowInSlice == rowCount / m_param->maxSlices)
         {
-            m_rce.rowTotalBits = 0;
+            m_rowSliceTotalBits[sliceId] = 0;
             if (bIsVbv)
-                for (uint32_t i = 0; i < rowCount; i++)
-                    m_rce.rowTotalBits += curEncData.m_rowStat[i].encodedBits;
+            {                
+                for (uint32_t i = m_sliceBaseRow[sliceId]; i < (rowCount / m_param->maxSlices) + m_sliceBaseRow[sliceId]; i++)
+                    m_rowSliceTotalBits[sliceId] += curEncData.m_rowStat[i].encodedBits;
+            }
             else
-                for (uint32_t cuAddr = 0; cuAddr < rowCount * numCols; cuAddr++)
-                    m_rce.rowTotalBits += curEncData.m_cuStat[cuAddr].totalBits;
+            {
+                uint32_t startAddr = rowCount * numCols * sliceId;
+                uint32_t finishAddr = startAddr + rowCount * numCols;
+                
+                for (uint32_t cuAddr = startAddr; cuAddr < finishAddr; cuAddr++)
+                    m_rowSliceTotalBits[sliceId] += curEncData.m_cuStat[cuAddr].totalBits;
+            }            
 
-            m_top->m_rateControl->rateControlUpdateStats(&m_rce);
+            if (ATOMIC_INC(&m_sliceCnt) == (int)m_param->maxSlices)
+            {
+                m_rce.rowTotalBits = 0;
+                for (uint32_t i = 0; i < m_param->maxSlices; i++)
+                    m_rce.rowTotalBits += m_rowSliceTotalBits[i];
+                m_top->m_rateControl->rateControlUpdateStats(&m_rce);
+            }
         }
     }
 
@@ -1742,11 +1788,13 @@
         if (rowInSlice >= m_filterRowDelay)
         {
             enableRowFilter(m_row_to_idx[row - m_filterRowDelay]);
+
             /* NOTE: Activate filter if first row (row 0) */
             if (rowInSlice == m_filterRowDelay)
                 enqueueRowFilter(m_row_to_idx[row - m_filterRowDelay]);
             tryWakeOne();
         }
+
         if (bLastRowInSlice)
         {
             for (uint32_t i = endRowInSlicePlus1 - m_filterRowDelay; i < endRowInSlicePlus1; i++)
diff -r e62b12bd8b45 -r 0882827c33cc source/encoder/frameencoder.h
--- a/source/encoder/frameencoder.h	Thu Jun 29 13:13:56 2017 +0530
+++ b/source/encoder/frameencoder.h	Fri Sep 22 20:20:58 2017 +0530
@@ -138,6 +138,7 @@
     volatile bool            m_bAllRowsStop;
     volatile int             m_completionCount;
     volatile int             m_vbvResetTriggerRow;
+    volatile int             m_sliceCnt;
 
     uint32_t                 m_numRows;
     uint32_t                 m_numCols;
@@ -147,8 +148,10 @@
 
     CTURow*                  m_rows;
     uint16_t                 m_sliceAddrBits;
-    uint16_t                 m_sliceGroupSize;
-    uint32_t*                m_sliceBaseRow;
+    uint32_t                 m_sliceGroupSize;
+    uint32_t*                m_sliceBaseRow;    
+    uint32_t*                m_sliceMaxBlockRow;
+    int64_t                  m_rowSliceTotalBits[2];
     RateControlEntry         m_rce;
     SEIDecodedPictureHash    m_seiReconPictureDigest;
 
diff -r e62b12bd8b45 -r 0882827c33cc source/encoder/ratecontrol.cpp
--- a/source/encoder/ratecontrol.cpp	Thu Jun 29 13:13:56 2017 +0530
+++ b/source/encoder/ratecontrol.cpp	Fri Sep 22 20:20:58 2017 +0530
@@ -732,7 +732,6 @@
     m_bitrate = m_param->rc.bitrate * 1000;
 }
 
-
 void RateControl::initHRD(SPS& sps)
 {
     int vbvBufferSize = m_param->rc.vbvBufferSize * 1000;
@@ -765,6 +764,7 @@
 
     #undef MAX_DURATION
 }
+
 bool RateControl::analyseABR2Pass(uint64_t allAvailableBits)
 {
     double rateFactor, stepMult;
@@ -1473,6 +1473,7 @@
 
     return q;
 }
+
 double RateControl::countExpectedBits(int startPos, int endPos)
 {
     double expectedBits = 0;
@@ -1484,6 +1485,7 @@
     }
     return expectedBits;
 }
+
 bool RateControl::findUnderflow(double *fills, int *t0, int *t1, int over, int endPos)
 {
     /* find an interval ending on an overflow or underflow (depending on whether
@@ -1531,6 +1533,7 @@
     }
     return adjusted;
 }
+
 bool RateControl::cuTreeReadFor2Pass(Frame* frame)
 {
     int index = m_encOrder[frame->m_poc];
@@ -1579,24 +1582,24 @@
 double RateControl::tuneAbrQScaleFromFeedback(double qScale)
 {
     double abrBuffer = 2 * m_rateTolerance * m_bitrate;
-        /* use framesDone instead of POC as poc count is not serial with bframes enabled */
-        double overflow = 1.0;
-        double timeDone = (double)(m_framesDone - m_param->frameNumThreads + 1) * m_frameDuration;
-        double wantedBits = timeDone * m_bitrate;
-        int64_t encodedBits = m_totalBits;
-        if (m_param->totalFrames && m_param->totalFrames <= 2 * m_fps)
-        {
-            abrBuffer = m_param->totalFrames * (m_bitrate / m_fps);
-            encodedBits = m_encodedBits;
-        }
+    /* use framesDone instead of POC as poc count is not serial with bframes enabled */
+    double overflow = 1.0;
+    double timeDone = (double)(m_framesDone - m_param->frameNumThreads + 1) * m_frameDuration;
+    double wantedBits = timeDone * m_bitrate;
+    int64_t encodedBits = m_totalBits;
+    if (m_param->totalFrames && m_param->totalFrames <= 2 * m_fps)
+    {
+        abrBuffer = m_param->totalFrames * (m_bitrate / m_fps);
+        encodedBits = m_encodedBits;
+    }
 
-        if (wantedBits > 0 && encodedBits > 0 && (!m_partialResidualFrames || 
-            m_param->rc.bStrictCbr || m_isGrainEnabled))
-        {
-            abrBuffer *= X265_MAX(1, sqrt(timeDone));
-            overflow = x265_clip3(.5, 2.0, 1.0 + (encodedBits - wantedBits) / abrBuffer);
-            qScale *= overflow;
-        }
+    if (wantedBits > 0 && encodedBits > 0 && (!m_partialResidualFrames || 
+        m_param->rc.bStrictCbr || m_isGrainEnabled))
+    {
+        abrBuffer *= X265_MAX(1, sqrt(timeDone));
+        overflow = x265_clip3(.5, 2.0, 1.0 + (encodedBits - wantedBits) / abrBuffer);
+        qScale *= overflow;
+    }
     return qScale;
 }
 
@@ -2330,17 +2333,18 @@
     return totalSatdBits + encodedBitsSoFar;
 }
 
-int RateControl::rowVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv)
+int RateControl::rowVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv, uint32_t* m_sliceBaseRow, uint32_t sliceId)
 {
     FrameData& curEncData = *curFrame->m_encData;
     double qScaleVbv = x265_qp2qScale(qpVbv);
     uint64_t rowSatdCost = curEncData.m_rowStat[row].rowSatd;
     double encodedBits = curEncData.m_rowStat[row].encodedBits;
+    uint32_t rowInSlice = row - m_sliceBaseRow[sliceId];
 
-    if (m_param->bEnableWavefront && row == 1)
+    if (m_param->bEnableWavefront && rowInSlice == 1)
     {
-        rowSatdCost += curEncData.m_rowStat[0].rowSatd;
-        encodedBits += curEncData.m_rowStat[0].encodedBits;
+        rowSatdCost += curEncData.m_rowStat[row - 1].rowSatd;
+        encodedBits += curEncData.m_rowStat[row - 1].encodedBits;
     }
     rowSatdCost >>= X265_DEPTH - 8;
     updatePredictor(rce->rowPred[0], qScaleVbv, (double)rowSatdCost, encodedBits);
@@ -2350,8 +2354,8 @@
         if (qpVbv < refFrame->m_encData->m_rowStat[row].rowQp)
         {
             uint64_t intraRowSatdCost = curEncData.m_rowStat[row].rowIntraSatd;
-            if (m_param->bEnableWavefront && row == 1)
-                intraRowSatdCost += curEncData.m_rowStat[0].rowIntraSatd;
+            if (m_param->bEnableWavefront && rowInSlice == 1)
+                intraRowSatdCost += curEncData.m_rowStat[row - 1].rowIntraSatd;
             intraRowSatdCost >>= X265_DEPTH - 8;
             updatePredictor(rce->rowPred[1], qScaleVbv, (double)intraRowSatdCost, encodedBits);
         }
@@ -2376,7 +2380,7 @@
     const SPS& sps = *curEncData.m_slice->m_sps;
     double maxFrameError = X265_MAX(0.05, 1.0 / sps.numCuInHeight);
 
-    if (row < sps.numCuInHeight - 1)
+    if (row < m_sliceBaseRow[sliceId + 1] - 1)
     {
         /* More threads means we have to be more cautious in letting ratecontrol use up extra bits. */
         double rcTol = bufferLeftPlanned / m_param->frameNumThreads * m_rateTolerance;
@@ -2693,8 +2697,8 @@
             m_encodedBitsWindow[pos % s_slidingWindowFrames] = actualBits;
         if(rce->sliceType != I_SLICE)
         {
-        int qp = int (rce->qpaRc + 0.5);
-        m_qpToEncodedBits[qp] =  m_qpToEncodedBits[qp] == 0 ? actualBits : (m_qpToEncodedBits[qp] + actualBits) * 0.5;
+            int qp = int (rce->qpaRc + 0.5);
+            m_qpToEncodedBits[qp] =  m_qpToEncodedBits[qp] == 0 ? actualBits : (m_qpToEncodedBits[qp] + actualBits) * 0.5;
         }
         curFrame->m_rcData->wantedBitsWindow = m_wantedBitsWindow;
         curFrame->m_rcData->cplxrSum = m_cplxrSum;
@@ -2779,7 +2783,8 @@
             curFrame->m_encData->m_frameStats.percent8x8Skip  * m_ncu) < 0)
             goto writeFailure;
     }
-    else{
+    else
+    {
         RPS* rpsWriter = &curFrame->m_encData->m_slice->m_rps;
         int i, num = rpsWriter->numberOfPictures;
         char deltaPOC[128];
diff -r e62b12bd8b45 -r 0882827c33cc source/encoder/ratecontrol.h
--- a/source/encoder/ratecontrol.h	Thu Jun 29 13:13:56 2017 +0530
+++ b/source/encoder/ratecontrol.h	Fri Sep 22 20:20:58 2017 +0530
@@ -244,7 +244,7 @@
     int  rateControlStart(Frame* curFrame, RateControlEntry* rce, Encoder* enc);
     void rateControlUpdateStats(RateControlEntry* rce);
     int  rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry* rce, int *filler);
-    int  rowVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv);
+    int  rowVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv, uint32_t* m_sliceBaseRow, uint32_t sliceId);
     int  rateControlSliceType(int frameNum);
     bool cuTreeReadFor2Pass(Frame* curFrame);
     void hrdFullness(SEIBufferingPeriod* sei);


More information about the x265-devel mailing list