[x265] [PATCH] search: measure RDO of intra modes within 25% of least cost [CHANGES OUTPUTS]
dave
dtyx265 at gmail.com
Fri Sep 12 03:16:08 CEST 2014
On 09/11/2014 04:37 AM, Steve Borho wrote:
> # HG changeset patch
> # User Steve Borho <steve at borho.org>
> # Date 1410345355 -7200
> # Wed Sep 10 12:35:55 2014 +0200
> # Node ID 2fb709fbf2d85caae68db9dd6574ba3e6f52d99f
> # Parent 012f315d3eda8044f5a49865e15ba2943fbab094
> search: measure RDO of intra modes within 25% of least cost [CHANGES OUTPUTS]
>
> all presets use this function to encode I slices, so every encode is affected.
>
> Previous behavior:
> RD measure top N least sa8d cost intra modes and all most probable modes where
> N was depth-based: intraModeNumFast[] = { 8, 8, 3, 3, 3 }; // 4x4, 8x8, etc
>
> New behavior:
> RD measure up to MAX_RD_INTRA_MODES (6) modes that are within 25% of best sa8d
> cost or are most probable.
I have been testing a new version of xUpdateCandList that only adds a
mode if it meets a certain percentage threshold. Based on my testing,
or at least on the one video I was testing with, at 25% you will get 6
modes the majority of the time. You may need to use something under 5%
if you want to occasionally get fewer than 6 candidates. I would have
submitted my patch for this today and still can if you would like to
take a look at it.
>
> The new behavior may measure less than 6 modes, the old behavior definitely
> would not, and the new behavior may skip some most-probable modes if there
> are plenty of other modes which are near the best cost. Since mode signal cost
> is included already, this seems ok.
>
> The general idea is that if 1-2 modes have much better sa8d cost than all the
> others, then we are likely wasting our time RD measuring 8-11 modes. We're
> betting that sa8d cost is a somewhat decent predictor of RD cost.
>
> Note that I initially tried without a limit (measure all within 25% or MPM) but
> for some clips this was a horrible perf trade-off. In some situations all the
> intra modes might measure close together (flat source block) and we would end
> up measuring most or all of the intra modes for very little gain. So this
> version re-introduces a "top N candidate list" but does not bother trying to
> keep the list sorted since it is small
>
> diff -r 012f315d3eda -r 2fb709fbf2d8 source/Lib/TLibCommon/CommonDef.h
> --- a/source/Lib/TLibCommon/CommonDef.h Wed Sep 10 17:27:20 2014 +0200
> +++ b/source/Lib/TLibCommon/CommonDef.h Wed Sep 10 12:35:55 2014 +0200
> @@ -73,8 +73,6 @@
> #define SCAN_SET_SIZE 16
> #define LOG2_SCAN_SET_SIZE 4
>
> -#define FAST_UDI_MAX_RDMODE_NUM 35 // maximum number of RD comparison in fast-UDI estimation loop
> -
> #define ALL_IDX -1
> #define PLANAR_IDX 0
> #define VER_IDX 26 // index for intra VERTICAL mode
> diff -r 012f315d3eda -r 2fb709fbf2d8 source/encoder/search.cpp
> --- a/source/encoder/search.cpp Wed Sep 10 17:27:20 2014 +0200
> +++ b/source/encoder/search.cpp Wed Sep 10 12:35:55 2014 +0200
> @@ -1281,12 +1281,9 @@
> uint32_t log2TrSize = cu->getLog2CUSize(0) - initTrDepth;
> uint32_t tuSize = 1 << log2TrSize;
> uint32_t qNumParts = cu->getTotalNumPart() >> 2;
> - uint32_t overallDistY = 0;
> - uint32_t candNum;
> - uint64_t candCostList[FAST_UDI_MAX_RDMODE_NUM];
> + uint32_t totalDist = 0;
> uint32_t sizeIdx = log2TrSize - 2;
> - uint32_t partOffset = 0;
> - static const uint8_t intraModeNumFast[] = { 8, 8, 3, 3, 3 }; // 4x4, 8x8, 16x16, 32x32, 64x64
> + uint32_t partOffset = 0;
>
> // loop over partitions
> for (uint32_t pu = 0; pu < numPU; pu++, partOffset += qNumParts)
> @@ -1297,13 +1294,6 @@
> // determine set of modes to be tested (using prediction signal only)
> pixel* fenc = fencYuv->getLumaAddr(partOffset);
> uint32_t stride = predYuv->getStride();
> - uint32_t rdModeList[FAST_UDI_MAX_RDMODE_NUM];
> - int numModesForFullRD = intraModeNumFast[sizeIdx];
> -
> - for (int i = 0; i < numModesForFullRD; i++)
> - candCostList[i] = MAX_INT64;
> -
> - uint64_t modeCosts[35];
>
> pixel *above = m_refAbove + tuSize - 1;
> pixel *aboveFiltered = m_refAboveFlt + tuSize - 1;
> @@ -1348,18 +1338,20 @@
> }
>
> uint32_t preds[3];
> - int numMpm = cu->getIntraDirLumaPredictor(partOffset, preds);
> + cu->getIntraDirLumaPredictor(partOffset, preds);
>
> uint64_t mpms;
> uint32_t rbits = getIntraRemModeBits(cu, partOffset, depth, preds, mpms);
>
> pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
> + uint64_t modeCosts[35];
> + uint64_t bcost;
>
> // DC
> primitives.intra_pred[sizeIdx][DC_IDX](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
> uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? getIntraModeBits(cu, DC_IDX, partOffset, depth) : rbits;
> uint32_t sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
> - modeCosts[DC_IDX] = m_rdCost.calcRdSADCost(sad, bits);
> + modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
>
> // PLANAR
> pixel *abovePlanar = above;
> @@ -1373,6 +1365,7 @@
> bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? getIntraModeBits(cu, PLANAR_IDX, partOffset, depth) : rbits;
> sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
> modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
> + COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
>
> // angular predictions
> primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
> @@ -1386,69 +1379,42 @@
> bits = (mpms & ((uint64_t)1 << mode)) ? getIntraModeBits(cu, mode, partOffset, depth) : rbits;
> sad = sa8d(cmp, srcStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
> modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
> + COPY1_IF_LT(bcost, modeCosts[mode]);
> }
>
> - // Find N least cost modes. N = numModesForFullRD
> - candNum = 0;
> + uint64_t candCostList[MAX_RD_INTRA_MODES];
> + uint32_t rdModeList[MAX_RD_INTRA_MODES];
> + for (int i = 0; i < MAX_RD_INTRA_MODES; i++)
> + candCostList[i] = MAX_INT64;
> +
> + /* Find the top MAX_RD_INTRA_MODES modes with cost within 25% of best
> + * or among the most probable modes */
> + uint64_t paddedBcost = bcost + (bcost >> 2); // 1.25%
> for (int mode = 0; mode < 35; mode++)
> - candNum += xUpdateCandList(mode, modeCosts[mode], numModesForFullRD, rdModeList, candCostList);
> -
> - for (int j = 0; j < numMpm; j++)
> + if (modeCosts[mode] < paddedBcost || (mpms & ((uint64_t)1 << mode)))
> + updateCandList(mode, modeCosts[mode], rdModeList, candCostList);
> +
> + /* measure modes with 25% of best SAD RD-COST using simple RDO (no TU splits) */
> + uint32_t bmode = 0;
> + uint64_t cost;
> + bcost = MAX_INT64;
> + for (int i = 0; i < MAX_RD_INTRA_MODES; i++)
> {
> - bool mostProbableModeIncluded = false;
> - uint32_t mostProbableMode = preds[j];
> -
> - for (int i = 0; i < numModesForFullRD; i++)
> - {
> - if (mostProbableMode == rdModeList[i])
> - {
> - mostProbableModeIncluded = true;
> - break;
> - }
> - }
> -
> - if (!mostProbableModeIncluded)
> - rdModeList[numModesForFullRD++] = mostProbableMode;
> + if (candCostList[i] == MAX_INT64)
> + break;
> + m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
> + cu->setLumaIntraDirSubParts(rdModeList[i], partOffset, depth + initTrDepth);
> + cost = 0;
> + xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, false, cost, depthRange);
> + COPY2_IF_LT(bcost, cost, bmode, rdModeList[i]);
> }
>
> - // check modes (using r-d costs)
> - uint32_t bestPUMode = 0;
> - uint32_t bestPUDistY = 0;
> - uint64_t bestPUCost = MAX_INT64;
> - uint32_t puDistY;
> - uint64_t puCost;
> - for (int mode = 0; mode < numModesForFullRD; mode++)
> - {
> - // set luma prediction mode
> - cu->setLumaIntraDirSubParts(rdModeList[mode], partOffset, depth + initTrDepth);
> -
> - // set context models
> - m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
> -
> - // determine residual for partition
> - puCost = 0;
> - puDistY = xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, false, puCost, depthRange);
> -
> - // check r-d cost
> - if (puCost < bestPUCost)
> - {
> - bestPUMode = rdModeList[mode];
> - bestPUDistY = puDistY;
> - bestPUCost = puCost;
> - }
> - }
> -
> /* remeasure best mode, allowing TU splits */
> - cu->setLumaIntraDirSubParts(bestPUMode, partOffset, depth + initTrDepth);
> -
> - // set context models
> + cu->setLumaIntraDirSubParts(bmode, partOffset, depth + initTrDepth);
> +
> m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
>
> - // determine residual for partition
> - puCost = 0;
> - puDistY = xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, true, puCost, depthRange);
> -
> - overallDistY += (puCost >= bestPUCost) ? bestPUDistY : puDistY;
> + totalDist += xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, true, cost, depthRange);
>
> xSetIntraResultQT(cu, initTrDepth, partOffset, reconYuv);
>
> @@ -1475,11 +1441,11 @@
> cu->getCbf(TEXT_LUMA)[offs] |= combCbfY;
> }
>
> - // reset context models (TODO: caller should do this)
> + // reset context models
> m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
>
> // set distortion (rate and r-d costs are determined later)
> - cu->m_totalDistortion = overallDistY;
> + cu->m_totalDistortion = totalDist;
>
> x265_emms();
> }
> @@ -3486,28 +3452,27 @@
> return getIntraModeBits(cu, mode, partOffset, depth);
> }
>
> -uint32_t Search::xUpdateCandList(uint32_t mode, uint64_t cost, uint32_t fastCandNum, uint32_t* CandModeList, uint64_t* CandCostList)
> +/* swap the current mode/cost with the mode with the highest cost in the
> + * current candidcate list, if its cost is better (maintain a top N list) */
> +void Search::updateCandList(uint32_t mode, uint64_t cost, uint32_t* candModeList, uint64_t* candCostList)
> {
> - uint32_t i;
> - uint32_t shift = 0;
> -
> - while (shift < fastCandNum && cost < CandCostList[fastCandNum - 1 - shift])
> - shift++;
> -
> - if (shift != 0)
> + uint32_t maxIndex = 0;
> + uint64_t maxValue = 0;
> +
> + for (int i = 0; i < MAX_RD_INTRA_MODES; i++)
> {
> - for (i = 1; i < shift; i++)
> + if (maxValue < candCostList[i])
> {
> - CandModeList[fastCandNum - i] = CandModeList[fastCandNum - 1 - i];
> - CandCostList[fastCandNum - i] = CandCostList[fastCandNum - 1 - i];
> + maxValue = candCostList[i];
> + maxIndex = i;
> }
> -
> - CandModeList[fastCandNum - shift] = mode;
> - CandCostList[fastCandNum - shift] = cost;
> - return 1;
> }
>
> - return 0;
> + if (cost < maxValue)
> + {
> + candCostList[maxIndex] = cost;
> + candModeList[maxIndex] = mode;
> + }
> }
>
> /* add inter-prediction syntax elements for a CU block */
> diff -r 012f315d3eda -r 2fb709fbf2d8 source/encoder/search.h
> --- a/source/encoder/search.h Wed Sep 10 17:27:20 2014 +0200
> +++ b/source/encoder/search.h Wed Sep 10 12:35:55 2014 +0200
> @@ -172,10 +172,12 @@
> void checkBestMVP(MV* amvpCand, MV cMv, MV& mvPred, int& mvpIdx, uint32_t& outBits, uint32_t& outCost);
> void getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3]);
> uint32_t getInterSymbolBits(TComDataCU* cu, uint32_t depthRange[2]);
> - uint32_t xUpdateCandList(uint32_t mode, uint64_t cost, uint32_t fastCandNum, uint32_t* CandModeList, uint64_t* CandCostList);
>
> uint32_t xMergeEstimation(TComDataCU* cu, int partIdx, MergeData& m);
> void xSetSearchRange(TComDataCU* cu, MV mvp, int merange, MV& mvmin, MV& mvmax);
> +
> + enum { MAX_RD_INTRA_MODES = 6 };
> + void updateCandList(uint32_t mode, uint64_t cost, uint32_t* candModeList, uint64_t* candCostList);
> };
> }
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
More information about the x265-devel
mailing list