[x265] [PATCH] search: measure RDO of intra modes within 25% of least cost [CHANGES OUTPUTS]

dave dtyx265 at gmail.com
Fri Sep 12 03:16:08 CEST 2014


On 09/11/2014 04:37 AM, Steve Borho wrote:
> # HG changeset patch
> # User Steve Borho <steve at borho.org>
> # Date 1410345355 -7200
> #      Wed Sep 10 12:35:55 2014 +0200
> # Node ID 2fb709fbf2d85caae68db9dd6574ba3e6f52d99f
> # Parent  012f315d3eda8044f5a49865e15ba2943fbab094
> search: measure RDO of intra modes within 25% of least cost [CHANGES OUTPUTS]
>
> all presets use this function to encode I slices, so every encode is affected.
>
> Previous behavior:
>    RD measure top N least sa8d cost intra modes and all most probable modes where
>    N was depth-based: intraModeNumFast[] = { 8, 8, 3, 3, 3 }; // 4x4, 8x8, etc
>
> New behavior:
>    RD measure up to MAX_RD_INTRA_MODES (6) modes that are within 25% of best sa8d
>    cost or are most probable.
I have been testing a new version of xUpdateCandList that only adds a 
mode if it meets a certain percentage threshold.  Based on my testing, 
or at least on the one video I was testing with, at 25% you will get 6 
modes the majority of the time.  You may need to use something under 5% 
if you want to occasionally get fewer than 6 candidates.  I would have 
submitted my patch for this today and still can if you would like to 
take a look at it.
>
> The new behavior may measure less than 6 modes, the old behavior definitely
> would not, and the new behavior may skip some most-probable modes if there
> are plenty of other modes which are near the best cost.  Since mode signal cost
> is included already, this seems ok.
>
> The general idea is that if 1-2 modes have much better sa8d cost than all the
> others, then we are likely wasting our time RD measuring 8-11 modes. We're
> betting that sa8d cost is a somewhat decent predictor of RD cost.
>
> Note that I initially tried without a limit (measure all within 25% or MPM) but
> for some clips this was a horrible perf trade-off. In some situations all the
> intra modes might measure close together (flat source block) and we would end
> up measuring most or all of the intra modes for very little gain. So this
> version re-introduces a "top N candidate list" but does not bother trying to
> keep the list sorted since it is small
>
> diff -r 012f315d3eda -r 2fb709fbf2d8 source/Lib/TLibCommon/CommonDef.h
> --- a/source/Lib/TLibCommon/CommonDef.h	Wed Sep 10 17:27:20 2014 +0200
> +++ b/source/Lib/TLibCommon/CommonDef.h	Wed Sep 10 12:35:55 2014 +0200
> @@ -73,8 +73,6 @@
>   #define SCAN_SET_SIZE               16
>   #define LOG2_SCAN_SET_SIZE          4
>   
> -#define FAST_UDI_MAX_RDMODE_NUM     35 // maximum number of RD comparison in fast-UDI estimation loop
> -
>   #define ALL_IDX                     -1
>   #define PLANAR_IDX                  0
>   #define VER_IDX                     26 // index for intra VERTICAL   mode
> diff -r 012f315d3eda -r 2fb709fbf2d8 source/encoder/search.cpp
> --- a/source/encoder/search.cpp	Wed Sep 10 17:27:20 2014 +0200
> +++ b/source/encoder/search.cpp	Wed Sep 10 12:35:55 2014 +0200
> @@ -1281,12 +1281,9 @@
>       uint32_t log2TrSize   = cu->getLog2CUSize(0) - initTrDepth;
>       uint32_t tuSize       = 1 << log2TrSize;
>       uint32_t qNumParts    = cu->getTotalNumPart() >> 2;
> -    uint32_t overallDistY = 0;
> -    uint32_t candNum;
> -    uint64_t candCostList[FAST_UDI_MAX_RDMODE_NUM];
> +    uint32_t totalDist    = 0;
>       uint32_t sizeIdx      = log2TrSize - 2;
> -    uint32_t partOffset = 0;
> -    static const uint8_t intraModeNumFast[] = { 8, 8, 3, 3, 3 }; // 4x4, 8x8, 16x16, 32x32, 64x64
> +    uint32_t partOffset   = 0;
>   
>       // loop over partitions
>       for (uint32_t pu = 0; pu < numPU; pu++, partOffset += qNumParts)
> @@ -1297,13 +1294,6 @@
>           // determine set of modes to be tested (using prediction signal only)
>           pixel*   fenc   = fencYuv->getLumaAddr(partOffset);
>           uint32_t stride = predYuv->getStride();
> -        uint32_t rdModeList[FAST_UDI_MAX_RDMODE_NUM];
> -        int numModesForFullRD = intraModeNumFast[sizeIdx];
> -
> -        for (int i = 0; i < numModesForFullRD; i++)
> -            candCostList[i] = MAX_INT64;
> -
> -        uint64_t modeCosts[35];
>   
>           pixel *above         = m_refAbove    + tuSize - 1;
>           pixel *aboveFiltered = m_refAboveFlt + tuSize - 1;
> @@ -1348,18 +1338,20 @@
>           }
>   
>           uint32_t preds[3];
> -        int numMpm = cu->getIntraDirLumaPredictor(partOffset, preds);
> +        cu->getIntraDirLumaPredictor(partOffset, preds);
>           
>           uint64_t mpms;
>           uint32_t rbits = getIntraRemModeBits(cu, partOffset, depth, preds, mpms);
>   
>           pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
> +        uint64_t modeCosts[35];
> +        uint64_t bcost;
>   
>           // DC
>           primitives.intra_pred[sizeIdx][DC_IDX](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
>           uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? getIntraModeBits(cu, DC_IDX, partOffset, depth) : rbits;
>           uint32_t sad  = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
> -        modeCosts[DC_IDX] = m_rdCost.calcRdSADCost(sad, bits);
> +        modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
>   
>           // PLANAR
>           pixel *abovePlanar = above;
> @@ -1373,6 +1365,7 @@
>           bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? getIntraModeBits(cu, PLANAR_IDX, partOffset, depth) : rbits;
>           sad  = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
>           modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
> +        COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
>   
>           // angular predictions
>           primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
> @@ -1386,69 +1379,42 @@
>               bits = (mpms & ((uint64_t)1 << mode)) ? getIntraModeBits(cu, mode, partOffset, depth) : rbits;
>               sad = sa8d(cmp, srcStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
>               modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
> +            COPY1_IF_LT(bcost, modeCosts[mode]);
>           }
>   
> -        // Find N least cost modes. N = numModesForFullRD
> -        candNum = 0;
> +        uint64_t candCostList[MAX_RD_INTRA_MODES];
> +        uint32_t rdModeList[MAX_RD_INTRA_MODES];
> +        for (int i = 0; i < MAX_RD_INTRA_MODES; i++)
> +            candCostList[i] = MAX_INT64;
> +
> +        /* Find the top MAX_RD_INTRA_MODES modes with cost within 25% of best
> +         * or among the most probable modes */
> +        uint64_t paddedBcost = bcost + (bcost >> 2); // 1.25%
>           for (int mode = 0; mode < 35; mode++)
> -            candNum += xUpdateCandList(mode, modeCosts[mode], numModesForFullRD, rdModeList, candCostList);
> -
> -        for (int j = 0; j < numMpm; j++)
> +            if (modeCosts[mode] < paddedBcost || (mpms & ((uint64_t)1 << mode)))
> +                updateCandList(mode, modeCosts[mode], rdModeList, candCostList);
> +
> +        /* measure modes with 25% of best SAD RD-COST using simple RDO (no TU splits) */
> +        uint32_t bmode = 0;
> +        uint64_t cost;
> +        bcost = MAX_INT64;
> +        for (int i = 0; i < MAX_RD_INTRA_MODES; i++)
>           {
> -            bool mostProbableModeIncluded = false;
> -            uint32_t mostProbableMode = preds[j];
> -
> -            for (int i = 0; i < numModesForFullRD; i++)
> -            {
> -                if (mostProbableMode == rdModeList[i])
> -                {
> -                    mostProbableModeIncluded = true;
> -                    break;
> -                }
> -            }
> -
> -            if (!mostProbableModeIncluded)
> -                rdModeList[numModesForFullRD++] = mostProbableMode;
> +            if (candCostList[i] == MAX_INT64)
> +                break;
> +            m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
> +            cu->setLumaIntraDirSubParts(rdModeList[i], partOffset, depth + initTrDepth);
> +            cost = 0;
> +            xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, false, cost, depthRange);
> +            COPY2_IF_LT(bcost, cost, bmode, rdModeList[i]);
>           }
>   
> -        // check modes (using r-d costs)
> -        uint32_t bestPUMode  = 0;
> -        uint32_t bestPUDistY = 0;
> -        uint64_t bestPUCost  = MAX_INT64;
> -        uint32_t puDistY;
> -        uint64_t puCost;
> -        for (int mode = 0; mode < numModesForFullRD; mode++)
> -        {
> -            // set luma prediction mode
> -            cu->setLumaIntraDirSubParts(rdModeList[mode], partOffset, depth + initTrDepth);
> -
> -            // set context models
> -            m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
> -
> -            // determine residual for partition
> -            puCost = 0;
> -            puDistY = xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, false, puCost, depthRange);
> -
> -            // check r-d cost
> -            if (puCost < bestPUCost)
> -            {
> -                bestPUMode  = rdModeList[mode];
> -                bestPUDistY = puDistY;
> -                bestPUCost  = puCost;
> -            }
> -        }
> -
>           /* remeasure best mode, allowing TU splits */
> -        cu->setLumaIntraDirSubParts(bestPUMode, partOffset, depth + initTrDepth);
> -
> -        // set context models
> +        cu->setLumaIntraDirSubParts(bmode, partOffset, depth + initTrDepth);
> +
>           m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
>   
> -        // determine residual for partition
> -        puCost = 0;
> -        puDistY = xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, true, puCost, depthRange);
> -
> -        overallDistY += (puCost >= bestPUCost) ? bestPUDistY : puDistY;
> +        totalDist += xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, true, cost, depthRange);
>   
>           xSetIntraResultQT(cu, initTrDepth, partOffset, reconYuv);
>   
> @@ -1475,11 +1441,11 @@
>               cu->getCbf(TEXT_LUMA)[offs] |= combCbfY;
>       }
>   
> -    // reset context models (TODO: caller should do this)
> +    // reset context models
>       m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
>   
>       // set distortion (rate and r-d costs are determined later)
> -    cu->m_totalDistortion = overallDistY;
> +    cu->m_totalDistortion = totalDist;
>   
>       x265_emms();
>   }
> @@ -3486,28 +3452,27 @@
>       return getIntraModeBits(cu, mode, partOffset, depth);
>   }
>   
> -uint32_t Search::xUpdateCandList(uint32_t mode, uint64_t cost, uint32_t fastCandNum, uint32_t* CandModeList, uint64_t* CandCostList)
> +/* swap the current mode/cost with the mode with the highest cost in the
> + * current candidcate list, if its cost is better (maintain a top N list) */
> +void Search::updateCandList(uint32_t mode, uint64_t cost, uint32_t* candModeList, uint64_t* candCostList)
>   {
> -    uint32_t i;
> -    uint32_t shift = 0;
> -
> -    while (shift < fastCandNum && cost < CandCostList[fastCandNum - 1 - shift])
> -        shift++;
> -
> -    if (shift != 0)
> +    uint32_t maxIndex = 0;
> +    uint64_t maxValue = 0;
> +
> +    for (int i = 0; i < MAX_RD_INTRA_MODES; i++)
>       {
> -        for (i = 1; i < shift; i++)
> +        if (maxValue < candCostList[i])
>           {
> -            CandModeList[fastCandNum - i] = CandModeList[fastCandNum - 1 - i];
> -            CandCostList[fastCandNum - i] = CandCostList[fastCandNum - 1 - i];
> +            maxValue = candCostList[i];
> +            maxIndex = i;
>           }
> -
> -        CandModeList[fastCandNum - shift] = mode;
> -        CandCostList[fastCandNum - shift] = cost;
> -        return 1;
>       }
>   
> -    return 0;
> +    if (cost < maxValue)
> +    {
> +        candCostList[maxIndex] = cost;
> +        candModeList[maxIndex] = mode;
> +    }
>   }
>   
>   /* add inter-prediction syntax elements for a CU block */
> diff -r 012f315d3eda -r 2fb709fbf2d8 source/encoder/search.h
> --- a/source/encoder/search.h	Wed Sep 10 17:27:20 2014 +0200
> +++ b/source/encoder/search.h	Wed Sep 10 12:35:55 2014 +0200
> @@ -172,10 +172,12 @@
>       void     checkBestMVP(MV* amvpCand, MV cMv, MV& mvPred, int& mvpIdx, uint32_t& outBits, uint32_t& outCost);
>       void     getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3]);
>       uint32_t getInterSymbolBits(TComDataCU* cu, uint32_t depthRange[2]);
> -    uint32_t xUpdateCandList(uint32_t mode, uint64_t cost, uint32_t fastCandNum, uint32_t* CandModeList, uint64_t* CandCostList);
>   
>       uint32_t xMergeEstimation(TComDataCU* cu, int partIdx, MergeData& m);
>       void     xSetSearchRange(TComDataCU* cu, MV mvp, int merange, MV& mvmin, MV& mvmax);
> +
> +    enum { MAX_RD_INTRA_MODES = 6 };
> +    void     updateCandList(uint32_t mode, uint64_t cost, uint32_t* candModeList, uint64_t* candCostList);
>   };
>   }
>   
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel



More information about the x265-devel mailing list