[x265] [PATCH] search: measure RDO of intra modes within 25% of least cost [CHANGES OUTPUTS]

Steve Borho steve at borho.org
Thu Sep 11 13:37:18 CEST 2014


# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1410345355 -7200
#      Wed Sep 10 12:35:55 2014 +0200
# Node ID 2fb709fbf2d85caae68db9dd6574ba3e6f52d99f
# Parent  012f315d3eda8044f5a49865e15ba2943fbab094
search: measure RDO of intra modes within 25% of least cost [CHANGES OUTPUTS]

all presets use this function to encode I slices, so every encode is affected.

Previous behavior:
  RD measure top N least sa8d cost intra modes and all most probable modes where
  N was depth-based: intraModeNumFast[] = { 8, 8, 3, 3, 3 }; // 4x4, 8x8, etc

New behavior:
  RD measure up to MAX_RD_INTRA_MODES (6) modes that are within 25% of best sa8d
  cost or are most probable.

The new behavior may measure less than 6 modes, the old behavior definitely
would not, and the new behavior may skip some most-probable modes if there
are plenty of other modes which are near the best cost.  Since mode signal cost
is included already, this seems ok.

The general idea is that if 1-2 modes have much better sa8d cost than all the
others, then we are likely wasting our time RD measuring 8-11 modes. We're
betting that sa8d cost is a somewhat decent predictor of RD cost.

Note that I initially tried without a limit (measure all within 25% or MPM) but
for some clips this was a horrible perf trade-off. In some situations all the
intra modes might measure close together (flat source block) and we would end
up measuring most or all of the intra modes for very little gain. So this
version re-introduces a "top N candidate list" but does not bother trying to
keep the list sorted since it is small

diff -r 012f315d3eda -r 2fb709fbf2d8 source/Lib/TLibCommon/CommonDef.h
--- a/source/Lib/TLibCommon/CommonDef.h	Wed Sep 10 17:27:20 2014 +0200
+++ b/source/Lib/TLibCommon/CommonDef.h	Wed Sep 10 12:35:55 2014 +0200
@@ -73,8 +73,6 @@
 #define SCAN_SET_SIZE               16
 #define LOG2_SCAN_SET_SIZE          4
 
-#define FAST_UDI_MAX_RDMODE_NUM     35 // maximum number of RD comparison in fast-UDI estimation loop
-
 #define ALL_IDX                     -1
 #define PLANAR_IDX                  0
 #define VER_IDX                     26 // index for intra VERTICAL   mode
diff -r 012f315d3eda -r 2fb709fbf2d8 source/encoder/search.cpp
--- a/source/encoder/search.cpp	Wed Sep 10 17:27:20 2014 +0200
+++ b/source/encoder/search.cpp	Wed Sep 10 12:35:55 2014 +0200
@@ -1281,12 +1281,9 @@
     uint32_t log2TrSize   = cu->getLog2CUSize(0) - initTrDepth;
     uint32_t tuSize       = 1 << log2TrSize;
     uint32_t qNumParts    = cu->getTotalNumPart() >> 2;
-    uint32_t overallDistY = 0;
-    uint32_t candNum;
-    uint64_t candCostList[FAST_UDI_MAX_RDMODE_NUM];
+    uint32_t totalDist    = 0;
     uint32_t sizeIdx      = log2TrSize - 2;
-    uint32_t partOffset = 0;
-    static const uint8_t intraModeNumFast[] = { 8, 8, 3, 3, 3 }; // 4x4, 8x8, 16x16, 32x32, 64x64
+    uint32_t partOffset   = 0;
 
     // loop over partitions
     for (uint32_t pu = 0; pu < numPU; pu++, partOffset += qNumParts)
@@ -1297,13 +1294,6 @@
         // determine set of modes to be tested (using prediction signal only)
         pixel*   fenc   = fencYuv->getLumaAddr(partOffset);
         uint32_t stride = predYuv->getStride();
-        uint32_t rdModeList[FAST_UDI_MAX_RDMODE_NUM];
-        int numModesForFullRD = intraModeNumFast[sizeIdx];
-
-        for (int i = 0; i < numModesForFullRD; i++)
-            candCostList[i] = MAX_INT64;
-
-        uint64_t modeCosts[35];
 
         pixel *above         = m_refAbove    + tuSize - 1;
         pixel *aboveFiltered = m_refAboveFlt + tuSize - 1;
@@ -1348,18 +1338,20 @@
         }
 
         uint32_t preds[3];
-        int numMpm = cu->getIntraDirLumaPredictor(partOffset, preds);
+        cu->getIntraDirLumaPredictor(partOffset, preds);
         
         uint64_t mpms;
         uint32_t rbits = getIntraRemModeBits(cu, partOffset, depth, preds, mpms);
 
         pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
+        uint64_t modeCosts[35];
+        uint64_t bcost;
 
         // DC
         primitives.intra_pred[sizeIdx][DC_IDX](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
         uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? getIntraModeBits(cu, DC_IDX, partOffset, depth) : rbits;
         uint32_t sad  = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
-        modeCosts[DC_IDX] = m_rdCost.calcRdSADCost(sad, bits);
+        modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
 
         // PLANAR
         pixel *abovePlanar = above;
@@ -1373,6 +1365,7 @@
         bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? getIntraModeBits(cu, PLANAR_IDX, partOffset, depth) : rbits;
         sad  = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
         modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
+        COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
 
         // angular predictions
         primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
@@ -1386,69 +1379,42 @@
             bits = (mpms & ((uint64_t)1 << mode)) ? getIntraModeBits(cu, mode, partOffset, depth) : rbits;
             sad = sa8d(cmp, srcStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
             modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
+            COPY1_IF_LT(bcost, modeCosts[mode]);
         }
 
-        // Find N least cost modes. N = numModesForFullRD
-        candNum = 0;
+        uint64_t candCostList[MAX_RD_INTRA_MODES];
+        uint32_t rdModeList[MAX_RD_INTRA_MODES];
+        for (int i = 0; i < MAX_RD_INTRA_MODES; i++)
+            candCostList[i] = MAX_INT64;
+
+        /* Find the top MAX_RD_INTRA_MODES modes with cost within 25% of best
+         * or among the most probable modes */
+        uint64_t paddedBcost = bcost + (bcost >> 2); // 1.25%
         for (int mode = 0; mode < 35; mode++)
-            candNum += xUpdateCandList(mode, modeCosts[mode], numModesForFullRD, rdModeList, candCostList);
-
-        for (int j = 0; j < numMpm; j++)
+            if (modeCosts[mode] < paddedBcost || (mpms & ((uint64_t)1 << mode)))
+                updateCandList(mode, modeCosts[mode], rdModeList, candCostList);
+
+        /* measure modes with 25% of best SAD RD-COST using simple RDO (no TU splits) */
+        uint32_t bmode = 0;
+        uint64_t cost;
+        bcost = MAX_INT64;
+        for (int i = 0; i < MAX_RD_INTRA_MODES; i++)
         {
-            bool mostProbableModeIncluded = false;
-            uint32_t mostProbableMode = preds[j];
-
-            for (int i = 0; i < numModesForFullRD; i++)
-            {
-                if (mostProbableMode == rdModeList[i])
-                {
-                    mostProbableModeIncluded = true;
-                    break;
-                }
-            }
-
-            if (!mostProbableModeIncluded)
-                rdModeList[numModesForFullRD++] = mostProbableMode;
+            if (candCostList[i] == MAX_INT64)
+                break;
+            m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+            cu->setLumaIntraDirSubParts(rdModeList[i], partOffset, depth + initTrDepth);
+            cost = 0;
+            xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, false, cost, depthRange);
+            COPY2_IF_LT(bcost, cost, bmode, rdModeList[i]);
         }
 
-        // check modes (using r-d costs)
-        uint32_t bestPUMode  = 0;
-        uint32_t bestPUDistY = 0;
-        uint64_t bestPUCost  = MAX_INT64;
-        uint32_t puDistY;
-        uint64_t puCost;
-        for (int mode = 0; mode < numModesForFullRD; mode++)
-        {
-            // set luma prediction mode
-            cu->setLumaIntraDirSubParts(rdModeList[mode], partOffset, depth + initTrDepth);
-
-            // set context models
-            m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
-
-            // determine residual for partition
-            puCost = 0;
-            puDistY = xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, false, puCost, depthRange);
-
-            // check r-d cost
-            if (puCost < bestPUCost)
-            {
-                bestPUMode  = rdModeList[mode];
-                bestPUDistY = puDistY;
-                bestPUCost  = puCost;
-            }
-        }
-
         /* remeasure best mode, allowing TU splits */
-        cu->setLumaIntraDirSubParts(bestPUMode, partOffset, depth + initTrDepth);
-
-        // set context models
+        cu->setLumaIntraDirSubParts(bmode, partOffset, depth + initTrDepth);
+
         m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
 
-        // determine residual for partition
-        puCost = 0;
-        puDistY = xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, true, puCost, depthRange);
-
-        overallDistY += (puCost >= bestPUCost) ? bestPUDistY : puDistY;
+        totalDist += xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, true, cost, depthRange);
 
         xSetIntraResultQT(cu, initTrDepth, partOffset, reconYuv);
 
@@ -1475,11 +1441,11 @@
             cu->getCbf(TEXT_LUMA)[offs] |= combCbfY;
     }
 
-    // reset context models (TODO: caller should do this)
+    // reset context models
     m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
 
     // set distortion (rate and r-d costs are determined later)
-    cu->m_totalDistortion = overallDistY;
+    cu->m_totalDistortion = totalDist;
 
     x265_emms();
 }
@@ -3486,28 +3452,27 @@
     return getIntraModeBits(cu, mode, partOffset, depth);
 }
 
-uint32_t Search::xUpdateCandList(uint32_t mode, uint64_t cost, uint32_t fastCandNum, uint32_t* CandModeList, uint64_t* CandCostList)
+/* swap the current mode/cost with the mode with the highest cost in the
+ * current candidcate list, if its cost is better (maintain a top N list) */
+void Search::updateCandList(uint32_t mode, uint64_t cost, uint32_t* candModeList, uint64_t* candCostList)
 {
-    uint32_t i;
-    uint32_t shift = 0;
-
-    while (shift < fastCandNum && cost < CandCostList[fastCandNum - 1 - shift])
-        shift++;
-
-    if (shift != 0)
+    uint32_t maxIndex = 0;
+    uint64_t maxValue = 0;
+
+    for (int i = 0; i < MAX_RD_INTRA_MODES; i++)
     {
-        for (i = 1; i < shift; i++)
+        if (maxValue < candCostList[i])
         {
-            CandModeList[fastCandNum - i] = CandModeList[fastCandNum - 1 - i];
-            CandCostList[fastCandNum - i] = CandCostList[fastCandNum - 1 - i];
+            maxValue = candCostList[i];
+            maxIndex = i;
         }
-
-        CandModeList[fastCandNum - shift] = mode;
-        CandCostList[fastCandNum - shift] = cost;
-        return 1;
     }
 
-    return 0;
+    if (cost < maxValue)
+    {
+        candCostList[maxIndex] = cost;
+        candModeList[maxIndex] = mode;
+    }
 }
 
 /* add inter-prediction syntax elements for a CU block */
diff -r 012f315d3eda -r 2fb709fbf2d8 source/encoder/search.h
--- a/source/encoder/search.h	Wed Sep 10 17:27:20 2014 +0200
+++ b/source/encoder/search.h	Wed Sep 10 12:35:55 2014 +0200
@@ -172,10 +172,12 @@
     void     checkBestMVP(MV* amvpCand, MV cMv, MV& mvPred, int& mvpIdx, uint32_t& outBits, uint32_t& outCost);
     void     getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3]);
     uint32_t getInterSymbolBits(TComDataCU* cu, uint32_t depthRange[2]);
-    uint32_t xUpdateCandList(uint32_t mode, uint64_t cost, uint32_t fastCandNum, uint32_t* CandModeList, uint64_t* CandCostList);
 
     uint32_t xMergeEstimation(TComDataCU* cu, int partIdx, MergeData& m);
     void     xSetSearchRange(TComDataCU* cu, MV mvp, int merange, MV& mvmin, MV& mvmax);
+
+    enum { MAX_RD_INTRA_MODES = 6 };
+    void     updateCandList(uint32_t mode, uint64_t cost, uint32_t* candModeList, uint64_t* candCostList);
 };
 }
 


More information about the x265-devel mailing list