[x265] [PATCH] search: measure RDO of intra modes within 25% of least cost [CHANGES OUTPUTS]

Steve Borho steve at borho.org
Thu Sep 11 19:48:04 CEST 2014


# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1410345355 -7200
#      Wed Sep 10 12:35:55 2014 +0200
# Node ID b35ff208d7f8f85ca4bb0c65c4a562c0e9c24100
# Parent  9637c6ae8294d8f74981d01a1c1e216196376e69
search: measure RDO of intra modes within 25% of least cost [CHANGES OUTPUTS]

This version adaps the number of RD measured modes by param.rdLevel (aka preset)
and by depth. This gives a non-trivial speedup to the very fast presets which
use frequent keyframes and helps improve compression in slower presets.

all presets use this function to encode I slices, so every encode is affected.

Previous behavior:
  RD measure top N least sa8d cost intra modes and all most probable modes where
  N was depth-based: intraModeNumFast[] = { 8, 8, 3, 3, 3 }; // 4x4, 8x8, etc

New behavior:
  RD measure up to MAX_RD_INTRA_MODES (6) modes that are within 25% of best sa8d
  cost or are most probable.

The new behavior may measure less than 6 modes, the old behavior definitely
would not, and the new behavior may skip some most-probable modes if there
are plenty of other modes which are near the best cost.  Since mode signal cost
is included already, this seems ok.

The general idea is that if 1-2 modes have much better sa8d cost than all the
others, then we are likely wasting our time RD measuring 8-11 modes. We're
betting that sa8d cost is a somewhat decent predictor of RD cost.

Note that I initially tried without a limit (measure all within 25% or MPM) but
for some clips this was a horrible perf trade-off. In some situations all the
intra modes might measure close together (flat source block) and we would end
up measuring most or all of the intra modes for very little gain. So this
version re-introduces a "top N candidate list" but does not bother trying to
keep the list sorted since it is small

diff -r 9637c6ae8294 -r b35ff208d7f8 source/Lib/TLibCommon/CommonDef.h
--- a/source/Lib/TLibCommon/CommonDef.h	Thu Sep 11 19:24:28 2014 +0530
+++ b/source/Lib/TLibCommon/CommonDef.h	Wed Sep 10 12:35:55 2014 +0200
@@ -73,8 +73,6 @@
 #define SCAN_SET_SIZE               16
 #define LOG2_SCAN_SET_SIZE          4
 
-#define FAST_UDI_MAX_RDMODE_NUM     35 // maximum number of RD comparison in fast-UDI estimation loop
-
 #define ALL_IDX                     -1
 #define PLANAR_IDX                  0
 #define VER_IDX                     26 // index for intra VERTICAL   mode
diff -r 9637c6ae8294 -r b35ff208d7f8 source/encoder/search.cpp
--- a/source/encoder/search.cpp	Thu Sep 11 19:24:28 2014 +0530
+++ b/source/encoder/search.cpp	Wed Sep 10 12:35:55 2014 +0200
@@ -1281,12 +1281,9 @@
     uint32_t log2TrSize   = cu->getLog2CUSize(0) - initTrDepth;
     uint32_t tuSize       = 1 << log2TrSize;
     uint32_t qNumParts    = cu->getTotalNumPart() >> 2;
-    uint32_t overallDistY = 0;
-    uint32_t candNum;
-    uint64_t candCostList[FAST_UDI_MAX_RDMODE_NUM];
+    uint32_t totalDist    = 0;
     uint32_t sizeIdx      = log2TrSize - 2;
-    uint32_t partOffset = 0;
-    static const uint8_t intraModeNumFast[] = { 8, 8, 3, 3, 3 }; // 4x4, 8x8, 16x16, 32x32, 64x64
+    uint32_t partOffset   = 0;
 
     // loop over partitions
     for (uint32_t pu = 0; pu < numPU; pu++, partOffset += qNumParts)
@@ -1297,13 +1294,6 @@
         // determine set of modes to be tested (using prediction signal only)
         pixel*   fenc   = fencYuv->getLumaAddr(partOffset);
         uint32_t stride = predYuv->getStride();
-        uint32_t rdModeList[FAST_UDI_MAX_RDMODE_NUM];
-        int numModesForFullRD = intraModeNumFast[sizeIdx];
-
-        for (int i = 0; i < numModesForFullRD; i++)
-            candCostList[i] = MAX_INT64;
-
-        uint64_t modeCosts[35];
 
         pixel *above         = m_refAbove    + tuSize - 1;
         pixel *aboveFiltered = m_refAboveFlt + tuSize - 1;
@@ -1348,18 +1338,20 @@
         }
 
         uint32_t preds[3];
-        int numMpm = cu->getIntraDirLumaPredictor(partOffset, preds);
+        cu->getIntraDirLumaPredictor(partOffset, preds);
         
         uint64_t mpms;
         uint32_t rbits = getIntraRemModeBits(cu, partOffset, depth, preds, mpms);
 
         pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
+        uint64_t modeCosts[35];
+        uint64_t bcost;
 
         // DC
         primitives.intra_pred[sizeIdx][DC_IDX](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
         uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? getIntraModeBits(cu, DC_IDX, partOffset, depth) : rbits;
         uint32_t sad  = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
-        modeCosts[DC_IDX] = m_rdCost.calcRdSADCost(sad, bits);
+        modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
 
         // PLANAR
         pixel *abovePlanar = above;
@@ -1373,6 +1365,7 @@
         bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? getIntraModeBits(cu, PLANAR_IDX, partOffset, depth) : rbits;
         sad  = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
         modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
+        COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
 
         // angular predictions
         primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
@@ -1386,69 +1379,45 @@
             bits = (mpms & ((uint64_t)1 << mode)) ? getIntraModeBits(cu, mode, partOffset, depth) : rbits;
             sad = sa8d(cmp, srcStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
             modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
+            COPY1_IF_LT(bcost, modeCosts[mode]);
         }
 
-        // Find N least cost modes. N = numModesForFullRD
-        candNum = 0;
+        /* Find the top maxCandCount candidate modes with cost within 25% of best
+         * or among the most probable modes. maxCandCount is derived from the
+         * rdLevel and depth. In general we want to try more modes at slower RD
+         * levels and at higher depths */
+        uint64_t candCostList[MAX_RD_INTRA_MODES];
+        uint32_t rdModeList[MAX_RD_INTRA_MODES];
+        int maxCandCount = 2 + m_param->rdLevel + (initTrDepth >> 1);
+        for (int i = 0; i < maxCandCount; i++)
+            candCostList[i] = MAX_INT64;
+
+        uint64_t paddedBcost = bcost + (bcost >> 2); // 1.25%
         for (int mode = 0; mode < 35; mode++)
-            candNum += xUpdateCandList(mode, modeCosts[mode], numModesForFullRD, rdModeList, candCostList);
-
-        for (int j = 0; j < numMpm; j++)
+            if (modeCosts[mode] < paddedBcost || (mpms & ((uint64_t)1 << mode)))
+                updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList);
+
+        /* measure best candidates using simple RDO (no TU splits) */
+        uint32_t bmode = 0;
+        uint64_t cost;
+        bcost = MAX_INT64;
+        for (int i = 0; i < maxCandCount; i++)
         {
-            bool mostProbableModeIncluded = false;
-            uint32_t mostProbableMode = preds[j];
-
-            for (int i = 0; i < numModesForFullRD; i++)
-            {
-                if (mostProbableMode == rdModeList[i])
-                {
-                    mostProbableModeIncluded = true;
-                    break;
-                }
-            }
-
-            if (!mostProbableModeIncluded)
-                rdModeList[numModesForFullRD++] = mostProbableMode;
+            if (candCostList[i] == MAX_INT64)
+                break;
+            m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+            cu->setLumaIntraDirSubParts(rdModeList[i], partOffset, depth + initTrDepth);
+            cost = 0;
+            xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, false, cost, depthRange);
+            COPY2_IF_LT(bcost, cost, bmode, rdModeList[i]);
         }
 
-        // check modes (using r-d costs)
-        uint32_t bestPUMode  = 0;
-        uint32_t bestPUDistY = 0;
-        uint64_t bestPUCost  = MAX_INT64;
-        uint32_t puDistY;
-        uint64_t puCost;
-        for (int mode = 0; mode < numModesForFullRD; mode++)
-        {
-            // set luma prediction mode
-            cu->setLumaIntraDirSubParts(rdModeList[mode], partOffset, depth + initTrDepth);
-
-            // set context models
-            m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
-
-            // determine residual for partition
-            puCost = 0;
-            puDistY = xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, false, puCost, depthRange);
-
-            // check r-d cost
-            if (puCost < bestPUCost)
-            {
-                bestPUMode  = rdModeList[mode];
-                bestPUDistY = puDistY;
-                bestPUCost  = puCost;
-            }
-        }
-
         /* remeasure best mode, allowing TU splits */
-        cu->setLumaIntraDirSubParts(bestPUMode, partOffset, depth + initTrDepth);
-
-        // set context models
+        cu->setLumaIntraDirSubParts(bmode, partOffset, depth + initTrDepth);
+
         m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
 
-        // determine residual for partition
-        puCost = 0;
-        puDistY = xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, true, puCost, depthRange);
-
-        overallDistY += (puCost >= bestPUCost) ? bestPUDistY : puDistY;
+        totalDist += xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, true, cost, depthRange);
 
         xSetIntraResultQT(cu, initTrDepth, partOffset, reconYuv);
 
@@ -1475,11 +1444,11 @@
             cu->getCbf(TEXT_LUMA)[offs] |= combCbfY;
     }
 
-    // reset context models (TODO: caller should do this)
+    // reset context models
     m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
 
     // set distortion (rate and r-d costs are determined later)
-    cu->m_totalDistortion = overallDistY;
+    cu->m_totalDistortion = totalDist;
 
     x265_emms();
 }
@@ -3486,28 +3455,27 @@
     return getIntraModeBits(cu, mode, partOffset, depth);
 }
 
-uint32_t Search::xUpdateCandList(uint32_t mode, uint64_t cost, uint32_t fastCandNum, uint32_t* CandModeList, uint64_t* CandCostList)
+/* swap the current mode/cost with the mode with the highest cost in the
+ * current candidcate list, if its cost is better (maintain a top N list) */
+void Search::updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList)
 {
-    uint32_t i;
-    uint32_t shift = 0;
-
-    while (shift < fastCandNum && cost < CandCostList[fastCandNum - 1 - shift])
-        shift++;
-
-    if (shift != 0)
+    uint32_t maxIndex = 0;
+    uint64_t maxValue = 0;
+
+    for (int i = 0; i < maxCandCount; i++)
     {
-        for (i = 1; i < shift; i++)
+        if (maxValue < candCostList[i])
         {
-            CandModeList[fastCandNum - i] = CandModeList[fastCandNum - 1 - i];
-            CandCostList[fastCandNum - i] = CandCostList[fastCandNum - 1 - i];
+            maxValue = candCostList[i];
+            maxIndex = i;
         }
-
-        CandModeList[fastCandNum - shift] = mode;
-        CandCostList[fastCandNum - shift] = cost;
-        return 1;
     }
 
-    return 0;
+    if (cost < maxValue)
+    {
+        candCostList[maxIndex] = cost;
+        candModeList[maxIndex] = mode;
+    }
 }
 
 /* add inter-prediction syntax elements for a CU block */
diff -r 9637c6ae8294 -r b35ff208d7f8 source/encoder/search.h
--- a/source/encoder/search.h	Thu Sep 11 19:24:28 2014 +0530
+++ b/source/encoder/search.h	Wed Sep 10 12:35:55 2014 +0200
@@ -172,10 +172,12 @@
     void     checkBestMVP(MV* amvpCand, MV cMv, MV& mvPred, int& mvpIdx, uint32_t& outBits, uint32_t& outCost);
     void     getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3]);
     uint32_t getInterSymbolBits(TComDataCU* cu, uint32_t depthRange[2]);
-    uint32_t xUpdateCandList(uint32_t mode, uint64_t cost, uint32_t fastCandNum, uint32_t* CandModeList, uint64_t* CandCostList);
 
     uint32_t mergeEstimation(TComDataCU* cu, int partIdx, MergeData& m);
     void     setSearchRange(TComDataCU* cu, MV mvp, int merange, MV& mvmin, MV& mvmax);
+
+    enum { MAX_RD_INTRA_MODES = 16 };
+    void     updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList);
 };
 }
 


More information about the x265-devel mailing list