[x265] [PATCH 1 of 3] search: seperate intra analysis from RDO in estIntraPredQT(), improve var names

Sun Feb 1 22:20:38 CET 2015

# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1422821410 21600
#      Sun Feb 01 14:10:10 2015 -0600
# Node ID 6aa779d44a06b1f0255395f6e09e86904801f1d8
# Parent  c3e2dbd3b2c3933142a5734b6a13eea6ea56e8ae
search: seperate intra analysis from RDO in estIntraPredQT(), improve var names

This clarifies the statistics in I slices and in RD levels 4 and 5. This coomit
adds a brace { } scope to perform the profiling but does not change indentation.
This will be done in the next commit.

diff -r c3e2dbd3b2c3 -r 6aa779d44a06 source/encoder/search.cpp

--- a/source/encoder/search.cpp	Sat Jan 31 15:57:00 2015 -0600
+++ b/source/encoder/search.cpp	Sun Feb 01 14:10:10 2015 -0600
@@ -1141,8 +1141,6 @@
     uint32_t tuDepthRange[2];
     cu.getIntraTUQtDepthRange(tuDepthRange, 0);
 
-    ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
-
     intraMode.initCosts();
     intraMode.distortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange, sharedModes);
     intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom);
@@ -1207,7 +1205,7 @@
     // 33 Angle modes once
     ALIGN_VAR_32(pixel, bufScale[32 * 32]);
     ALIGN_VAR_32(pixel, bufTrans[32 * 32]);
-    ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]);
+    ALIGN_VAR_32(pixel, preds[33 * 32 * 32]);
     int scaleTuSize = tuSize;
     int scaleStride = stride;
     int costShift = 0;
@@ -1248,14 +1246,14 @@
      *  pred[1], pred[2] - less probable, slightly more cost
      *  non-mpm modes    - all cost the same (rbits) */
     uint64_t mpms;
-    uint32_t preds[3];
-    uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms);
+    uint32_t mpmModes[3];
+    uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);
 
     // DC
-    primitives.cu[sizeIdx].intra_pred[DC_IDX](tmp, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
-    bsad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
+    primitives.cu[sizeIdx].intra_pred[DC_IDX](preds, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
+    bsad = sa8d(fenc, scaleStride, preds, scaleStride) << costShift;
     bmode = mode = DC_IDX;
-    bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
+    bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
     bcost = m_rdCost.calcRdSADCost(bsad, bbits);
 
     // PLANAR
@@ -1263,10 +1261,10 @@
     if (tuSize & (8 | 16 | 32))
         planar = intraNeighbourBuf[1];
 
-    primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](tmp, scaleStride, planar, 0, 0);
-    sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
+    primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](preds, scaleStride, planar, 0, 0);
+    sad = sa8d(fenc, scaleStride, preds, scaleStride) << costShift;
     mode = PLANAR_IDX;
-    bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
+    bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
     cost = m_rdCost.calcRdSADCost(sad, bits);
     COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
 
@@ -1274,7 +1272,7 @@
     if (primitives.cu[sizeIdx].intra_pred_allangs)
     {
         primitives.cu[sizeIdx].transpose(bufTrans, fenc, scaleStride);
-        primitives.cu[sizeIdx].intra_pred_allangs(tmp, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16)); 
+        primitives.cu[sizeIdx].intra_pred_allangs(preds, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16)); 
     }
     else
         allangs = false;
@@ -1282,16 +1280,16 @@
 #define TRY_ANGLE(angle) \
     if (allangs) { \
         if (angle < 18) \
-            sad = sa8d(bufTrans, scaleTuSize, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \
+            sad = sa8d(bufTrans, scaleTuSize, &preds[(angle - 2) * predsize], scaleTuSize) << costShift; \
         else \
-            sad = sa8d(fenc, scaleStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \
-        bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
+            sad = sa8d(fenc, scaleStride, &preds[(angle - 2) * predsize], scaleTuSize) << costShift; \
+        bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
         cost = m_rdCost.calcRdSADCost(sad, bits); \
     } else { \
         int filter = !!(g_intraFilterFlags[angle] & scaleTuSize); \
-        primitives.cu[sizeIdx].intra_pred[angle](tmp, scaleTuSize, intraNeighbourBuf[filter], angle, scaleTuSize <= 16); \
-        sad = sa8d(fenc, scaleStride, tmp, scaleTuSize) << costShift; \
-        bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
+        primitives.cu[sizeIdx].intra_pred[angle](preds, scaleTuSize, intraNeighbourBuf[filter], angle, scaleTuSize <= 16); \
+        sad = sa8d(fenc, scaleStride, preds, scaleTuSize) << costShift; \
+        bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
         cost = m_rdCost.calcRdSADCost(sad, bits); \
     }
 
@@ -1424,6 +1422,16 @@
             bmode = sharedModes[puIdx];
         else
         {
+            ALIGN_VAR_32(pixel, pred[32 * 32]);
+
+            uint64_t candCostList[MAX_RD_INTRA_MODES];
+            uint32_t rdModeList[MAX_RD_INTRA_MODES];
+            uint64_t bcost;
+            int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1);
+
+            {
+            ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
+
             // Reference sample smoothing
             IntraNeighbors intraNeighbors;
             initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
@@ -1433,10 +1441,6 @@
             const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
             uint32_t stride = predYuv->m_size;
 
-            // 33 Angle modes once
-            ALIGN_VAR_32(pixel, bufTrans[32 * 32]);
-            ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]);
-
             int scaleTuSize = tuSize;
             int scaleStride = stride;
             int costShift = 0;
@@ -1474,17 +1478,16 @@
              *  pred[1], pred[2] - less probable, slightly more cost
              *  non-mpm modes    - all cost the same (rbits) */
             uint64_t mpms;
-            uint32_t preds[3];
-            uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms);
+            uint32_t mpmModes[3];
+            uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);
 
             pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
             uint64_t modeCosts[35];
-            uint64_t bcost;
 
             // DC
-            primitives.cu[sizeIdx].intra_pred[DC_IDX](tmp, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
-            uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(preds, DC_IDX) : rbits;
-            uint32_t sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
+            primitives.cu[sizeIdx].intra_pred[DC_IDX](pred, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
+            uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, DC_IDX) : rbits;
+            uint32_t sad = sa8d(fenc, scaleStride, pred, scaleStride) << costShift;
             modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
 
             // PLANAR
@@ -1492,24 +1495,27 @@
             if (tuSize >= 8 && tuSize <= 32)
                 planar = intraNeighbourBuf[1];
 
-            primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](tmp, scaleStride, planar, 0, 0);
-            bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(preds, PLANAR_IDX) : rbits;
-            sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
+            primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](pred, scaleStride, planar, 0, 0);
+            bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, PLANAR_IDX) : rbits;
+            sad = sa8d(fenc, scaleStride, pred, scaleStride) << costShift;
             modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
             COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
 
             // angular predictions
             if (primitives.cu[sizeIdx].intra_pred_allangs)
             {
+                ALIGN_VAR_32(pixel, bufTrans[32 * 32]);      // TODO: Use aligned mallocs
+                ALIGN_VAR_32(pixel, allPreds[33 * 32 * 32]);
+
                 primitives.cu[sizeIdx].transpose(bufTrans, fenc, scaleStride);
-                primitives.cu[sizeIdx].intra_pred_allangs(tmp, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16));
+                primitives.cu[sizeIdx].intra_pred_allangs(allPreds, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16));
                 for (int mode = 2; mode < 35; mode++)
                 {
-                    bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
+                    bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
                     if (mode < 18)
-                        sad = sa8d(bufTrans, scaleTuSize, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
+                        sad = sa8d(bufTrans, scaleTuSize, &allPreds[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
                     else
-                        sad = sa8d(fenc, scaleStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
+                        sad = sa8d(fenc, scaleStride, &allPreds[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
                     modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
                     COPY1_IF_LT(bcost, modeCosts[mode]);
                 }
@@ -1518,10 +1524,10 @@
             {
                 for (int mode = 2; mode < 35; mode++)
                 {
-                    bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
+                    bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
                     int filter = !!(g_intraFilterFlags[mode] & scaleTuSize);
-                    primitives.cu[sizeIdx].intra_pred[mode](tmp, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16);
-                    sad = sa8d(fenc, scaleStride, tmp, scaleTuSize) << costShift;
+                    primitives.cu[sizeIdx].intra_pred[mode](pred, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16);
+                    sad = sa8d(fenc, scaleStride, pred, scaleTuSize) << costShift;
                     modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
                     COPY1_IF_LT(bcost, modeCosts[mode]);
                 }
@@ -1531,9 +1537,6 @@
              * or among the most probable modes. maxCandCount is derived from the
              * rdLevel and depth. In general we want to try more modes at slower RD
              * levels and at higher depths */
-            uint64_t candCostList[MAX_RD_INTRA_MODES];
-            uint32_t rdModeList[MAX_RD_INTRA_MODES];
-            int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1);
             for (int i = 0; i < maxCandCount; i++)
                 candCostList[i] = MAX_INT64;
 
@@ -1541,6 +1544,7 @@
             for (int mode = 0; mode < 35; mode++)
                 if (modeCosts[mode] < paddedBcost || (mpms & ((uint64_t)1 << mode)))
                     updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList);
+            }
 
             /* measure best candidates using simple RDO (no TU splits) */
             bcost = MAX_INT64;
@@ -3398,13 +3402,13 @@
 
 /* returns the number of bits required to signal a non-most-probable mode.
  * on return mpms contains bitmap of most probable modes */
-uint32_t Search::getIntraRemModeBits(CUData& cu, uint32_t absPartIdx, uint32_t preds[3], uint64_t& mpms) const
+uint32_t Search::getIntraRemModeBits(CUData& cu, uint32_t absPartIdx, uint32_t mpmModes[3], uint64_t& mpms) const
 {
-    cu.getIntraDirLumaPredictor(absPartIdx, preds);
+    cu.getIntraDirLumaPredictor(absPartIdx, mpmModes);
 
     mpms = 0;
     for (int i = 0; i < 3; ++i)
-        mpms |= ((uint64_t)1 << preds[i]);
+        mpms |= ((uint64_t)1 << mpmModes[i]);
 
     return m_entropyCoder.bitsIntraModeNonMPM();
 }
diff -r c3e2dbd3b2c3 -r 6aa779d44a06 source/encoder/search.h
--- a/source/encoder/search.h	Sat Jan 31 15:57:00 2015 -0600
+++ b/source/encoder/search.h	Sun Feb 01 14:10:10 2015 -0600
@@ -345,7 +345,7 @@
     static void updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList);
 
     // get most probable luma modes for CU part, and bit cost of all non mpm modes
-    uint32_t getIntraRemModeBits(CUData & cu, uint32_t absPartIdx, uint32_t preds[3], uint64_t& mpms) const;
+    uint32_t getIntraRemModeBits(CUData & cu, uint32_t absPartIdx, uint32_t mpmModes[3], uint64_t& mpms) const;
 
     void updateModeCost(Mode& m) const { m.rdCost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(m.distortion, m.totalBits, m.psyEnergy) : m_rdCost.calcRdCost(m.distortion, m.totalBits); }
 };