[x265] [PATCH 4 of 4] limitTU: compare full TU's cost with split cost to limit recursion

Tue Oct 4 11:20:46 CEST 2016

# HG changeset patch
# User Kavitha Sampath <kavitha at multicorewareinc.com>
# Date 1475245764 -19800
#      Fri Sep 30 19:59:24 2016 +0530
# Node ID 665289d732629438da80d67a7d230f27a6cad29b
# Parent  3ae30a43ac939fe875eaec7f22d134711b00c449
limitTU: compare full TU's cost with split cost to limit recursion

diff -r 3ae30a43ac93 -r 665289d73262 source/encoder/search.cpp

--- a/source/encoder/search.cpp	Fri Sep 30 17:55:41 2016 +0530
+++ b/source/encoder/search.cpp	Fri Sep 30 19:59:24 2016 +0530
@@ -2620,8 +2620,10 @@
 
     if (m_param->limitTU == X265_TU_LIMIT_DFS)
         m_maxTUDepth = 0;
+    cacheTUInfo cache;
+
     Cost costs;
-    estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
+    estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange, cache);
 
     uint32_t tqBypass = cu.m_tqBypass[0];
     if (!tqBypass)
@@ -2870,7 +2872,56 @@
         return m_rdCost.calcRdCost(dist, nullBits);
 }
 
-void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2])
+bool Search::splitTU(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& splitCost, const uint32_t depthRange[2], cacheTUInfo& cache, int32_t splitMore)
+{
+    CUData& cu = mode.cu;
+    uint32_t depth = cuGeom.depth + tuDepth;
+    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
+
+    uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+    uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
+    for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
+    {
+        if (m_param->limitTU == X265_TU_LIMIT_DFS && tuDepth == 0 && qIdx == 1)
+        {
+            // Fetch maximum TU depth of first sub partition to limit recursion of others
+            for (uint32_t i = 0; i < cuGeom.numPartitions / 4; i++)
+                m_maxTUDepth = X265_MAX(m_maxTUDepth, cu.m_tuDepth[i]);
+        }
+        estimateResidualQT(mode, cuGeom, qPartIdx, tuDepth + 1, resiYuv, splitCost, depthRange, cache, splitMore);
+        ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
+        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
+        {
+            ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+            vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
+        }
+    }
+    cu.m_cbf[0][absPartIdx] |= ycbf << tuDepth;
+    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
+    {
+        cu.m_cbf[1][absPartIdx] |= ucbf << tuDepth;
+        cu.m_cbf[2][absPartIdx] |= vcbf << tuDepth;
+    }
+
+    // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
+    // for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma.
+    // But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context
+    // at depth 0 (for example).
+    m_entropyCoder.load(m_rqt[depth].rqtRoot);
+    m_entropyCoder.resetBits();
+    codeInterSubdivCbfQT(cu, absPartIdx, tuDepth, depthRange);
+    uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
+    splitCost.bits += splitCbfBits;
+
+    if (m_rdCost.m_psyRd)
+        splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
+    else
+        splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
+        
+    return ycbf || ucbf || vcbf;
+}
+
+void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2], cacheTUInfo& cache, int32_t splitMore)
 {
     CUData& cu = mode.cu;
     uint32_t depth = cuGeom.depth + tuDepth;
@@ -2879,11 +2930,34 @@
 
     bool bCheckSplit = log2TrSize > depthRange[0];
     bool bCheckFull = log2TrSize <= depthRange[1];
+    bool bSaveTUData = false, bLoadTUData = false;
+    uint32_t idx = 0;
+
     if (m_param->limitTU == X265_TU_LIMIT_DFS && m_maxTUDepth)
     {
         uint32_t log2MaxTrSize = cuGeom.log2CUSize - m_maxTUDepth;
         bCheckSplit = log2TrSize > log2MaxTrSize;
     }
+    else if (m_param->limitTU == X265_TU_LIMIT_BFS && splitMore >= 0)
+    {
+        if (bCheckSplit && bCheckFull && tuDepth)
+        {
+            uint32_t qNumParts = 1 << (log2TrSize - LOG2_UNIT_SIZE) * 2;
+            uint32_t qIdx = (absPartIdx / qNumParts) % 4;
+            idx = (depth - 1) * 4 + qIdx;
+            if (splitMore)
+            {
+                bLoadTUData = true;
+                bCheckFull = false;
+            }
+            else
+            {
+                bSaveTUData = true;
+                bCheckSplit = false;
+            }
+        }
+    }
+
     bool bSplitPresentFlag = bCheckSplit && bCheckFull;
 
     if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && bCheckSplit)
@@ -3356,6 +3430,34 @@
                     bCheckSplit = false;
             }
         }
+
+        if (bSaveTUData)
+        {
+            for (int plane = 0; plane < MAX_NUM_COMPONENT; plane++)
+            {
+                for(int part = 0; part < (m_csp == X265_CSP_I422) + 1; part++)
+                {
+                    cache.bestTransformMode[idx][plane][part] = bestTransformMode[plane][part];
+                    cache.cbfFlag[idx][plane][part] = cbfFlag[plane][part];
+                }
+            }
+            cache.cost[idx] = fullCost;
+            m_entropyCoder.store(cache.rqtStore[idx]);
+        }
+    }
+    if (bLoadTUData)
+    {
+        for (int plane = 0; plane < MAX_NUM_COMPONENT; plane++)
+        {
+            for(int part = 0; part < (m_csp == X265_CSP_I422) + 1; part++)
+            {
+                bestTransformMode[plane][part] = cache.bestTransformMode[idx][plane][part];
+                cbfFlag[plane][part] = cache.cbfFlag[idx][plane][part];
+            }
+        }
+        fullCost = cache.cost[idx];
+        m_entropyCoder.load(cache.rqtStore[idx]);
+        bCheckFull = true;
     }
 
     // code sub-blocks
@@ -3376,50 +3478,29 @@
             splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
         }
 
-        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
-        uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
-        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
-        {
-            if (m_param->limitTU == X265_TU_LIMIT_DFS && tuDepth == 0 && qIdx == 1)
-            {
-                for (uint32_t i = 0; i < cuGeom.numPartitions / 4; i++)
-                    m_maxTUDepth = X265_MAX(m_maxTUDepth, cu.m_tuDepth[i]);
-            }
-            estimateResidualQT(mode, cuGeom, qPartIdx, tuDepth + 1, resiYuv, splitCost, depthRange);
-            ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
-            if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
-            {
-                ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
-                vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
-            }
-        }
-        cu.m_cbf[0][absPartIdx] |= ycbf << tuDepth;
-        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
-        {
-            cu.m_cbf[1][absPartIdx] |= ucbf << tuDepth;
-            cu.m_cbf[2][absPartIdx] |= vcbf << tuDepth;
-        }
-
-        // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
-        // for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma.
-        // But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context
-        // at depth 0 (for example).
-        m_entropyCoder.load(m_rqt[depth].rqtRoot);
-        m_entropyCoder.resetBits();
-
-        codeInterSubdivCbfQT(cu, absPartIdx, tuDepth, depthRange);
-        uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
-        splitCost.bits += splitCbfBits;
-
-        if (m_rdCost.m_psyRd)
-            splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
-        else
-            splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
-
-        if (ycbf || ucbf || vcbf || !bCheckFull)
+        bool yCbCrCbf = splitTU(mode, cuGeom, absPartIdx, tuDepth, resiYuv, splitCost, depthRange, cache, 0);
+        if (yCbCrCbf || !bCheckFull)
         {
             if (splitCost.rdcost < fullCost.rdcost)
             {
+                if (m_param->limitTU == X265_TU_LIMIT_BFS)
+                {
+                    uint32_t nextlog2TrSize = cuGeom.log2CUSize - (tuDepth + 1);
+                    bool nextSplit = nextlog2TrSize > depthRange[0];
+                    if (nextSplit)
+                    {
+                        m_entropyCoder.load(m_rqt[depth].rqtRoot);
+                        splitCost.bits = splitCost.distortion = splitCost.rdcost = splitCost.energy = 0;
+                        if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
+                        {
+                            // Subdiv flag can be encoded at the start of analysis of split blocks.
+                            m_entropyCoder.resetBits();
+                            m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
+                            splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
+                        }
+                        splitTU(mode, cuGeom, absPartIdx, tuDepth, resiYuv, splitCost, depthRange, cache, 1);
+                    }
+                }
                 outCosts.distortion += splitCost.distortion;
                 outCosts.rdcost     += splitCost.rdcost;
                 outCosts.bits       += splitCost.bits;
diff -r 3ae30a43ac93 -r 665289d73262 source/encoder/search.h
--- a/source/encoder/search.h	Fri Sep 30 17:55:41 2016 +0530
+++ b/source/encoder/search.h	Fri Sep 30 19:59:24 2016 +0530
@@ -49,6 +49,8 @@
 #define ProfileCounter(cu, count)
 #endif
 
+#define NUM_SUBPART MAX_TS_SIZE * 4 // 4 sub partitions * 4 depth
+
 namespace X265_NS {
 // private namespace
 
@@ -378,8 +380,17 @@
         Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
     };
 
+    struct cacheTUInfo
+    {
+        Cost cost[NUM_SUBPART];
+        uint32_t bestTransformMode[NUM_SUBPART][MAX_NUM_COMPONENT][2];
+        uint8_t cbfFlag[NUM_SUBPART][MAX_NUM_COMPONENT][2];
+        Entropy rqtStore[NUM_SUBPART];
+    };
+
     uint64_t estimateNullCbfCost(sse_t dist, uint32_t psyEnergy, uint32_t tuDepth, TextType compId);
-    void     estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2]);
+    bool     splitTU(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& splitCost, const uint32_t depthRange[2], cacheTUInfo& cache, int32_t splitMore);
+    void     estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2], cacheTUInfo& cache, int32_t splitMore = -1);
 
     // generate prediction, generate residual and recon. if bAllowSplit, find optimal RQT splits
     void     codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& costs, const uint32_t depthRange[2]);