[x265] [PATCH] [REVIEW PATCH/OUTPUT CHANGED]search: removed multiple encode Coefficients from estimateResidualQT()

Wed Nov 5 16:06:49 CET 2014

# HG changeset patch
# User Ashok Kumar Mishra<ashok at multicorewareinc.com>
# Date 1415199866 -19800
#      Wed Nov 05 20:34:26 2014 +0530
# Node ID 3e415f2dc7377a4a8093128c017df9ff329d6eda
# Parent  a1902139ce82796caa570405b63cbb78fc6fd442
[REVIEW PATCH/OUTPUT CHANGED]search: removed multiple encode Coefficients from estimateResidualQT()

Tried to remove multiple encode coefficients from estimateResidualQT() function.
Coefficients are encoded in three stages: Once for calculation of distortion and twice for split and unsplit
block cost calculation. I have given comments where I have changed the code.

diff -r a1902139ce82 -r 3e415f2dc737 source/encoder/entropy.cpp

--- a/source/encoder/entropy.cpp	Wed Nov 05 20:34:26 2014 +0530
+++ b/source/encoder/entropy.cpp	Wed Nov 05 20:34:26 2014 +0530
@@ -1501,6 +1501,13 @@
     encodeBin(cbf, m_contextState[OFF_QT_ROOT_CBF_CTX]);
 }
 
+uint32_t Entropy::estimateCbfBits(uint32_t cbf,TextType ttype, uint32_t trDepth)
+{
+    //This is an approximation. Encode the context bin to estimate the bits
+    uint32_t ctx = ctxCbf[ttype][trDepth];
+    return encodeBinContext(cbf, m_contextState[OFF_QT_CBF_CTX + ctx]);
+}
+
 void Entropy::codeQtCbfZero(TextType ttype, uint32_t trDepth)
 {
     // this function is only used to estimate the bits when cbf is 0
@@ -2026,6 +2033,17 @@
         writeOut();
 }
 
+/** Return the bits of encoding the context bin specified without encoding it.*/
+uint32_t Entropy::encodeBinContext(uint32_t binValue, uint8_t &ctxModel)
+{
+    uint64_t fracBits = m_fracBits;
+    fracBits &= 32767;
+
+    fracBits += sbacGetEntropyBits(ctxModel, binValue);
+
+    return (uint32_t)(fracBits >> 15);
+}
+
 /** Encode equiprobable bin */
 void Entropy::encodeBinEP(uint32_t binValue)
 {
diff -r a1902139ce82 -r 3e415f2dc737 source/encoder/entropy.h
--- a/source/encoder/entropy.h	Wed Nov 05 20:34:26 2014 +0530
+++ b/source/encoder/entropy.h	Wed Nov 05 20:34:26 2014 +0530
@@ -176,6 +176,7 @@
     void codeQtRootCbfZero();
     void codeCoeff(const CUData& cu, uint32_t absPartIdx, uint32_t depth, bool& bCodeDQP, uint32_t depthRange[2]);
     void codeCoeffNxN(const CUData& cu, const coeff_t* coef, uint32_t absPartIdx, uint32_t log2TrSize, TextType ttype);
+    uint32_t estimateCbfBits(uint32_t cbf, TextType ttype, uint32_t trDepth);
 
     uint32_t bitsIntraModeNonMPM() const;
     uint32_t bitsIntraModeMPM(const uint32_t preds[3], uint32_t dir) const;
@@ -199,6 +200,7 @@
     void encodeBinEP(uint32_t binValue);
     void encodeBinsEP(uint32_t binValues, int numBins);
     void encodeBinTrm(uint32_t binValue);
+    uint32_t encodeBinContext(uint32_t binValue, uint8_t &ctxModel);
 
     void encodeCU(const CUData& cu, const CUGeom &cuGeom, uint32_t absPartIdx, uint32_t depth, bool& bEncodeDQP);
     void finishCU(const CUData& cu, uint32_t absPartIdx, uint32_t depth);
diff -r a1902139ce82 -r 3e415f2dc737 source/encoder/search.cpp
--- a/source/encoder/search.cpp	Wed Nov 05 20:34:26 2014 +0530
+++ b/source/encoder/search.cpp	Wed Nov 05 20:34:26 2014 +0530
@@ -2714,11 +2714,10 @@
     }
 }
 
-uint64_t Search::deriveNullCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId)
+uint64_t Search::estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId)
 {
-    m_entropyCoder.resetBits();
-    m_entropyCoder.codeQtCbfZero(compId, tuDepth);
-    const uint32_t nullBits = m_entropyCoder.getNumberOfWrittenBits();
+    uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth);
+
     if (m_rdCost.m_psyRd)
         return m_rdCost.calcPsyRdCost(dist, nullBits, psyEnergy);
     else
@@ -2732,6 +2731,7 @@
 
     bool bCheckSplit = log2TrSize > depthRange[0];
     bool bCheckFull = log2TrSize <= depthRange[1];
+    bool bSplitPresentFlag = bCheckSplit && bCheckFull;
 
     if (cu.m_partSize[absPartIdx] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && bCheckSplit)
         bCheckFull = false;
@@ -2757,9 +2757,9 @@
 
     uint8_t  cbfFlag[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
     uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
-    uint32_t singleBitsComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
-    uint32_t singleDistComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
-    uint32_t singlePsyEnergyComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
+    uint32_t singleBits[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
+    uint32_t singleDist[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
+    uint32_t singlePsyEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
     uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
     uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} };
 
@@ -2796,48 +2796,19 @@
         cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0];
 
         m_entropyCoder.resetBits();
-        m_entropyCoder.codeQtCbf(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
+
+        if (bSplitPresentFlag && log2TrSize > depthRange[0])
+            m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
+        fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
+
+        // Coding luma cbf flag has been removed from here. The context for cbf flag is different for each depth.
+        // So it is valid if we encode coefficients and then cbfs at least for analysis.
+//        m_entropyCoder.codeQtCbf(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
         if (cbfFlag[TEXT_LUMA][0])
             m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
-        singleBitsComp[TEXT_LUMA][0] = m_entropyCoder.getNumberOfWrittenBits();
-
-        uint32_t singleBitsPrev = singleBitsComp[TEXT_LUMA][0];
-
-        if (bCodeChroma)
-        {
-            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
-            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
-            {
-                coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
-                TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
-
-                do
-                {
-                    uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
-                    uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
-
-                    cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
-
-                    if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
-                        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
-
-                    fenc = const_cast<pixel*>(fencYuv->getChromaAddr(chromaId, absPartIdxC));
-                    resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
-                    numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
-                    cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
-
-                    m_entropyCoder.codeQtCbf(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
-                    if (cbfFlag[chromaId][tuIterator.section])
-                        m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
-
-                    uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits();
-                    singleBitsComp[chromaId][tuIterator.section] = newBits - singleBitsPrev;
-
-                    singleBitsPrev = newBits;
-                }
-                while (tuIterator.isNextSection());
-            }
-        }
+
+        uint32_t singleBitsPrev = m_entropyCoder.getNumberOfWrittenBits();
+        singleBits[TEXT_LUMA][0] = singleBitsPrev - fullCost.bits;
 
         X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n");
         uint32_t distY = primitives.ssd_s[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size);
@@ -2852,28 +2823,35 @@
         {
             m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]); //this is for inter mode only
 
+            // non-zero cost calculation for luma - This is an approximation
+            // finally we have to encode correct cbf after comparing with null cost
             const uint32_t nonZeroDistY = primitives.sse_ss[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
-            uint32_t nonZeroPsyEnergyY = 0;
+            uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
+            uint32_t nonZeroPsyEnergyY = 0; uint64_t singleCostY = 0;
             if (m_rdCost.m_psyRd)
+            {
                 nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
+                singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroPsyEnergyY);
+            }
+            else
+                singleCostY = m_rdCost.calcRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0]);
 
             if (cu.m_tqBypass[0])
             {
-                distY = nonZeroDistY;
-                psyEnergyY = nonZeroPsyEnergyY;
+                singleDist[TEXT_LUMA][0] = nonZeroDistY;
+                singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
             }
             else
             {
-                uint64_t singleCostY = 0;
-                if (m_rdCost.m_psyRd)
-                    singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0], nonZeroPsyEnergyY);
-                else
-                    singleCostY = m_rdCost.calcRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0]);
-
-                uint64_t nullCostY = deriveNullCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
+                // zero-cost calculation for luma. This is an approximation
+                // Initial cost calculation was also an approximation. First resetting the bit counter and then encoding zero cbf.
+                // Now encoding the zero cbf without writing into bitstream, keeping m_fracBits unchanged. The same is valid for chroma.
+                uint64_t nullCostY = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
+
                 if (nullCostY < singleCostY)
                 {
                     cbfFlag[TEXT_LUMA][0] = 0;
+                    singleBits[TEXT_LUMA][0] = 0;
 #if CHECKED_BUILD || _DEBUG
                     uint32_t numCoeffY = 1 << (log2TrSize << 1);
                     memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY);
@@ -2881,32 +2859,33 @@
 #endif
                     if (checkTransformSkipY)
                         minCost[TEXT_LUMA][0] = nullCostY;
+                    singleDist[TEXT_LUMA][0] = distY;
+                    singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY;
                 }
                 else
                 {
-                    distY = nonZeroDistY;
-                    psyEnergyY = nonZeroPsyEnergyY;
                     if (checkTransformSkipY)
                         minCost[TEXT_LUMA][0] = singleCostY;
+                    singleDist[TEXT_LUMA][0] = nonZeroDistY;
+                    singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
                 }
             }
         }
         else
         {
             if (checkTransformSkipY)
-                minCost[TEXT_LUMA][0] = deriveNullCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
+                minCost[TEXT_LUMA][0] = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
             primitives.blockfill_s[partSize](curResiY, strideResiY, 0);
+            singleDist[TEXT_LUMA][0] = distY;
+            singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY;
         }
 
-        singleDistComp[TEXT_LUMA][0] = distY;
-        singlePsyEnergyComp[TEXT_LUMA][0] = psyEnergyY;
-
         cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
 
         if (bCodeChroma)
         {
-            uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
             uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
+            uint32_t strideResiC  = m_rqt[qtLayer].resiQtYuv.m_csize;
             for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
             {
                 uint32_t distC = 0, psyEnergyC = 0;
@@ -2918,38 +2897,60 @@
                     uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
                     uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
 
+                    cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
+
+                    if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
+                        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
+
+                    fenc = const_cast<pixel*>(fencYuv->getChromaAddr(chromaId, absPartIdxC));
+                    resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
+                    numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
+                    cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
+
+                    //Coding cbf flags has been removed from here
+//                    m_entropyCoder.codeQtCbf(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
+                    if (cbfFlag[chromaId][tuIterator.section])
+                        m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
+                    uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits();
+                    singleBits[chromaId][tuIterator.section] = newBits - singleBitsPrev;
+                    singleBitsPrev = newBits;
+
                     int16_t *curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
-
                     distC = m_rdCost.scaleChromaDistCb(primitives.ssd_s[log2TrSizeC - 2](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize));
 
                     if (cbfFlag[chromaId][tuIterator.section])
                     {
                         m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiC, strideResiC, coeffCurC + subTUOffset,
                                                 log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]);
+
+                        // non-zero cost calculation for luma, same as luma - This is an approximation
+                        // finally we have to encode correct cbf after comparing with null cost
                         uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
-                        const uint32_t nonZeroDistC = m_rdCost.scaleChromaDistCb(dist);
-                        uint32_t nonZeroPsyEnergyC = 0;
+                        uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
+                        uint32_t nonZeroDistC = m_rdCost.scaleChromaDistCb(dist);
+                        uint32_t nonZeroPsyEnergyC = 0; uint64_t singleCostC = 0;
                         if (m_rdCost.m_psyRd)
+                        {
                             nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
+                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
+                        }
+                        else
+                            singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]);
 
                         if (cu.m_tqBypass[0])
                         {
-                            distC = nonZeroDistC;
-                            psyEnergyC = nonZeroPsyEnergyC;
+                            singleDist[chromaId][tuIterator.section] = nonZeroDistC;
+                            singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
                         }
                         else
                         {
-                            uint64_t singleCostC = 0;
-                            if (m_rdCost.m_psyRd)
-                                singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section], nonZeroPsyEnergyC);
-                            else
-                                singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section]);
-
-                            uint64_t nullCostC = deriveNullCost(distC, psyEnergyC, tuDepth, (TextType)chromaId);
+                            //zero-cost calculation for chroma. This is an approximation
+                            uint64_t nullCostC = estimateNullCbfCost(distC, psyEnergyC, tuDepth, (TextType)chromaId);
 
                             if (nullCostC < singleCostC)
                             {
                                 cbfFlag[chromaId][tuIterator.section] = 0;
+                                singleBits[chromaId][tuIterator.section] = 0;
 #if CHECKED_BUILD || _DEBUG
                                 uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
                                 memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
@@ -2957,26 +2958,27 @@
 #endif
                                 if (checkTransformSkipC)
                                     minCost[chromaId][tuIterator.section] = nullCostC;
+                                singleDist[chromaId][tuIterator.section] = distC;
+                                singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC;
                             }
                             else
                             {
-                                distC = nonZeroDistC;
-                                psyEnergyC = nonZeroPsyEnergyC;
                                 if (checkTransformSkipC)
                                     minCost[chromaId][tuIterator.section] = singleCostC;
+                                singleDist[chromaId][tuIterator.section] = nonZeroDistC;
+                                singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
                             }
                         }
                     }
                     else
                     {
                         if (checkTransformSkipC)
-                            minCost[chromaId][tuIterator.section] = deriveNullCost(distC, psyEnergyC, tuDepthC, (TextType)chromaId);
+                            minCost[chromaId][tuIterator.section] = estimateNullCbfCost(distC, psyEnergyC, tuDepthC, (TextType)chromaId);
                         primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0);
+                        singleDist[chromaId][tuIterator.section] = distC;
+                        singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC;
                     }
 
-                    singleDistComp[chromaId][tuIterator.section] = distC;
-                    singlePsyEnergyComp[chromaId][tuIterator.section] = psyEnergyC;
-
                     cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
                 }
                 while (tuIterator.isNextSection());
@@ -3027,8 +3029,8 @@
                 cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
             else
             {
-                singleDistComp[TEXT_LUMA][0] = nonZeroDistY;
-                singlePsyEnergyComp[TEXT_LUMA][0] = nonZeroPsyEnergyY;
+                singleDist[TEXT_LUMA][0] = nonZeroDistY;
+                singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
                 cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
                 bestTransformMode[TEXT_LUMA][0] = 1;
                 uint32_t numCoeffY = 1 << (log2TrSize << 1);
@@ -3073,13 +3075,13 @@
                     uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, tsCoeffC, log2TrSizeC, (TextType)chromaId, absPartIdxC, true);
 
                     m_entropyCoder.resetBits();
-                    singleBitsComp[chromaId][tuIterator.section] = 0;
+                    singleBits[chromaId][tuIterator.section] = 0;
 
                     if (numSigTSkipC)
                     {
                         m_entropyCoder.codeQtCbf(!!numSigTSkipC, (TextType)chromaId, tuDepth);
                         m_entropyCoder.codeCoeffNxN(cu, tsCoeffC, absPartIdxC, log2TrSizeC, (TextType)chromaId);
-                        singleBitsComp[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
+                        singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
 
                         m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], tsResiC, trSizeC, tsCoeffC,
                                                 log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
@@ -3088,18 +3090,18 @@
                         if (m_rdCost.m_psyRd)
                         {
                             nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC);
-                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section], nonZeroPsyEnergyC);
+                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
                         }
                         else
-                            singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section]);
+                            singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section]);
                     }
 
                     if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC)
                         cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
                     else
                     {
-                        singleDistComp[chromaId][tuIterator.section] = nonZeroDistC;
-                        singlePsyEnergyComp[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
+                        singleDist[chromaId][tuIterator.section] = nonZeroDistC;
+                        singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
                         cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC;
                         bestTransformMode[chromaId][tuIterator.section] = 1;
                         uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
@@ -3113,13 +3115,14 @@
             }
         }
 
+        // Here we were encoding cbfs and coefficients, after calculating distortion above.
+        // Now I am encoding only cbfs, since I have encoded coefficients above. I have just collected
+        // bits required for coefficients and added with number of cbf bits. As I tested the order does not
+        // make any difference. But bit confused whether I should load the original context as below.
         m_entropyCoder.load(m_rqt[depth].rqtRoot);
-
         m_entropyCoder.resetBits();
 
-        if (log2TrSize > depthRange[0])
-            m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
-
+        //Encode cbf flags
         if (bCodeChroma)
         {
             for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
@@ -3136,43 +3139,31 @@
         }
 
         m_entropyCoder.codeQtCbf(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
-        if (cbfFlag[TEXT_LUMA][0])
-            m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
-
-        if (bCodeChroma)
-        {
-            uint32_t subTUSize = 1 << (log2TrSizeC * 2);
-            uint32_t partIdxesPerSubTU = absPartIdxStep >> 1;
-            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
-
-            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
-            {
-                coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
-                if (!splitIntoSubTUs)
-                {
-                    if (cbfFlag[chromaId][0])
-                        m_entropyCoder.codeCoeffNxN(cu, coeffCurC, absPartIdx, log2TrSizeC, (TextType)chromaId);
-                }
-                else
-                {
-                    for (uint32_t subTU = 0; subTU < 2; subTU++)
-                    {
-                        if (cbfFlag[chromaId][subTU])
-                            m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTU * subTUSize, absPartIdx + subTU * partIdxesPerSubTU, log2TrSizeC, (TextType)chromaId);
-                    }
-                }
-            }
-        }
-
-        fullCost.distortion += singleDistComp[TEXT_LUMA][0];
-        fullCost.energy += singlePsyEnergyComp[TEXT_LUMA][0];// need to check we need to add chroma also
+
+        uint32_t cbfBits = m_entropyCoder.getNumberOfWrittenBits();
+
+        uint32_t coeffBits = 0;
+        coeffBits = singleBits[TEXT_LUMA][0];
         for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
         {
-            fullCost.distortion += singleDistComp[TEXT_CHROMA_U][subTUIndex];
-            fullCost.distortion += singleDistComp[TEXT_CHROMA_V][subTUIndex];
+            coeffBits += singleBits[TEXT_CHROMA_U][subTUIndex];
+            coeffBits += singleBits[TEXT_CHROMA_V][subTUIndex];
         }
 
-        fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
+        // In split mode, we need only coeffBits. The reason is encoding chroma cbfs is different from luma.
+        // In case of chroma, if any one of the splitted block's cbf is 1, then we need to encode cbf 1, and then for
+        // four splitted block's individual cbf value. This is not known before analysis of four splitted blocks.
+        // For that reason, I am collecting individual coefficient bits only.
+        fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits;
+
+        fullCost.distortion += singleDist[TEXT_LUMA][0];
+        fullCost.energy += singlePsyEnergy[TEXT_LUMA][0];// need to check we need to add chroma also
+        for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
+        {
+            fullCost.distortion += singleDist[TEXT_CHROMA_U][subTUIndex];
+            fullCost.distortion += singleDist[TEXT_CHROMA_V][subTUIndex];
+        }
+
         if (m_rdCost.m_psyRd)
             fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
         else
@@ -3189,6 +3180,14 @@
         }
 
         Cost splitCost;
+        if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
+        {
+            // Subdiv flag can be encoded at the start of anlysis of splitted blocks.
+            m_entropyCoder.resetBits();
+            m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
+            splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
+        }
+
         const uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
         uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
         for (uint32_t i = 0; i < 4; ++i)
@@ -3205,15 +3204,16 @@
             cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
         }
 
+        // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
+        // for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma.
+        // But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context
+        // at depth 0 (for example).
         m_entropyCoder.load(m_rqt[depth].rqtRoot);
         m_entropyCoder.resetBits();
 
         codeInterSubdivCbfQT(cu, absPartIdx, depth, depthRange);
-        encodeResidualQT(cu, absPartIdx, depth, TEXT_LUMA, depthRange);
-        encodeResidualQT(cu, absPartIdx, depth, TEXT_CHROMA_U, depthRange);
-        encodeResidualQT(cu, absPartIdx, depth, TEXT_CHROMA_V, depthRange);
-
-        splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
+        uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
+        splitCost.bits += splitCbfBits;
 
         if (m_rdCost.m_psyRd)
             splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
@@ -3293,9 +3293,6 @@
     const bool     bSubdiv     = curTuDepth != tuDepth;
     const uint32_t log2TrSize  = g_maxLog2CUSize - depth;
 
-    if (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0])
-        m_entropyCoder.codeTransformSubdivFlag(bSubdiv, 5 - log2TrSize);
-
     const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
     uint32_t trWidthC  = 1 << log2TrSizeC;
diff -r a1902139ce82 -r 3e415f2dc737 source/encoder/search.h
--- a/source/encoder/search.h	Wed Nov 05 20:34:26 2014 +0530
+++ b/source/encoder/search.h	Wed Nov 05 20:34:26 2014 +0530
@@ -218,7 +218,7 @@
         Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
     };
 
-    uint64_t deriveNullCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId);
+    uint64_t estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId);
     void     estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, uint32_t depthRange[2]);
 
     // estimate bit cost of residual QT