[x265] [PATCH 2 of 2] entropy: give each Search instance its own set of RD contexts

Sun Oct 5 18:10:19 CEST 2014

# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1412523648 18000
#      Sun Oct 05 10:40:48 2014 -0500
# Node ID 592988125077ff5fd325459f6068cfe671f97e1c
# Parent  6f5e14b8c57b5d252f844eecf3ab5d0b11b1f4fd
entropy: give each Search instance its own set of RD contexts

This give each ThreadLocalData a complete set of independent working contexts
so each thread can measure RDO (for the same row) independent of each other.

This was mainly an issue with the 'temp' and 'rqtRoot' and 'rqtTest' contexts.
For this to work we have to sync the 'cur' context to the slave prior to it
performing any RD measurements.

This commit finally removes the CI_IDX enums and uses a simple struct to hold
the contexts per depth; and the member variables were renamed from
"rdEntropyCoders" to "rdContexts" since these coders are only ever used to save
and restore CABAC state (never to code with)

** This change makes the encoder non-deterministic, traced to the fact that the
rd contexts are not initialized at the start of each slice, which indicates a
bug in Search.cpp.  For a given CTU analysis, none of the rd contexts should
ever be assumed to hold any initial state; it must be written before it is read.
**

diff -r 6f5e14b8c57b -r 592988125077 source/Lib/TLibCommon/TypeDef.h

--- a/source/Lib/TLibCommon/TypeDef.h	Sun Oct 05 10:28:25 2014 -0500
+++ b/source/Lib/TLibCommon/TypeDef.h	Sun Oct 05 10:40:48 2014 -0500
@@ -78,18 +78,6 @@
     MAX_NUM_COMPONENT = 3
 };
 
-// index for SBAC based RD optimization
-enum CI_IDX
-{
-    CI_CURR_BEST = 0,   // best mode index
-    CI_NEXT_BEST,       // next best index
-    CI_TEMP_BEST,       // temporal index
-    CI_QT_TRAFO_TEST,
-    CI_QT_TRAFO_ROOT,
-    CI_NUM,             // total number
-    CI_NUM_SAO   = 3,
-};
-
 // motion vector predictor direction used in AMVP
 enum MVP_DIR
 {
diff -r 6f5e14b8c57b -r 592988125077 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp	Sun Oct 05 10:28:25 2014 -0500
+++ b/source/encoder/analysis.cpp	Sun Oct 05 10:40:48 2014 -0500
@@ -287,14 +287,13 @@
         slave = &m_tld[threadId].analysis;
         slave->m_me.setSourcePlane(fenc->getLumaAddr(), fenc->getStride());
         slave->m_log = &slave->m_sliceTypeLog[cu->m_slice->m_sliceType];
-        slave->m_rdEntropyCoders = this->m_rdEntropyCoders;
         m_origYuv[0]->copyPartToYuv(slave->m_origYuv[depth], m_curCUData->encodeIdx);
         slave->setQP(cu->m_slice, m_rdCost.m_qp);
         if (!jobId || m_param->rdLevel > 4)
         {
             slave->m_quant.setQPforQuant(cu);
             slave->m_quant.m_nr = m_quant.m_nr;
-            slave->m_rdEntropyCoders[depth][CI_CURR_BEST].load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+            slave->m_rdContexts[depth].cur.load(m_rdContexts[depth].cur);
         }
     }
 
@@ -413,12 +412,14 @@
     }
 }
 
-void Analysis::compressCU(TComDataCU* cu)
+void Analysis::compressCTU(TComDataCU* ctu, const Entropy& initialContext)
 {
-    Frame* pic = cu->m_pic;
-    uint32_t cuAddr = cu->m_cuAddr;
+    Frame* pic = ctu->m_pic;
+    uint32_t cuAddr = ctu->m_cuAddr;
 
-    if (cu->m_slice->m_pps->bUseDQP)
+    m_rdContexts[0].cur.load(initialContext);
+
+    if (ctu->m_slice->m_pps->bUseDQP)
         m_bEncodeDQP = true;
 
     // initialize CU data
@@ -426,27 +427,27 @@
     m_tempCU[0]->initCU(pic, cuAddr);
 
     // analysis of CU
-    uint32_t numPartition = cu->m_cuLocalData->numPartitions;
+    uint32_t numPartition = ctu->m_cuLocalData->numPartitions;
     if (m_bestCU[0]->m_slice->m_sliceType == I_SLICE)
     {
         if (m_param->analysisMode == X265_ANALYSIS_LOAD && pic->m_intraData)
         {
             uint32_t zOrder = 0;
-            compressSharedIntraCTU(m_bestCU[0], m_tempCU[0], false, cu->m_cuLocalData, 
-                &pic->m_intraData->depth[cuAddr * cu->m_numPartitions],
-                &pic->m_intraData->partSizes[cuAddr * cu->m_numPartitions],
-                &pic->m_intraData->modes[cuAddr * cu->m_numPartitions], zOrder);
+            compressSharedIntraCTU(m_bestCU[0], m_tempCU[0], false, ctu->m_cuLocalData, 
+                &pic->m_intraData->depth[cuAddr * ctu->m_numPartitions],
+                &pic->m_intraData->partSizes[cuAddr * ctu->m_numPartitions],
+                &pic->m_intraData->modes[cuAddr * ctu->m_numPartitions], zOrder);
         }
         else
         {
-            compressIntraCU(m_bestCU[0], m_tempCU[0], false, cu->m_cuLocalData);
+            compressIntraCU(m_bestCU[0], m_tempCU[0], false, ctu->m_cuLocalData);
             if (m_param->analysisMode == X265_ANALYSIS_SAVE && pic->m_intraData)
             {
-                memcpy(&pic->m_intraData->depth[cuAddr * cu->m_numPartitions], m_bestCU[0]->getDepth(), sizeof(uint8_t) * numPartition);
-                memcpy(&pic->m_intraData->modes[cuAddr * cu->m_numPartitions], m_bestCU[0]->getLumaIntraDir(), sizeof(uint8_t) * numPartition);
-                memcpy(&pic->m_intraData->partSizes[cuAddr * cu->m_numPartitions], m_bestCU[0]->getPartitionSize(), sizeof(char) * numPartition);
+                memcpy(&pic->m_intraData->depth[cuAddr * ctu->m_numPartitions], m_bestCU[0]->getDepth(), sizeof(uint8_t) * numPartition);
+                memcpy(&pic->m_intraData->modes[cuAddr * ctu->m_numPartitions], m_bestCU[0]->getLumaIntraDir(), sizeof(uint8_t) * numPartition);
+                memcpy(&pic->m_intraData->partSizes[cuAddr * ctu->m_numPartitions], m_bestCU[0]->getPartitionSize(), sizeof(char) * numPartition);
                 pic->m_intraData->cuAddr[cuAddr] = cuAddr;
-                pic->m_intraData->poc[cuAddr]    = cu->m_pic->m_POC;
+                pic->m_intraData->poc[cuAddr]    = ctu->m_pic->m_POC;
             }
         }
         if (m_param->bLogCuStats || m_param->rc.bStatWrite)
@@ -455,18 +456,18 @@
             do
             {
                 m_log->totalCu++;
-                uint32_t depth = cu->getDepth(i);
+                uint32_t depth = ctu->getDepth(i);
                 int next = numPartition >> (depth * 2);
                 m_log->qTreeIntraCnt[depth]++;
-                if (depth == g_maxCUDepth && cu->getPartitionSize(i) != SIZE_2Nx2N)
+                if (depth == g_maxCUDepth && ctu->getPartitionSize(i) != SIZE_2Nx2N)
                     m_log->cntIntraNxN++;
                 else
                 {
                     m_log->cntIntra[depth]++;
-                    if (cu->getLumaIntraDir(i) > 1)
+                    if (ctu->getLumaIntraDir(i) > 1)
                         m_log->cuIntraDistribution[depth][ANGULAR_MODE_ID]++;
                     else
-                        m_log->cuIntraDistribution[depth][cu->getLumaIntraDir(i)]++;
+                        m_log->cuIntraDistribution[depth][ctu->getLumaIntraDir(i)]++;
                 }
                 i += next;
             }
@@ -481,20 +482,20 @@
 
             /* At the start of analysis, the best CU is a null pointer
              * On return, it points to the CU encode with best chosen mode */
-            compressInterCU_rd0_4(outBestCU, m_tempCU[0], cu, 0, cu->m_cuLocalData, false, 0, 4);
+            compressInterCU_rd0_4(outBestCU, m_tempCU[0], ctu, 0, ctu->m_cuLocalData, false, 0, 4);
         }
         else
-            compressInterCU_rd5_6(m_bestCU[0], m_tempCU[0], 0, cu->m_cuLocalData);
+            compressInterCU_rd5_6(m_bestCU[0], m_tempCU[0], 0, ctu->m_cuLocalData);
 
         if (m_param->bLogCuStats || m_param->rc.bStatWrite)
         {
             uint32_t i = 0;
             do
             {
-                uint32_t depth = cu->getDepth(i);
+                uint32_t depth = ctu->getDepth(i);
                 m_log->cntTotalCu[depth]++;
                 int next = numPartition >> (depth * 2);
-                if (cu->isSkipped(i))
+                if (ctu->isSkipped(i))
                 {
                     m_log->cntSkipCu[depth]++;
                     m_log->qTreeSkipCnt[depth]++;
@@ -502,29 +503,29 @@
                 else
                 {
                     m_log->totalCu++;
-                    if (cu->getPredictionMode(0) == MODE_INTER)
+                    if (ctu->getPredictionMode(0) == MODE_INTER)
                     {
                         m_log->cntInter[depth]++;
                         m_log->qTreeInterCnt[depth]++;
-                        if (cu->getPartitionSize(0) < AMP_ID)
-                            m_log->cuInterDistribution[depth][cu->getPartitionSize(0)]++;
+                        if (ctu->getPartitionSize(0) < AMP_ID)
+                            m_log->cuInterDistribution[depth][ctu->getPartitionSize(0)]++;
                         else
                             m_log->cuInterDistribution[depth][AMP_ID]++;
                     }
-                    else if (cu->getPredictionMode(0) == MODE_INTRA)
+                    else if (ctu->getPredictionMode(0) == MODE_INTRA)
                     {
                         m_log->qTreeIntraCnt[depth]++;
-                        if (depth == g_maxCUDepth && cu->getPartitionSize(0) == SIZE_NxN)
+                        if (depth == g_maxCUDepth && ctu->getPartitionSize(0) == SIZE_NxN)
                         {
                             m_log->cntIntraNxN++;
                         }
                         else
                         {
                             m_log->cntIntra[depth]++;
-                            if (cu->getLumaIntraDir(0) > 1)
+                            if (ctu->getLumaIntraDir(0) > 1)
                                 m_log->cuIntraDistribution[depth][ANGULAR_MODE_ID]++;
                             else
-                                m_log->cuIntraDistribution[depth][cu->getLumaIntraDir(0)]++;
+                                m_log->cuIntraDistribution[depth][ctu->getLumaIntraDir(0)]++;
                         }
                     }
                 }
@@ -594,10 +595,10 @@
             if (child_cu->flags & CU::PRESENT)
             {
                 subTempPartCU->initSubCU(outTempCU, child_cu, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
-                if (0 == partUnitIdx) //initialize RD with previous depth buffer
-                    m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+                if (!partUnitIdx)
+                    m_rdContexts[nextDepth].cur.load(m_rdContexts[depth].cur);
                 else
-                    m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[nextDepth][CI_NEXT_BEST]);
+                    m_rdContexts[nextDepth].cur.load(m_rdContexts[nextDepth].next);
 
                 compressIntraCU(subBestPartCU, subTempPartCU, nextDepth, child_cu);
                 outTempCU->copyPartFrom(subBestPartCU, child_cu, partUnitIdx, nextDepth); // Keep best part data to current temporary data.
@@ -645,7 +646,7 @@
                 outTempCU->setQPSubParts(outTempCU->getRefQP(targetPartIdx), 0, depth); // set QP to default QP
         }
 
-        m_rdEntropyCoders[nextDepth][CI_NEXT_BEST].store(m_rdEntropyCoders[depth][CI_TEMP_BEST]);
+        m_rdContexts[nextDepth].next.store(m_rdContexts[depth].temp);
         checkBestMode(outBestCU, outTempCU, depth); // RD compare current CU against split
     }
 
@@ -731,9 +732,9 @@
                 subTempPartCU->initSubCU(outTempCU, child_cu, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
 
                 if (partUnitIdx) // initialize RD with previous depth buffer
-                    m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[nextDepth][CI_NEXT_BEST]);
+                    m_rdContexts[nextDepth].cur.load(m_rdContexts[nextDepth].next);
                 else
-                    m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+                    m_rdContexts[nextDepth].cur.load(m_rdContexts[depth].cur);
 
                 // set current best CU cost to 1 marking as non-best CU by default
                 subTempPartCU->m_totalRDCost = 1;
@@ -785,7 +786,7 @@
             else
                 outTempCU->setQPSubParts(outTempCU->getRefQP(targetPartIdx), 0, depth); // set QP to default QP
         }
-        m_rdEntropyCoders[nextDepth][CI_NEXT_BEST].store(m_rdEntropyCoders[depth][CI_TEMP_BEST]);
+        m_rdContexts[nextDepth].next.store(m_rdContexts[depth].temp);
         checkBestMode(outBestCU, outTempCU, depth);
     }
     outBestCU->copyToPic(depth);
@@ -836,7 +837,7 @@
     // Encode Coefficients
     bool bCodeDQP = m_bEncodeDQP;
     m_entropyCoder.codeCoeff(outTempCU, 0, depth, bCodeDQP, tuDepthRange);
-    m_entropyCoder.store(m_rdEntropyCoders[depth][CI_TEMP_BEST]);
+    m_entropyCoder.store(m_rdContexts[depth].temp);
     outTempCU->m_totalBits = m_entropyCoder.getNumberOfWrittenBits();
     outTempCU->m_coeffBits = outTempCU->m_totalBits - outTempCU->m_mvBits;
 
@@ -1028,7 +1029,7 @@
                     }
                     else
                         /* inter has best cost, store RD state as next best */
-                        m_rdEntropyCoders[depth][CI_TEMP_BEST].store(m_rdEntropyCoders[depth][CI_NEXT_BEST]);
+                        m_rdContexts[depth].temp.store(m_rdContexts[depth].next);
 
                     if (slice->m_sliceType == P_SLICE 
                     // uncomment this expression to more closely match --no-pmode outputs (throw away intra if skip found)
@@ -1044,7 +1045,7 @@
                             outBestCU = m_intraInInterCU[depth];
                             std::swap(m_bestPredYuv[depth], m_modePredYuv[PRED_INTRA][depth]);
                             std::swap(m_bestRecoYuv[depth], m_bestIntraRecoYuv[depth]);
-                            m_intraContexts.store(m_rdEntropyCoders[depth][CI_NEXT_BEST]);
+                            m_intraContexts.store(m_rdContexts[depth].next);
                         }
                     }
                 }
@@ -1103,7 +1104,7 @@
                             std::swap(m_bestRecoYuv[depth], m_bestMergeRecoYuv[depth]);
                         }
                         else
-                            m_rdEntropyCoders[depth][CI_TEMP_BEST].store(m_rdEntropyCoders[depth][CI_NEXT_BEST]);
+                            m_rdContexts[depth].temp.store(m_rdContexts[depth].next);
                     }
 
                     /* Check for Intra in inter frames only if it is a P-slice */
@@ -1120,7 +1121,7 @@
                             uint64_t intraInInterCost, bestCost;
                             if (m_param->rdLevel > 2)
                             {
-                                encodeIntraInInter(m_intraInInterCU[depth], cu, m_origYuv[depth], m_modePredYuv[PRED_INTRA][depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], m_rdEntropyCoders[depth][CI_TEMP_BEST]);
+                                encodeIntraInInter(m_intraInInterCU[depth], cu, m_origYuv[depth], m_modePredYuv[PRED_INTRA][depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], m_rdContexts[depth].temp);
                                 intraInInterCost = m_rdCost.m_psyRd ? m_intraInInterCU[depth]->m_totalPsyCost : m_intraInInterCU[depth]->m_totalRDCost;
                                 bestCost = m_rdCost.m_psyRd ? outBestCU->m_totalPsyCost : outBestCU->m_totalRDCost;
                             }
@@ -1135,7 +1136,7 @@
                                 std::swap(m_bestPredYuv[depth], m_modePredYuv[PRED_INTRA][depth]);
                                 std::swap(m_bestRecoYuv[depth], m_tmpRecoYuv[depth]);
                                 if (m_param->rdLevel > 2)
-                                    m_rdEntropyCoders[depth][CI_TEMP_BEST].store(m_rdEntropyCoders[depth][CI_NEXT_BEST]);
+                                    m_rdContexts[depth].temp.store(m_rdContexts[depth].next);
                             }
                         }
                     }
@@ -1158,12 +1159,12 @@
                             }
 
                             encodeResAndCalcRdInterCU(outBestCU, cu, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth], m_bestResiYuv[depth], m_bestRecoYuv[depth]);
-                            m_rdEntropyCoders[depth][CI_TEMP_BEST].store(m_rdEntropyCoders[depth][CI_NEXT_BEST]);
+                            m_rdContexts[depth].temp.store(m_rdContexts[depth].next);
                         }
                         else if (outBestCU->getPredictionMode(0) == MODE_INTRA)
                         {
-                            encodeIntraInInter(outBestCU, cu, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth], m_bestRecoYuv[depth], m_rdEntropyCoders[depth][CI_TEMP_BEST]);
-                            m_rdEntropyCoders[depth][CI_TEMP_BEST].store(m_rdEntropyCoders[depth][CI_NEXT_BEST]);
+                            encodeIntraInInter(outBestCU, cu, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth], m_bestRecoYuv[depth], m_rdContexts[depth].temp);
+                            m_rdContexts[depth].temp.store(m_rdContexts[depth].next);
                         }
                     }
                     else if (m_param->rdLevel == 1)
@@ -1310,9 +1311,9 @@
             if (child_cu->flags & CU::PRESENT)
             {
                 if (partUnitIdx) // initialize RD with previous depth buffer
-                    m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[nextDepth][CI_NEXT_BEST]);
+                    m_rdContexts[nextDepth].cur.load(m_rdContexts[nextDepth].next);
                 else
-                    m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+                    m_rdContexts[nextDepth].cur.load(m_rdContexts[depth].cur);
 
                 compressInterCU_rd0_4(subBestPartCU, subTempPartCU, outTempCU, nextDepth, child_cu, cu_unsplit_flag, partUnitIdx, minDepth);
 
@@ -1383,7 +1384,7 @@
                 outTempCU->setQPSubParts(outTempCU->getRefQP(targetPartIdx), 0, depth); // set QP to default QP
         }
 
-        m_rdEntropyCoders[nextDepth][CI_NEXT_BEST].store(m_rdEntropyCoders[depth][CI_TEMP_BEST]);
+        m_rdContexts[nextDepth].next.store(m_rdContexts[depth].temp);
 
         /* If Best Mode is not NULL; then compare costs. Else assign best mode to Sub-CU costs
          * Copy recon data from Temp structure to Best structure */
@@ -1663,9 +1664,9 @@
                 subTempPartCU->initSubCU(outTempCU, child_cu, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
 
                 if (partUnitIdx) // initialize RD with previous depth buffer
-                    m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[nextDepth][CI_NEXT_BEST]);
+                    m_rdContexts[nextDepth].cur.load(m_rdContexts[nextDepth].next);
                 else
-                    m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+                    m_rdContexts[nextDepth].cur.load(m_rdContexts[depth].cur);
 
                 compressInterCU_rd5_6(subBestPartCU, subTempPartCU, nextDepth, child_cu);
                 outTempCU->copyPartFrom(subBestPartCU, child_cu, partUnitIdx, nextDepth); // Keep best part data to current temporary data.
@@ -1714,7 +1715,7 @@
                 outTempCU->setQPSubParts(outTempCU->getRefQP(targetPartIdx), 0, depth); // set QP to default QP
         }
 
-        m_rdEntropyCoders[nextDepth][CI_NEXT_BEST].store(m_rdEntropyCoders[depth][CI_TEMP_BEST]);
+        m_rdContexts[nextDepth].next.store(m_rdContexts[depth].temp);
         checkBestMode(outBestCU, outTempCU, depth); // RD compare current CU against split
     }
     outBestCU->copyToPic(depth); // Copy Best data to Picture for next partition prediction.
@@ -1819,7 +1820,7 @@
                 // No-residue mode
                 encodeResAndCalcRdSkipCU(m_bestMergeCU[depth], m_origYuv[depth], m_modePredYuv[PRED_MERGE][depth], m_tmpRecoYuv[depth]);
                 std::swap(m_bestMergeRecoYuv[depth], m_tmpRecoYuv[depth]);
-                m_rdEntropyCoders[depth][CI_TEMP_BEST].store(m_rdEntropyCoders[depth][CI_NEXT_BEST]);
+                m_rdContexts[depth].temp.store(m_rdContexts[depth].next);
             }
 
             // Encode with residue
@@ -1831,7 +1832,7 @@
             {
                 std::swap(m_bestMergeCU[depth], m_mergeCU[depth]);
                 std::swap(m_bestMergeRecoYuv[depth], m_tmpRecoYuv[depth]);
-                m_rdEntropyCoders[depth][CI_TEMP_BEST].store(m_rdEntropyCoders[depth][CI_NEXT_BEST]);
+                m_rdContexts[depth].temp.store(m_rdContexts[depth].next);
             }
         }
     }
@@ -1907,7 +1908,7 @@
                         std::swap(outBestPredYuv, m_tmpPredYuv[depth]);
                         std::swap(rpcYuvReconBest, m_tmpRecoYuv[depth]);
 
-                        m_rdEntropyCoders[depth][CI_TEMP_BEST].store(m_rdEntropyCoders[depth][CI_NEXT_BEST]);
+                        m_rdContexts[depth].temp.store(m_rdContexts[depth].next);
                     }
                     outTempCU->setQPSubParts(origQP, 0, depth);
                     outTempCU->setSkipFlagSubParts(false, 0, depth);
@@ -2409,7 +2410,7 @@
     // Encode Coefficients
     bool bCodeDQP = m_bEncodeDQP;
     m_entropyCoder.codeCoeff(outTempCU, 0, depth, bCodeDQP, tuDepthRange);
-    m_entropyCoder.store(m_rdEntropyCoders[depth][CI_TEMP_BEST]);
+    m_entropyCoder.store(m_rdContexts[depth].temp);
     outTempCU->m_totalBits = m_entropyCoder.getNumberOfWrittenBits();
     outTempCU->m_coeffBits = outTempCU->m_totalBits - outTempCU->m_mvBits;
 
@@ -2436,7 +2437,7 @@
     uint32_t initTrDepth = cu->getPartitionSize(0) == SIZE_2Nx2N ? 0 : 1;
 
     // set context models
-    m_entropyCoder.load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+    m_entropyCoder.load(m_rdContexts[depth].cur);
 
     m_quant.setQPforQuant(cu);
 
@@ -2629,7 +2630,7 @@
         // Change Reconstruction data
         std::swap(m_bestRecoYuv[depth], m_tmpRecoYuv[depth]);
 
-        m_rdEntropyCoders[depth][CI_TEMP_BEST].store(m_rdEntropyCoders[depth][CI_NEXT_BEST]);
+        m_rdContexts[depth].temp.store(m_rdContexts[depth].next);
     }
 }
 
diff -r 6f5e14b8c57b -r 592988125077 source/encoder/analysis.h
--- a/source/encoder/analysis.h	Sun Oct 05 10:28:25 2014 -0500
+++ b/source/encoder/analysis.h	Sun Oct 05 10:40:48 2014 -0500
@@ -113,7 +113,7 @@
     Analysis();
     bool create(uint32_t totalDepth, uint32_t maxWidth, ThreadLocalData* tld);
     void destroy();
-    void compressCU(TComDataCU* cu);
+    void compressCTU(TComDataCU* ctu, const Entropy& initialContext);
 
 protected:
 
diff -r 6f5e14b8c57b -r 592988125077 source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp	Sun Oct 05 10:28:25 2014 -0500
+++ b/source/encoder/frameencoder.cpp	Sun Oct 05 10:40:48 2014 -0500
@@ -214,7 +214,7 @@
         m_substreamSizes = X265_MALLOC(uint32_t, numSubstreams);
         if (!m_param->bEnableSAO)
             for (uint32_t i = 0; i < numSubstreams; i++)
-                m_rows[i].rdEntropyCoders[0][CI_CURR_BEST].setBitstream(&m_outStreams[i]);
+                m_rows[i].rowGoOnCoder.setBitstream(&m_outStreams[i]);
     }
     else
         for (uint32_t i = 0; i < numSubstreams; i++)
@@ -425,7 +425,7 @@
         if (m_param->bEnableWavefront && !col && lin)
         {
             m_entropyCoder.copyState(m_initSliceContext);
-            m_entropyCoder.loadContexts(m_rows[lin - 1].bufferEntropyCoder);
+            m_entropyCoder.loadContexts(m_rows[lin - 1].bufferedEntropy);
         }
 
         if (slice->m_sps->bUseSAO)
@@ -463,7 +463,7 @@
         {
             if (col == 1)
                 // Store probabilities of second CTU in line into buffer
-                m_rows[lin].bufferEntropyCoder.loadContexts(m_entropyCoder);
+                m_rows[lin].bufferedEntropy.loadContexts(m_entropyCoder);
 
             if (col == widthInLCUs - 1)
                 m_entropyCoder.finishSlice();
@@ -610,15 +610,13 @@
 
     /* When WPP is enabled, every row has its own row coder instance. Otherwise
      * they share row 0 */
-    Entropy& rowCoder = m_param->bEnableWavefront ? m_rows[row].rdEntropyCoders[0][CI_CURR_BEST] :
-                                                    m_rows[0].rdEntropyCoders[0][CI_CURR_BEST];
+    Entropy& rowCoder = m_param->bEnableWavefront ? m_rows[row].rowGoOnCoder : m_rows[0].rowGoOnCoder;
     // setup thread-local data
     Slice *slice = m_frame->m_picSym->m_slice;
     TComPicYuv* fenc = m_frame->getPicYuvOrg();
     tld.analysis.m_quant.m_nr = m_nr;
     tld.analysis.m_me.setSourcePlane(fenc->getLumaAddr(), fenc->getStride());
     tld.analysis.m_log = &tld.analysis.m_sliceTypeLog[m_frame->m_picSym->m_slice->m_sliceType];
-    tld.analysis.m_rdEntropyCoders = curRow.rdEntropyCoders;
     tld.analysis.setQP(slice, slice->m_sliceQp);
 
     int64_t startTime = x265_mdate();
@@ -661,22 +659,16 @@
                 m_frame->m_qpaAq[row] += qp;
         }
 
-        if (m_param->bEnableWavefront)
+        if (m_param->bEnableWavefront && !col && row)
         {
-            if (!col && row)
-            {
-                // Load SBAC coder context from previous row and initialize row state.
-                rowCoder.copyState(m_initSliceContext);
-                rowCoder.loadContexts(m_rows[row - 1].bufferEntropyCoder);
-            }
+            // Load SBAC coder context from previous row and initialize row state.
+            rowCoder.copyState(m_initSliceContext);
+            rowCoder.loadContexts(m_rows[row - 1].bufferedEntropy);
         }
-        else if (row)
-            // load current best state from go-on entropy coder
-            curRow.rdEntropyCoders[0][CI_CURR_BEST].load(rowCoder);
 
         cu->loadCTUData(m_param->maxCUSize);
         tld.analysis.m_quant.setQPforQuant(cu);
-        tld.analysis.compressCU(cu); // Does all the CU analysis
+        tld.analysis.compressCTU(cu, rowCoder); // Does all the CU analysis
 
         /* advance top-level row coder to include the context of this CTU.
          * if SAO is disabled, rowCoder writes the final CTU bitstream */
@@ -684,7 +676,7 @@
 
         if (m_param->bEnableWavefront && col == 1)
             // Save CABAC state for next row
-            curRow.bufferEntropyCoder.loadContexts(rowCoder);
+            curRow.bufferedEntropy.loadContexts(rowCoder);
 
         // Completed CU processing
         curRow.completed++;
diff -r 6f5e14b8c57b -r 592988125077 source/encoder/frameencoder.h
--- a/source/encoder/frameencoder.h	Sun Oct 05 10:28:25 2014 -0500
+++ b/source/encoder/frameencoder.h	Sun Oct 05 10:40:48 2014 -0500
@@ -49,8 +49,8 @@
  * WPP is active, several rows will be simultaneously encoded. */
 struct CTURow
 {
-    Entropy           bufferEntropyCoder;  /* store context for next row */
-    Entropy           rdEntropyCoders[NUM_FULL_DEPTH][CI_NUM];
+    Entropy           bufferedEntropy;  /* store CTU2 context for next row CTU0 */
+    Entropy           rowGoOnCoder;     /* store context between CTUs, code bitstream if !SAO */
 
     FrameStats        rowStats;
 
@@ -80,10 +80,7 @@
         busy = false;
         completed = 0;
         memset(&rowStats, 0, sizeof(rowStats));
-
-        for (uint32_t depth = 0; depth <= g_maxFullDepth; depth++)
-            for (int ciIdx = 0; ciIdx < CI_NUM; ciIdx++)
-                rdEntropyCoders[depth][ciIdx].load(initContext);
+        rowGoOnCoder.load(initContext);
     }
 };
 
diff -r 6f5e14b8c57b -r 592988125077 source/encoder/framefilter.cpp
--- a/source/encoder/framefilter.cpp	Sun Oct 05 10:28:25 2014 -0500
+++ b/source/encoder/framefilter.cpp	Sun Oct 05 10:40:48 2014 -0500
@@ -116,8 +116,8 @@
     if (m_param->bEnableSAO)
     {
         m_sao.m_entropyCoder.load(m_frameEncoder->m_initSliceContext);
-        m_sao.m_rdEntropyCoders[0][CI_NEXT_BEST].load(m_frameEncoder->m_initSliceContext);
-        m_sao.m_rdEntropyCoders[0][CI_CURR_BEST].load(m_frameEncoder->m_initSliceContext);
+        m_sao.m_rdContexts.next.load(m_frameEncoder->m_initSliceContext);
+        m_sao.m_rdContexts.cur.load(m_frameEncoder->m_initSliceContext);
 
         m_sao.rdoSaoUnitRow(saoParam, row);
 
diff -r 6f5e14b8c57b -r 592988125077 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp	Sun Oct 05 10:28:25 2014 -0500
+++ b/source/encoder/sao.cpp	Sun Oct 05 10:40:48 2014 -0500
@@ -214,8 +214,8 @@
     resetStats();
 
     m_entropyCoder.load(initState);
-    m_rdEntropyCoders[0][CI_NEXT_BEST].load(initState);
-    m_rdEntropyCoders[0][CI_CURR_BEST].load(initState);
+    m_rdContexts.next.load(initState);
+    m_rdContexts.cur.load(initState);
 
     SAOParam* saoParam = pic->getPicSym()->m_saoParam;
     if (!saoParam)
@@ -1267,12 +1267,12 @@
         compDistortion[0] = 0;
         compDistortion[1] = 0;
         compDistortion[2] = 0;
-        m_entropyCoder.load(m_rdEntropyCoders[0][CI_CURR_BEST]);
+        m_entropyCoder.load(m_rdContexts.cur);
         if (allowMergeLeft)
             m_entropyCoder.codeSaoMerge(0);
         if (allowMergeUp)
             m_entropyCoder.codeSaoMerge(0);
-        m_entropyCoder.store(m_rdEntropyCoders[0][CI_TEMP_BEST]);
+        m_entropyCoder.store(m_rdContexts.temp);
         // reset stats Y, Cb, Cr
         for (compIdx = 0; compIdx < 3; compIdx++)
         {
@@ -1311,7 +1311,7 @@
         if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
         {
             // Cost of new SAO_params
-            m_entropyCoder.load(m_rdEntropyCoders[0][CI_CURR_BEST]);
+            m_entropyCoder.load(m_rdContexts.cur);
             m_entropyCoder.resetBits();
             if (allowMergeLeft)
                 m_entropyCoder.codeSaoMerge(0);
@@ -1325,14 +1325,14 @@
 
             rate = m_entropyCoder.getNumberOfWrittenBits();
             bestCost = compDistortion[0] + (double)rate;
-            m_entropyCoder.store(m_rdEntropyCoders[0][CI_TEMP_BEST]);
+            m_entropyCoder.store(m_rdContexts.temp);
 
             // Cost of Merge
             for (int mergeUp = 0; mergeUp < 2; ++mergeUp)
             {
                 if ((allowMergeLeft && !mergeUp) || (allowMergeUp && mergeUp))
                 {
-                    m_entropyCoder.load(m_rdEntropyCoders[0][CI_CURR_BEST]);
+                    m_entropyCoder.load(m_rdContexts.cur);
                     m_entropyCoder.resetBits();
                     if (allowMergeLeft)
                         m_entropyCoder.codeSaoMerge(1 - mergeUp);
@@ -1344,7 +1344,7 @@
                     if (mergeCost < bestCost)
                     {
                         bestCost = mergeCost;
-                        m_entropyCoder.store(m_rdEntropyCoders[0][CI_TEMP_BEST]);
+                        m_entropyCoder.store(m_rdContexts.temp);
                         for (compIdx = 0; compIdx < 3; compIdx++)
                         {
                             mergeSaoParam[compIdx][mergeUp].mergeLeftFlag = !mergeUp;
@@ -1360,8 +1360,8 @@
                 m_numNoSao[0]++;
             if (saoParam->ctuParam[1][addr].typeIdx < 0)
                 m_numNoSao[1] += 2;
-            m_entropyCoder.load(m_rdEntropyCoders[0][CI_TEMP_BEST]);
-            m_entropyCoder.store(m_rdEntropyCoders[0][CI_CURR_BEST]);
+            m_entropyCoder.load(m_rdContexts.temp);
+            m_entropyCoder.store(m_rdContexts.cur);
         }
     }
 }
@@ -1466,7 +1466,7 @@
     int    currentDistortionTableBo[MAX_NUM_SAO_CLASS];
     double currentRdCostTableBo[MAX_NUM_SAO_CLASS];
 
-    m_entropyCoder.load(m_rdEntropyCoders[0][CI_TEMP_BEST]);
+    m_entropyCoder.load(m_rdContexts.temp);
     m_entropyCoder.resetBits();
     m_entropyCoder.codeSaoOffset(&ctuParamRdo, 0);
     dCostPartBest = m_entropyCoder.getNumberOfWrittenBits() * m_lumaLambda;
@@ -1507,7 +1507,7 @@
         for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
             ctuParamRdo.offset[classIdx] = (int)m_offset[0][typeIdx][classIdx + ctuParamRdo.subTypeIdx + 1];
 
-        m_entropyCoder.load(m_rdEntropyCoders[0][CI_TEMP_BEST]);
+        m_entropyCoder.load(m_rdContexts.temp);
         m_entropyCoder.resetBits();
         m_entropyCoder.codeSaoOffset(&ctuParamRdo, 0);
 
@@ -1523,9 +1523,9 @@
     }
 
     compDistortion[0] += ((double)bestDist / m_lumaLambda);
-    m_entropyCoder.load(m_rdEntropyCoders[0][CI_TEMP_BEST]);
+    m_entropyCoder.load(m_rdContexts.temp);
     m_entropyCoder.codeSaoOffset(lclCtuParam, 0);
-    m_entropyCoder.store(m_rdEntropyCoders[0][CI_TEMP_BEST]);
+    m_entropyCoder.store(m_rdContexts.temp);
 
     // merge left or merge up
 
@@ -1593,7 +1593,7 @@
     int    bestClassTableBo[2] = { 0, 0 };
     int    currentDistortionTableBo[MAX_NUM_SAO_CLASS];
 
-    m_entropyCoder.load(m_rdEntropyCoders[0][CI_TEMP_BEST]);
+    m_entropyCoder.load(m_rdContexts.temp);
     m_entropyCoder.resetBits();
     m_entropyCoder.codeSaoOffset(&ctuParamRdo[0], 1);
     m_entropyCoder.codeSaoOffset(&ctuParamRdo[1], 2);
@@ -1637,7 +1637,7 @@
             estDist[1] = estSaoTypeDist(2, typeIdx, 0, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo);
         }
 
-        m_entropyCoder.load(m_rdEntropyCoders[0][CI_TEMP_BEST]);
+        m_entropyCoder.load(m_rdContexts.temp);
         m_entropyCoder.resetBits();
 
         for (int compIdx = 0; compIdx < 2; compIdx++)
@@ -1666,10 +1666,10 @@
     }
 
     distortion[0] += ((double)bestDist / m_chromaLambda);
-    m_entropyCoder.load(m_rdEntropyCoders[0][CI_TEMP_BEST]);
+    m_entropyCoder.load(m_rdContexts.temp);
     m_entropyCoder.codeSaoOffset(lclCtuParam[0], 1);
     m_entropyCoder.codeSaoOffset(lclCtuParam[1], 2);
-    m_entropyCoder.store(m_rdEntropyCoders[0][CI_TEMP_BEST]);
+    m_entropyCoder.store(m_rdContexts.temp);
 
     // merge left or merge up
 
diff -r 6f5e14b8c57b -r 592988125077 source/encoder/sao.h
--- a/source/encoder/sao.h	Sun Oct 05 10:28:25 2014 -0500
+++ b/source/encoder/sao.h	Sun Oct 05 10:40:48 2014 -0500
@@ -95,9 +95,16 @@
 
 public:
 
+    struct SAOContexts
+    {
+        Entropy cur;
+        Entropy next;
+        Entropy temp;
+    };
+
     Frame*      m_pic;
-    Entropy     m_rdEntropyCoders[5][CI_NUM_SAO];
     Entropy     m_entropyCoder;
+    SAOContexts m_rdContexts;
 
     x265_param* m_param;
     int         m_refDepth;
diff -r 6f5e14b8c57b -r 592988125077 source/encoder/search.cpp
--- a/source/encoder/search.cpp	Sun Oct 05 10:28:25 2014 -0500
+++ b/source/encoder/search.cpp	Sun Oct 05 10:40:48 2014 -0500
@@ -47,7 +47,6 @@
 
     m_numLayers = 0;
     m_param = NULL;
-    m_rdEntropyCoders = NULL;
 }
 
 Search::~Search()
@@ -478,7 +477,7 @@
         if (checkTransformSkip || checkTQbypass)
         {
             // store original entropy coding status
-            m_entropyCoder.store(m_rdEntropyCoders[fullDepth][CI_QT_TRAFO_ROOT]);
+            m_entropyCoder.store(m_rdContexts[fullDepth].rqtRoot);
 
             uint32_t  singleDistYTmp = 0;
             uint32_t  singlePsyEnergyYTmp = 0;
@@ -535,10 +534,10 @@
                     bestTQbypass = singleTQbypass;
                     bestModeId   = modeId;
                     if (bestModeId == firstCheckId)
-                        m_entropyCoder.store(m_rdEntropyCoders[fullDepth][CI_TEMP_BEST]);
+                        m_entropyCoder.store(m_rdContexts[fullDepth].temp);
                 }
                 if (modeId == firstCheckId)
-                    m_entropyCoder.load(m_rdEntropyCoders[fullDepth][CI_QT_TRAFO_ROOT]);
+                    m_entropyCoder.load(m_rdContexts[fullDepth].rqtRoot);
             }
 
             cu->setTransformSkipSubParts(checkTransformSkip ? bestModeId : 0, TEXT_LUMA, absPartIdx, fullDepth);
@@ -549,7 +548,7 @@
             {
                 xLoadIntraResultQT(cu, cuData, absPartIdx, log2TrSize, reconQt, reconQtStride);
                 cu->setCbfSubParts(singleCbfY << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
-                m_entropyCoder.load(m_rdEntropyCoders[fullDepth][CI_TEMP_BEST]);
+                m_entropyCoder.load(m_rdContexts[fullDepth].temp);
             }
             else
             {
@@ -560,7 +559,7 @@
         }
         else
         {
-            m_entropyCoder.store(m_rdEntropyCoders[fullDepth][CI_QT_TRAFO_ROOT]);
+            m_entropyCoder.store(m_rdContexts[fullDepth].rqtRoot);
 
             // code luma block with given intra prediction mode and store Cbf
             cu->setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
@@ -589,11 +588,11 @@
         // store full entropy coding status, load original entropy coding status
         if (bCheckFull)
         {
-            m_entropyCoder.store(m_rdEntropyCoders[fullDepth][CI_QT_TRAFO_TEST]);
-            m_entropyCoder.load(m_rdEntropyCoders[fullDepth][CI_QT_TRAFO_ROOT]);
+            m_entropyCoder.store(m_rdContexts[fullDepth].rqtTest);
+            m_entropyCoder.load(m_rdContexts[fullDepth].rqtRoot);
         }
         else
-            m_entropyCoder.store(m_rdEntropyCoders[fullDepth][CI_QT_TRAFO_ROOT]);
+            m_entropyCoder.store(m_rdContexts[fullDepth].rqtRoot);
 
         // code splitted block
         uint64_t splitCost     = 0;
@@ -636,7 +635,7 @@
         }
 
         // set entropy coding status
-        m_entropyCoder.load(m_rdEntropyCoders[fullDepth][CI_QT_TRAFO_TEST]);
+        m_entropyCoder.load(m_rdContexts[fullDepth].rqtTest);
 
         // set transform index and Cbf values
         cu->setTrIdxSubParts(trDepth, absPartIdx, fullDepth);
@@ -926,7 +925,7 @@
                 if (checkTransformSkip)
                 {
                     // use RDO to decide whether Cr/Cb takes TS
-                    m_entropyCoder.store(m_rdEntropyCoders[fullDepth][CI_QT_TRAFO_ROOT]);
+                    m_entropyCoder.store(m_rdContexts[fullDepth].rqtRoot);
 
                     uint64_t singleCost     = MAX_INT64;
                     int      bestModeId     = 0;
@@ -976,17 +975,17 @@
                             singleCbfC  = singleCbfCTmp;
                             singlePsyEnergy = singlePsyEnergyTmp;
                             if (bestModeId == firstCheckId)
-                                m_entropyCoder.store(m_rdEntropyCoders[fullDepth][CI_TEMP_BEST]);
+                                m_entropyCoder.store(m_rdContexts[fullDepth].temp);
                         }
                         if (chromaModeId == firstCheckId)
-                            m_entropyCoder.load(m_rdEntropyCoders[fullDepth][CI_QT_TRAFO_ROOT]);
+                            m_entropyCoder.load(m_rdContexts[fullDepth].rqtRoot);
                     }
 
                     if (bestModeId == firstCheckId)
                     {
                         xLoadIntraResultChromaQT(cu, cuData, absPartIdxC, log2TrSizeC, chromaId, reconQt, reconQtStride);
                         cu->setCbfPartRange(singleCbfC << trDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
-                        m_entropyCoder.load(m_rdEntropyCoders[fullDepth][CI_TEMP_BEST]);
+                        m_entropyCoder.load(m_rdContexts[fullDepth].temp);
                     }
                     else
                     {
@@ -1000,7 +999,7 @@
                     outDist += singleDistC;
 
                     if (chromaId == 1)
-                        m_entropyCoder.store(m_rdEntropyCoders[fullDepth][CI_QT_TRAFO_ROOT]);
+                        m_entropyCoder.store(m_rdContexts[fullDepth].rqtRoot);
                 }
                 else
                 {
@@ -1349,7 +1348,7 @@
         {
             if (candCostList[i] == MAX_INT64)
                 break;
-            m_entropyCoder.load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+            m_entropyCoder.load(m_rdContexts[depth].cur);
             cu->setLumaIntraDirSubParts(rdModeList[i], partOffset, depth + initTrDepth);
             cost = bits = 0;
             uint32_t psyEnergy = 0;
@@ -1359,7 +1358,7 @@
 
         /* remeasure best mode, allowing TU splits */
         cu->setLumaIntraDirSubParts(bmode, partOffset, depth + initTrDepth);
-        m_entropyCoder.load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+        m_entropyCoder.load(m_rdContexts[depth].cur);
 
         uint32_t psyEnergy = 0;
         // update distortion (rate and r-d costs are determined later)
@@ -1389,7 +1388,7 @@
     }
 
     // reset context models
-    m_entropyCoder.load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+    m_entropyCoder.load(m_rdContexts[depth].cur);
 
     x265_emms();
 }
@@ -1414,7 +1413,7 @@
         cu->setLumaIntraDirSubParts(sharedModes[pu], partOffset, depth + initTrDepth);
 
         // set context models
-        m_entropyCoder.load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+        m_entropyCoder.load(m_rdContexts[depth].cur);
 
         uint32_t psyEnergy = 0;
         // update overall distortion (rate and r-d costs are determined later)
@@ -1447,7 +1446,7 @@
     }
 
     // reset context models
-    m_entropyCoder.load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+    m_entropyCoder.load(m_rdContexts[depth].cur);
 }
 
 void Search::getBestIntraModeChroma(TComDataCU* cu, CU* cuData, TComYuv* fencYuv, TComYuv* predYuv)
@@ -1533,7 +1532,7 @@
         for (uint32_t mode = minMode; mode < maxMode; mode++)
         {
             // restore context models
-            m_entropyCoder.load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+            m_entropyCoder.load(m_rdContexts[depth].cur);
 
             // chroma coding
             cu->setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTrDepth);
@@ -1542,7 +1541,7 @@
             uint32_t dist = xRecurIntraChromaCodingQT(cu, cuData, initTrDepth, absPartIdxC, fencYuv, predYuv, resiYuv, psyEnergy);
 
             if (cu->m_slice->m_pps->bTransformSkipEnabled)
-                m_entropyCoder.load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+                m_entropyCoder.load(m_rdContexts[depth].cur);
 
             uint32_t bits = xGetIntraBitsQTChroma(cu, cuData, initTrDepth, absPartIdxC, tuIterator.absPartIdxStep);
             uint64_t cost = 0; 
@@ -1608,7 +1607,7 @@
         }
     }
 
-    m_entropyCoder.load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+    m_entropyCoder.load(m_rdContexts[depth].cur);
 }
 
 /* estimation of best merge coding */
@@ -2069,7 +2068,7 @@
     cu->m_totalDistortion += m_rdCost.scaleChromaDistCb(primitives.sse_pp[part](fencYuv->getCbAddr(), fencYuv->getCStride(), outReconYuv->getCbAddr(), outReconYuv->getCStride()));
     cu->m_totalDistortion += m_rdCost.scaleChromaDistCr(primitives.sse_pp[part](fencYuv->getCrAddr(), fencYuv->getCStride(), outReconYuv->getCrAddr(), outReconYuv->getCStride()));
 
-    m_entropyCoder.load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+    m_entropyCoder.load(m_rdContexts[depth].cur);
     m_entropyCoder.resetBits();
     if (cu->m_slice->m_pps->bTransquantBypassEnabled)
         m_entropyCoder.codeCUTransquantBypassFlag(cu->getCUTransquantBypass(0));
@@ -2090,7 +2089,7 @@
     else
         cu->m_totalRDCost = m_rdCost.calcRdCost(cu->m_totalDistortion, cu->m_totalBits);
 
-    m_entropyCoder.store(m_rdEntropyCoders[depth][CI_TEMP_BEST]);
+    m_entropyCoder.store(m_rdContexts[depth].temp);
 }
 
 /** encode residual and calculate rate-distortion for a CU block */
@@ -2133,7 +2132,7 @@
         bool bIsLosslessMode = bIsTQBypassEnable && !modeId;
 
         cu->setCUTransquantBypassSubParts(bIsLosslessMode, 0, depth);
-        m_entropyCoder.load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+        m_entropyCoder.load(m_rdContexts[depth].cur);
 
         uint64_t cost = 0;
         uint32_t zeroDistortion = 0;
@@ -2178,7 +2177,7 @@
         else
             xSetResidualQTData(cu, 0, NULL, depth, false);
 
-        m_entropyCoder.load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+        m_entropyCoder.load(m_rdContexts[depth].cur);
 
         bits = getInterSymbolBits(cu, tuDepthRange);
 
@@ -2196,7 +2195,7 @@
             bestBits = bits;
             bestCost = cost;
             bestCoeffBits = cu->m_coeffBits;
-            m_entropyCoder.store(m_rdEntropyCoders[depth][CI_TEMP_BEST]);
+            m_entropyCoder.store(m_rdContexts[depth].temp);
         }
     }
 
@@ -2205,12 +2204,12 @@
     if (bIsTQBypassEnable && !bestMode)
     {
         cu->setCUTransquantBypassSubParts(true, 0, depth);
-        m_entropyCoder.load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+        m_entropyCoder.load(m_rdContexts[depth].cur);
         uint64_t cost = 0;
         uint32_t bits = 0;
         xEstimateResidualQT(cu, cuData, 0, fencYuv, predYuv, outResiYuv, depth, cost, bits, NULL, tuDepthRange);
         xSetResidualQTData(cu, 0, NULL, depth, false);
-        m_entropyCoder.store(m_rdEntropyCoders[depth][CI_TEMP_BEST]);
+        m_entropyCoder.store(m_rdContexts[depth].temp);
     }
 
     if (cu->getQtRootCbf(0))
@@ -2451,7 +2450,7 @@
 
     uint32_t bestCBF[MAX_NUM_COMPONENT];
     uint32_t bestsubTUCBF[MAX_NUM_COMPONENT][2];
-    m_entropyCoder.store(m_rdEntropyCoders[depth][CI_QT_TRAFO_ROOT]);
+    m_entropyCoder.store(m_rdContexts[depth].rqtRoot);
 
     uint32_t trSize = 1 << log2TrSize;
     const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
@@ -2834,7 +2833,7 @@
             ALIGN_VAR_32(coeff_t, tsCoeffY[MAX_TS_SIZE * MAX_TS_SIZE]);
             ALIGN_VAR_32(int16_t, tsResiY[MAX_TS_SIZE * MAX_TS_SIZE]);
 
-            m_entropyCoder.load(m_rdEntropyCoders[depth][CI_QT_TRAFO_ROOT]);
+            m_entropyCoder.load(m_rdContexts[depth].rqtRoot);
 
             cu->setTransformSkipSubParts(1, TEXT_LUMA, absPartIdx, depth);
 
@@ -2897,7 +2896,7 @@
             uint64_t singleCostU = MAX_INT64;
             uint64_t singleCostV = MAX_INT64;
 
-            m_entropyCoder.load(m_rdEntropyCoders[depth][CI_QT_TRAFO_ROOT]);
+            m_entropyCoder.load(m_rdContexts[depth].rqtRoot);
 
             TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
 
@@ -3021,7 +3020,7 @@
             while (tuIterator.isNextSection());
         }
 
-        m_entropyCoder.load(m_rdEntropyCoders[depth][CI_QT_TRAFO_ROOT]);
+        m_entropyCoder.load(m_rdContexts[depth].rqtRoot);
 
         m_entropyCoder.resetBits();
 
@@ -3105,8 +3104,8 @@
     {
         if (bCheckFull)
         {
-            m_entropyCoder.store(m_rdEntropyCoders[depth][CI_QT_TRAFO_TEST]);
-            m_entropyCoder.load(m_rdEntropyCoders[depth][CI_QT_TRAFO_ROOT]);
+            m_entropyCoder.store(m_rdContexts[depth].rqtTest);
+            m_entropyCoder.load(m_rdContexts[depth].rqtRoot);
         }
         uint32_t subdivDist = 0;
         uint32_t subdivBits = 0;
@@ -3152,7 +3151,7 @@
             cu->getCbf(TEXT_CHROMA_V)[absPartIdx + i] |= vcbf << trMode;
         }
 
-        m_entropyCoder.load(m_rdEntropyCoders[depth][CI_QT_TRAFO_ROOT]);
+        m_entropyCoder.load(m_rdContexts[depth].rqtRoot);
         m_entropyCoder.resetBits();
 
         xEncodeResidualQT(cu, absPartIdx, depth, true,  TEXT_LUMA, depthRange);
@@ -3196,7 +3195,7 @@
             }
         }
         X265_CHECK(bCheckFull, "check-full must be set\n");
-        m_entropyCoder.load(m_rdEntropyCoders[depth][CI_QT_TRAFO_TEST]);
+        m_entropyCoder.load(m_rdContexts[depth].rqtTest);
     }
 
     rdCost += singleCost;
@@ -3411,7 +3410,7 @@
 uint32_t Search::getIntraModeBits(TComDataCU* cu, uint32_t mode, uint32_t partOffset, uint32_t depth)
 {
     // Reload only contexts required for coding intra mode information
-    m_entropyCoder.loadIntraDirModeLuma(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+    m_entropyCoder.loadIntraDirModeLuma(m_rdContexts[depth].cur);
 
     cu->getLumaIntraDir()[partOffset] = (uint8_t)mode;
 
diff -r 6f5e14b8c57b -r 592988125077 source/encoder/search.h
--- a/source/encoder/search.h	Sun Oct 05 10:28:25 2014 -0500
+++ b/source/encoder/search.h	Sun Oct 05 10:40:48 2014 -0500
@@ -45,6 +45,19 @@
 
 class Entropy;
 
+/* All the CABAC contexts that Analysis needs to keep track of at each depth */
+struct RDContexts
+{
+    /* Note: When Analysis uses Mode structs, next and temp go away.
+    * temp will be mode->context and next is bestMode->context */
+
+    Entropy cur;     /* input context for current CU */
+    Entropy next;    /* current best output context for current CU */
+    Entropy temp;    /* output of most recently measured mode */
+    Entropy rqtRoot; /* residual quad-tree start context */
+    Entropy rqtTest; /* residual quad-tree test context */
+};
+
 inline int getTUBits(int idx, int numIdx)
 {
     return idx + (idx < numIdx - 1);
@@ -62,7 +75,7 @@
     x265_param*     m_param;
 
     Entropy         m_entropyCoder;
-    Entropy       (*m_rdEntropyCoders)[CI_NUM];
+    RDContexts      m_rdContexts[NUM_FULL_DEPTH];
 
     TComYuv         m_predTempYuv;
     TComYuv         m_bidirPredYuv[2];