[x265] [PATCH 1 of 2] TComDataCU: replace getZorderIdxInCU() with encodeIdx of CU structure

santhoshini at multicorewareinc.com santhoshini at multicorewareinc.com
Mon Sep 29 12:02:59 CEST 2014


# HG changeset patch
# User Santhoshini Sekar <santhoshini at multicorewareinc.com>
# Date 1411972892 -19800
#      Mon Sep 29 12:11:32 2014 +0530
# Node ID ed887d8ae5cd24b0c2317fb83b3c908be27e037a
# Parent  32f50df7fa7672f4c1818ddf3165b4bd243e0b10
TComDataCU: replace getZorderIdxInCU() with encodeIdx of CU structure

diff -r 32f50df7fa76 -r ed887d8ae5cd source/Lib/TLibCommon/TComDataCU.cpp
--- a/source/Lib/TLibCommon/TComDataCU.cpp	Fri Sep 26 17:33:09 2014 -0500
+++ b/source/Lib/TLibCommon/TComDataCU.cpp	Mon Sep 29 12:11:32 2014 +0530
@@ -387,16 +387,15 @@
 }
 
 // initialize Sub partition
-void TComDataCU::initSubCU(TComDataCU* cu, uint32_t partUnitIdx, uint32_t depth, int qp)
+void TComDataCU::initSubCU(TComDataCU* cu, CU* cuData, uint32_t partUnitIdx, uint32_t depth, int qp)
 {
     X265_CHECK(partUnitIdx < 4, "part unit should be less than 4\n");
     uint8_t log2CUSize = g_maxLog2CUSize - depth;
-    uint32_t partOffset = (cu->getTotalNumPart() >> 2) * partUnitIdx;
 
     m_pic              = cu->m_pic;
     m_slice            = cu->m_slice;
     m_cuAddr           = cu->getAddr();
-    m_absIdxInLCU      = cu->getZorderIdxInCU() + partOffset;
+    m_absIdxInLCU      = cuData->encodeIdx;
 
     m_cuPelX           = cu->getCUPelX() + ((partUnitIdx &  1) << log2CUSize);
     m_cuPelY           = cu->getCUPelY() + ((partUnitIdx >> 1) << log2CUSize);
@@ -453,7 +452,7 @@
     m_cuAboveRight  = cu->getCUAboveRight();
 }
 
-void TComDataCU::copyToSubCU(TComDataCU* cu, uint32_t partUnitIdx, uint32_t depth)
+void TComDataCU::copyToSubCU(TComDataCU* cu, CU* cuData, uint32_t partUnitIdx, uint32_t depth)
 {
     X265_CHECK(partUnitIdx < 4, "part unit should be less than 4\n");
 
@@ -462,7 +461,7 @@
     m_pic              = cu->m_pic;
     m_slice            = cu->m_slice;
     m_cuAddr           = cu->getAddr();
-    m_absIdxInLCU      = cu->getZorderIdxInCU() + partOffset;
+    m_absIdxInLCU      = cuData->encodeIdx + partOffset;
 
     m_cuPelX           = cu->getCUPelX() + ((partUnitIdx &  1) << (g_maxLog2CUSize - depth));
     m_cuPelY           = cu->getCUPelY() + ((partUnitIdx >> 1) << (g_maxLog2CUSize - depth));
@@ -1067,9 +1066,9 @@
     }
     else
     {
-        if (getZorderIdxInCU() > 0)
+        if (m_pic->getCU(m_cuAddr)->m_CULocalData->encodeIdx > 0)
         {
-            return m_pic->getCU(getAddr())->getLastCodedQP(getZorderIdxInCU());
+            return m_pic->getCU(getAddr())->getLastCodedQP(m_pic->getCU(m_cuAddr)->m_CULocalData->encodeIdx);
         }
         else if (getAddr() > 0 && !(m_slice->m_pps->bEntropyCodingSyncEnabled &&
                                     getAddr() % m_pic->getFrameWidthInCU() == 0))
diff -r 32f50df7fa76 -r ed887d8ae5cd source/Lib/TLibCommon/TComDataCU.h
--- a/source/Lib/TLibCommon/TComDataCU.h	Fri Sep 26 17:33:09 2014 -0500
+++ b/source/Lib/TLibCommon/TComDataCU.h	Mon Sep 29 12:11:32 2014 +0530
@@ -273,9 +273,9 @@
 
     void          initCU(Frame* pic, uint32_t cuAddr);
     void          initEstData();
-    void          initSubCU(TComDataCU* cu, uint32_t partUnitIdx, uint32_t depth, int qp);
+    void          initSubCU(TComDataCU* cu, CU* cuData, uint32_t partUnitIdx, uint32_t depth, int qp);
 
-    void          copyToSubCU(TComDataCU* lcu, uint32_t partUnitIdx, uint32_t depth);
+    void          copyToSubCU(TComDataCU* lcu, CU* cuData, uint32_t partUnitIdx, uint32_t depth);
     void          copyPartFrom(TComDataCU* cu, uint32_t partUnitIdx, uint32_t depth, bool isRDObasedAnalysis = true);
 
     void          copyToPic(uint32_t depth);
@@ -288,8 +288,6 @@
 
     uint32_t&     getAddr()                        { return m_cuAddr; }
 
-    uint32_t&     getZorderIdxInCU()               { return m_absIdxInLCU; }
-
     uint32_t      getSCUAddr() const               { return (m_cuAddr << g_maxFullDepth * 2) + m_absIdxInLCU; }
 
 
diff -r 32f50df7fa76 -r ed887d8ae5cd source/Lib/TLibCommon/TComPattern.cpp
--- a/source/Lib/TLibCommon/TComPattern.cpp	Fri Sep 26 17:33:09 2014 -0500
+++ b/source/Lib/TLibCommon/TComPattern.cpp	Mon Sep 29 12:11:32 2014 +0530
@@ -49,7 +49,7 @@
 // Public member functions (TComPattern)
 // ====================================================================================================================
 
-void TComPattern::initAdiPattern(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth, pixel* adiBuf,
+void TComPattern::initAdiPattern(TComDataCU* cu, CU* cuData, uint32_t zOrderIdxInPart, uint32_t partDepth, pixel* adiBuf,
                                  pixel* refAbove, pixel* refLeft, pixel* refAboveFlt, pixel* refLeftFlt, int dirMode)
 {
     IntraNeighbors intraNeighbors;
@@ -58,7 +58,7 @@
     uint32_t tuSize = intraNeighbors.tuSize;
     uint32_t tuSize2 = tuSize << 1;
 
-    pixel* adiOrigin = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), cu->getZorderIdxInCU() + zOrderIdxInPart);
+    pixel* adiOrigin = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), cuData->encodeIdx + zOrderIdxInPart);
     int picStride = cu->m_pic->getStride();
 
     fillReferenceSamples(adiOrigin, picStride, adiBuf, intraNeighbors);
@@ -130,14 +130,14 @@
     }
 }
 
-void TComPattern::initAdiPatternChroma(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth, pixel* adiBuf, uint32_t chromaId)
+void TComPattern::initAdiPatternChroma(TComDataCU* cu, CU* cuData, uint32_t zOrderIdxInPart, uint32_t partDepth, pixel* adiBuf, uint32_t chromaId)
 {
     IntraNeighbors intraNeighbors;
 
     initIntraNeighbors(cu, zOrderIdxInPart, partDepth, false, &intraNeighbors);
     uint32_t tuSize = intraNeighbors.tuSize;
 
-    pixel* adiOrigin = cu->m_pic->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), cu->getZorderIdxInCU() + zOrderIdxInPart);
+    pixel* adiOrigin = cu->m_pic->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), cuData->encodeIdx + zOrderIdxInPart);
     int picStride = cu->m_pic->getCStride();
     pixel* adiRef = getAdiChromaBuf(chromaId, tuSize, adiBuf);
 
diff -r 32f50df7fa76 -r ed887d8ae5cd source/Lib/TLibCommon/TComPattern.h
--- a/source/Lib/TLibCommon/TComPattern.h	Fri Sep 26 17:33:09 2014 -0500
+++ b/source/Lib/TLibCommon/TComPattern.h	Mon Sep 29 12:11:32 2014 +0530
@@ -53,6 +53,7 @@
 
 class TComDataCU;
 
+struct CU;
 struct IntraNeighbors
 {
     int  numIntraNeighbor;
@@ -82,12 +83,12 @@
     // -------------------------------------------------------------------------------------------------------------------
 
     /// set parameters from pixel buffers for accessing neighboring pixels
-    static void initAdiPattern(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth, pixel* adiBuf,
+    static void initAdiPattern(TComDataCU* cu, CU* cuData, uint32_t zOrderIdxInPart, uint32_t partDepth, pixel* adiBuf,
                                pixel* refAbove, pixel* refLeft,
                                pixel* refAboveFlt, pixel* refLeftFlt, int dirMode);
 
     /// set chroma parameters from CU data for accessing ADI data
-    static void initAdiPatternChroma(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth,
+    static void initAdiPatternChroma(TComDataCU* cu, CU* cuData, uint32_t zOrderIdxInPart, uint32_t partDepth,
                                      pixel* adiBuf, uint32_t chromaId);
 
     static void initIntraNeighbors(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth, bool isLuma, IntraNeighbors *IntraNeighbors);
diff -r 32f50df7fa76 -r ed887d8ae5cd source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp	Fri Sep 26 17:33:09 2014 -0500
+++ b/source/encoder/analysis.cpp	Mon Sep 29 12:11:32 2014 +0530
@@ -282,7 +282,7 @@
                 cu->childIdx = child_idx;
                 cu->offset[0] = sb_x * blockSize;
                 cu->offset[1] = sb_y * blockSize;
-                cu->encodeIdx = getDepthScanIdx(cu->offset[0] >> 3, cu->offset[1] >> 3, b8Width);
+                cu->encodeIdx = getDepthScanIdx(cu->offset[0] >> 3, cu->offset[1] >> 3, b8Width) * 4;
                 cu->flags = 0;
 
                 CU_SET_FLAG(cu->flags, CU::PRESENT, present_flag);
@@ -421,7 +421,7 @@
     //PPAScopeEvent(CompressIntraCU + depth);
     Frame* pic = outBestCU->m_pic;
     uint32_t cuAddr = outBestCU->getAddr();
-    uint32_t absPartIdx = outBestCU->getZorderIdxInCU();
+    uint32_t absPartIdx = cu->encodeIdx;
 
     if (depth == 0)
         // get original YUV data from picture
@@ -471,10 +471,10 @@
         {
             CU *child_cu = pic->getCU(cuAddr)->m_CULocalData + cu->childIdx + partUnitIdx;
             int qp = outTempCU->getQP(0);
-            subBestPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
+            subBestPartCU->initSubCU(outTempCU, child_cu, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
             if (child_cu->flags & CU::PRESENT)
             {
-                subTempPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
+                subTempPartCU->initSubCU(outTempCU, child_cu, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
                 if (0 == partUnitIdx) //initialize RD with previous depth buffer
                     m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
                 else
@@ -563,15 +563,15 @@
     int32_t ctuToDepthIndex = g_maxCUDepth - 1;
 
     if (depth)
-        m_origYuv[0]->copyPartToYuv(m_origYuv[depth], outBestCU->getZorderIdxInCU());
+        m_origYuv[0]->copyPartToYuv(m_origYuv[depth], cu->encodeIdx);
     else
-        m_origYuv[depth]->copyFromPicYuv(pic->getPicYuvOrg(), outBestCU->getAddr(), outBestCU->getZorderIdxInCU());
+        m_origYuv[depth]->copyFromPicYuv(pic->getPicYuvOrg(), outBestCU->getAddr(), cu->encodeIdx);
 
     Slice* slice = outTempCU->m_slice;
     int32_t cu_split_flag = !(cu->flags & CU::LEAF);
     int32_t cu_unsplit_flag = !(cu->flags & CU::SPLIT_MANDATORY);
 
-    if (cu_unsplit_flag && ((zOrder == outBestCU->getZorderIdxInCU()) && (depth == sharedDepth[zOrder])))
+    if (cu_unsplit_flag && ((zOrder == cu->encodeIdx) && (depth == sharedDepth[zOrder])))
     {
         m_quant.setQPforQuant(outTempCU);
         checkIntra(outTempCU, (PartSize)sharedPartSizes[zOrder], cu, &sharedModes[zOrder]);
@@ -606,10 +606,10 @@
         {
             CU *child_cu = pic->getCU(outTempCU->getAddr())->m_CULocalData + cu->childIdx + partUnitIdx;
             int qp = outTempCU->getQP(0);
-            subBestPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
+            subBestPartCU->initSubCU(outTempCU, child_cu, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
             if (child_cu->flags & CU::PRESENT)
             {
-                subTempPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
+                subTempPartCU->initSubCU(outTempCU, child_cu, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
 
                 if (partUnitIdx) // initialize RD with previous depth buffer
                     m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[nextDepth][CI_NEXT_BEST]);
@@ -672,7 +672,7 @@
     outBestCU->copyToPic(depth);
     if (!cu_unsplit_flag)
         return;
-    m_bestRecoYuv[depth]->copyToPicYuv(pic->getPicYuvRec(), outBestCU->getAddr(), outBestCU->getZorderIdxInCU());
+    m_bestRecoYuv[depth]->copyToPicYuv(pic->getPicYuvRec(), outBestCU->getAddr(), cu->encodeIdx);
 
 #if CHECKED_BUILD || _DEBUG
     X265_CHECK(outBestCU->getPartitionSize(0) != SIZE_NONE, "no best partition size\n");
@@ -700,11 +700,11 @@
     outTempCU->getQuadtreeTULog2MinSizeInCU(tuDepthRange, 0);
 
     if (sharedModes)
-        sharedEstIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange, sharedModes);
+        sharedEstIntraPredQT(outTempCU, cu, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange, sharedModes);
     else
-        estIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange);
+        estIntraPredQT(outTempCU, cu, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange);
 
-    estIntraPredChromaQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth]);
+    estIntraPredChromaQT(outTempCU, cu, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth]);
 
     m_entropyCoder->resetBits();
     if (outTempCU->m_slice->m_pps->bTransquantBypassEnabled)
@@ -740,7 +740,7 @@
 {
     Frame* pic = outTempCU->m_pic;
     uint32_t cuAddr = outTempCU->getAddr();
-    uint32_t absPartIdx = outTempCU->getZorderIdxInCU();
+    uint32_t absPartIdx = cu->encodeIdx;
 
     if (depth)
         // copy partition YUV from depth 0 CTU cache
@@ -801,12 +801,12 @@
             /* Initialise all Mode-CUs based on parentCU */
             if (depth)
             {
-                m_interCU_2Nx2N[depth]->initSubCU(parentCU, PartitionIndex, depth, qp);
-                m_interCU_2NxN[depth]->initSubCU(parentCU, PartitionIndex, depth, qp);
-                m_interCU_Nx2N[depth]->initSubCU(parentCU, PartitionIndex, depth, qp);
-                m_intraInInterCU[depth]->initSubCU(parentCU, PartitionIndex, depth, qp);
-                m_mergeCU[depth]->initSubCU(parentCU, PartitionIndex, depth, qp);
-                m_bestMergeCU[depth]->initSubCU(parentCU, PartitionIndex, depth, qp);
+                m_interCU_2Nx2N[depth]->initSubCU(parentCU, cu, PartitionIndex, depth, qp);
+                m_interCU_2NxN[depth]->initSubCU(parentCU, cu, PartitionIndex, depth, qp);
+                m_interCU_Nx2N[depth]->initSubCU(parentCU, cu, PartitionIndex, depth, qp);
+                m_intraInInterCU[depth]->initSubCU(parentCU, cu, PartitionIndex, depth, qp);
+                m_mergeCU[depth]->initSubCU(parentCU, cu, PartitionIndex, depth, qp);
+                m_bestMergeCU[depth]->initSubCU(parentCU, cu, PartitionIndex, depth, qp);
             }
             else
             {
@@ -819,7 +819,7 @@
             }
 
             /* Compute Merge Cost */
-            checkMerge2Nx2N_rd0_4(m_bestMergeCU[depth], m_mergeCU[depth], m_modePredYuv[3][depth], m_bestMergeRecoYuv[depth]);
+            checkMerge2Nx2N_rd0_4(m_bestMergeCU[depth], m_mergeCU[depth], cu, m_modePredYuv[3][depth], m_bestMergeRecoYuv[depth]);
             bool earlyskip = false;
             if (m_param->rdLevel >= 1)
                 earlyskip = (m_param->bEnableEarlySkip && m_bestMergeCU[depth]->isSkipped(0));
@@ -827,7 +827,7 @@
             if (!earlyskip)
             {
                 /* Compute 2Nx2N mode costs */
-                checkInter_rd0_4(m_interCU_2Nx2N[depth], m_modePredYuv[0][depth], SIZE_2Nx2N);
+                checkInter_rd0_4(m_interCU_2Nx2N[depth], cu, m_modePredYuv[0][depth], SIZE_2Nx2N);
 
                 /* initialise outBestCU to 2Nx2N */
                 outBestCU = m_interCU_2Nx2N[depth];
@@ -836,8 +836,8 @@
                 /* Compute Rect costs */
                 if (m_param->bEnableRectInter)
                 {
-                    checkInter_rd0_4(m_interCU_Nx2N[depth], m_modePredYuv[1][depth], SIZE_Nx2N);
-                    checkInter_rd0_4(m_interCU_2NxN[depth], m_modePredYuv[2][depth], SIZE_2NxN);
+                    checkInter_rd0_4(m_interCU_Nx2N[depth], cu, m_modePredYuv[1][depth], SIZE_Nx2N);
+                    checkInter_rd0_4(m_interCU_2NxN[depth], cu, m_modePredYuv[2][depth], SIZE_2NxN);
                     if (m_interCU_Nx2N[depth]->m_sa8dCost < outBestCU->m_sa8dCost)
                     {
                         outBestCU = m_interCU_Nx2N[depth];
@@ -856,11 +856,11 @@
                     int numPart = outBestCU->getNumPartInter();
                     for (int partIdx = 0; partIdx < numPart; partIdx++)
                     {
-                        prepMotionCompensation(outBestCU, partIdx);
+                        prepMotionCompensation(outBestCU, cu, partIdx);
                         motionCompensation(m_bestPredYuv[depth], false, true);
                     }
 
-                    encodeResAndCalcRdInterCU(outBestCU, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth],
+                    encodeResAndCalcRdInterCU(outBestCU, cu, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth],
                                               m_bestResiYuv[depth], m_bestRecoYuv[depth]);
                     uint64_t bestMergeCost = m_rdCost.m_psyRd ? m_bestMergeCU[depth]->m_totalPsyCost : m_bestMergeCU[depth]->m_totalRDCost;
                     uint64_t bestCost = m_rdCost.m_psyRd ? outBestCU->m_totalPsyCost : outBestCU->m_totalRDCost;
@@ -885,11 +885,11 @@
 
                     if (bdoIntra)
                     {
-                        checkIntraInInter_rd0_4(m_intraInInterCU[depth], SIZE_2Nx2N);
+                        checkIntraInInter_rd0_4(m_intraInInterCU[depth], cu, SIZE_2Nx2N);
                         uint64_t intraInInterCost, bestCost;
                         if (m_param->rdLevel > 2)
                         {
-                            encodeIntraInInter(m_intraInInterCU[depth], m_origYuv[depth], m_modePredYuv[5][depth],
+                            encodeIntraInInter(m_intraInInterCU[depth], cu, m_origYuv[depth], m_modePredYuv[5][depth],
                                                m_tmpResiYuv[depth], m_tmpRecoYuv[depth]);
                             intraInInterCost = m_rdCost.m_psyRd ? m_intraInInterCU[depth]->m_totalPsyCost : m_intraInInterCU[depth]->m_totalRDCost;
                             bestCost = m_rdCost.m_psyRd ? outBestCU->m_totalPsyCost : outBestCU->m_totalRDCost;
@@ -922,17 +922,17 @@
                         int numPart = outBestCU->getNumPartInter();
                         for (int partIdx = 0; partIdx < numPart; partIdx++)
                         {
-                            prepMotionCompensation(outBestCU, partIdx);
+                            prepMotionCompensation(outBestCU, cu, partIdx);
                             motionCompensation(m_bestPredYuv[depth], false, true);
                         }
 
-                        encodeResAndCalcRdInterCU(outBestCU, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth],
+                        encodeResAndCalcRdInterCU(outBestCU, cu, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth],
                                                   m_bestResiYuv[depth], m_bestRecoYuv[depth]);
                         m_rdEntropyCoders[depth][CI_TEMP_BEST].store(m_rdEntropyCoders[depth][CI_NEXT_BEST]);
                     }
                     else if (outBestCU->getPredictionMode(0) == MODE_INTRA)
                     {
-                        encodeIntraInInter(outBestCU, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth],  m_bestRecoYuv[depth]);
+                        encodeIntraInInter(outBestCU, cu, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth],  m_bestRecoYuv[depth]);
                         m_rdEntropyCoders[depth][CI_TEMP_BEST].store(m_rdEntropyCoders[depth][CI_NEXT_BEST]);
                     }
                 }
@@ -949,15 +949,15 @@
                         int numPart = outBestCU->getNumPartInter();
                         for (int partIdx = 0; partIdx < numPart; partIdx++)
                         {
-                            prepMotionCompensation(outBestCU, partIdx);
+                            prepMotionCompensation(outBestCU, cu, partIdx);
                             motionCompensation(m_bestPredYuv[depth], false, true);
                         }
 
                         m_tmpResiYuv[depth]->subtract(m_origYuv[depth], m_bestPredYuv[depth], outBestCU->getLog2CUSize(0));
-                        generateCoeffRecon(outBestCU, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth], m_bestRecoYuv[depth]);
+                        generateCoeffRecon(outBestCU, cu, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth], m_bestRecoYuv[depth]);
                     }
                     else
-                        generateCoeffRecon(outBestCU, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth], m_bestRecoYuv[depth]);
+                        generateCoeffRecon(outBestCU, cu, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth], m_bestRecoYuv[depth]);
                 }
                 else if (!m_param->rdLevel)
                 {
@@ -966,7 +966,7 @@
                         int numPart = outBestCU->getNumPartInter();
                         for (int partIdx = 0; partIdx < numPart; partIdx++)
                         {
-                            prepMotionCompensation(outBestCU, partIdx);
+                            prepMotionCompensation(outBestCU, cu, partIdx);
                             motionCompensation(m_bestPredYuv[depth], false, true);
                         }
                     }
@@ -1074,7 +1074,7 @@
             CU *child_cu = pic->getCU(cuAddr)->m_CULocalData + cu->childIdx + partUnitIdx;
 
             TComDataCU* subBestPartCU = NULL;
-            subTempPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
+            subTempPartCU->initSubCU(outTempCU, child_cu, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
 
             if (child_cu->flags & CU::PRESENT)
             {
@@ -1188,7 +1188,7 @@
     outBestCU->copyToPic(depth);
 
     if (!m_param->rdLevel && !depth)
-        encodeResidue(outBestCU, outBestCU, 0, 0);
+        encodeResidue(outBestCU, outBestCU, cu, 0, 0);
     else if (m_param->rdLevel)
     {
         /* Copy Yuv data to picture Yuv */
@@ -1230,7 +1230,7 @@
 
     Frame* pic = outBestCU->m_pic;
     uint32_t cuAddr = outBestCU->getAddr();
-    uint32_t absPartIdx = outBestCU->getZorderIdxInCU();
+    uint32_t absPartIdx = cu->encodeIdx;
 
     if (depth)
         // copy partition YUV from depth 0 CTU cache
@@ -1256,14 +1256,14 @@
         if (slice->m_sliceType != I_SLICE)
         {
             // by Merge for inter_2Nx2N
-            checkMerge2Nx2N_rd5_6(outBestCU, outTempCU, &earlyDetectionSkipMode, m_bestPredYuv[depth], m_bestRecoYuv[depth]);
+            checkMerge2Nx2N_rd5_6(outBestCU, outTempCU, cu, &earlyDetectionSkipMode, m_bestPredYuv[depth], m_bestRecoYuv[depth]);
 
             outTempCU->initEstData();
 
             if (!m_param->bEnableEarlySkip)
             {
                 // 2Nx2N, NxN
-                checkInter_rd5_6(outBestCU, outTempCU, SIZE_2Nx2N);
+                checkInter_rd5_6(outBestCU, outTempCU, cu, SIZE_2Nx2N);
                 outTempCU->initEstData();
                 if (m_param->bEnableCbfFastMode)
                     doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
@@ -1280,7 +1280,7 @@
                 // 2Nx2N, NxN
                 if (cu->log2CUSize != 3 && depth == g_maxCUDepth && doNotBlockPu)
                 {
-                    checkInter_rd5_6(outBestCU, outTempCU, SIZE_NxN);
+                    checkInter_rd5_6(outBestCU, outTempCU, cu, SIZE_NxN);
                     outTempCU->initEstData();
                 }
 
@@ -1289,14 +1289,14 @@
                     // 2NxN, Nx2N
                     if (doNotBlockPu)
                     {
-                        checkInter_rd5_6(outBestCU, outTempCU, SIZE_Nx2N);
+                        checkInter_rd5_6(outBestCU, outTempCU, cu, SIZE_Nx2N);
                         outTempCU->initEstData();
                         if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_Nx2N)
                             doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
                     }
                     if (doNotBlockPu)
                     {
-                        checkInter_rd5_6(outBestCU, outTempCU, SIZE_2NxN);
+                        checkInter_rd5_6(outBestCU, outTempCU, cu, SIZE_2NxN);
                         outTempCU->initEstData();
                         if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_2NxN)
                             doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
@@ -1316,14 +1316,14 @@
                     {
                         if (doNotBlockPu)
                         {
-                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_2NxnU);
+                            checkInter_rd5_6(outBestCU, outTempCU, cu, SIZE_2NxnU);
                             outTempCU->initEstData();
                             if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_2NxnU)
                                 doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
                         }
                         if (doNotBlockPu)
                         {
-                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_2NxnD);
+                            checkInter_rd5_6(outBestCU, outTempCU, cu, SIZE_2NxnD);
                             outTempCU->initEstData();
                             if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_2NxnD)
                                 doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
@@ -1333,14 +1333,14 @@
                     {
                         if (doNotBlockPu)
                         {
-                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_2NxnU, true);
+                            checkInter_rd5_6(outBestCU, outTempCU, cu, SIZE_2NxnU, true);
                             outTempCU->initEstData();
                             if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_2NxnU)
                                 doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
                         }
                         if (doNotBlockPu)
                         {
-                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_2NxnD, true);
+                            checkInter_rd5_6(outBestCU, outTempCU, cu, SIZE_2NxnD, true);
                             outTempCU->initEstData();
                             if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_2NxnD)
                                 doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
@@ -1352,14 +1352,14 @@
                     {
                         if (doNotBlockPu)
                         {
-                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_nLx2N);
+                            checkInter_rd5_6(outBestCU, outTempCU, cu, SIZE_nLx2N);
                             outTempCU->initEstData();
                             if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_nLx2N)
                                 doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
                         }
                         if (doNotBlockPu)
                         {
-                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_nRx2N);
+                            checkInter_rd5_6(outBestCU, outTempCU, cu, SIZE_nRx2N);
                             outTempCU->initEstData();
                         }
                     }
@@ -1367,14 +1367,14 @@
                     {
                         if (doNotBlockPu)
                         {
-                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_nLx2N, true);
+                            checkInter_rd5_6(outBestCU, outTempCU, cu, SIZE_nLx2N, true);
                             outTempCU->initEstData();
                             if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_nLx2N)
                                 doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
                         }
                         if (doNotBlockPu)
                         {
-                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_nRx2N, true);
+                            checkInter_rd5_6(outBestCU, outTempCU, cu, SIZE_nRx2N, true);
                             outTempCU->initEstData();
                         }
                     }
@@ -1387,12 +1387,12 @@
                  outBestCU->getCbf(0, TEXT_CHROMA_U) != 0   ||
                  outBestCU->getCbf(0, TEXT_CHROMA_V) != 0)  && doIntra)
             {
-                checkIntraInInter_rd5_6(outBestCU, outTempCU, SIZE_2Nx2N);
+                checkIntraInInter_rd5_6(outBestCU, outTempCU, cu, SIZE_2Nx2N);
                 outTempCU->initEstData();
 
                 if (depth == g_maxCUDepth && cu->log2CUSize > slice->m_sps->quadtreeTULog2MinSize)
                 {
-                    checkIntraInInter_rd5_6(outBestCU, outTempCU, SIZE_NxN);
+                    checkIntraInInter_rd5_6(outBestCU, outTempCU, cu, SIZE_NxN);
                     outTempCU->initEstData();
                 }
             }
@@ -1425,11 +1425,11 @@
             CU *child_cu = pic->getCU(cuAddr)->m_CULocalData + cu->childIdx + partUnitIdx;
 
             int qp = outTempCU->getQP(0);
-            subBestPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
+            subBestPartCU->initSubCU(outTempCU, child_cu, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
 
             if (child_cu->flags & CU::PRESENT)
             {
-                subTempPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
+                subTempPartCU->initSubCU(outTempCU, child_cu, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
 
                 if (partUnitIdx) // initialize RD with previous depth buffer
                     m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[nextDepth][CI_NEXT_BEST]);
@@ -1505,7 +1505,7 @@
 #endif
 }
 
-void Analysis::checkMerge2Nx2N_rd0_4(TComDataCU*& outBestCU, TComDataCU*& outTempCU, TComYuv*& bestPredYuv, TComYuv*& yuvReconBest)
+void Analysis::checkMerge2Nx2N_rd0_4(TComDataCU*& outBestCU, TComDataCU*& outTempCU, CU* cuData, TComYuv*& bestPredYuv, TComYuv*& yuvReconBest)
 {
     X265_CHECK(outTempCU->m_slice->m_sliceType != I_SLICE, "Evaluating merge in I slice\n");
     TComMvField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
@@ -1541,7 +1541,7 @@
 
             // do MC only for Luma part
             /* Set CU parameters for motion compensation */
-            prepMotionCompensation(outTempCU, 0);
+            prepMotionCompensation(outTempCU, cuData, 0);
             motionCompensation(m_tmpPredYuv[depth], true, false);
             uint32_t bitsCand = getTUBits(mergeCand, maxNumMergeCand);
             outTempCU->m_totalBits = bitsCand;
@@ -1580,7 +1580,7 @@
             int numPart = outBestCU->getNumPartInter();
             for (int partIdx = 0; partIdx < numPart; partIdx++)
             {
-                prepMotionCompensation(outBestCU, partIdx);
+                prepMotionCompensation(outBestCU, cuData, partIdx);
                 motionCompensation(bestPredYuv, false, true);
             }
 
@@ -1595,7 +1595,7 @@
             }
 
             // Encode with residue
-            encodeResAndCalcRdInterCU(outTempCU, m_origYuv[depth], bestPredYuv, m_tmpResiYuv[depth], m_bestResiYuv[depth], m_tmpRecoYuv[depth]);
+            encodeResAndCalcRdInterCU(outTempCU, cuData, m_origYuv[depth], bestPredYuv, m_tmpResiYuv[depth], m_bestResiYuv[depth], m_tmpRecoYuv[depth]);
 
             uint64_t tempCost = m_rdCost.m_psyRd ? outTempCU->m_totalPsyCost : outTempCU->m_totalRDCost;
             uint64_t bestCost = m_rdCost.m_psyRd ? outBestCU->m_totalPsyCost : outBestCU->m_totalRDCost;
@@ -1609,7 +1609,7 @@
     }
 }
 
-void Analysis::checkMerge2Nx2N_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, bool *earlyDetectionSkipMode, TComYuv*& outBestPredYuv, TComYuv*& rpcYuvReconBest)
+void Analysis::checkMerge2Nx2N_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, CU* cuData, bool *earlyDetectionSkipMode, TComYuv*& outBestPredYuv, TComYuv*& rpcYuvReconBest)
 {
     X265_CHECK(outTempCU->m_slice->m_sliceType != I_SLICE, "I slice not expected\n");
     TComMvField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
@@ -1654,14 +1654,14 @@
                     outTempCU->getCUMvField(REF_PIC_LIST_1)->setAllMvField(mvFieldNeighbours[mergeCand][1], SIZE_2Nx2N, 0, 0); // interprets depth relative to outTempCU level
 
                     // do MC
-                    prepMotionCompensation(outTempCU, 0);
+                    prepMotionCompensation(outTempCU, cuData, 0);
                     motionCompensation(m_tmpPredYuv[depth], true, true);
 
                     // estimate residual and encode everything
                     if (noResidual)
                         encodeResAndCalcRdSkipCU(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpRecoYuv[depth]);
                     else
-                        encodeResAndCalcRdInterCU(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth],
+                        encodeResAndCalcRdInterCU(outTempCU, cuData, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth],
                                                   m_bestResiYuv[depth], m_tmpRecoYuv[depth]); 
 
                     /* TODO: Fix the satd cost estimates. Why is merge being chosen in high motion areas: estimated distortion is too low? */
@@ -1706,7 +1706,7 @@
     }
 }
 
-void Analysis::checkInter_rd0_4(TComDataCU* outTempCU, TComYuv* outPredYuv, PartSize partSize, bool bUseMRG)
+void Analysis::checkInter_rd0_4(TComDataCU* outTempCU, CU* cuData, TComYuv* outPredYuv, PartSize partSize, bool bUseMRG)
 {
     uint32_t depth = outTempCU->getDepth(0);
 
@@ -1716,7 +1716,7 @@
 
     // do motion compensation only for Luma since luma cost alone is calculated
     outTempCU->m_totalBits = 0;
-    if (predInterSearch(outTempCU, outPredYuv, bUseMRG, false))
+    if (predInterSearch(outTempCU, cuData, outPredYuv, bUseMRG, false))
     {
         int sizeIdx = outTempCU->getLog2CUSize(0) - 2;
         uint32_t distortion = primitives.sa8d[sizeIdx](m_origYuv[depth]->getLumaAddr(), m_origYuv[depth]->getStride(),
@@ -1731,7 +1731,7 @@
     }
 }
 
-void Analysis::checkInter_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, bool bUseMRG)
+void Analysis::checkInter_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, CU* cuData, PartSize partSize, bool bUseMRG)
 {
     uint32_t depth = outTempCU->getDepth(0);
 
@@ -1740,15 +1740,15 @@
     outTempCU->setPredModeSubParts(MODE_INTER, 0, depth);
     outTempCU->setCUTransquantBypassSubParts(!!m_param->bLossless, 0, depth);
 
-    if (predInterSearch(outTempCU, m_tmpPredYuv[depth], bUseMRG, true))
+    if (predInterSearch(outTempCU, cuData, m_tmpPredYuv[depth], bUseMRG, true))
     {
-        encodeResAndCalcRdInterCU(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_bestResiYuv[depth], m_tmpRecoYuv[depth]);
+        encodeResAndCalcRdInterCU(outTempCU, cuData, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_bestResiYuv[depth], m_tmpRecoYuv[depth]);
         checkDQP(outTempCU);
         checkBestMode(outBestCU, outTempCU, depth);
     }
 }
 
-void Analysis::checkIntraInInter_rd0_4(TComDataCU* cu, PartSize partSize)
+void Analysis::checkIntraInInter_rd0_4(TComDataCU* cu, CU* cuData, PartSize partSize)
 {
     uint32_t depth = cu->getDepth(0);
 
@@ -1762,7 +1762,7 @@
     const uint32_t partOffset  = 0;
 
     // Reference sample smoothing
-    TComPattern::initAdiPattern(cu, partOffset, initTrDepth, m_predBuf, m_refAbove, m_refLeft, m_refAboveFlt, m_refLeftFlt, ALL_IDX);
+    TComPattern::initAdiPattern(cu, cuData, partOffset, initTrDepth, m_predBuf, m_refAbove, m_refLeft, m_refAboveFlt, m_refLeftFlt, ALL_IDX);
 
     pixel* fenc     = m_origYuv[depth]->getLumaAddr();
     uint32_t stride = m_modePredYuv[5][depth]->getStride();
@@ -1914,7 +1914,7 @@
     cu->setLumaIntraDirSubParts(bmode, partOffset, depth + initTrDepth);
 }
 
-void Analysis::checkIntraInInter_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize)
+void Analysis::checkIntraInInter_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, CU* cuData, PartSize partSize)
 {
     uint32_t depth = outTempCU->getDepth(0);
 
@@ -1929,9 +1929,9 @@
     uint32_t tuDepthRange[2];
     outTempCU->getQuadtreeTULog2MinSizeInCU(tuDepthRange, 0);
 
-    estIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange);
+    estIntraPredQT(outTempCU, cuData, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange);
 
-    estIntraPredChromaQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth]);
+    estIntraPredChromaQT(outTempCU, cuData, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth]);
 
     m_entropyCoder->resetBits();
     if (outTempCU->m_slice->m_pps->bTransquantBypassEnabled)
@@ -1967,7 +1967,7 @@
     checkBestMode(outBestCU, outTempCU, depth);
 }
 
-void Analysis::encodeIntraInInter(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv,  ShortYuv* outResiYuv, TComYuv* outReconYuv)
+void Analysis::encodeIntraInInter(TComDataCU* cu, CU* cuData, TComYuv* fencYuv, TComYuv* predYuv,  ShortYuv* outResiYuv, TComYuv* outReconYuv)
 {
     uint64_t puCost = 0;
     uint32_t puBits = 0;
@@ -1983,7 +1983,7 @@
     uint32_t tuDepthRange[2];
     cu->getQuadtreeTULog2MinSizeInCU(tuDepthRange, 0);
 
-    uint32_t puDistY = xRecurIntraCodingQT(cu, initTrDepth, 0, fencYuv, predYuv, outResiYuv, false, puCost, puBits, psyEnergy, tuDepthRange);
+    uint32_t puDistY = xRecurIntraCodingQT(cu, cuData, initTrDepth, 0, fencYuv, predYuv, outResiYuv, false, puCost, puBits, psyEnergy, tuDepthRange);
     xSetIntraResultQT(cu, initTrDepth, 0, outReconYuv);
 
     // update PU data
@@ -1992,7 +1992,7 @@
     // set distortion (rate and r-d costs are determined later)
     cu->m_totalDistortion = puDistY;
 
-    estIntraPredChromaQT(cu, fencYuv, predYuv, outResiYuv, outReconYuv);
+    estIntraPredChromaQT(cu, cuData, fencYuv, predYuv, outResiYuv, outReconYuv);
     m_entropyCoder->resetBits();
     if (cu->m_slice->m_pps->bTransquantBypassEnabled)
         m_entropyCoder->codeCUTransquantBypassFlag(cu->getCUTransquantBypass(0));
@@ -2024,7 +2024,7 @@
         cu->m_totalRDCost = m_rdCost.calcRdCost(cu->m_totalDistortion, cu->m_totalBits);
 }
 
-void Analysis::encodeResidue(TComDataCU* ctu, TComDataCU* cu, uint32_t absPartIdx, uint32_t depth)
+void Analysis::encodeResidue(TComDataCU* ctu, TComDataCU* cu, CU* cuData, uint32_t absPartIdx, uint32_t depth)
 {
     Frame* pic = cu->m_pic;
 
@@ -2038,10 +2038,11 @@
         uint32_t ymax = slice->m_sps->picHeightInLumaSamples - ctu->getCUPelY();
         for (uint32_t partUnitIdx = 0; partUnitIdx < 4; partUnitIdx++, absPartIdx += qNumParts)
         {
+            CU *child_cu = cu->m_CULocalData + cuData->childIdx + partUnitIdx;
             if (g_zscanToPelX[absPartIdx] < xmax && g_zscanToPelY[absPartIdx] < ymax)
             {
-                subTempPartCU->copyToSubCU(cu, partUnitIdx, nextDepth);
-                encodeResidue(ctu, subTempPartCU, absPartIdx, nextDepth);
+                subTempPartCU->copyToSubCU(cu, child_cu, partUnitIdx, nextDepth);
+                encodeResidue(ctu, subTempPartCU, child_cu, absPartIdx, nextDepth);
             }
         }
 
@@ -2083,7 +2084,7 @@
             uint32_t tuDepthRange[2];
             cu->getQuadtreeTULog2MinSizeInCU(tuDepthRange, 0);
             // Residual encoding
-            residualTransformQuantInter(cu, 0, m_origYuv[0], m_tmpResiYuv[depth], cu->getDepth(0), tuDepthRange);
+            residualTransformQuantInter(cu, cuData, 0, m_origYuv[0], m_tmpResiYuv[depth], cu->getDepth(0), tuDepthRange);
             checkDQP(cu);
 
             if (ctu->getMergeFlag(absPartIdx) && cu->getPartitionSize(0) == SIZE_2Nx2N && !cu->getQtRootCbf(0))
@@ -2143,7 +2144,7 @@
     else
     {
         m_origYuv[0]->copyPartToYuv(m_origYuv[depth], absPartIdx);
-        generateCoeffRecon(cu, m_origYuv[depth], m_modePredYuv[5][depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth]);
+        generateCoeffRecon(cu, cuData, m_origYuv[depth], m_modePredYuv[5][depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth]);
         checkDQP(cu);
         m_tmpRecoYuv[depth]->copyToPicYuv(pic->getPicYuvRec(), cuAddr, absPartIdx);
         cu->copyCodedToPic(depth);
diff -r 32f50df7fa76 -r ed887d8ae5cd source/encoder/analysis.h
--- a/source/encoder/analysis.h	Fri Sep 26 17:33:09 2014 -0500
+++ b/source/encoder/analysis.h	Mon Sep 29 12:11:32 2014 +0530
@@ -117,17 +117,17 @@
                                int bInsidePicture, uint32_t partitionIndex, uint32_t minDepth);
     void compressInterCU_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, uint32_t depth, CU *cu,
                                PartSize parentSize = SIZE_NONE);
-    void checkMerge2Nx2N_rd0_4(TComDataCU*& outBestCU, TComDataCU*& outTempCU, TComYuv*& bestPredYuv, TComYuv*& tmpPredYuv);
-    void checkMerge2Nx2N_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, bool *earlyDetectionSkipMode,
+    void checkMerge2Nx2N_rd0_4(TComDataCU*& outBestCU, TComDataCU*& outTempCU, CU* cu, TComYuv*& bestPredYuv, TComYuv*& tmpPredYuv);
+    void checkMerge2Nx2N_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, CU* cu, bool *earlyDetectionSkipMode,
                                TComYuv*& outBestPredYuv, TComYuv*& rpcYuvReconBest);
-    void checkInter_rd0_4(TComDataCU* outTempCU, TComYuv* outPredYUV, PartSize partSize, bool bUseMRG = false);
-    void checkInter_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, bool bUseMRG = false);
-    void checkIntraInInter_rd0_4(TComDataCU* cu, PartSize partSize);
-    void checkIntraInInter_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize);
+    void checkInter_rd0_4(TComDataCU* outTempCU, CU* cu, TComYuv* outPredYUV, PartSize partSize, bool bUseMRG = false);
+    void checkInter_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, CU* cu, PartSize partSize, bool bUseMRG = false);
+    void checkIntraInInter_rd0_4(TComDataCU* cu, CU* cuData, PartSize partSize);
+    void checkIntraInInter_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, CU* cu, PartSize partSize);
 
     void checkBestMode(TComDataCU*& outBestCU, TComDataCU*& outTempCU, uint32_t depth);
-    void encodeIntraInInter(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* outResiYuv, TComYuv* outReconYuv);
-    void encodeResidue(TComDataCU* lcu, TComDataCU* cu, uint32_t absPartIdx, uint32_t depth);
+    void encodeIntraInInter(TComDataCU* cu, CU* cuData, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* outResiYuv, TComYuv* outReconYuv);
+    void encodeResidue(TComDataCU* lcu, TComDataCU* cu, CU* cuData, uint32_t absPartIdx, uint32_t depth);
     void checkDQP(TComDataCU* cu);
     void deriveTestModeAMP(TComDataCU* bestCU, PartSize parentSize, bool &bTestAMP_Hor, bool &bTestAMP_Ver,
                            bool &bTestMergeAMP_Hor, bool &bTestMergeAMP_Ver);
diff -r 32f50df7fa76 -r ed887d8ae5cd source/encoder/predict.cpp
--- a/source/encoder/predict.cpp	Fri Sep 26 17:33:09 2014 -0500
+++ b/source/encoder/predict.cpp	Mon Sep 29 12:11:32 2014 +0530
@@ -142,12 +142,12 @@
     primitives.intra_pred[dirMode][sizeIdx](dst, stride, left, above, dirMode, 0);
 }
 
-void Predict::prepMotionCompensation(TComDataCU* cu, int partIdx)
+void Predict::prepMotionCompensation(TComDataCU* cu, CU* cuData, int partIdx)
 {
     m_slice = cu->m_slice;
     cu->getPartIndexAndSize(partIdx, m_partAddr, m_width, m_height);
     m_cuAddr = cu->getAddr();
-    m_zOrderIdxinCU = cu->getZorderIdxInCU();
+    m_zOrderIdxinCU = cuData->encodeIdx;
 
     m_mvField[0] = cu->getCUMvField(REF_PIC_LIST_0);
     m_mvField[1] = cu->getCUMvField(REF_PIC_LIST_1);
diff -r 32f50df7fa76 -r ed887d8ae5cd source/encoder/predict.h
--- a/source/encoder/predict.h	Fri Sep 26 17:33:09 2014 -0500
+++ b/source/encoder/predict.h	Mon Sep 29 12:11:32 2014 +0530
@@ -89,7 +89,7 @@
     bool allocBuffers(int csp);
 
     /* prepMotionCompensation needs to be called to prepare MC with CU-relevant data */
-    void prepMotionCompensation(TComDataCU* cu, int partIdx);
+    void prepMotionCompensation(TComDataCU* cu, CU* cuData, int partIdx);
     void motionCompensation(TComYuv* predYuv, bool bLuma, bool bChroma);
 
     /* Angular Intra */
diff -r 32f50df7fa76 -r ed887d8ae5cd source/encoder/search.cpp
--- a/source/encoder/search.cpp	Fri Sep 26 17:33:09 2014 -0500
+++ b/source/encoder/search.cpp	Mon Sep 29 12:11:32 2014 +0530
@@ -287,15 +287,14 @@
 }
 
 /* returns distortion */
-uint32_t Search::xIntraCodingLumaBlk(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize, TComYuv* fencYuv, TComYuv* predYuv,
+uint32_t Search::xIntraCodingLumaBlk(TComDataCU* cu, CU* cuData, uint32_t absPartIdx, uint32_t log2TrSize, TComYuv* fencYuv, TComYuv* predYuv,
                                      ShortYuv* resiYuv, int16_t* reconQt, uint32_t reconQtStride, coeff_t* coeff, uint32_t& cbf)
 {
     uint32_t stride       = fencYuv->getStride();
     pixel*   fenc         = fencYuv->getLumaAddr(absPartIdx);
     pixel*   pred         = predYuv->getLumaAddr(absPartIdx);
     int16_t* residual     = resiYuv->getLumaAddr(absPartIdx);
-
-    uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
+    uint32_t zorder           = cuData->encodeIdx + absPartIdx;
     pixel*   reconIPred       = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
     uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getStride();
     bool     useTransformSkip = !!cu->getTransformSkip(absPartIdx, TEXT_LUMA);
@@ -337,7 +336,7 @@
     }
 }
 
-uint32_t Search::xIntraCodingChromaBlk(TComDataCU* cu, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, int16_t* reconQt,
+uint32_t Search::xIntraCodingChromaBlk(TComDataCU* cu, CU* cuData, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, int16_t* reconQt,
                                        uint32_t reconQtStride, coeff_t* coeff, uint32_t& cbf, uint32_t chromaId, uint32_t log2TrSizeC)
 {
     TextType ttype        = (TextType)chromaId;
@@ -346,7 +345,7 @@
     pixel*   pred         = predYuv->getChromaAddr(chromaId, absPartIdx);
     int16_t* residual     = resiYuv->getChromaAddr(chromaId, absPartIdx);
 
-    uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
+    uint32_t zorder           = cuData->encodeIdx + absPartIdx;
     pixel*   reconIPred       = cu->m_pic->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), zorder);
     uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getCStride();
     bool     useTransformSkipC = !!cu->getTransformSkip(absPartIdx, ttype);
@@ -393,7 +392,7 @@
 }
 
 /* returns distortion. TODO reorder params */
-uint32_t Search::xRecurIntraCodingQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, 
+uint32_t Search::xRecurIntraCodingQT(TComDataCU* cu, CU* cuData, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, 
                                      bool bAllowRQTSplit, uint64_t& rdCost, uint32_t& rdBits, uint32_t& psyEnergy, uint32_t depthRange[2])
 {
     uint32_t fullDepth   = cu->getDepth(0) + trDepth;
@@ -454,7 +453,7 @@
 
         // init availability pattern
         uint32_t lumaPredMode = cu->getLumaIntraDir(absPartIdx);
-        TComPattern::initAdiPattern(cu, absPartIdx, trDepth, m_predBuf, m_refAbove, m_refLeft, m_refAboveFlt, m_refLeftFlt, lumaPredMode);
+        TComPattern::initAdiPattern(cu, cuData, absPartIdx, trDepth, m_predBuf, m_refAbove, m_refLeft, m_refAboveFlt, m_refLeftFlt, lumaPredMode);
 
         // get prediction signal
         predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
@@ -496,11 +495,11 @@
                     cu->setCUTransquantBypassSubParts(bIsLossLess, absPartIdx, fullDepth);
 
                 // code luma block with given intra prediction mode and store Cbf
-                singleDistYTmp = xIntraCodingLumaBlk(cu, absPartIdx, log2TrSize, fencYuv, predYuv, resiYuv, recon, reconStride, coeff, singleCbfYTmp);
+                singleDistYTmp = xIntraCodingLumaBlk(cu, cuData, absPartIdx, log2TrSize, fencYuv, predYuv, resiYuv, recon, reconStride, coeff, singleCbfYTmp);
                 singlePsyEnergyYTmp = 0;
                 if (m_rdCost.m_psyRd)
                 {
-                    uint32_t zorder = cu->getZorderIdxInCU() + absPartIdx;
+                    uint32_t zorder = cuData->encodeIdx + absPartIdx;
                     singlePsyEnergyYTmp = m_rdCost.psyCost(log2TrSize - 2, fencYuv->getLumaAddr(absPartIdx), fencYuv->getStride(),
                         cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder), cu->m_pic->getPicYuvRec()->getStride());
                 }
@@ -540,7 +539,7 @@
 
             if (bestModeId == firstCheckId)
             {
-                xLoadIntraResultQT(cu, absPartIdx, log2TrSize, reconQt, reconQtStride);
+                xLoadIntraResultQT(cu, cuData, absPartIdx, log2TrSize, reconQt, reconQtStride);
                 cu->setCbfSubParts(singleCbfY << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
                 m_entropyCoder->load(m_rdEntropyCoders[fullDepth][CI_TEMP_BEST]);
             }
@@ -557,10 +556,10 @@
 
             // code luma block with given intra prediction mode and store Cbf
             cu->setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
-            singleDistY = xIntraCodingLumaBlk(cu, absPartIdx, log2TrSize, fencYuv, predYuv, resiYuv, reconQt, reconQtStride, coeffY, singleCbfY);
+            singleDistY = xIntraCodingLumaBlk(cu, cuData, absPartIdx, log2TrSize, fencYuv, predYuv, resiYuv, reconQt, reconQtStride, coeffY, singleCbfY);
             if (m_rdCost.m_psyRd)
             {
-                uint32_t zorder = cu->getZorderIdxInCU() + absPartIdx;
+                uint32_t zorder = cuData->encodeIdx + absPartIdx;
                 singlePsyEnergyY = m_rdCost.psyCost(log2TrSize - 2, fencYuv->getLumaAddr(absPartIdx), fencYuv->getStride(),
                     cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder), cu->m_pic->getPicYuvRec()->getStride());
             }
@@ -599,7 +598,7 @@
 
         for (uint32_t part = 0; part < 4; part++, absPartIdxSub += qPartsDiv)
         {
-            splitDistY += xRecurIntraCodingQT(cu, trDepth + 1, absPartIdxSub, fencYuv, predYuv, resiYuv, bAllowRQTSplit, splitCost, splitBits, 
+            splitDistY += xRecurIntraCodingQT(cu, cuData, trDepth + 1, absPartIdxSub, fencYuv, predYuv, resiYuv, bAllowRQTSplit, splitCost, splitBits, 
                                               splitPsyEnergyY, depthRange);
             splitCbfY |= cu->getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1);
         }
@@ -638,7 +637,7 @@
 
         // set reconstruction for next intra prediction blocks
         uint32_t qtLayer   = log2TrSize - 2;
-        uint32_t zorder    = cu->getZorderIdxInCU() + absPartIdx;
+        uint32_t zorder    = cuData->encodeIdx + absPartIdx;
         int16_t* reconQt   = m_qtTempShortYuv[qtLayer].getLumaAddr(absPartIdx);
         X265_CHECK(m_qtTempShortYuv[qtLayer].m_width == MAX_CU_SIZE, "width is not max CU size\n");
         const uint32_t reconQtStride = MAX_CU_SIZE;
@@ -655,7 +654,7 @@
     return outDist + singleDistY;
 }
 
-void Search::residualTransformQuantIntra(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv,
+void Search::residualTransformQuantIntra(TComDataCU* cu, CU* cuData, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv,
                                          ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2])
 {
     uint32_t fullDepth   = cu->getDepth(0) +  trDepth;
@@ -686,14 +685,14 @@
         uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
         coeff_t* coeff        = cu->getCoeffY() + coeffOffsetY;
 
-        uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
+        uint32_t zorder           = cuData->encodeIdx + absPartIdx;
         pixel*   reconIPred       = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
         uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getStride();
 
         bool     useTransformSkip = !!cu->getTransformSkip(absPartIdx, TEXT_LUMA);
 
         // init availability pattern
-        TComPattern::initAdiPattern(cu, absPartIdx, trDepth, m_predBuf, m_refAbove, m_refLeft, m_refAboveFlt, m_refLeftFlt, lumaPredMode);
+        TComPattern::initAdiPattern(cu, cuData, absPartIdx, trDepth, m_predBuf, m_refAbove, m_refLeft, m_refAboveFlt, m_refLeftFlt, lumaPredMode);
         // get prediction signal
         predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
 
@@ -742,7 +741,7 @@
 
         for (uint32_t part = 0; part < 4; part++, absPartIdxSub += qPartsDiv)
         {
-            residualTransformQuantIntra(cu, trDepth + 1, absPartIdxSub, fencYuv, predYuv, resiYuv, reconYuv, depthRange);
+            residualTransformQuantIntra(cu, cuData, trDepth + 1, absPartIdxSub, fencYuv, predYuv, resiYuv, reconYuv, depthRange);
             splitCbfY |= cu->getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1);
         }
 
@@ -778,24 +777,24 @@
     }
 }
 
-void Search::xLoadIntraResultQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize, int16_t* reconQt, uint32_t reconQtStride)
+void Search::xLoadIntraResultQT(TComDataCU* cu, CU* cuData, uint32_t absPartIdx, uint32_t log2TrSize, int16_t* reconQt, uint32_t reconQtStride)
 {
     // copy reconstruction
     int sizeIdx = log2TrSize - 2;
-    uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
+    uint32_t zorder           = cuData->encodeIdx + absPartIdx;
     pixel*   reconIPred       = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
     uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getStride();
     primitives.square_copy_sp[sizeIdx](reconIPred, reconIPredStride, reconQt, reconQtStride);
 }
 
-void Search::xLoadIntraResultChromaQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSizeC, uint32_t chromaId,
+void Search::xLoadIntraResultChromaQT(TComDataCU* cu, CU* cuData, uint32_t absPartIdx, uint32_t log2TrSizeC, uint32_t chromaId,
                                       int16_t* reconQt, uint32_t reconQtStride)
 {
     X265_CHECK(chromaId == 1 || chromaId == 2, "invalid chroma id");
 
     // copy reconstruction
     int sizeIdxC = log2TrSizeC - 2;
-    uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
+    uint32_t zorder           = cuData->encodeIdx + absPartIdx;
     pixel*   reconIPred       = cu->m_pic->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), zorder);
     uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getCStride();
     primitives.square_copy_sp[sizeIdxC](reconIPred, reconIPredStride, reconQt, reconQtStride);
@@ -838,7 +837,7 @@
 }
 
 /* returns distortion */
-uint32_t Search::xRecurIntraChromaCodingQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv,
+uint32_t Search::xRecurIntraChromaCodingQT(TComDataCU* cu, CU* cuData, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv,
                                            uint32_t& psyEnergy)
 {
     uint32_t fullDepth = cu->getDepth(0) + trDepth;
@@ -897,7 +896,7 @@
                 pixel*   pred        = predYuv->getChromaAddr(chromaId, absPartIdxC);
 
                 // init availability pattern
-                TComPattern::initAdiPatternChroma(cu, absPartIdxC, trDepthC, m_predBuf, chromaId);
+                TComPattern::initAdiPatternChroma(cu, cuData, absPartIdxC, trDepthC, m_predBuf, chromaId);
                 pixel* chromaPred = TComPattern::getAdiChromaBuf(chromaId, tuSize, m_predBuf);
 
                 uint32_t chromaPredMode = cu->getChromaIntraDir(absPartIdxC);
@@ -941,7 +940,7 @@
 
                         cu->setTransformSkipPartRange(chromaModeId, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
 
-                        singleDistCTmp = xIntraCodingChromaBlk(cu, absPartIdxC, fencYuv, predYuv, resiYuv, recon, reconStride, coeff, singleCbfCTmp, chromaId, log2TrSizeC);
+                        singleDistCTmp = xIntraCodingChromaBlk(cu, cuData, absPartIdxC, fencYuv, predYuv, resiYuv, recon, reconStride, coeff, singleCbfCTmp, chromaId, log2TrSizeC);
                         cu->setCbfPartRange(singleCbfCTmp << trDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
 
                         if (chromaModeId == 1 && !singleCbfCTmp)
@@ -952,7 +951,7 @@
                             uint32_t bitsTmp = singleCbfCTmp ? xGetIntraBitsChroma(cu, absPartIdxC, log2TrSizeC, chromaId, coeff) : 0;
                             if (m_rdCost.m_psyRd)
                             {
-                                uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
+                                uint32_t zorder = cuData->encodeIdx + absPartIdxC;
                                 singlePsyEnergyTmp = m_rdCost.psyCost(log2TrSizeC - 2, fencYuv->getChromaAddr(chromaId, absPartIdxC), fencYuv->getCStride(),
                                     cu->m_pic->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), zorder), cu->m_pic->getPicYuvRec()->getCStride());
                                 singleCostTmp = m_rdCost.calcPsyRdCost(singleDistCTmp, bitsTmp, singlePsyEnergyTmp);
@@ -977,7 +976,7 @@
 
                     if (bestModeId == firstCheckId)
                     {
-                        xLoadIntraResultChromaQT(cu, absPartIdxC, log2TrSizeC, chromaId, reconQt, reconQtStride);
+                        xLoadIntraResultChromaQT(cu, cuData, absPartIdxC, log2TrSizeC, chromaId, reconQt, reconQtStride);
                         cu->setCbfPartRange(singleCbfC << trDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
                         m_entropyCoder->load(m_rdEntropyCoders[fullDepth][CI_TEMP_BEST]);
                     }
@@ -998,10 +997,10 @@
                 else
                 {
                     cu->setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
-                    outDist += xIntraCodingChromaBlk(cu, absPartIdxC, fencYuv, predYuv, resiYuv, reconQt, reconQtStride, coeffC, singleCbfC, chromaId, log2TrSizeC);
+                    outDist += xIntraCodingChromaBlk(cu, cuData, absPartIdxC, fencYuv, predYuv, resiYuv, reconQt, reconQtStride, coeffC, singleCbfC, chromaId, log2TrSizeC);
                     if (m_rdCost.m_psyRd)
                     {
-                        uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
+                        uint32_t zorder = cuData->encodeIdx + absPartIdxC;
                         singlePsyEnergyTmp = m_rdCost.psyCost(log2TrSizeC - 2, fencYuv->getChromaAddr(chromaId, absPartIdxC), fencYuv->getCStride(),
                             cu->m_pic->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), zorder), cu->m_pic->getPicYuvRec()->getCStride());
                     }
@@ -1026,7 +1025,7 @@
         for (uint32_t part = 0; part < 4; part++, absPartIdxSub += qPartsDiv)
         {
             uint32_t psyEnergyTemp = 0;
-            outDist += xRecurIntraChromaCodingQT(cu, trDepth + 1, absPartIdxSub, fencYuv, predYuv, resiYuv, psyEnergyTemp);
+            outDist += xRecurIntraChromaCodingQT(cu, cuData, trDepth + 1, absPartIdxSub, fencYuv, predYuv, resiYuv, psyEnergyTemp);
             splitPsyEnergy += psyEnergyTemp;
             splitCbfU |= cu->getCbf(absPartIdxSub, TEXT_CHROMA_U, trDepth + 1);
             splitCbfV |= cu->getCbf(absPartIdxSub, TEXT_CHROMA_V, trDepth + 1);
@@ -1089,7 +1088,7 @@
     }
 }
 
-void Search::residualQTIntraChroma(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx,
+void Search::residualQTIntraChroma(TComDataCU* cu, CU* cuData, uint32_t trDepth, uint32_t absPartIdx,
                                    TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv)
 {
     uint32_t fullDepth = cu->getDepth(0) + trDepth;
@@ -1134,7 +1133,7 @@
                 pixel*   recon          = reconYuv->getChromaAddr(chromaId, absPartIdxC);
                 uint32_t coeffOffsetC   = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (hChromaShift + vChromaShift));
                 coeff_t* coeff          = cu->getCoeff(ttype) + coeffOffsetC;
-                uint32_t zorder         = cu->getZorderIdxInCU() + absPartIdxC;
+                uint32_t zorder         = cuData->encodeIdx + absPartIdxC;
                 pixel*   reconIPred     = cu->m_pic->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), zorder);
                 uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getCStride();
 
@@ -1148,7 +1147,7 @@
                     chromaPredMode = cu->getLumaIntraDir((m_csp == X265_CSP_I444) ? absPartIdxC : 0);
                 chromaPredMode = (m_csp == X265_CSP_I422) ? g_chroma422IntraAngleMappingTable[chromaPredMode] : chromaPredMode;
                 // init availability pattern
-                TComPattern::initAdiPatternChroma(cu, absPartIdxC, trDepthC, m_predBuf, chromaId);
+                TComPattern::initAdiPatternChroma(cu, cuData, absPartIdxC, trDepthC, m_predBuf, chromaId);
                 pixel* chromaPred = TComPattern::getAdiChromaBuf(chromaId, tuSize, m_predBuf);
 
                 // get prediction signal
@@ -1196,7 +1195,7 @@
         uint32_t absPartIdxSub = absPartIdx;
         for (uint32_t part = 0; part < 4; part++, absPartIdxSub += qPartsDiv)
         {
-            residualQTIntraChroma(cu, trDepth + 1, absPartIdxSub, fencYuv, predYuv, resiYuv, reconYuv);
+            residualQTIntraChroma(cu, cuData, trDepth + 1, absPartIdxSub, fencYuv, predYuv, resiYuv, reconYuv);
             splitCbfU |= cu->getCbf(absPartIdxSub, TEXT_CHROMA_U, trDepth + 1);
             splitCbfV |= cu->getCbf(absPartIdxSub, TEXT_CHROMA_V, trDepth + 1);
         }
@@ -1209,7 +1208,7 @@
     }
 }
 
-void Search::estIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2])
+void Search::estIntraPredQT(TComDataCU* cu, CU* cuData, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2])
 {
     uint32_t depth        = cu->getDepth(0);
     uint32_t initTrDepth  = cu->getPartitionSize(0) == SIZE_2Nx2N ? 0 : 1;
@@ -1226,7 +1225,7 @@
     for (uint32_t pu = 0; pu < numPU; pu++, partOffset += qNumParts)
     {
         // Reference sample smoothing
-        TComPattern::initAdiPattern(cu, partOffset, initTrDepth, m_predBuf, m_refAbove, m_refLeft, m_refAboveFlt, m_refLeftFlt, ALL_IDX);
+        TComPattern::initAdiPattern(cu, cuData, partOffset, initTrDepth, m_predBuf, m_refAbove, m_refLeft, m_refAboveFlt, m_refLeftFlt, ALL_IDX);
 
         // determine set of modes to be tested (using prediction signal only)
         pixel*   fenc   = fencYuv->getLumaAddr(partOffset);
@@ -1346,7 +1345,7 @@
             cu->setLumaIntraDirSubParts(rdModeList[i], partOffset, depth + initTrDepth);
             cost = bits = 0;
             uint32_t psyEnergy = 0;
-            xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, false, cost, bits, psyEnergy, depthRange);
+            xRecurIntraCodingQT(cu, cuData, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, false, cost, bits, psyEnergy, depthRange);
             COPY2_IF_LT(bcost, cost, bmode, rdModeList[i]);
         }
 
@@ -1356,14 +1355,14 @@
 
         uint32_t psyEnergy = 0;
         // update distortion (rate and r-d costs are determined later)
-        cu->m_totalDistortion += xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, true, cost, bits, psyEnergy, depthRange);
+        cu->m_totalDistortion += xRecurIntraCodingQT(cu, cuData, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, true, cost, bits, psyEnergy, depthRange);
 
         xSetIntraResultQT(cu, initTrDepth, partOffset, reconYuv);
 
         // set reconstruction for next intra prediction blocks
         if (pu != numPU - 1)
         {
-            uint32_t zorder      = cu->getZorderIdxInCU() + partOffset;
+            uint32_t zorder      = cuData->encodeIdx + partOffset;
             pixel*   dst         = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
             pixel*   src         = reconYuv->getLumaAddr(partOffset);
             primitives.square_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride);
@@ -1387,7 +1386,7 @@
     x265_emms();
 }
 
-void Search::sharedEstIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2], uint8_t* sharedModes)
+void Search::sharedEstIntraPredQT(TComDataCU* cu, CU* cuData, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2], uint8_t* sharedModes)
 {
     uint32_t depth       = cu->getDepth(0);
     uint32_t initTrDepth = cu->getPartitionSize(0) == SIZE_2Nx2N ? 0 : 1;
@@ -1411,12 +1410,12 @@
 
         uint32_t psyEnergy = 0;
         // update overall distortion (rate and r-d costs are determined later)
-        cu->m_totalDistortion += xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, true, puCost, bits, psyEnergy, depthRange);
+        cu->m_totalDistortion += xRecurIntraCodingQT(cu, cuData, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, true, puCost, bits, psyEnergy, depthRange);
         xSetIntraResultQT(cu, initTrDepth, partOffset, reconYuv);
 
         if (pu != numPU - 1)
         {
-            uint32_t zorder      = cu->getZorderIdxInCU() + partOffset;
+            uint32_t zorder      = cuData->encodeIdx + partOffset;
             pixel*   dst         = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
             pixel*   src         = reconYuv->getLumaAddr(partOffset);
             primitives.luma_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride);
@@ -1443,7 +1442,7 @@
     m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
 }
 
-void Search::getBestIntraModeChroma(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv)
+void Search::getBestIntraModeChroma(TComDataCU* cu, CU* cuData, TComYuv* fencYuv, TComYuv* predYuv)
 {
     uint32_t bestMode  = 0;
     uint64_t bestCost  = MAX_INT64;
@@ -1463,8 +1462,8 @@
     int32_t sizeIdx = log2TrSizeC - 2;
     pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
 
-    TComPattern::initAdiPatternChroma(cu, 0, 0, m_predBuf, 1);
-    TComPattern::initAdiPatternChroma(cu, 0, 0, m_predBuf, 2);
+    TComPattern::initAdiPatternChroma(cu, cuData, 0, 0, m_predBuf, 1);
+    TComPattern::initAdiPatternChroma(cu, cuData, 0, 0, m_predBuf, 2);
     cu->getAllowedChromaDir(0, modeList);
 
     // check chroma modes
@@ -1496,7 +1495,7 @@
     cu->setChromIntraDirSubParts(bestMode, 0, cu->getDepth(0));
 }
 
-void Search::estIntraPredChromaQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv)
+void Search::estIntraPredChromaQT(TComDataCU* cu, CU* cuData, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv)
 {
     uint32_t depth       = cu->getDepth(0);
     uint32_t initTrDepth = (cu->getPartitionSize(0) != SIZE_2Nx2N) && (cu->getChromaFormat() == X265_CSP_I444 ? 1 : 0);
@@ -1532,7 +1531,7 @@
             cu->setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTrDepth);
 
             uint32_t psyEnergy = 0;
-            uint32_t dist = xRecurIntraChromaCodingQT(cu, initTrDepth, absPartIdxC, fencYuv, predYuv, resiYuv, psyEnergy);
+            uint32_t dist = xRecurIntraChromaCodingQT(cu, cuData, initTrDepth, absPartIdxC, fencYuv, predYuv, resiYuv, psyEnergy);
 
             if (cu->m_slice->m_pps->bTransformSkipEnabled)
                 m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
@@ -1559,7 +1558,7 @@
 
         if (!tuIterator.isLastSection())
         {
-            uint32_t zorder      = cu->getZorderIdxInCU() + absPartIdxC;
+            uint32_t zorder      = cuData->encodeIdx + absPartIdxC;
             uint32_t dststride   = cu->m_pic->getPicYuvRec()->getCStride();
             uint32_t srcstride   = reconYuv->getCStride();
             pixel *src, *dst;
@@ -1605,7 +1604,7 @@
 }
 
 /* estimation of best merge coding */
-uint32_t Search::mergeEstimation(TComDataCU* cu, int puIdx, MergeData& m)
+uint32_t Search::mergeEstimation(TComDataCU* cu, CU* cuData, int puIdx, MergeData& m)
 {
     X265_CHECK(cu->getPartitionSize(0) != SIZE_2Nx2N, "merge tested on non-2Nx2N partition\n");
 
@@ -1639,7 +1638,7 @@
         cu->getCUMvField(REF_PIC_LIST_1)->m_mv[m.absPartIdx] = m.mvFieldNeighbours[mergeCand][1].mv;
         cu->getCUMvField(REF_PIC_LIST_1)->m_refIdx[m.absPartIdx] = (char)m.mvFieldNeighbours[mergeCand][1].refIdx;
 
-        prepMotionCompensation(cu, puIdx);
+        prepMotionCompensation(cu, cuData, puIdx);
         motionCompensation(&m_predTempYuv, true, false);
         uint32_t costCand = m_me.bufSATD(m_predTempYuv.getLumaAddr(m.absPartIdx), m_predTempYuv.getStride());
         uint32_t bitsCand = getTUBits(mergeCand, m.maxNumMergeCand);
@@ -1661,7 +1660,7 @@
 
 /* search of the best candidate for inter prediction
  * returns true if predYuv was filled with a motion compensated prediction */
-bool Search::predInterSearch(TComDataCU* cu, TComYuv* predYuv, bool bMergeOnly, bool bChroma)
+bool Search::predInterSearch(TComDataCU* cu, CU* cuData, TComYuv* predYuv, bool bMergeOnly, bool bChroma)
 {
     MV amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS];
     MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
@@ -1690,7 +1689,7 @@
         int      roiWidth, roiHeight;
         cu->getPartIndexAndSize(partIdx, partAddr, roiWidth, roiHeight);
 
-        pixel* pu = fenc->getLumaAddr(cu->getAddr(), cu->getZorderIdxInCU() + partAddr);
+        pixel* pu = fenc->getLumaAddr(cu->getAddr(), cuData->encodeIdx + partAddr);
         m_me.setSourcePU(pu - fenc->getLumaAddr(), roiWidth, roiHeight);
 
         uint32_t mrgCost = MAX_UINT;
@@ -1701,7 +1700,7 @@
             merge.absPartIdx = partAddr;
             merge.width = roiWidth;
             merge.height = roiHeight;
-            mrgCost = mergeEstimation(cu, partIdx, merge);
+            mrgCost = mergeEstimation(cu, cuData, partIdx, merge);
 
             if (bMergeOnly && cu->getLog2CUSize(0) > 3)
             {
@@ -1719,7 +1718,7 @@
                 cu->getCUMvField(REF_PIC_LIST_1)->setAllMvField(merge.mvField[1], partSize, partAddr, 0, partIdx);
                 totalmebits += merge.bits;
 
-                prepMotionCompensation(cu, partIdx);
+                prepMotionCompensation(cu, cuData, partIdx);
                 motionCompensation(predYuv, true, bChroma);
                 continue;
             }
@@ -1761,7 +1760,7 @@
 
                     cu->clipMv(mvCand);
 
-                    prepMotionCompensation(cu, partIdx);
+                    prepMotionCompensation(cu, cuData, partIdx);
                     predInterLumaBlk(slice->m_refPicList[l][ref]->getPicYuvRec(), &m_predTempYuv, &mvCand);
                     uint32_t cost = m_me.bufSAD(m_predTempYuv.getLumaAddr(partAddr), m_predTempYuv.getStride());
                     cost = (uint32_t)m_rdCost.calcRdSADCost(cost, MVP_IDX_BITS);
@@ -1809,7 +1808,7 @@
             TComPicYuv *refPic0 = slice->m_refPicList[0][list[0].ref]->getPicYuvRec();
             TComPicYuv *refPic1 = slice->m_refPicList[1][list[1].ref]->getPicYuvRec();
             
-            prepMotionCompensation(cu, partIdx);
+            prepMotionCompensation(cu, cuData, partIdx);
             predInterLumaBlk(refPic0, &m_predYuv[0], &list[0].mv);
             predInterLumaBlk(refPic1, &m_predYuv[1], &list[1].mv);
 
@@ -1935,7 +1934,7 @@
 
             totalmebits += list[1].bits;
         }
-        prepMotionCompensation(cu, partIdx);
+        prepMotionCompensation(cu, cuData, partIdx);
         motionCompensation(predYuv, true, bChroma);
     }
 
@@ -2094,7 +2093,7 @@
 }
 
 /** encode residual and calculate rate-distortion for a CU block */
-void Search::encodeResAndCalcRdInterCU(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* outResiYuv,
+void Search::encodeResAndCalcRdInterCU(TComDataCU* cu, CU* cuData, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* outResiYuv,
                                        ShortYuv* outBestResiYuv, TComYuv* outReconYuv)
 {
     X265_CHECK(!cu->isIntra(0), "intra CU not expected\n");
@@ -2138,7 +2137,7 @@
         uint64_t cost = 0;
         uint32_t zeroDistortion = 0;
         uint32_t bits = 0;
-        uint32_t distortion = xEstimateResidualQT(cu, 0, fencYuv, predYuv, outResiYuv, depth, cost, bits, &zeroDistortion, tuDepthRange);
+        uint32_t distortion = xEstimateResidualQT(cu, cuData, 0, fencYuv, predYuv, outResiYuv, depth, cost, bits, &zeroDistortion, tuDepthRange);
 
         m_entropyCoder->resetBits();
         m_entropyCoder->codeQtRootCbfZero();
@@ -2208,7 +2207,7 @@
         m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
         uint64_t cost = 0;
         uint32_t bits = 0;
-        xEstimateResidualQT(cu, 0, fencYuv, predYuv, outResiYuv, depth, cost, bits, NULL, tuDepthRange);
+        xEstimateResidualQT(cu, cuData, 0, fencYuv, predYuv, outResiYuv, depth, cost, bits, NULL, tuDepthRange);
         xSetResidualQTData(cu, 0, NULL, depth, false);
         m_entropyCoder->store(m_rdEntropyCoders[depth][CI_TEMP_BEST]);
     }
@@ -2243,7 +2242,7 @@
         cu->clearCbf(0, depth);
 }
 
-void Search::generateCoeffRecon(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv)
+void Search::generateCoeffRecon(TComDataCU* cu, CU* cuData, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv)
 {
     m_quant.setQPforQuant(cu);
 
@@ -2252,7 +2251,7 @@
 
     if (cu->getPredictionMode(0) == MODE_INTER)
     {
-        residualTransformQuantInter(cu, 0, fencYuv, resiYuv, cu->getDepth(0), tuDepthRange);
+        residualTransformQuantInter(cu, cuData, 0, fencYuv, resiYuv, cu->getDepth(0), tuDepthRange);
         if (cu->getQtRootCbf(0))
             reconYuv->addClip(predYuv, resiYuv, cu->getLog2CUSize(0));
         else
@@ -2265,13 +2264,13 @@
     else if (cu->getPredictionMode(0) == MODE_INTRA)
     {
         uint32_t initTrDepth = cu->getPartitionSize(0) == SIZE_2Nx2N ? 0 : 1;
-        residualTransformQuantIntra(cu, initTrDepth, 0, fencYuv, predYuv, resiYuv, reconYuv, tuDepthRange);
-        getBestIntraModeChroma(cu, fencYuv, predYuv);
-        residualQTIntraChroma(cu, 0, 0, fencYuv, predYuv, resiYuv, reconYuv);
+        residualTransformQuantIntra(cu, cuData, initTrDepth, 0, fencYuv, predYuv, resiYuv, reconYuv, tuDepthRange);
+        getBestIntraModeChroma(cu, cuData, fencYuv, predYuv);
+        residualQTIntraChroma(cu, cuData, 0, 0, fencYuv, predYuv, resiYuv, reconYuv);
     }
 }
 
-void Search::residualTransformQuantInter(TComDataCU* cu, uint32_t absPartIdx, TComYuv* fencYuv, ShortYuv* resiYuv,
+void Search::residualTransformQuantInter(TComDataCU* cu, CU* cuData, uint32_t absPartIdx, TComYuv* fencYuv, ShortYuv* resiYuv,
                                          const uint32_t depth, uint32_t depthRange[2])
 {
     X265_CHECK(cu->getDepth(0) == cu->getDepth(absPartIdx), "invalid depth\n");
@@ -2382,7 +2381,9 @@
     {
         const uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
         for (uint32_t i = 0; i < 4; ++i)
-            residualTransformQuantInter(cu, absPartIdx + i * qPartNumSubdiv, fencYuv, resiYuv, depth + 1, depthRange);
+        {
+            residualTransformQuantInter(cu, cuData, absPartIdx + i * qPartNumSubdiv, fencYuv, resiYuv, depth + 1, depthRange);
+        }
 
         uint32_t ycbf = 0;
         uint32_t ucbf = 0;
@@ -2403,7 +2404,7 @@
     }
 }
 
-uint32_t Search::xEstimateResidualQT(TComDataCU* cu, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv,
+uint32_t Search::xEstimateResidualQT(TComDataCU* cu, CU* cuData, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv,
                                      uint32_t depth, uint64_t& rdCost, uint32_t& outBits, uint32_t* outZeroDist, uint32_t depthRange[2])
 {
     X265_CHECK(cu->getDepth(0) == cu->getDepth(absPartIdx), "depth not matching\n");
@@ -2571,7 +2572,7 @@
             if (m_rdCost.m_psyRd)
             {
                 pixel*   pred = predYuv->getLumaAddr(absPartIdx);
-                uint32_t zorder = cu->getZorderIdxInCU() + absPartIdx;
+                uint32_t zorder = cuData->encodeIdx + absPartIdx;
                 pixel*   reconIPred = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
                 uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getStride();
                 uint32_t stride = fencYuv->getStride();
@@ -2670,7 +2671,7 @@
                     if (m_rdCost.m_psyRd)
                     {
                         pixel*   pred = predYuv->getCbAddr(absPartIdxC);
-                        uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
+                        uint32_t zorder = cuData->encodeIdx + absPartIdxC;
                         pixel*   reconIPred = cu->m_pic->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder);
                         uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getCStride();
                         uint32_t stride = fencYuv->getCStride();
@@ -2752,7 +2753,7 @@
                     if (m_rdCost.m_psyRd)
                     {
                         pixel*   pred = predYuv->getCrAddr(absPartIdxC);
-                        uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
+                        uint32_t zorder = cuData->encodeIdx + absPartIdxC;
                         pixel*   reconIPred = cu->m_pic->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder);
                         uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getCStride();
                         uint32_t stride = fencYuv->getCStride();
@@ -2858,7 +2859,7 @@
                 if (m_rdCost.m_psyRd)
                 {
                     pixel*   pred = predYuv->getLumaAddr(absPartIdx);
-                    uint32_t zorder = cu->getZorderIdxInCU() + absPartIdx;
+                    uint32_t zorder = cuData->encodeIdx + absPartIdx;
                     pixel*   reconIPred = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
                     uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getStride();
                     uint32_t stride = fencYuv->getStride();
@@ -2946,7 +2947,7 @@
                     if (m_rdCost.m_psyRd)
                     {
                         pixel*   pred = predYuv->getCbAddr(absPartIdxC);
-                        uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
+                        uint32_t zorder = cuData->encodeIdx + absPartIdxC;
                         pixel*   reconIPred = cu->m_pic->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder);
                         uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getCStride();
                         uint32_t stride = fencYuv->getCStride();
@@ -2987,7 +2988,7 @@
                     if (m_rdCost.m_psyRd)
                     {
                         pixel*   pred = predYuv->getCrAddr(absPartIdxC);
-                        uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
+                        uint32_t zorder = cuData->encodeIdx + absPartIdxC;
                         pixel*   reconIPred = cu->m_pic->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder);
                         uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getCStride();
                         uint32_t stride = fencYuv->getCStride();
@@ -3130,7 +3131,7 @@
         for (uint32_t i = 0; i < 4; ++i)
         {
             cu->m_psyEnergy = 0;
-            subdivDist += xEstimateResidualQT(cu, absPartIdx + i * qPartNumSubdiv, fencYuv, predYuv, resiYuv, depth + 1, subDivCost, subdivBits, bCheckFull ? NULL : outZeroDist, depthRange);
+            subdivDist += xEstimateResidualQT(cu, cuData, absPartIdx + i * qPartNumSubdiv, fencYuv, predYuv, resiYuv, depth + 1, subDivCost, subdivBits, bCheckFull ? NULL : outZeroDist, depthRange);
             subDivPsyEnergy += cu->m_psyEnergy;
         }
 
diff -r 32f50df7fa76 -r ed887d8ae5cd source/encoder/search.h
--- a/source/encoder/search.h	Fri Sep 26 17:33:09 2014 -0500
+++ b/source/encoder/search.h	Mon Sep 29 12:11:32 2014 +0530
@@ -79,19 +79,19 @@
 
     bool     initSearch(x265_param *param, ScalingList& scalingList);
 
-    void     estIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2]);
-    void     sharedEstIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2], uint8_t* sharedModes);
-    void     estIntraPredChromaQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv);
+    void     estIntraPredQT(TComDataCU* cu, CU* cuData, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2]);
+    void     sharedEstIntraPredQT(TComDataCU* cu, CU* cuData, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2], uint8_t* sharedModes);
+    void     estIntraPredChromaQT(TComDataCU* cu, CU* cuData, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv);
 
     // estimation inter prediction (non-skip)
-    bool     predInterSearch(TComDataCU* cu, TComYuv* predYuv, bool bMergeOnly, bool bChroma);
+    bool     predInterSearch(TComDataCU* cu, CU* cuData, TComYuv* predYuv, bool bMergeOnly, bool bChroma);
 
     // encode residual and compute rd-cost for inter mode
-    void     encodeResAndCalcRdInterCU(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, ShortYuv* bestResiYuv, TComYuv* reconYuv);
+    void     encodeResAndCalcRdInterCU(TComDataCU* cu, CU* cuData, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, ShortYuv* bestResiYuv, TComYuv* reconYuv);
     void     encodeResAndCalcRdSkipCU(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, TComYuv* reconYuv);
 
-    void     generateCoeffRecon(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv);
-    void     residualTransformQuantInter(TComDataCU* cu, uint32_t absPartIdx, TComYuv* fencYuv, ShortYuv* resiYuv, uint32_t depth, uint32_t depthRange[2]);
+    void     generateCoeffRecon(TComDataCU* cu, CU* cuData, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv);
+    void     residualTransformQuantInter(TComDataCU* cu, CU* cuData, uint32_t absPartIdx, TComYuv* fencYuv, ShortYuv* resiYuv, uint32_t depth, uint32_t depthRange[2]);
 
     uint32_t getIntraModeBits(TComDataCU* cu, uint32_t mode, uint32_t partOffset, uint32_t depth);
     uint32_t getIntraRemModeBits(TComDataCU * cu, uint32_t partOffset, uint32_t depth, uint32_t preds[3], uint64_t& mpms);
@@ -109,32 +109,32 @@
     uint32_t xGetIntraBitsQTChroma(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t absPartIdxStep);
     uint32_t xGetIntraBitsLuma(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t log2TrSize, coeff_t* coeff, uint32_t depthRange[2]);
     uint32_t xGetIntraBitsChroma(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSizeC, uint32_t chromaId, coeff_t* coeff);
-    uint32_t xIntraCodingLumaBlk(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv,
+    uint32_t xIntraCodingLumaBlk(TComDataCU* cu, CU* cuData, uint32_t absPartIdx, uint32_t log2TrSize, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv,
                                  int16_t* reconQt, uint32_t reconQtStride, coeff_t* coeff, uint32_t& cbf);
 
-    uint32_t xEstimateResidualQT(TComDataCU* cu, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, uint32_t depth,
+    uint32_t xEstimateResidualQT(TComDataCU* cu, CU* cuData, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, uint32_t depth,
                                  uint64_t &rdCost, uint32_t &outBits, uint32_t *zeroDist, uint32_t tuDepthRange[2]);
 
-    uint32_t xRecurIntraCodingQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv,
+    uint32_t xRecurIntraCodingQT(TComDataCU* cu, CU* cuData, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv,
                                  ShortYuv* resiYuv, bool bAllowRQTSplit, uint64_t& dRDCost, uint32_t& puBits, uint32_t& psyEnergy, uint32_t depthRange[2]);
 
-    uint32_t xRecurIntraChromaCodingQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv,
+    uint32_t xRecurIntraChromaCodingQT(TComDataCU* cu, CU* cuData, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv,
                                        uint32_t& psyEnergy);
 
-    uint32_t xIntraCodingChromaBlk(TComDataCU* cu, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv,
+    uint32_t xIntraCodingChromaBlk(TComDataCU* cu, CU* cuData, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv,
                                    int16_t* reconQt, uint32_t reconQtStride, coeff_t* coeff, uint32_t& cbf, uint32_t chromaId, uint32_t log2TrSizeC);
 
-    void     residualTransformQuantIntra(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv,
+    void     residualTransformQuantIntra(TComDataCU* cu, CU* cuData, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv,
                                          TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2]);
 
-    void     residualQTIntraChroma(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv,
+    void     residualQTIntraChroma(TComDataCU* cu, CU* cuData, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv,
                                    TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv);
 
     void     xEncodeResidualQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t depth, bool bSubdivAndCbf, TextType ttype, uint32_t depthRange[2]);
     void     xSetIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* reconYuv);
 
-    void     xLoadIntraResultQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize, int16_t* reconQt, uint32_t reconQtStride);
-    void     xLoadIntraResultChromaQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSizeC, uint32_t chromaId, int16_t* reconQt, uint32_t reconQtStride);
+    void     xLoadIntraResultQT(TComDataCU* cu, CU* cuData, uint32_t absPartIdx, uint32_t log2TrSize, int16_t* reconQt, uint32_t reconQtStride);
+    void     xLoadIntraResultChromaQT(TComDataCU* cu, CU* cuData, uint32_t absPartIdx, uint32_t log2TrSizeC, uint32_t chromaId, int16_t* reconQt, uint32_t reconQtStride);
 
     void     offsetSubTUCBFs(TComDataCU* cu, TextType ttype, uint32_t trDepth, uint32_t absPartIdx);
 
@@ -171,13 +171,13 @@
     void     checkBestMVP(MV* amvpCand, MV cMv, MV& mvPred, int& mvpIdx, uint32_t& outBits, uint32_t& outCost);
     void     getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3]);
     uint32_t getInterSymbolBits(TComDataCU* cu, uint32_t depthRange[2]);
-    uint32_t mergeEstimation(TComDataCU* cu, int partIdx, MergeData& m);
+    uint32_t mergeEstimation(TComDataCU* cu, CU* cuData, int partIdx, MergeData& m);
     void     setSearchRange(TComDataCU* cu, MV mvp, int merange, MV& mvmin, MV& mvmax);
 
     /* intra helper functions */
     enum { MAX_RD_INTRA_MODES = 16 };
     void     updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList);
-    void     getBestIntraModeChroma(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv);
+    void     getBestIntraModeChroma(TComDataCU* cu, CU* cuData, TComYuv* fencYuv, TComYuv* predYuv);
 };
 }
 



More information about the x265-devel mailing list