[x265] [PATCH] TComDataCU: replace getZorderIdxInCU() with encodeIdx of CU structure

dave dtyx265 at gmail.com
Fri Sep 26 16:52:41 CEST 2014


Great timing.  Yesterday I noticed encodeIdx isn't used for anything so 
I was going to submit a patch to remove it.  Right now CU.offset[2] 
contains values that are only used to calculate encodeIdx.  Unless 
someone sees any other potential use of CU.offset[2], I will remove it 
from CU and use local variables in loadCTUData to calculate encodeIdx.

On 09/26/2014 02:14 AM, santhoshini at multicorewareinc.com wrote:
> # HG changeset patch
> # User Santhoshini Sekar <santhoshini at multicorewareinc.com>
> # Date 1411720561 -19800
> #      Fri Sep 26 14:06:01 2014 +0530
> # Node ID 2048b3e3c064d3518f4521a8d4cdd7bf9fd5331a
> # Parent  7dccbbed034970de161b361cd6e17ed4efca7226
> TComDataCU: replace getZorderIdxInCU() with encodeIdx of CU structure
>
> diff -r 7dccbbed0349 -r 2048b3e3c064 source/Lib/TLibCommon/TComDataCU.cpp
> --- a/source/Lib/TLibCommon/TComDataCU.cpp	Wed Sep 24 18:26:45 2014 -0500
> +++ b/source/Lib/TLibCommon/TComDataCU.cpp	Fri Sep 26 14:06:01 2014 +0530
> @@ -387,16 +387,15 @@
>   }
>   
>   // initialize Sub partition
> -void TComDataCU::initSubCU(TComDataCU* cu, uint32_t partUnitIdx, uint32_t depth, int qp)
> +void TComDataCU::initSubCU(TComDataCU* cu, uint32_t partUnitIdx, uint32_t depth, int qp, CU* cuData)
>   {
>       X265_CHECK(partUnitIdx < 4, "part unit should be less than 4\n");
>       uint8_t log2CUSize = g_maxLog2CUSize - depth;
> -    uint32_t partOffset = (cu->getTotalNumPart() >> 2) * partUnitIdx;
>   
>       m_pic              = cu->m_pic;
>       m_slice            = cu->m_slice;
>       m_cuAddr           = cu->getAddr();
> -    m_absIdxInLCU      = cu->getZorderIdxInCU() + partOffset;
> +    m_absIdxInLCU      = cuData->encodeIdx;
>   
>       m_cuPelX           = cu->getCUPelX() + ((partUnitIdx &  1) << log2CUSize);
>       m_cuPelY           = cu->getCUPelY() + ((partUnitIdx >> 1) << log2CUSize);
> @@ -453,7 +452,7 @@
>       m_cuAboveRight  = cu->getCUAboveRight();
>   }
>   
> -void TComDataCU::copyToSubCU(TComDataCU* cu, uint32_t partUnitIdx, uint32_t depth)
> +void TComDataCU::copyToSubCU(TComDataCU* cu, uint32_t partUnitIdx, uint32_t depth, CU* cuData)
>   {
>       X265_CHECK(partUnitIdx < 4, "part unit should be less than 4\n");
>   
> @@ -462,7 +461,7 @@
>       m_pic              = cu->m_pic;
>       m_slice            = cu->m_slice;
>       m_cuAddr           = cu->getAddr();
> -    m_absIdxInLCU      = cu->getZorderIdxInCU() + partOffset;
> +    m_absIdxInLCU      = cuData->encodeIdx + partOffset;
>   
>       m_cuPelX           = cu->getCUPelX() + ((partUnitIdx &  1) << (g_maxLog2CUSize - depth));
>       m_cuPelY           = cu->getCUPelY() + ((partUnitIdx >> 1) << (g_maxLog2CUSize - depth));
> @@ -1067,9 +1066,9 @@
>       }
>       else
>       {
> -        if (getZorderIdxInCU() > 0)
> +        if (m_pic->getCU(m_cuAddr)->m_CULocalData->encodeIdx > 0)
>           {
> -            return m_pic->getCU(getAddr())->getLastCodedQP(getZorderIdxInCU());
> +            return m_pic->getCU(getAddr())->getLastCodedQP(m_pic->getCU(m_cuAddr)->m_CULocalData->encodeIdx);
>           }
>           else if (getAddr() > 0 && !(m_slice->m_pps->bEntropyCodingSyncEnabled &&
>                                       getAddr() % m_pic->getFrameWidthInCU() == 0))
> diff -r 7dccbbed0349 -r 2048b3e3c064 source/Lib/TLibCommon/TComDataCU.h
> --- a/source/Lib/TLibCommon/TComDataCU.h	Wed Sep 24 18:26:45 2014 -0500
> +++ b/source/Lib/TLibCommon/TComDataCU.h	Fri Sep 26 14:06:01 2014 +0530
> @@ -273,9 +273,9 @@
>   
>       void          initCU(Frame* pic, uint32_t cuAddr);
>       void          initEstData();
> -    void          initSubCU(TComDataCU* cu, uint32_t partUnitIdx, uint32_t depth, int qp);
> +    void          initSubCU(TComDataCU* cu, uint32_t partUnitIdx, uint32_t depth, int qp, CU* cuData);
>   
> -    void          copyToSubCU(TComDataCU* lcu, uint32_t partUnitIdx, uint32_t depth);
> +    void          copyToSubCU(TComDataCU* lcu, uint32_t partUnitIdx, uint32_t depth, CU* cuData);
>       void          copyPartFrom(TComDataCU* cu, uint32_t partUnitIdx, uint32_t depth, bool isRDObasedAnalysis = true);
>   
>       void          copyToPic(uint32_t depth);
> @@ -288,8 +288,6 @@
>   
>       uint32_t&     getAddr()                        { return m_cuAddr; }
>   
> -    uint32_t&     getZorderIdxInCU()               { return m_absIdxInLCU; }
> -
>       uint32_t      getSCUAddr() const               { return (m_cuAddr << g_maxFullDepth * 2) + m_absIdxInLCU; }
>   
>   
> diff -r 7dccbbed0349 -r 2048b3e3c064 source/Lib/TLibCommon/TComPattern.cpp
> --- a/source/Lib/TLibCommon/TComPattern.cpp	Wed Sep 24 18:26:45 2014 -0500
> +++ b/source/Lib/TLibCommon/TComPattern.cpp	Fri Sep 26 14:06:01 2014 +0530
> @@ -50,7 +50,7 @@
>   // ====================================================================================================================
>   
>   void TComPattern::initAdiPattern(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth, pixel* adiBuf,
> -                                 pixel* refAbove, pixel* refLeft, pixel* refAboveFlt, pixel* refLeftFlt, int dirMode)
> +                                 pixel* refAbove, pixel* refLeft, pixel* refAboveFlt, pixel* refLeftFlt, int dirMode, CU* cuData)
>   {
>       IntraNeighbors intraNeighbors;
>   
> @@ -58,7 +58,7 @@
>       uint32_t tuSize = intraNeighbors.tuSize;
>       uint32_t tuSize2 = tuSize << 1;
>   
> -    pixel* adiOrigin = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), cu->getZorderIdxInCU() + zOrderIdxInPart);
> +    pixel* adiOrigin = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), cuData->encodeIdx + zOrderIdxInPart);
>       int picStride = cu->m_pic->getStride();
>   
>       fillReferenceSamples(adiOrigin, picStride, adiBuf, intraNeighbors);
> @@ -130,14 +130,14 @@
>       }
>   }
>   
> -void TComPattern::initAdiPatternChroma(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth, pixel* adiBuf, uint32_t chromaId)
> +void TComPattern::initAdiPatternChroma(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth, pixel* adiBuf, uint32_t chromaId, CU* cuData)
>   {
>       IntraNeighbors intraNeighbors;
>   
>       initIntraNeighbors(cu, zOrderIdxInPart, partDepth, false, &intraNeighbors);
>       uint32_t tuSize = intraNeighbors.tuSize;
>   
> -    pixel* adiOrigin = cu->m_pic->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), cu->getZorderIdxInCU() + zOrderIdxInPart);
> +    pixel* adiOrigin = cu->m_pic->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), cuData->encodeIdx + zOrderIdxInPart);
>       int picStride = cu->m_pic->getCStride();
>       pixel* adiRef = getAdiChromaBuf(chromaId, tuSize, adiBuf);
>   
> diff -r 7dccbbed0349 -r 2048b3e3c064 source/Lib/TLibCommon/TComPattern.h
> --- a/source/Lib/TLibCommon/TComPattern.h	Wed Sep 24 18:26:45 2014 -0500
> +++ b/source/Lib/TLibCommon/TComPattern.h	Fri Sep 26 14:06:01 2014 +0530
> @@ -53,6 +53,7 @@
>   
>   class TComDataCU;
>   
> +struct CU;
>   struct IntraNeighbors
>   {
>       int  numIntraNeighbor;
> @@ -84,11 +85,12 @@
>       /// set parameters from pixel buffers for accessing neighboring pixels
>       static void initAdiPattern(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth, pixel* adiBuf,
>                                  pixel* refAbove, pixel* refLeft,
> -                               pixel* refAboveFlt, pixel* refLeftFlt, int dirMode);
> +                               pixel* refAboveFlt, pixel* refLeftFlt, int dirMode,
> +                               CU* cuData);
>   
>       /// set chroma parameters from CU data for accessing ADI data
>       static void initAdiPatternChroma(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth,
> -                                     pixel* adiBuf, uint32_t chromaId);
> +                                     pixel* adiBuf, uint32_t chromaId, CU* cuData);
>   
>       static void initIntraNeighbors(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth, bool isLuma, IntraNeighbors *IntraNeighbors);
>   
> diff -r 7dccbbed0349 -r 2048b3e3c064 source/encoder/analysis.cpp
> --- a/source/encoder/analysis.cpp	Wed Sep 24 18:26:45 2014 -0500
> +++ b/source/encoder/analysis.cpp	Fri Sep 26 14:06:01 2014 +0530
> @@ -285,7 +285,7 @@
>                   cu->childIdx = child_idx;
>                   cu->offset[0] = sb_x * blockSize;
>                   cu->offset[1] = sb_y * blockSize;
> -                cu->encodeIdx = getDepthScanIdx(cu->offset[0] >> 3, cu->offset[1] >> 3, b8Width);
> +                cu->encodeIdx = getDepthScanIdx(cu->offset[0] >> 3, cu->offset[1] >> 3, b8Width) * 4;
>                   cu->flags = 0;
>   
>                   CU_SET_FLAG(cu->flags, CU::PRESENT, present_flag);
> @@ -424,7 +424,7 @@
>       //PPAScopeEvent(CompressIntraCU + depth);
>       Frame* pic = outBestCU->m_pic;
>       uint32_t cuAddr = outBestCU->getAddr();
> -    uint32_t absPartIdx = outBestCU->getZorderIdxInCU();
> +    uint32_t absPartIdx = cu->encodeIdx;
>   
>       if (depth == 0)
>           // get original YUV data from picture
> @@ -474,10 +474,10 @@
>           {
>               CU *child_cu = pic->getCU(cuAddr)->m_CULocalData + cu->childIdx + partUnitIdx;
>               int qp = outTempCU->getQP(0);
> -            subBestPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
> +            subBestPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp, child_cu); // clear sub partition datas or init.
>               if (child_cu->flags & CU::PRESENT)
>               {
> -                subTempPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
> +                subTempPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp, child_cu); // clear sub partition datas or init.
>                   if (0 == partUnitIdx) //initialize RD with previous depth buffer
>                       m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
>                   else
> @@ -566,15 +566,15 @@
>       int32_t ctuToDepthIndex = g_maxCUDepth - 1;
>   
>       if (depth)
> -        m_origYuv[0]->copyPartToYuv(m_origYuv[depth], outBestCU->getZorderIdxInCU());
> +        m_origYuv[0]->copyPartToYuv(m_origYuv[depth], cu->encodeIdx);
>       else
> -        m_origYuv[depth]->copyFromPicYuv(pic->getPicYuvOrg(), outBestCU->getAddr(), outBestCU->getZorderIdxInCU());
> +        m_origYuv[depth]->copyFromPicYuv(pic->getPicYuvOrg(), outBestCU->getAddr(), cu->encodeIdx);
>   
>       Slice* slice = outTempCU->m_slice;
>       int32_t cu_split_flag = !(cu->flags & CU::LEAF);
>       int32_t cu_unsplit_flag = !(cu->flags & CU::SPLIT_MANDATORY);
>   
> -    if (cu_unsplit_flag && ((zOrder == outBestCU->getZorderIdxInCU()) && (depth == sharedDepth[zOrder])))
> +    if (cu_unsplit_flag && ((zOrder == cu->encodeIdx) && (depth == sharedDepth[zOrder])))
>       {
>           m_quant.setQPforQuant(outTempCU);
>           checkIntra(outTempCU, (PartSize)sharedPartSizes[zOrder], cu, &sharedModes[zOrder]);
> @@ -609,10 +609,10 @@
>           {
>               CU *child_cu = pic->getCU(outTempCU->getAddr())->m_CULocalData + cu->childIdx + partUnitIdx;
>               int qp = outTempCU->getQP(0);
> -            subBestPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
> +            subBestPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp, child_cu); // clear sub partition datas or init.
>               if (child_cu->flags & CU::PRESENT)
>               {
> -                subTempPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
> +                subTempPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp, child_cu); // clear sub partition datas or init.
>   
>                   if (partUnitIdx) // initialize RD with previous depth buffer
>                       m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[nextDepth][CI_NEXT_BEST]);
> @@ -675,7 +675,7 @@
>       outBestCU->copyToPic(depth);
>       if (!cu_unsplit_flag)
>           return;
> -    m_bestRecoYuv[depth]->copyToPicYuv(pic->getPicYuvRec(), outBestCU->getAddr(), outBestCU->getZorderIdxInCU());
> +    m_bestRecoYuv[depth]->copyToPicYuv(pic->getPicYuvRec(), outBestCU->getAddr(), cu->encodeIdx);
>   
>   #if CHECKED_BUILD || _DEBUG
>       X265_CHECK(outBestCU->getPartitionSize(0) != SIZE_NONE, "no best partition size\n");
> @@ -703,11 +703,11 @@
>       outTempCU->getQuadtreeTULog2MinSizeInCU(tuDepthRange, 0);
>   
>       if (sharedModes)
> -        sharedEstIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange, sharedModes);
> +        sharedEstIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange, sharedModes, cu);
>       else
> -        estIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange);
> +        estIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange, cu);
>   
> -    estIntraPredChromaQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth]);
> +    estIntraPredChromaQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], cu);
>   
>       m_entropyCoder->resetBits();
>       if (outTempCU->m_slice->m_pps->bTransquantBypassEnabled)
> @@ -743,7 +743,7 @@
>   {
>       Frame* pic = outTempCU->m_pic;
>       uint32_t cuAddr = outTempCU->getAddr();
> -    uint32_t absPartIdx = outTempCU->getZorderIdxInCU();
> +    uint32_t absPartIdx = cu->encodeIdx;
>   
>       if (depth == 0)
>           // get original YUV data from picture
> @@ -818,16 +818,16 @@
>               }
>               else
>               {
> -                m_interCU_2Nx2N[depth]->initSubCU(parentCU, PartitionIndex, depth, qp);
> -                m_interCU_2NxN[depth]->initSubCU(parentCU, PartitionIndex, depth, qp);
> -                m_interCU_Nx2N[depth]->initSubCU(parentCU, PartitionIndex, depth, qp);
> -                m_intraInInterCU[depth]->initSubCU(parentCU, PartitionIndex, depth, qp);
> -                m_mergeCU[depth]->initSubCU(parentCU, PartitionIndex, depth, qp);
> -                m_bestMergeCU[depth]->initSubCU(parentCU, PartitionIndex, depth, qp);
> +                m_interCU_2Nx2N[depth]->initSubCU(parentCU, PartitionIndex, depth, qp, cu);
> +                m_interCU_2NxN[depth]->initSubCU(parentCU, PartitionIndex, depth, qp, cu);
> +                m_interCU_Nx2N[depth]->initSubCU(parentCU, PartitionIndex, depth, qp, cu);
> +                m_intraInInterCU[depth]->initSubCU(parentCU, PartitionIndex, depth, qp, cu);
> +                m_mergeCU[depth]->initSubCU(parentCU, PartitionIndex, depth, qp, cu);
> +                m_bestMergeCU[depth]->initSubCU(parentCU, PartitionIndex, depth, qp, cu);
>               }
>   
>               /* Compute  Merge Cost */
> -            checkMerge2Nx2N_rd0_4(m_bestMergeCU[depth], m_mergeCU[depth], m_modePredYuv[3][depth], m_bestMergeRecoYuv[depth]);
> +            checkMerge2Nx2N_rd0_4(m_bestMergeCU[depth], m_mergeCU[depth], m_modePredYuv[3][depth], m_bestMergeRecoYuv[depth], cu);
>               bool earlyskip = false;
>               if (m_param->rdLevel >= 1)
>                   earlyskip = (m_param->bEnableEarlySkip && m_bestMergeCU[depth]->isSkipped(0));
> @@ -836,7 +836,7 @@
>               {
>                   /* Compute 2Nx2N mode costs */
>                   {
> -                    checkInter_rd0_4(m_interCU_2Nx2N[depth], m_modePredYuv[0][depth], SIZE_2Nx2N);
> +                    checkInter_rd0_4(m_interCU_2Nx2N[depth], m_modePredYuv[0][depth], SIZE_2Nx2N, cu);
>                       /* Choose best mode; initialise outBestCU to 2Nx2N */
>                       outBestCU = m_interCU_2Nx2N[depth];
>                       std::swap(m_bestPredYuv[depth], m_modePredYuv[0][depth]);
> @@ -845,8 +845,8 @@
>                   /* Compute Rect costs */
>                   if (m_param->bEnableRectInter)
>                   {
> -                    checkInter_rd0_4(m_interCU_Nx2N[depth], m_modePredYuv[1][depth], SIZE_Nx2N);
> -                    checkInter_rd0_4(m_interCU_2NxN[depth], m_modePredYuv[2][depth], SIZE_2NxN);
> +                    checkInter_rd0_4(m_interCU_Nx2N[depth], m_modePredYuv[1][depth], SIZE_Nx2N, cu);
> +                    checkInter_rd0_4(m_interCU_2NxN[depth], m_modePredYuv[2][depth], SIZE_2NxN, cu);
>                       if (m_interCU_Nx2N[depth]->m_sa8dCost < outBestCU->m_sa8dCost)
>                       {
>                           outBestCU = m_interCU_Nx2N[depth];
> @@ -865,12 +865,12 @@
>                       int numPart = outBestCU->getNumPartInter();
>                       for (int partIdx = 0; partIdx < numPart; partIdx++)
>                       {
> -                        prepMotionCompensation(outBestCU, partIdx);
> +                        prepMotionCompensation(outBestCU, partIdx, cu);
>                           motionCompensation(m_bestPredYuv[depth], false, true);
>                       }
>   
>                       encodeResAndCalcRdInterCU(outBestCU, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth],
> -                                              m_bestResiYuv[depth], m_bestRecoYuv[depth]);
> +                                              m_bestResiYuv[depth], m_bestRecoYuv[depth], cu);
>                       uint64_t bestMergeCost = m_rdCost.m_psyRd ? m_bestMergeCU[depth]->m_totalPsyCost : m_bestMergeCU[depth]->m_totalRDCost;
>                       uint64_t bestCost = m_rdCost.m_psyRd ? outBestCU->m_totalPsyCost : outBestCU->m_totalRDCost;
>                       if (bestMergeCost < bestCost)
> @@ -898,12 +898,12 @@
>                       }
>                       if (bdoIntra)
>                       {
> -                        checkIntraInInter_rd0_4(m_intraInInterCU[depth], SIZE_2Nx2N);
> +                        checkIntraInInter_rd0_4(m_intraInInterCU[depth], SIZE_2Nx2N, cu);
>                           uint64_t intraInInterCost, bestCost;
>                           if (m_param->rdLevel > 2)
>                           {
>                               encodeIntraInInter(m_intraInInterCU[depth], m_origYuv[depth], m_modePredYuv[5][depth],
> -                                               m_tmpResiYuv[depth], m_tmpRecoYuv[depth]);
> +                                               m_tmpResiYuv[depth], m_tmpRecoYuv[depth], cu);
>                               intraInInterCost = m_rdCost.m_psyRd ? m_intraInInterCU[depth]->m_totalPsyCost : m_intraInInterCU[depth]->m_totalRDCost;
>                               bestCost = m_rdCost.m_psyRd ? outBestCU->m_totalPsyCost : outBestCU->m_totalRDCost;
>                           }
> @@ -935,17 +935,17 @@
>                           int numPart = outBestCU->getNumPartInter();
>                           for (int partIdx = 0; partIdx < numPart; partIdx++)
>                           {
> -                            prepMotionCompensation(outBestCU, partIdx);
> +                            prepMotionCompensation(outBestCU, partIdx, cu);
>                               motionCompensation(m_bestPredYuv[depth], false, true);
>                           }
>   
>                           encodeResAndCalcRdInterCU(outBestCU, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth],
> -                                                  m_bestResiYuv[depth], m_bestRecoYuv[depth]);
> +                                                  m_bestResiYuv[depth], m_bestRecoYuv[depth], cu);
>                           m_rdEntropyCoders[depth][CI_TEMP_BEST].store(m_rdEntropyCoders[depth][CI_NEXT_BEST]);
>                       }
>                       else if (outBestCU->getPredictionMode(0) == MODE_INTRA)
>                       {
> -                        encodeIntraInInter(outBestCU, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth],  m_bestRecoYuv[depth]);
> +                        encodeIntraInInter(outBestCU, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth],  m_bestRecoYuv[depth], cu);
>                           m_rdEntropyCoders[depth][CI_TEMP_BEST].store(m_rdEntropyCoders[depth][CI_NEXT_BEST]);
>                       }
>                   }
> @@ -962,15 +962,15 @@
>                           int numPart = outBestCU->getNumPartInter();
>                           for (int partIdx = 0; partIdx < numPart; partIdx++)
>                           {
> -                            prepMotionCompensation(outBestCU, partIdx);
> +                            prepMotionCompensation(outBestCU, partIdx, cu);
>                               motionCompensation(m_bestPredYuv[depth], false, true);
>                           }
>   
>                           m_tmpResiYuv[depth]->subtract(m_origYuv[depth], m_bestPredYuv[depth], outBestCU->getLog2CUSize(0));
> -                        generateCoeffRecon(outBestCU, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth], m_bestRecoYuv[depth]);
> +                        generateCoeffRecon(outBestCU, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth], m_bestRecoYuv[depth], cu);
>                       }
>                       else
> -                        generateCoeffRecon(outBestCU, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth], m_bestRecoYuv[depth]);
> +                        generateCoeffRecon(outBestCU, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth], m_bestRecoYuv[depth], cu);
>                   }
>                   else if (m_param->rdLevel == 0)
>                   {
> @@ -979,7 +979,7 @@
>                           int numPart = outBestCU->getNumPartInter();
>                           for (int partIdx = 0; partIdx < numPart; partIdx++)
>                           {
> -                            prepMotionCompensation(outBestCU, partIdx);
> +                            prepMotionCompensation(outBestCU, partIdx, cu);
>                               motionCompensation(m_bestPredYuv[depth], false, true);
>                           }
>                       }
> @@ -1092,7 +1092,7 @@
>               CU *child_cu = pic->getCU(cuAddr)->m_CULocalData + cu->childIdx + partUnitIdx;
>   
>               TComDataCU* subBestPartCU = NULL;
> -            subTempPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
> +            subTempPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp, child_cu); // clear sub partition datas or init.
>   
>               if (child_cu->flags & CU::PRESENT)
>               {
> @@ -1210,7 +1210,7 @@
>       outBestCU->copyToPic(depth);
>   
>       if (m_param->rdLevel == 0 && depth == 0)
> -        encodeResidue(outBestCU, outBestCU, 0, 0);
> +        encodeResidue(outBestCU, outBestCU, 0, 0, cu);
>       else if (m_param->rdLevel != 0)
>       {
>           /* Copy Yuv data to picture Yuv */
> @@ -1252,7 +1252,7 @@
>   
>       Frame* pic = outBestCU->m_pic;
>       uint32_t cuAddr = outBestCU->getAddr();
> -    uint32_t absPartIdx = outBestCU->getZorderIdxInCU();
> +    uint32_t absPartIdx = cu->encodeIdx;
>   
>       if (depth == 0)
>           // get original YUV data from picture
> @@ -1278,14 +1278,14 @@
>           if (slice->m_sliceType != I_SLICE)
>           {
>               // by Merge for inter_2Nx2N
> -            checkMerge2Nx2N_rd5_6(outBestCU, outTempCU, &earlyDetectionSkipMode, m_bestPredYuv[depth], m_bestRecoYuv[depth]);
> +            checkMerge2Nx2N_rd5_6(outBestCU, outTempCU, &earlyDetectionSkipMode, m_bestPredYuv[depth], m_bestRecoYuv[depth], cu);
>   
>               outTempCU->initEstData();
>   
>               if (!m_param->bEnableEarlySkip)
>               {
>                   // 2Nx2N, NxN
> -                checkInter_rd5_6(outBestCU, outTempCU, SIZE_2Nx2N);
> +                checkInter_rd5_6(outBestCU, outTempCU, SIZE_2Nx2N, cu);
>                   outTempCU->initEstData();
>                   if (m_param->bEnableCbfFastMode)
>                       doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
> @@ -1304,7 +1304,7 @@
>                   {
>                       if (depth == g_maxCUDepth && doNotBlockPu)
>                       {
> -                        checkInter_rd5_6(outBestCU, outTempCU, SIZE_NxN);
> +                        checkInter_rd5_6(outBestCU, outTempCU, SIZE_NxN, cu);
>                           outTempCU->initEstData();
>                       }
>                   }
> @@ -1314,14 +1314,14 @@
>                       // 2NxN, Nx2N
>                       if (doNotBlockPu)
>                       {
> -                        checkInter_rd5_6(outBestCU, outTempCU, SIZE_Nx2N);
> +                        checkInter_rd5_6(outBestCU, outTempCU, SIZE_Nx2N, cu);
>                           outTempCU->initEstData();
>                           if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_Nx2N)
>                               doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
>                       }
>                       if (doNotBlockPu)
>                       {
> -                        checkInter_rd5_6(outBestCU, outTempCU, SIZE_2NxN);
> +                        checkInter_rd5_6(outBestCU, outTempCU, SIZE_2NxN, cu);
>                           outTempCU->initEstData();
>                           if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_2NxN)
>                               doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
> @@ -1341,14 +1341,14 @@
>                       {
>                           if (doNotBlockPu)
>                           {
> -                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_2NxnU);
> +                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_2NxnU, cu);
>                               outTempCU->initEstData();
>                               if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_2NxnU)
>                                   doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
>                           }
>                           if (doNotBlockPu)
>                           {
> -                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_2NxnD);
> +                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_2NxnD, cu);
>                               outTempCU->initEstData();
>                               if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_2NxnD)
>                                   doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
> @@ -1358,14 +1358,14 @@
>                       {
>                           if (doNotBlockPu)
>                           {
> -                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_2NxnU, true);
> +                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_2NxnU, cu, true);
>                               outTempCU->initEstData();
>                               if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_2NxnU)
>                                   doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
>                           }
>                           if (doNotBlockPu)
>                           {
> -                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_2NxnD, true);
> +                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_2NxnD, cu, true);
>                               outTempCU->initEstData();
>                               if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_2NxnD)
>                                   doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
> @@ -1377,14 +1377,14 @@
>                       {
>                           if (doNotBlockPu)
>                           {
> -                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_nLx2N);
> +                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_nLx2N, cu);
>                               outTempCU->initEstData();
>                               if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_nLx2N)
>                                   doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
>                           }
>                           if (doNotBlockPu)
>                           {
> -                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_nRx2N);
> +                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_nRx2N, cu);
>                               outTempCU->initEstData();
>                           }
>                       }
> @@ -1392,14 +1392,14 @@
>                       {
>                           if (doNotBlockPu)
>                           {
> -                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_nLx2N, true);
> +                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_nLx2N, cu, true);
>                               outTempCU->initEstData();
>                               if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_nLx2N)
>                                   doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
>                           }
>                           if (doNotBlockPu)
>                           {
> -                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_nRx2N, true);
> +                            checkInter_rd5_6(outBestCU, outTempCU, SIZE_nRx2N, cu, true);
>                               outTempCU->initEstData();
>                           }
>                       }
> @@ -1412,14 +1412,14 @@
>                    outBestCU->getCbf(0, TEXT_CHROMA_U) != 0   ||
>                    outBestCU->getCbf(0, TEXT_CHROMA_V) != 0)  && doIntra)
>               {
> -                checkIntraInInter_rd5_6(outBestCU, outTempCU, SIZE_2Nx2N);
> +                checkIntraInInter_rd5_6(outBestCU, outTempCU, SIZE_2Nx2N, cu);
>                   outTempCU->initEstData();
>   
>                   if (depth == g_maxCUDepth)
>                   {
>                       if (cu->log2CUSize > slice->m_sps->quadtreeTULog2MinSize)
>                       {
> -                        checkIntraInInter_rd5_6(outBestCU, outTempCU, SIZE_NxN);
> +                        checkIntraInInter_rd5_6(outBestCU, outTempCU, SIZE_NxN, cu);
>                           outTempCU->initEstData();
>                       }
>                   }
> @@ -1453,11 +1453,11 @@
>               CU *child_cu = pic->getCU(cuAddr)->m_CULocalData + cu->childIdx + partUnitIdx;
>   
>               int qp = outTempCU->getQP(0);
> -            subBestPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
> +            subBestPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp, child_cu); // clear sub partition datas or init.
>   
>               if (child_cu->flags & CU::PRESENT)
>               {
> -                subTempPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
> +                subTempPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp, child_cu); // clear sub partition datas or init.
>   
>                   if (0 == partUnitIdx) //initialize RD with previous depth buffer
>                       m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
> @@ -1533,7 +1533,7 @@
>   #endif
>   }
>   
> -void Analysis::checkMerge2Nx2N_rd0_4(TComDataCU*& outBestCU, TComDataCU*& outTempCU, TComYuv*& bestPredYuv, TComYuv*& yuvReconBest)
> +void Analysis::checkMerge2Nx2N_rd0_4(TComDataCU*& outBestCU, TComDataCU*& outTempCU, TComYuv*& bestPredYuv, TComYuv*& yuvReconBest, CU* cuData)
>   {
>       X265_CHECK(outTempCU->m_slice->m_sliceType != I_SLICE, "Evaluating merge in I slice\n");
>       TComMvField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
> @@ -1569,7 +1569,7 @@
>   
>               // do MC only for Luma part
>               /* Set CU parameters for motion compensation */
> -            prepMotionCompensation(outTempCU, 0);
> +            prepMotionCompensation(outTempCU, 0, cuData);
>               motionCompensation(m_tmpPredYuv[depth], true, false);
>               uint32_t bitsCand = getTUBits(mergeCand, maxNumMergeCand);
>               outTempCU->m_totalBits = bitsCand;
> @@ -1608,7 +1608,7 @@
>               int numPart = outBestCU->getNumPartInter();
>               for (int partIdx = 0; partIdx < numPart; partIdx++)
>               {
> -                prepMotionCompensation(outBestCU, partIdx);
> +                prepMotionCompensation(outBestCU, partIdx, cuData);
>                   motionCompensation(bestPredYuv, false, true);
>               }
>   
> @@ -1623,7 +1623,7 @@
>               }
>   
>               // Encode with residue
> -            encodeResAndCalcRdInterCU(outTempCU, m_origYuv[depth], bestPredYuv, m_tmpResiYuv[depth], m_bestResiYuv[depth], m_tmpRecoYuv[depth]);
> +            encodeResAndCalcRdInterCU(outTempCU, m_origYuv[depth], bestPredYuv, m_tmpResiYuv[depth], m_bestResiYuv[depth], m_tmpRecoYuv[depth], cuData);
>   
>               uint64_t tempCost = m_rdCost.m_psyRd ? outTempCU->m_totalPsyCost : outTempCU->m_totalRDCost;
>               uint64_t bestCost = m_rdCost.m_psyRd ? outBestCU->m_totalPsyCost : outBestCU->m_totalRDCost;
> @@ -1637,7 +1637,7 @@
>       }
>   }
>   
> -void Analysis::checkMerge2Nx2N_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, bool *earlyDetectionSkipMode, TComYuv*& outBestPredYuv, TComYuv*& rpcYuvReconBest)
> +void Analysis::checkMerge2Nx2N_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, bool *earlyDetectionSkipMode, TComYuv*& outBestPredYuv, TComYuv*& rpcYuvReconBest, CU* cuData)
>   {
>       X265_CHECK(outTempCU->m_slice->m_sliceType != I_SLICE, "I slice not expected\n");
>       TComMvField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
> @@ -1682,7 +1682,7 @@
>                       outTempCU->getCUMvField(REF_PIC_LIST_1)->setAllMvField(mvFieldNeighbours[mergeCand][1], SIZE_2Nx2N, 0, 0); // interprets depth relative to outTempCU level
>   
>                       // do MC
> -                    prepMotionCompensation(outTempCU, 0);
> +                    prepMotionCompensation(outTempCU, 0, cuData);
>                       motionCompensation(m_tmpPredYuv[depth], true, true);
>   
>                       // estimate residual and encode everything
> @@ -1697,7 +1697,8 @@
>                                                     m_tmpPredYuv[depth],
>                                                     m_tmpResiYuv[depth],
>                                                     m_bestResiYuv[depth],
> -                                                  m_tmpRecoYuv[depth]);
> +                                                  m_tmpRecoYuv[depth],
> +                                                  cuData);
>   
>   
>                       /* Todo: Fix the satd cost estimates. Why is merge being chosen in high motion areas: estimated distortion is too low? */
> @@ -1742,7 +1743,7 @@
>       }
>   }
>   
> -void Analysis::checkInter_rd0_4(TComDataCU* outTempCU, TComYuv* outPredYuv, PartSize partSize, bool bUseMRG)
> +void Analysis::checkInter_rd0_4(TComDataCU* outTempCU, TComYuv* outPredYuv, PartSize partSize, CU* cuData, bool bUseMRG)
>   {
>       uint32_t depth = outTempCU->getDepth(0);
>   
> @@ -1752,7 +1753,7 @@
>   
>       // do motion compensation only for Luma since luma cost alone is calculated
>       outTempCU->m_totalBits = 0;
> -    if (predInterSearch(outTempCU, outPredYuv, bUseMRG, false))
> +    if (predInterSearch(outTempCU, outPredYuv, bUseMRG, false, cuData))
>       {
>           int sizeIdx = outTempCU->getLog2CUSize(0) - 2;
>           uint32_t distortion = primitives.sa8d[sizeIdx](m_origYuv[depth]->getLumaAddr(), m_origYuv[depth]->getStride(),
> @@ -1767,7 +1768,7 @@
>       }
>   }
>   
> -void Analysis::checkInter_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, bool bUseMRG)
> +void Analysis::checkInter_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, CU* cuData, bool bUseMRG)
>   {
>       uint32_t depth = outTempCU->getDepth(0);
>   
> @@ -1776,15 +1777,15 @@
>       outTempCU->setPredModeSubParts(MODE_INTER, 0, depth);
>       outTempCU->setCUTransquantBypassSubParts(!!m_param->bLossless, 0, depth);
>   
> -    if (predInterSearch(outTempCU, m_tmpPredYuv[depth], bUseMRG, true))
> +    if (predInterSearch(outTempCU, m_tmpPredYuv[depth], bUseMRG, true, cuData))
>       {
> -        encodeResAndCalcRdInterCU(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_bestResiYuv[depth], m_tmpRecoYuv[depth]);
> +        encodeResAndCalcRdInterCU(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_bestResiYuv[depth], m_tmpRecoYuv[depth], cuData);
>           checkDQP(outTempCU);
>           checkBestMode(outBestCU, outTempCU, depth);
>       }
>   }
>   
> -void Analysis::checkIntraInInter_rd0_4(TComDataCU* cu, PartSize partSize)
> +void Analysis::checkIntraInInter_rd0_4(TComDataCU* cu, PartSize partSize, CU* cuData)
>   {
>       uint32_t depth = cu->getDepth(0);
>   
> @@ -1798,7 +1799,7 @@
>       const uint32_t partOffset  = 0;
>   
>       // Reference sample smoothing
> -    TComPattern::initAdiPattern(cu, partOffset, initTrDepth, m_predBuf, m_refAbove, m_refLeft, m_refAboveFlt, m_refLeftFlt, ALL_IDX);
> +    TComPattern::initAdiPattern(cu, partOffset, initTrDepth, m_predBuf, m_refAbove, m_refLeft, m_refAboveFlt, m_refLeftFlt, ALL_IDX, cuData);
>   
>       pixel* fenc     = m_origYuv[depth]->getLumaAddr();
>       uint32_t stride = m_modePredYuv[5][depth]->getStride();
> @@ -1950,7 +1951,7 @@
>       cu->setLumaIntraDirSubParts(bmode, partOffset, depth + initTrDepth);
>   }
>   
> -void Analysis::checkIntraInInter_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize)
> +void Analysis::checkIntraInInter_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, CU* cuData)
>   {
>       uint32_t depth = outTempCU->getDepth(0);
>   
> @@ -1965,9 +1966,9 @@
>       uint32_t tuDepthRange[2];
>       outTempCU->getQuadtreeTULog2MinSizeInCU(tuDepthRange, 0);
>   
> -    estIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange);
> +    estIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange, cuData);
>   
> -    estIntraPredChromaQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth]);
> +    estIntraPredChromaQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], cuData);
>   
>       m_entropyCoder->resetBits();
>       if (outTempCU->m_slice->m_pps->bTransquantBypassEnabled)
> @@ -2003,7 +2004,7 @@
>       checkBestMode(outBestCU, outTempCU, depth);
>   }
>   
> -void Analysis::encodeIntraInInter(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv,  ShortYuv* outResiYuv, TComYuv* outReconYuv)
> +void Analysis::encodeIntraInInter(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv,  ShortYuv* outResiYuv, TComYuv* outReconYuv, CU* cuData)
>   {
>       uint64_t puCost = 0;
>       uint32_t puBits = 0;
> @@ -2019,7 +2020,7 @@
>       uint32_t tuDepthRange[2];
>       cu->getQuadtreeTULog2MinSizeInCU(tuDepthRange, 0);
>   
> -    uint32_t puDistY = xRecurIntraCodingQT(cu, initTrDepth, 0, fencYuv, predYuv, outResiYuv, false, puCost, puBits, psyEnergy, tuDepthRange);
> +    uint32_t puDistY = xRecurIntraCodingQT(cu, initTrDepth, 0, fencYuv, predYuv, outResiYuv, false, puCost, puBits, psyEnergy, tuDepthRange, cuData);
>       xSetIntraResultQT(cu, initTrDepth, 0, outReconYuv);
>   
>       //=== update PU data ====
> @@ -2028,7 +2029,7 @@
>       //===== set distortion (rate and r-d costs are determined later) =====
>       cu->m_totalDistortion = puDistY;
>   
> -    estIntraPredChromaQT(cu, fencYuv, predYuv, outResiYuv, outReconYuv);
> +    estIntraPredChromaQT(cu, fencYuv, predYuv, outResiYuv, outReconYuv, cuData);
>       m_entropyCoder->resetBits();
>       if (cu->m_slice->m_pps->bTransquantBypassEnabled)
>           m_entropyCoder->codeCUTransquantBypassFlag(cu->getCUTransquantBypass(0));
> @@ -2060,7 +2061,7 @@
>           cu->m_totalRDCost = m_rdCost.calcRdCost(cu->m_totalDistortion, cu->m_totalBits);
>   }
>   
> -void Analysis::encodeResidue(TComDataCU* lcu, TComDataCU* cu, uint32_t absPartIdx, uint32_t depth)
> +void Analysis::encodeResidue(TComDataCU* lcu, TComDataCU* cu, uint32_t absPartIdx, uint32_t depth, CU* cuData)
>   {
>       Frame* pic = cu->m_pic;
>   
> @@ -2073,10 +2074,11 @@
>           uint32_t xmax = slice->m_sps->picWidthInLumaSamples  - lcu->getCUPelX();
>           uint32_t ymax = slice->m_sps->picHeightInLumaSamples - lcu->getCUPelY();        for (uint32_t partUnitIdx = 0; partUnitIdx < 4; partUnitIdx++, absPartIdx += qNumParts)
>           {
> +            CU *child_cu = cu->m_CULocalData + cuData->childIdx + partUnitIdx;
>               if (g_zscanToPelX[absPartIdx] < xmax && g_zscanToPelY[absPartIdx] < ymax)
>               {
> -                subTempPartCU->copyToSubCU(cu, partUnitIdx, nextDepth);
> -                encodeResidue(lcu, subTempPartCU, absPartIdx, nextDepth);
> +                subTempPartCU->copyToSubCU(cu, partUnitIdx, nextDepth, child_cu);
> +                encodeResidue(lcu, subTempPartCU, absPartIdx, nextDepth, child_cu);
>               }
>           }
>   
> @@ -2118,7 +2120,7 @@
>               uint32_t tuDepthRange[2];
>               cu->getQuadtreeTULog2MinSizeInCU(tuDepthRange, 0);
>               // Residual encoding
> -            residualTransformQuantInter(cu, 0, m_origYuv[0], m_tmpResiYuv[depth], cu->getDepth(0), tuDepthRange);
> +            residualTransformQuantInter(cu, 0, m_origYuv[0], m_tmpResiYuv[depth], cu->getDepth(0), tuDepthRange, cuData);
>               checkDQP(cu);
>   
>               if (lcu->getMergeFlag(absPartIdx) && cu->getPartitionSize(0) == SIZE_2Nx2N && !cu->getQtRootCbf(0))
> @@ -2178,7 +2180,7 @@
>       else
>       {
>           m_origYuv[0]->copyPartToYuv(m_origYuv[depth], absPartIdx);
> -        generateCoeffRecon(cu, m_origYuv[depth], m_modePredYuv[5][depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth]);
> +        generateCoeffRecon(cu, m_origYuv[depth], m_modePredYuv[5][depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], cuData);
>           checkDQP(cu);
>           m_tmpRecoYuv[depth]->copyToPicYuv(pic->getPicYuvRec(), cuAddr, absPartIdx);
>           cu->copyCodedToPic(depth);
> diff -r 7dccbbed0349 -r 2048b3e3c064 source/encoder/analysis.h
> --- a/source/encoder/analysis.h	Wed Sep 24 18:26:45 2014 -0500
> +++ b/source/encoder/analysis.h	Fri Sep 26 14:06:01 2014 +0530
> @@ -117,17 +117,17 @@
>                                  int bInsidePicture, uint32_t partitionIndex, uint32_t minDepth);
>       void compressInterCU_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, uint32_t depth, CU *cu,
>                                  PartSize parentSize = SIZE_NONE);
> -    void checkMerge2Nx2N_rd0_4(TComDataCU*& outBestCU, TComDataCU*& outTempCU, TComYuv*& bestPredYuv, TComYuv*& tmpPredYuv);
> +    void checkMerge2Nx2N_rd0_4(TComDataCU*& outBestCU, TComDataCU*& outTempCU, TComYuv*& bestPredYuv, TComYuv*& tmpPredYuv, CU* cu);
>       void checkMerge2Nx2N_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, bool *earlyDetectionSkipMode,
> -                               TComYuv*& outBestPredYuv, TComYuv*& rpcYuvReconBest);
> -    void checkInter_rd0_4(TComDataCU* outTempCU, TComYuv* outPredYUV, PartSize partSize, bool bUseMRG = false);
> -    void checkInter_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, bool bUseMRG = false);
> -    void checkIntraInInter_rd0_4(TComDataCU* cu, PartSize partSize);
> -    void checkIntraInInter_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize);
> +                               TComYuv*& outBestPredYuv, TComYuv*& rpcYuvReconBest, CU* cu);
> +    void checkInter_rd0_4(TComDataCU* outTempCU, TComYuv* outPredYUV, PartSize partSize, CU* cu, bool bUseMRG = false);
> +    void checkInter_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, CU* cu, bool bUseMRG = false);
> +    void checkIntraInInter_rd0_4(TComDataCU* cu, PartSize partSize, CU* cuData);
> +    void checkIntraInInter_rd5_6(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, CU* cu);
>   
>       void checkBestMode(TComDataCU*& outBestCU, TComDataCU*& outTempCU, uint32_t depth);
> -    void encodeIntraInInter(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* outResiYuv, TComYuv* outReconYuv);
> -    void encodeResidue(TComDataCU* lcu, TComDataCU* cu, uint32_t absPartIdx, uint32_t depth);
> +    void encodeIntraInInter(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* outResiYuv, TComYuv* outReconYuv, CU* cuData);
> +    void encodeResidue(TComDataCU* lcu, TComDataCU* cu, uint32_t absPartIdx, uint32_t depth, CU* cuData);
>       void checkDQP(TComDataCU* cu);
>       void deriveTestModeAMP(TComDataCU* bestCU, PartSize parentSize, bool &bTestAMP_Hor, bool &bTestAMP_Ver,
>                              bool &bTestMergeAMP_Hor, bool &bTestMergeAMP_Ver);
> diff -r 7dccbbed0349 -r 2048b3e3c064 source/encoder/predict.cpp
> --- a/source/encoder/predict.cpp	Wed Sep 24 18:26:45 2014 -0500
> +++ b/source/encoder/predict.cpp	Fri Sep 26 14:06:01 2014 +0530
> @@ -142,12 +142,12 @@
>       primitives.intra_pred[dirMode][sizeIdx](dst, stride, left, above, dirMode, 0);
>   }
>   
> -void Predict::prepMotionCompensation(TComDataCU* cu, int partIdx)
> +void Predict::prepMotionCompensation(TComDataCU* cu, int partIdx, CU* cuData)
>   {
>       m_slice = cu->m_slice;
>       cu->getPartIndexAndSize(partIdx, m_partAddr, m_width, m_height);
>       m_cuAddr = cu->getAddr();
> -    m_zOrderIdxinCU = cu->getZorderIdxInCU();
> +    m_zOrderIdxinCU = cuData->encodeIdx;
>   
>       m_mvField[0] = cu->getCUMvField(REF_PIC_LIST_0);
>       m_mvField[1] = cu->getCUMvField(REF_PIC_LIST_1);
> diff -r 7dccbbed0349 -r 2048b3e3c064 source/encoder/predict.h
> --- a/source/encoder/predict.h	Wed Sep 24 18:26:45 2014 -0500
> +++ b/source/encoder/predict.h	Fri Sep 26 14:06:01 2014 +0530
> @@ -89,7 +89,7 @@
>       bool allocBuffers(int csp);
>   
>       /* prepMotionCompensation needs to be called to prepare MC with CU-relevant data */
> -    void prepMotionCompensation(TComDataCU* cu, int partIdx);
> +    void prepMotionCompensation(TComDataCU* cu, int partIdx, CU* cuData);
>       void motionCompensation(TComYuv* predYuv, bool bLuma, bool bChroma);
>   
>       /* Angular Intra */
> diff -r 7dccbbed0349 -r 2048b3e3c064 source/encoder/search.cpp
> --- a/source/encoder/search.cpp	Wed Sep 24 18:26:45 2014 -0500
> +++ b/source/encoder/search.cpp	Fri Sep 26 14:06:01 2014 +0530
> @@ -288,14 +288,13 @@
>   
>   /* returns distortion */
>   uint32_t Search::xIntraCodingLumaBlk(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize, TComYuv* fencYuv, TComYuv* predYuv,
> -                                     ShortYuv* resiYuv, int16_t* reconQt, uint32_t reconQtStride, coeff_t* coeff, uint32_t& cbf)
> +                                     ShortYuv* resiYuv, int16_t* reconQt, uint32_t reconQtStride, coeff_t* coeff, uint32_t& cbf, CU* cuData)
>   {
>       uint32_t stride       = fencYuv->getStride();
>       pixel*   fenc         = fencYuv->getLumaAddr(absPartIdx);
>       pixel*   pred         = predYuv->getLumaAddr(absPartIdx);
>       int16_t* residual     = resiYuv->getLumaAddr(absPartIdx);
> -
> -    uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
> +    uint32_t zorder           = cuData->encodeIdx + absPartIdx;
>       pixel*   reconIPred       = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
>       uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getStride();
>       bool     useTransformSkip = !!cu->getTransformSkip(absPartIdx, TEXT_LUMA);
> @@ -338,7 +337,7 @@
>   }
>   
>   uint32_t Search::xIntraCodingChromaBlk(TComDataCU* cu, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, int16_t* reconQt,
> -                                       uint32_t reconQtStride, coeff_t* coeff, uint32_t& cbf, uint32_t chromaId, uint32_t log2TrSizeC)
> +                                       uint32_t reconQtStride, coeff_t* coeff, uint32_t& cbf, uint32_t chromaId, uint32_t log2TrSizeC, CU* cuData)
>   {
>       TextType ttype        = (TextType)chromaId;
>       uint32_t stride       = fencYuv->getCStride();
> @@ -346,7 +345,7 @@
>       pixel*   pred         = predYuv->getChromaAddr(chromaId, absPartIdx);
>       int16_t* residual     = resiYuv->getChromaAddr(chromaId, absPartIdx);
>   
> -    uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
> +    uint32_t zorder           = cuData->encodeIdx + absPartIdx;
>       pixel*   reconIPred       = cu->m_pic->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), zorder);
>       uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getCStride();
>       bool     useTransformSkipC = !!cu->getTransformSkip(absPartIdx, ttype);
> @@ -394,7 +393,7 @@
>   
>   /* returns distortion. TODO reorder params */
>   uint32_t Search::xRecurIntraCodingQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv,
> -                                     bool bAllowRQTSplit, uint64_t& rdCost, uint32_t& rdBits, uint32_t& psyEnergy, uint32_t depthRange[2])
> +                                     bool bAllowRQTSplit, uint64_t& rdCost, uint32_t& rdBits, uint32_t& psyEnergy, uint32_t depthRange[2],CU* cuData)
>   {
>       uint32_t fullDepth   = cu->getDepth(0) + trDepth;
>       uint32_t log2TrSize  = g_maxLog2CUSize - fullDepth;
> @@ -454,7 +453,7 @@
>   
>           // init availability pattern
>           uint32_t lumaPredMode = cu->getLumaIntraDir(absPartIdx);
> -        TComPattern::initAdiPattern(cu, absPartIdx, trDepth, m_predBuf, m_refAbove, m_refLeft, m_refAboveFlt, m_refLeftFlt, lumaPredMode);
> +        TComPattern::initAdiPattern(cu, absPartIdx, trDepth, m_predBuf, m_refAbove, m_refLeft, m_refAboveFlt, m_refLeftFlt, lumaPredMode, cuData);
>   
>           // get prediction signal
>           predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
> @@ -496,11 +495,11 @@
>                       cu->setCUTransquantBypassSubParts(bIsLossLess, absPartIdx, fullDepth);
>   
>                   // code luma block with given intra prediction mode and store Cbf
> -                singleDistYTmp = xIntraCodingLumaBlk(cu, absPartIdx, log2TrSize, fencYuv, predYuv, resiYuv, recon, reconStride, coeff, singleCbfYTmp);
> +                singleDistYTmp = xIntraCodingLumaBlk(cu, absPartIdx, log2TrSize, fencYuv, predYuv, resiYuv, recon, reconStride, coeff, singleCbfYTmp, cuData);
>                   singlePsyEnergyYTmp = 0;
>                   if (m_rdCost.m_psyRd)
>                   {
> -                    uint32_t zorder = cu->getZorderIdxInCU() + absPartIdx;
> +                    uint32_t zorder = cuData->encodeIdx + absPartIdx;
>                       singlePsyEnergyYTmp = m_rdCost.psyCost(log2TrSize - 2, fencYuv->getLumaAddr(absPartIdx), fencYuv->getStride(),
>                           cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder), cu->m_pic->getPicYuvRec()->getStride());
>                   }
> @@ -540,7 +539,7 @@
>   
>               if (bestModeId == firstCheckId)
>               {
> -                xLoadIntraResultQT(cu, absPartIdx, log2TrSize, reconQt, reconQtStride);
> +                xLoadIntraResultQT(cu, absPartIdx, log2TrSize, reconQt, reconQtStride, cuData);
>                   cu->setCbfSubParts(singleCbfY << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
>                   m_entropyCoder->load(m_rdEntropyCoders[fullDepth][CI_TEMP_BEST]);
>               }
> @@ -557,10 +556,10 @@
>   
>               // code luma block with given intra prediction mode and store Cbf
>               cu->setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
> -            singleDistY = xIntraCodingLumaBlk(cu, absPartIdx, log2TrSize, fencYuv, predYuv, resiYuv, reconQt, reconQtStride, coeffY, singleCbfY);
> +            singleDistY = xIntraCodingLumaBlk(cu, absPartIdx, log2TrSize, fencYuv, predYuv, resiYuv, reconQt, reconQtStride, coeffY, singleCbfY, cuData);
>               if (m_rdCost.m_psyRd)
>               {
> -                uint32_t zorder = cu->getZorderIdxInCU() + absPartIdx;
> +                uint32_t zorder = cuData->encodeIdx + absPartIdx;
>                   singlePsyEnergyY = m_rdCost.psyCost(log2TrSize - 2, fencYuv->getLumaAddr(absPartIdx), fencYuv->getStride(),
>                       cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder), cu->m_pic->getPicYuvRec()->getStride());
>               }
> @@ -600,7 +599,7 @@
>           for (uint32_t part = 0; part < 4; part++, absPartIdxSub += qPartsDiv)
>           {
>               splitDistY += xRecurIntraCodingQT(cu, trDepth + 1, absPartIdxSub, fencYuv, predYuv, resiYuv, bAllowRQTSplit, splitCost, splitBits,
> -                                              splitPsyEnergyY, depthRange);
> +                                              splitPsyEnergyY, depthRange, cuData);
>               splitCbfY |= cu->getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1);
>           }
>   
> @@ -638,7 +637,7 @@
>   
>           // set reconstruction for next intra prediction blocks
>           uint32_t qtLayer   = log2TrSize - 2;
> -        uint32_t zorder    = cu->getZorderIdxInCU() + absPartIdx;
> +        uint32_t zorder    = cuData->encodeIdx + absPartIdx;
>           int16_t* reconQt   = m_qtTempShortYuv[qtLayer].getLumaAddr(absPartIdx);
>           X265_CHECK(m_qtTempShortYuv[qtLayer].m_width == MAX_CU_SIZE, "width is not max CU size\n");
>           const uint32_t reconQtStride = MAX_CU_SIZE;
> @@ -656,7 +655,7 @@
>   }
>   
>   void Search::residualTransformQuantIntra(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv,
> -                                         ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2])
> +                                         ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2], CU* cuData)
>   {
>       uint32_t fullDepth   = cu->getDepth(0) +  trDepth;
>       uint32_t log2TrSize  = g_maxLog2CUSize - fullDepth;
> @@ -686,14 +685,14 @@
>           uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
>           coeff_t* coeff        = cu->getCoeffY() + coeffOffsetY;
>   
> -        uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
> +        uint32_t zorder           = cuData->encodeIdx + absPartIdx;
>           pixel*   reconIPred       = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
>           uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getStride();
>   
>           bool     useTransformSkip = !!cu->getTransformSkip(absPartIdx, TEXT_LUMA);
>   
>           // init availability pattern
> -        TComPattern::initAdiPattern(cu, absPartIdx, trDepth, m_predBuf, m_refAbove, m_refLeft, m_refAboveFlt, m_refLeftFlt, lumaPredMode);
> +        TComPattern::initAdiPattern(cu, absPartIdx, trDepth, m_predBuf, m_refAbove, m_refLeft, m_refAboveFlt, m_refLeftFlt, lumaPredMode, cuData);
>           // get prediction signal
>           predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
>   
> @@ -742,7 +741,7 @@
>   
>           for (uint32_t part = 0; part < 4; part++, absPartIdxSub += qPartsDiv)
>           {
> -            residualTransformQuantIntra(cu, trDepth + 1, absPartIdxSub, fencYuv, predYuv, resiYuv, reconYuv, depthRange);
> +            residualTransformQuantIntra(cu, trDepth + 1, absPartIdxSub, fencYuv, predYuv, resiYuv, reconYuv, depthRange, cuData);
>               splitCbfY |= cu->getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1);
>           }
>   
> @@ -778,24 +777,24 @@
>       }
>   }
>   
> -void Search::xLoadIntraResultQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize, int16_t* reconQt, uint32_t reconQtStride)
> +void Search::xLoadIntraResultQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize, int16_t* reconQt, uint32_t reconQtStride, CU* cuData)
>   {
>       // copy reconstruction
>       int sizeIdx = log2TrSize - 2;
> -    uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
> +    uint32_t zorder           = cuData->encodeIdx + absPartIdx;
>       pixel*   reconIPred       = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
>       uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getStride();
>       primitives.square_copy_sp[sizeIdx](reconIPred, reconIPredStride, reconQt, reconQtStride);
>   }
>   
>   void Search::xLoadIntraResultChromaQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSizeC, uint32_t chromaId,
> -                                      int16_t* reconQt, uint32_t reconQtStride)
> +                                      int16_t* reconQt, uint32_t reconQtStride, CU* cuData)
>   {
>       X265_CHECK(chromaId == 1 || chromaId == 2, "invalid chroma id");
>   
>       // copy reconstruction
>       int sizeIdxC = log2TrSizeC - 2;
> -    uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
> +    uint32_t zorder           = cuData->encodeIdx + absPartIdx;
>       pixel*   reconIPred       = cu->m_pic->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), zorder);
>       uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getCStride();
>       primitives.square_copy_sp[sizeIdxC](reconIPred, reconIPredStride, reconQt, reconQtStride);
> @@ -839,7 +838,7 @@
>   
>   /* returns distortion */
>   uint32_t Search::xRecurIntraChromaCodingQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv,
> -                                           uint32_t& psyEnergy)
> +                                           uint32_t& psyEnergy,CU* cuData)
>   {
>       uint32_t fullDepth = cu->getDepth(0) + trDepth;
>       uint32_t trMode    = cu->getTransformIdx(absPartIdx);
> @@ -897,7 +896,7 @@
>                   pixel*   pred        = predYuv->getChromaAddr(chromaId, absPartIdxC);
>   
>                   // init availability pattern
> -                TComPattern::initAdiPatternChroma(cu, absPartIdxC, trDepthC, m_predBuf, chromaId);
> +                TComPattern::initAdiPatternChroma(cu, absPartIdxC, trDepthC, m_predBuf, chromaId, cuData);
>                   pixel* chromaPred = TComPattern::getAdiChromaBuf(chromaId, tuSize, m_predBuf);
>   
>                   uint32_t chromaPredMode = cu->getChromaIntraDir(absPartIdxC);
> @@ -941,7 +940,7 @@
>   
>                           cu->setTransformSkipPartRange(chromaModeId, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
>   
> -                        singleDistCTmp = xIntraCodingChromaBlk(cu, absPartIdxC, fencYuv, predYuv, resiYuv, recon, reconStride, coeff, singleCbfCTmp, chromaId, log2TrSizeC);
> +                        singleDistCTmp = xIntraCodingChromaBlk(cu, absPartIdxC, fencYuv, predYuv, resiYuv, recon, reconStride, coeff, singleCbfCTmp, chromaId, log2TrSizeC, cuData);
>                           cu->setCbfPartRange(singleCbfCTmp << trDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
>   
>                           if (chromaModeId == 1 && !singleCbfCTmp)
> @@ -952,7 +951,7 @@
>                               uint32_t bitsTmp = singleCbfCTmp ? xGetIntraBitsChroma(cu, absPartIdxC, log2TrSizeC, chromaId, coeff) : 0;
>                               if (m_rdCost.m_psyRd)
>                               {
> -                                uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
> +                                uint32_t zorder = cuData->encodeIdx + absPartIdxC;
>                                   singlePsyEnergyTmp = m_rdCost.psyCost(log2TrSizeC - 2, fencYuv->getChromaAddr(chromaId, absPartIdxC), fencYuv->getCStride(),
>                                       cu->m_pic->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), zorder), cu->m_pic->getPicYuvRec()->getCStride());
>                                   singleCostTmp = m_rdCost.calcPsyRdCost(singleDistCTmp, bitsTmp, singlePsyEnergyTmp);
> @@ -977,7 +976,7 @@
>   
>                       if (bestModeId == firstCheckId)
>                       {
> -                        xLoadIntraResultChromaQT(cu, absPartIdxC, log2TrSizeC, chromaId, reconQt, reconQtStride);
> +                        xLoadIntraResultChromaQT(cu, absPartIdxC, log2TrSizeC, chromaId, reconQt, reconQtStride, cuData);
>                           cu->setCbfPartRange(singleCbfC << trDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
>                           m_entropyCoder->load(m_rdEntropyCoders[fullDepth][CI_TEMP_BEST]);
>                       }
> @@ -998,10 +997,10 @@
>                   else
>                   {
>                       cu->setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
> -                    outDist += xIntraCodingChromaBlk(cu, absPartIdxC, fencYuv, predYuv, resiYuv, reconQt, reconQtStride, coeffC, singleCbfC, chromaId, log2TrSizeC);
> +                    outDist += xIntraCodingChromaBlk(cu, absPartIdxC, fencYuv, predYuv, resiYuv, reconQt, reconQtStride, coeffC, singleCbfC, chromaId, log2TrSizeC, cuData);
>                       if (m_rdCost.m_psyRd)
>                       {
> -                        uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
> +                        uint32_t zorder = cuData->encodeIdx + absPartIdxC;
>                           singlePsyEnergyTmp = m_rdCost.psyCost(log2TrSizeC - 2, fencYuv->getChromaAddr(chromaId, absPartIdxC), fencYuv->getCStride(),
>                               cu->m_pic->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), zorder), cu->m_pic->getPicYuvRec()->getCStride());
>                       }
> @@ -1026,7 +1025,7 @@
>           for (uint32_t part = 0; part < 4; part++, absPartIdxSub += qPartsDiv)
>           {
>               uint32_t psyEnergyTemp = 0;
> -            outDist += xRecurIntraChromaCodingQT(cu, trDepth + 1, absPartIdxSub, fencYuv, predYuv, resiYuv, psyEnergyTemp);
> +            outDist += xRecurIntraChromaCodingQT(cu, trDepth + 1, absPartIdxSub, fencYuv, predYuv, resiYuv, psyEnergyTemp, cuData);
>               splitPsyEnergy += psyEnergyTemp;
>               splitCbfU |= cu->getCbf(absPartIdxSub, TEXT_CHROMA_U, trDepth + 1);
>               splitCbfV |= cu->getCbf(absPartIdxSub, TEXT_CHROMA_V, trDepth + 1);
> @@ -1090,7 +1089,7 @@
>   }
>   
>   void Search::residualQTIntraChroma(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx,
> -                                   TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv)
> +                                   TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, CU* cuData)
>   {
>       uint32_t fullDepth = cu->getDepth(0) + trDepth;
>       uint32_t trMode    = cu->getTransformIdx(absPartIdx);
> @@ -1134,7 +1133,7 @@
>                   pixel*   recon          = reconYuv->getChromaAddr(chromaId, absPartIdxC);
>                   uint32_t coeffOffsetC   = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (hChromaShift + vChromaShift));
>                   coeff_t* coeff          = cu->getCoeff(ttype) + coeffOffsetC;
> -                uint32_t zorder         = cu->getZorderIdxInCU() + absPartIdxC;
> +                uint32_t zorder         = cuData->encodeIdx + absPartIdxC;
>                   pixel*   reconIPred     = cu->m_pic->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), zorder);
>                   uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getCStride();
>   
> @@ -1148,7 +1147,7 @@
>                       chromaPredMode = cu->getLumaIntraDir((m_csp == X265_CSP_I444) ? absPartIdxC : 0);
>                   chromaPredMode = (m_csp == X265_CSP_I422) ? g_chroma422IntraAngleMappingTable[chromaPredMode] : chromaPredMode;
>                   // init availability pattern
> -                TComPattern::initAdiPatternChroma(cu, absPartIdxC, trDepthC, m_predBuf, chromaId);
> +                TComPattern::initAdiPatternChroma(cu, absPartIdxC, trDepthC, m_predBuf, chromaId, cuData);
>                   pixel* chromaPred = TComPattern::getAdiChromaBuf(chromaId, tuSize, m_predBuf);
>   
>                   // get prediction signal
> @@ -1196,7 +1195,7 @@
>           uint32_t absPartIdxSub = absPartIdx;
>           for (uint32_t part = 0; part < 4; part++, absPartIdxSub += qPartsDiv)
>           {
> -            residualQTIntraChroma(cu, trDepth + 1, absPartIdxSub, fencYuv, predYuv, resiYuv, reconYuv);
> +            residualQTIntraChroma(cu, trDepth + 1, absPartIdxSub, fencYuv, predYuv, resiYuv, reconYuv, cuData);
>               splitCbfU |= cu->getCbf(absPartIdxSub, TEXT_CHROMA_U, trDepth + 1);
>               splitCbfV |= cu->getCbf(absPartIdxSub, TEXT_CHROMA_V, trDepth + 1);
>           }
> @@ -1209,7 +1208,7 @@
>       }
>   }
>   
> -void Search::estIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2])
> +void Search::estIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2], CU* cuData)
>   {
>       uint32_t depth        = cu->getDepth(0);
>       uint32_t initTrDepth  = cu->getPartitionSize(0) == SIZE_2Nx2N ? 0 : 1;
> @@ -1226,7 +1225,7 @@
>       for (uint32_t pu = 0; pu < numPU; pu++, partOffset += qNumParts)
>       {
>           // Reference sample smoothing
> -        TComPattern::initAdiPattern(cu, partOffset, initTrDepth, m_predBuf, m_refAbove, m_refLeft, m_refAboveFlt, m_refLeftFlt, ALL_IDX);
> +        TComPattern::initAdiPattern(cu, partOffset, initTrDepth, m_predBuf, m_refAbove, m_refLeft, m_refAboveFlt, m_refLeftFlt, ALL_IDX, cuData);
>   
>           // determine set of modes to be tested (using prediction signal only)
>           pixel*   fenc   = fencYuv->getLumaAddr(partOffset);
> @@ -1346,7 +1345,7 @@
>               cu->setLumaIntraDirSubParts(rdModeList[i], partOffset, depth + initTrDepth);
>               cost = bits = 0;
>               uint32_t psyEnergy = 0;
> -            xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, false, cost, bits, psyEnergy, depthRange);
> +            xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, false, cost, bits, psyEnergy, depthRange, cuData);
>               COPY2_IF_LT(bcost, cost, bmode, rdModeList[i]);
>           }
>   
> @@ -1356,14 +1355,14 @@
>   
>           uint32_t psyEnergy = 0;
>           // update distortion (rate and r-d costs are determined later)
> -        cu->m_totalDistortion += xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, true, cost, bits, psyEnergy, depthRange);
> +        cu->m_totalDistortion += xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, true, cost, bits, psyEnergy, depthRange, cuData);
>   
>           xSetIntraResultQT(cu, initTrDepth, partOffset, reconYuv);
>   
>           // set reconstruction for next intra prediction blocks
>           if (pu != numPU - 1)
>           {
> -            uint32_t zorder      = cu->getZorderIdxInCU() + partOffset;
> +            uint32_t zorder      = cuData->encodeIdx + partOffset;
>               pixel*   dst         = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
>               pixel*   src         = reconYuv->getLumaAddr(partOffset);
>               primitives.square_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride);
> @@ -1387,7 +1386,7 @@
>       x265_emms();
>   }
>   
> -void Search::sharedEstIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2], uint8_t* sharedModes)
> +void Search::sharedEstIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2], uint8_t* sharedModes, CU* cuData)
>   {
>       uint32_t depth       = cu->getDepth(0);
>       uint32_t initTrDepth = cu->getPartitionSize(0) == SIZE_2Nx2N ? 0 : 1;
> @@ -1411,12 +1410,12 @@
>   
>           uint32_t psyEnergy = 0;
>           // update overall distortion (rate and r-d costs are determined later)
> -        cu->m_totalDistortion += xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, true, puCost, bits, psyEnergy, depthRange);
> +        cu->m_totalDistortion += xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, true, puCost, bits, psyEnergy, depthRange, cuData);
>           xSetIntraResultQT(cu, initTrDepth, partOffset, reconYuv);
>   
>           if (pu != numPU - 1)
>           {
> -            uint32_t zorder      = cu->getZorderIdxInCU() + partOffset;
> +            uint32_t zorder      = cuData->encodeIdx + partOffset;
>               pixel*   dst         = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
>               pixel*   src         = reconYuv->getLumaAddr(partOffset);
>               primitives.luma_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride);
> @@ -1443,7 +1442,7 @@
>       m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
>   }
>   
> -void Search::getBestIntraModeChroma(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv)
> +void Search::getBestIntraModeChroma(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, CU* cuData)
>   {
>       uint32_t bestMode  = 0;
>       uint64_t bestCost  = MAX_INT64;
> @@ -1463,8 +1462,8 @@
>       int32_t sizeIdx = log2TrSizeC - 2;
>       pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
>   
> -    TComPattern::initAdiPatternChroma(cu, 0, 0, m_predBuf, 1);
> -    TComPattern::initAdiPatternChroma(cu, 0, 0, m_predBuf, 2);
> +    TComPattern::initAdiPatternChroma(cu, 0, 0, m_predBuf, 1, cuData);
> +    TComPattern::initAdiPatternChroma(cu, 0, 0, m_predBuf, 2, cuData);
>       cu->getAllowedChromaDir(0, modeList);
>   
>       // check chroma modes
> @@ -1496,7 +1495,7 @@
>       cu->setChromIntraDirSubParts(bestMode, 0, cu->getDepth(0));
>   }
>   
> -void Search::estIntraPredChromaQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv)
> +void Search::estIntraPredChromaQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv,CU* cuData)
>   {
>       uint32_t depth       = cu->getDepth(0);
>       uint32_t initTrDepth = (cu->getPartitionSize(0) != SIZE_2Nx2N) && (cu->getChromaFormat() == X265_CSP_I444 ? 1 : 0);
> @@ -1532,7 +1531,7 @@
>               cu->setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTrDepth);
>   
>               uint32_t psyEnergy = 0;
> -            uint32_t dist = xRecurIntraChromaCodingQT(cu, initTrDepth, absPartIdxC, fencYuv, predYuv, resiYuv, psyEnergy);
> +            uint32_t dist = xRecurIntraChromaCodingQT(cu, initTrDepth, absPartIdxC, fencYuv, predYuv, resiYuv, psyEnergy, cuData);
>   
>               if (cu->m_slice->m_pps->bTransformSkipEnabled)
>                   m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
> @@ -1559,7 +1558,7 @@
>   
>           if (!tuIterator.isLastSection())
>           {
> -            uint32_t zorder      = cu->getZorderIdxInCU() + absPartIdxC;
> +            uint32_t zorder      = cuData->encodeIdx + absPartIdxC;
>               uint32_t dststride   = cu->m_pic->getPicYuvRec()->getCStride();
>               uint32_t srcstride   = reconYuv->getCStride();
>               pixel *src, *dst;
> @@ -1605,7 +1604,7 @@
>   }
>   
>   /* estimation of best merge coding */
> -uint32_t Search::mergeEstimation(TComDataCU* cu, int puIdx, MergeData& m)
> +uint32_t Search::mergeEstimation(TComDataCU* cu, int puIdx, MergeData& m, CU* cuData)
>   {
>       X265_CHECK(cu->getPartitionSize(0) != SIZE_2Nx2N, "merge tested on non-2Nx2N partition\n");
>   
> @@ -1639,7 +1638,7 @@
>           cu->getCUMvField(REF_PIC_LIST_1)->m_mv[m.absPartIdx] = m.mvFieldNeighbours[mergeCand][1].mv;
>           cu->getCUMvField(REF_PIC_LIST_1)->m_refIdx[m.absPartIdx] = (char)m.mvFieldNeighbours[mergeCand][1].refIdx;
>   
> -        prepMotionCompensation(cu, puIdx);
> +        prepMotionCompensation(cu, puIdx, cuData);
>           motionCompensation(&m_predTempYuv, true, false);
>           uint32_t costCand = m_me.bufSATD(m_predTempYuv.getLumaAddr(m.absPartIdx), m_predTempYuv.getStride());
>           uint32_t bitsCand = getTUBits(mergeCand, m.maxNumMergeCand);
> @@ -1661,7 +1660,7 @@
>   
>   /* search of the best candidate for inter prediction
>    * returns true if predYuv was filled with a motion compensated prediction */
> -bool Search::predInterSearch(TComDataCU* cu, TComYuv* predYuv, bool bMergeOnly, bool bChroma)
> +bool Search::predInterSearch(TComDataCU* cu, TComYuv* predYuv, bool bMergeOnly, bool bChroma, CU* cuData)
>   {
>       MV amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS];
>       MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
> @@ -1690,7 +1689,7 @@
>           int      roiWidth, roiHeight;
>           cu->getPartIndexAndSize(partIdx, partAddr, roiWidth, roiHeight);
>   
> -        pixel* pu = fenc->getLumaAddr(cu->getAddr(), cu->getZorderIdxInCU() + partAddr);
> +        pixel* pu = fenc->getLumaAddr(cu->getAddr(), cuData->encodeIdx + partAddr);
>           m_me.setSourcePU(pu - fenc->getLumaAddr(), roiWidth, roiHeight);
>   
>           uint32_t mrgCost = MAX_UINT;
> @@ -1701,7 +1700,7 @@
>               merge.absPartIdx = partAddr;
>               merge.width = roiWidth;
>               merge.height = roiHeight;
> -            mrgCost = mergeEstimation(cu, partIdx, merge);
> +            mrgCost = mergeEstimation(cu, partIdx, merge, cuData);
>   
>               if (bMergeOnly && cu->getLog2CUSize(0) > 3)
>               {
> @@ -1719,7 +1718,7 @@
>                   cu->getCUMvField(REF_PIC_LIST_1)->setAllMvField(merge.mvField[1], partSize, partAddr, 0, partIdx);
>                   totalmebits += merge.bits;
>   
> -                prepMotionCompensation(cu, partIdx);
> +                prepMotionCompensation(cu, partIdx, cuData);
>                   motionCompensation(predYuv, true, bChroma);
>                   continue;
>               }
> @@ -1761,7 +1760,7 @@
>   
>                       cu->clipMv(mvCand);
>   
> -                    prepMotionCompensation(cu, partIdx);
> +                    prepMotionCompensation(cu, partIdx, cuData);
>                       predInterLumaBlk(slice->m_refPicList[l][ref]->getPicYuvRec(), &m_predTempYuv, &mvCand);
>                       uint32_t cost = m_me.bufSAD(m_predTempYuv.getLumaAddr(partAddr), m_predTempYuv.getStride());
>                       cost = (uint32_t)m_rdCost.calcRdSADCost(cost, MVP_IDX_BITS);
> @@ -1809,7 +1808,7 @@
>               TComPicYuv *refPic0 = slice->m_refPicList[0][list[0].ref]->getPicYuvRec();
>               TComPicYuv *refPic1 = slice->m_refPicList[1][list[1].ref]->getPicYuvRec();
>               
> -            prepMotionCompensation(cu, partIdx);
> +            prepMotionCompensation(cu, partIdx, cuData);
>               predInterLumaBlk(refPic0, &m_predYuv[0], &list[0].mv);
>               predInterLumaBlk(refPic1, &m_predYuv[1], &list[1].mv);
>   
> @@ -1935,7 +1934,7 @@
>   
>               totalmebits += list[1].bits;
>           }
> -        prepMotionCompensation(cu, partIdx);
> +        prepMotionCompensation(cu, partIdx, cuData);
>           motionCompensation(predYuv, true, bChroma);
>       }
>   
> @@ -2095,7 +2094,7 @@
>   
>   /** encode residual and calculate rate-distortion for a CU block */
>   void Search::encodeResAndCalcRdInterCU(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* outResiYuv,
> -                                       ShortYuv* outBestResiYuv, TComYuv* outReconYuv)
> +                                       ShortYuv* outBestResiYuv, TComYuv* outReconYuv, CU* cuData)
>   {
>       X265_CHECK(!cu->isIntra(0), "intra CU not expected\n");
>   
> @@ -2138,7 +2137,7 @@
>           uint64_t cost = 0;
>           uint32_t zeroDistortion = 0;
>           uint32_t bits = 0;
> -        uint32_t distortion = xEstimateResidualQT(cu, 0, fencYuv, predYuv, outResiYuv, depth, cost, bits, &zeroDistortion, tuDepthRange);
> +        uint32_t distortion = xEstimateResidualQT(cu, 0, fencYuv, predYuv, outResiYuv, depth, cost, bits, &zeroDistortion, tuDepthRange, cuData);
>   
>           m_entropyCoder->resetBits();
>           m_entropyCoder->codeQtRootCbfZero();
> @@ -2208,7 +2207,7 @@
>           m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
>           uint64_t cost = 0;
>           uint32_t bits = 0;
> -        xEstimateResidualQT(cu, 0, fencYuv, predYuv, outResiYuv, depth, cost, bits, NULL, tuDepthRange);
> +        xEstimateResidualQT(cu, 0, fencYuv, predYuv, outResiYuv, depth, cost, bits, NULL, tuDepthRange, cuData);
>           xSetResidualQTData(cu, 0, NULL, depth, false);
>           m_entropyCoder->store(m_rdEntropyCoders[depth][CI_TEMP_BEST]);
>       }
> @@ -2243,7 +2242,7 @@
>           cu->clearCbf(0, depth);
>   }
>   
> -void Search::generateCoeffRecon(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv)
> +void Search::generateCoeffRecon(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, CU* cuData)
>   {
>       m_quant.setQPforQuant(cu);
>   
> @@ -2252,7 +2251,7 @@
>   
>       if (cu->getPredictionMode(0) == MODE_INTER)
>       {
> -        residualTransformQuantInter(cu, 0, fencYuv, resiYuv, cu->getDepth(0), tuDepthRange);
> +        residualTransformQuantInter(cu, 0, fencYuv, resiYuv, cu->getDepth(0), tuDepthRange, cuData);
>           if (cu->getQtRootCbf(0))
>               reconYuv->addClip(predYuv, resiYuv, cu->getLog2CUSize(0));
>           else
> @@ -2265,14 +2264,14 @@
>       else if (cu->getPredictionMode(0) == MODE_INTRA)
>       {
>           uint32_t initTrDepth = cu->getPartitionSize(0) == SIZE_2Nx2N ? 0 : 1;
> -        residualTransformQuantIntra(cu, initTrDepth, 0, fencYuv, predYuv, resiYuv, reconYuv, tuDepthRange);
> -        getBestIntraModeChroma(cu, fencYuv, predYuv);
> -        residualQTIntraChroma(cu, 0, 0, fencYuv, predYuv, resiYuv, reconYuv);
> +        residualTransformQuantIntra(cu, initTrDepth, 0, fencYuv, predYuv, resiYuv, reconYuv, tuDepthRange, cuData);
> +        getBestIntraModeChroma(cu, fencYuv, predYuv, cuData);
> +        residualQTIntraChroma(cu, 0, 0, fencYuv, predYuv, resiYuv, reconYuv, cuData);
>       }
>   }
>   
>   void Search::residualTransformQuantInter(TComDataCU* cu, uint32_t absPartIdx, TComYuv* fencYuv, ShortYuv* resiYuv,
> -                                         const uint32_t depth, uint32_t depthRange[2])
> +                                         const uint32_t depth, uint32_t depthRange[2], CU* cuData)
>   {
>       X265_CHECK(cu->getDepth(0) == cu->getDepth(absPartIdx), "invalid depth\n");
>       const uint32_t trMode = depth - cu->getDepth(0);
> @@ -2382,7 +2381,9 @@
>       {
>           const uint32_t qPartNumSubdiv = cu->m_pic->getNumPartInCU() >> ((depth + 1) << 1);
>           for (uint32_t i = 0; i < 4; ++i)
> -            residualTransformQuantInter(cu, absPartIdx + i * qPartNumSubdiv, fencYuv, resiYuv, depth + 1, depthRange);
> +        {
> +            residualTransformQuantInter(cu, absPartIdx + i * qPartNumSubdiv, fencYuv, resiYuv, depth + 1, depthRange, cuData);
> +        }
>   
>           uint32_t ycbf = 0;
>           uint32_t ucbf = 0;
> @@ -2404,7 +2405,7 @@
>   }
>   
>   uint32_t Search::xEstimateResidualQT(TComDataCU* cu, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv,
> -                                     uint32_t depth, uint64_t& rdCost, uint32_t& outBits, uint32_t* outZeroDist, uint32_t depthRange[2])
> +                                     uint32_t depth, uint64_t& rdCost, uint32_t& outBits, uint32_t* outZeroDist, uint32_t depthRange[2], CU* cuData)
>   {
>       X265_CHECK(cu->getDepth(0) == cu->getDepth(absPartIdx), "depth not matching\n");
>       const uint32_t trMode = depth - cu->getDepth(0);
> @@ -2571,7 +2572,7 @@
>               if (m_rdCost.m_psyRd)
>               {
>                   pixel*   pred = predYuv->getLumaAddr(absPartIdx);
> -                uint32_t zorder = cu->getZorderIdxInCU() + absPartIdx;
> +                uint32_t zorder = cuData->encodeIdx + absPartIdx;
>                   pixel*   reconIPred = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
>                   uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getStride();
>                   uint32_t stride = fencYuv->getStride();
> @@ -2670,7 +2671,7 @@
>                       if (m_rdCost.m_psyRd)
>                       {
>                           pixel*   pred = predYuv->getCbAddr(absPartIdxC);
> -                        uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
> +                        uint32_t zorder = cuData->encodeIdx + absPartIdxC;
>                           pixel*   reconIPred = cu->m_pic->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder);
>                           uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getCStride();
>                           uint32_t stride = fencYuv->getCStride();
> @@ -2752,7 +2753,7 @@
>                       if (m_rdCost.m_psyRd)
>                       {
>                           pixel*   pred = predYuv->getCrAddr(absPartIdxC);
> -                        uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
> +                        uint32_t zorder = cuData->encodeIdx + absPartIdxC;
>                           pixel*   reconIPred = cu->m_pic->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder);
>                           uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getCStride();
>                           uint32_t stride = fencYuv->getCStride();
> @@ -2858,7 +2859,7 @@
>                   if (m_rdCost.m_psyRd)
>                   {
>                       pixel*   pred = predYuv->getLumaAddr(absPartIdx);
> -                    uint32_t zorder = cu->getZorderIdxInCU() + absPartIdx;
> +                    uint32_t zorder = cuData->encodeIdx + absPartIdx;
>                       pixel*   reconIPred = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
>                       uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getStride();
>                       uint32_t stride = fencYuv->getStride();
> @@ -2946,7 +2947,7 @@
>                       if (m_rdCost.m_psyRd)
>                       {
>                           pixel*   pred = predYuv->getCbAddr(absPartIdxC);
> -                        uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
> +                        uint32_t zorder = cuData->encodeIdx + absPartIdxC;
>                           pixel*   reconIPred = cu->m_pic->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder);
>                           uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getCStride();
>                           uint32_t stride = fencYuv->getCStride();
> @@ -2987,7 +2988,7 @@
>                       if (m_rdCost.m_psyRd)
>                       {
>                           pixel*   pred = predYuv->getCrAddr(absPartIdxC);
> -                        uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
> +                        uint32_t zorder = cuData->encodeIdx + absPartIdxC;
>                           pixel*   reconIPred = cu->m_pic->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder);
>                           uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getCStride();
>                           uint32_t stride = fencYuv->getCStride();
> @@ -3130,7 +3131,7 @@
>           for (uint32_t i = 0; i < 4; ++i)
>           {
>               cu->m_psyEnergy = 0;
> -            subdivDist += xEstimateResidualQT(cu, absPartIdx + i * qPartNumSubdiv, fencYuv, predYuv, resiYuv, depth + 1, subDivCost, subdivBits, bCheckFull ? NULL : outZeroDist, depthRange);
> +            subdivDist += xEstimateResidualQT(cu, absPartIdx + i * qPartNumSubdiv, fencYuv, predYuv, resiYuv, depth + 1, subDivCost, subdivBits, bCheckFull ? NULL : outZeroDist, depthRange, cuData);
>               subDivPsyEnergy += cu->m_psyEnergy;
>           }
>   
> diff -r 7dccbbed0349 -r 2048b3e3c064 source/encoder/search.h
> --- a/source/encoder/search.h	Wed Sep 24 18:26:45 2014 -0500
> +++ b/source/encoder/search.h	Fri Sep 26 14:06:01 2014 +0530
> @@ -79,19 +79,19 @@
>   
>       bool     initSearch(x265_param *param, ScalingList& scalingList);
>   
> -    void     estIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2]);
> -    void     sharedEstIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2], uint8_t* sharedModes);
> -    void     estIntraPredChromaQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv);
> +    void     estIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2], CU* cuData);
> +    void     sharedEstIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2], uint8_t* sharedModes, CU* cuData);
> +    void     estIntraPredChromaQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, CU* cuData);
>   
>       // estimation inter prediction (non-skip)
> -    bool     predInterSearch(TComDataCU* cu, TComYuv* predYuv, bool bMergeOnly, bool bChroma);
> +    bool     predInterSearch(TComDataCU* cu, TComYuv* predYuv, bool bMergeOnly, bool bChroma, CU* cuData);
>   
>       // encode residual and compute rd-cost for inter mode
> -    void     encodeResAndCalcRdInterCU(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, ShortYuv* bestResiYuv, TComYuv* reconYuv);
> +    void     encodeResAndCalcRdInterCU(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, ShortYuv* bestResiYuv, TComYuv* reconYuv, CU* cuData);
>       void     encodeResAndCalcRdSkipCU(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, TComYuv* reconYuv);
>   
> -    void     generateCoeffRecon(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv);
> -    void     residualTransformQuantInter(TComDataCU* cu, uint32_t absPartIdx, TComYuv* fencYuv, ShortYuv* resiYuv, uint32_t depth, uint32_t depthRange[2]);
> +    void     generateCoeffRecon(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, CU* cuData);
> +    void     residualTransformQuantInter(TComDataCU* cu, uint32_t absPartIdx, TComYuv* fencYuv, ShortYuv* resiYuv, uint32_t depth, uint32_t depthRange[2], CU* cuData);
>   
>       uint32_t getIntraModeBits(TComDataCU* cu, uint32_t mode, uint32_t partOffset, uint32_t depth);
>       uint32_t getIntraRemModeBits(TComDataCU * cu, uint32_t partOffset, uint32_t depth, uint32_t preds[3], uint64_t& mpms);
> @@ -110,31 +110,31 @@
>       uint32_t xGetIntraBitsLuma(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t log2TrSize, coeff_t* coeff, uint32_t depthRange[2]);
>       uint32_t xGetIntraBitsChroma(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSizeC, uint32_t chromaId, coeff_t* coeff);
>       uint32_t xIntraCodingLumaBlk(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv,
> -                                 int16_t* reconQt, uint32_t reconQtStride, coeff_t* coeff, uint32_t& cbf);
> +                                 int16_t* reconQt, uint32_t reconQtStride, coeff_t* coeff, uint32_t& cbf, CU* cuData);
>   
>       uint32_t xEstimateResidualQT(TComDataCU* cu, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, uint32_t depth,
> -                                 uint64_t &rdCost, uint32_t &outBits, uint32_t *zeroDist, uint32_t tuDepthRange[2]);
> +                                 uint64_t &rdCost, uint32_t &outBits, uint32_t *zeroDist, uint32_t tuDepthRange[2], CU* cuData);
>   
>       uint32_t xRecurIntraCodingQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv,
> -                                 ShortYuv* resiYuv, bool bAllowRQTSplit, uint64_t& dRDCost, uint32_t& puBits, uint32_t& psyEnergy, uint32_t depthRange[2]);
> +                                 ShortYuv* resiYuv, bool bAllowRQTSplit, uint64_t& dRDCost, uint32_t& puBits, uint32_t& psyEnergy, uint32_t depthRange[2], CU* cuData);
>   
>       uint32_t xRecurIntraChromaCodingQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv,
> -                                       uint32_t& psyEnergy);
> +                                       uint32_t& psyEnergy, CU* cuData);
>   
>       uint32_t xIntraCodingChromaBlk(TComDataCU* cu, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv,
> -                                   int16_t* reconQt, uint32_t reconQtStride, coeff_t* coeff, uint32_t& cbf, uint32_t chromaId, uint32_t log2TrSizeC);
> +                                   int16_t* reconQt, uint32_t reconQtStride, coeff_t* coeff, uint32_t& cbf, uint32_t chromaId, uint32_t log2TrSizeC, CU* cuData);
>   
>       void     residualTransformQuantIntra(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv,
> -                                         TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2]);
> +                                         TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2], CU* cuData);
>   
>       void     residualQTIntraChroma(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv,
> -                                   TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv);
> +                                   TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, CU* cuData);
>   
>       void     xEncodeResidualQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t depth, bool bSubdivAndCbf, TextType ttype, uint32_t depthRange[2]);
>       void     xSetIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* reconYuv);
>   
> -    void     xLoadIntraResultQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize, int16_t* reconQt, uint32_t reconQtStride);
> -    void     xLoadIntraResultChromaQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSizeC, uint32_t chromaId, int16_t* reconQt, uint32_t reconQtStride);
> +    void     xLoadIntraResultQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize, int16_t* reconQt, uint32_t reconQtStride, CU* cuData);
> +    void     xLoadIntraResultChromaQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSizeC, uint32_t chromaId, int16_t* reconQt, uint32_t reconQtStride, CU* cuData);
>   
>       void     offsetSubTUCBFs(TComDataCU* cu, TextType ttype, uint32_t trDepth, uint32_t absPartIdx);
>   
> @@ -171,13 +171,13 @@
>       void     checkBestMVP(MV* amvpCand, MV cMv, MV& mvPred, int& mvpIdx, uint32_t& outBits, uint32_t& outCost);
>       void     getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3]);
>       uint32_t getInterSymbolBits(TComDataCU* cu, uint32_t depthRange[2]);
> -    uint32_t mergeEstimation(TComDataCU* cu, int partIdx, MergeData& m);
> +    uint32_t mergeEstimation(TComDataCU* cu, int partIdx, MergeData& m, CU* cuData);
>       void     setSearchRange(TComDataCU* cu, MV mvp, int merange, MV& mvmin, MV& mvmax);
>   
>       /* intra helper functions */
>       enum { MAX_RD_INTRA_MODES = 16 };
>       void     updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList);
> -    void     getBestIntraModeChroma(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv);
> +    void     getBestIntraModeChroma(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, CU* cuData);
>   };
>   }
>   
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel




More information about the x265-devel mailing list