[x265] refine intra neighbors

Satoshi Nakagawa nakagawa424 at oki.com
Thu Dec 25 05:26:10 CET 2014


code maintainability may be improved.

# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1419480956 -32400
#      Thu Dec 25 13:15:56 2014 +0900
# Node ID d400c836b3796e68bb08538a5c20f16f8966ee18
# Parent  5f9f7194267b76f733e9ffb0f9e8b474dfe89a71
refine intra neighbors

diff -r 5f9f7194267b -r d400c836b379 source/common/common.h
--- a/source/common/common.h	Tue Dec 23 17:40:53 2014 +0900
+++ b/source/common/common.h	Thu Dec 25 13:15:56 2014 +0900
@@ -163,6 +163,9 @@
 template<typename T>
 inline T x265_max(T a, T b) { return a > b ? a : b; }
 
+template<typename T>
+inline T x265_clip3(T minVal, T maxVal, T a) { return x265_min(x265_max(minVal, a), maxVal); }
+
 typedef int16_t  coeff_t;      // transform coefficient
 
 #define X265_MIN(a, b) ((a) < (b) ? (a) : (b))
diff -r 5f9f7194267b -r d400c836b379 source/common/cudata.cpp
--- a/source/common/cudata.cpp	Tue Dec 23 17:40:53 2014 +0900
+++ b/source/common/cudata.cpp	Thu Dec 25 13:15:56 2014 +0900
@@ -608,7 +608,7 @@
         {
             if (curPartUnitIdx > g_rasterToZscan[absPartIdxRT - s_numPartInCUSize + 1])
             {
-                uint32_t absZorderCUIdx  = g_zscanToRaster[m_absIdxInCTU] + (1 << (m_log2CUSize[0] - LOG2_UNIT_SIZE)) - 1;
+                uint32_t absZorderCUIdx = g_zscanToRaster[m_absIdxInCTU] + (1 << (m_log2CUSize[0] - LOG2_UNIT_SIZE)) - 1;
                 arPartUnitIdx = g_rasterToZscan[absPartIdxRT - s_numPartInCUSize + 1];
                 if (isEqualRowOrCol(absPartIdxRT, absZorderCUIdx, s_numPartInCUSize))
                     return m_encData->getPicCTU(m_cuAddr);
@@ -689,8 +689,6 @@
             return NULL;
         }
         blPartUnitIdx = g_rasterToZscan[absPartIdxLB + (1 + partUnitOffset) * s_numPartInCUSize - 1];
-        if (!m_cuLeft || !m_cuLeft->m_slice)
-            return NULL;
         return m_cuLeft;
     }
 
@@ -723,8 +721,6 @@
             return NULL;
         }
         arPartUnitIdx = g_rasterToZscan[absPartIdxRT + NUM_CU_PARTITIONS - s_numPartInCUSize + partUnitOffset];
-        if (!m_cuAbove || !m_cuAbove->m_slice)
-            return NULL;
         return m_cuAbove;
     }
 
@@ -732,8 +728,6 @@
         return NULL;
 
     arPartUnitIdx = g_rasterToZscan[NUM_CU_PARTITIONS - s_numPartInCUSize + partUnitOffset - 1];
-    if ((m_cuAboveRight == NULL || m_cuAboveRight->m_slice == NULL || (m_cuAboveRight->m_cuAddr) > m_cuAddr))
-        return NULL;
     return m_cuAboveRight;
 }
 
@@ -904,7 +898,7 @@
     tuDepthRange[0] = m_slice->m_sps->quadtreeTULog2MinSize;
     tuDepthRange[1] = m_slice->m_sps->quadtreeTULog2MaxSize;
 
-    tuDepthRange[0] = X265_MAX(tuDepthRange[0], X265_MIN(log2CUSize - (m_slice->m_sps->quadtreeTUMaxDepthIntra - 1 + splitFlag), tuDepthRange[1]));
+    tuDepthRange[0] = x265_clip3(tuDepthRange[0], tuDepthRange[1], log2CUSize - (m_slice->m_sps->quadtreeTUMaxDepthIntra - 1 + splitFlag));
 }
 
 void CUData::getInterTUQtDepthRange(uint32_t tuDepthRange[2], uint32_t absPartIdx) const
@@ -916,7 +910,7 @@
     tuDepthRange[0] = m_slice->m_sps->quadtreeTULog2MinSize;
     tuDepthRange[1] = m_slice->m_sps->quadtreeTULog2MaxSize;
 
-    tuDepthRange[0] = X265_MAX(tuDepthRange[0], X265_MIN(log2CUSize - (quadtreeTUMaxDepth - 1 + splitFlag), tuDepthRange[1]));
+    tuDepthRange[0] = x265_clip3(tuDepthRange[0], tuDepthRange[1], log2CUSize - (quadtreeTUMaxDepth - 1 + splitFlag));
 }
 
 uint32_t CUData::getCtxSkipFlag(uint32_t absPartIdx) const
@@ -1363,14 +1357,6 @@
     return outPartIdxRB;
 }
 
-void CUData::deriveLeftRightTopIdxAdi(uint32_t& outPartIdxLT, uint32_t& outPartIdxRT, uint32_t partOffset, uint32_t partDepth) const
-{
-    uint32_t numPartInWidth = 1 << (m_log2CUSize[0] - LOG2_UNIT_SIZE - partDepth);
-
-    outPartIdxLT = m_absIdxInCTU + partOffset;
-    outPartIdxRT = g_rasterToZscan[g_zscanToRaster[outPartIdxLT] + numPartInWidth - 1];
-}
-
 bool CUData::hasEqualMotion(uint32_t absPartIdx, const CUData& candCU, uint32_t candAbsPartIdx) const
 {
     if (m_interDir[absPartIdx] != candCU.m_interDir[candAbsPartIdx])
diff -r 5f9f7194267b -r d400c836b379 source/common/cudata.h
--- a/source/common/cudata.h	Tue Dec 23 17:40:53 2014 +0900
+++ b/source/common/cudata.h	Thu Dec 25 13:15:56 2014 +0900
@@ -212,7 +212,6 @@
 
     void     getAllowedChromaDir(uint32_t absPartIdx, uint32_t* modeList) const;
     int      getIntraDirLumaPredictor(uint32_t absPartIdx, uint32_t* intraDirPred) const;
-    void     deriveLeftRightTopIdxAdi(uint32_t& partIdxLT, uint32_t& partIdxRT, uint32_t partOffset, uint32_t partDepth) const;
 
     uint32_t getSCUAddr() const                  { return (m_cuAddr << g_maxFullDepth * 2) + m_absIdxInCTU; }
     uint32_t getCtxSplitFlag(uint32_t absPartIdx, uint32_t depth) const;
diff -r 5f9f7194267b -r d400c836b379 source/common/predict.cpp
--- a/source/common/predict.cpp	Tue Dec 23 17:40:53 2014 +0900
+++ b/source/common/predict.cpp	Thu Dec 25 13:15:56 2014 +0900
@@ -654,11 +654,8 @@
     }
 }
 
-void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t partDepth, int dirMode)
+void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, int dirMode)
 {
-    IntraNeighbors intraNeighbors;
-    initIntraNeighbors(cu, absPartIdx, partDepth, true, &intraNeighbors);
-
     pixel* adiBuf      = m_predBuf;
     pixel* refAbove    = m_refAbove;
     pixel* refLeft     = m_refLeft;
@@ -700,12 +697,12 @@
             int refTL = refAbove[0];
             int refTR = refAbove[trSize2];
             bStrongSmoothing = (abs(refBL + refTL - 2 * refLeft[trSize]) < threshold &&
-                abs(refTL + refTR - 2 * refAbove[trSize]) < threshold);
+                                abs(refTL + refTR - 2 * refAbove[trSize]) < threshold);
 
             if (bStrongSmoothing)
             {
                 // bilinear interpolation
-                const int shift = 5 + 1; // intraNeighbors.log2TrSize + 1;
+                const int shift = 5 + 1; // log2TrSize + 1;
                 int init = (refTL << shift) + tuSize;
                 int delta;
 
@@ -738,10 +735,8 @@
     }
 }
 
-void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t partDepth, uint32_t chromaId)
+void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId)
 {
-    IntraNeighbors intraNeighbors;
-    initIntraNeighbors(cu, absPartIdx, partDepth, false, &intraNeighbors);
     uint32_t tuSize = intraNeighbors.tuSize;
 
     const pixel* adiOrigin = cu.m_encData->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
@@ -751,9 +746,9 @@
     fillReferenceSamples(adiOrigin, picStride, adiRef, intraNeighbors);
 }
 
-void Predict::initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t partDepth, bool isLuma, IntraNeighbors *intraNeighbors)
+void Predict::initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, bool isLuma, IntraNeighbors *intraNeighbors)
 {
-    uint32_t log2TrSize = cu.m_log2CUSize[0] - partDepth;
+    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
     int log2UnitWidth = LOG2_UNIT_SIZE;
     int log2UnitHeight = LOG2_UNIT_SIZE;
 
@@ -764,12 +759,12 @@
         log2UnitHeight -= cu.m_vChromaShift;
     }
 
-    int   numIntraNeighbor = 0;
+    int numIntraNeighbor;
     bool* bNeighborFlags = intraNeighbors->bNeighborFlags;
 
-    uint32_t partIdxLT, partIdxRT, partIdxLB;
-
-    cu.deriveLeftRightTopIdxAdi(partIdxLT, partIdxRT, absPartIdx, partDepth);
+    uint32_t numPartInWidth = 1 << (cu.m_log2CUSize[0] - LOG2_UNIT_SIZE - tuDepth);
+    uint32_t partIdxLT = cu.m_absIdxInCTU + absPartIdx;
+    uint32_t partIdxRT = g_rasterToZscan[g_zscanToRaster[partIdxLT] + numPartInWidth - 1];
 
     uint32_t tuSize = 1 << log2TrSize;
     int  tuWidthInUnits = tuSize >> log2UnitWidth;
@@ -777,14 +772,26 @@
     int  aboveUnits = tuWidthInUnits << 1;
     int  leftUnits = tuHeightInUnits << 1;
     int  partIdxStride = cu.m_slice->m_sps->numPartInCUSize;
-    partIdxLB = g_rasterToZscan[g_zscanToRaster[partIdxLT] + ((tuHeightInUnits - 1) * partIdxStride)];
+    uint32_t partIdxLB = g_rasterToZscan[g_zscanToRaster[partIdxLT] + ((tuHeightInUnits - 1) * partIdxStride)];
 
-    bNeighborFlags[leftUnits] = isAboveLeftAvailable(cu, partIdxLT);
-    numIntraNeighbor += (int)(bNeighborFlags[leftUnits]);
-    numIntraNeighbor += isAboveAvailable(cu, partIdxLT, partIdxRT, (bNeighborFlags + leftUnits + 1));
-    numIntraNeighbor += isAboveRightAvailable(cu, partIdxLT, partIdxRT, (bNeighborFlags + leftUnits + 1 + tuWidthInUnits));
-    numIntraNeighbor += isLeftAvailable(cu, partIdxLT, partIdxLB, (bNeighborFlags + leftUnits - 1));
-    numIntraNeighbor += isBelowLeftAvailable(cu, partIdxLT, partIdxLB, (bNeighborFlags + leftUnits - 1 - tuHeightInUnits));
+    if (cu.m_slice->isIntra() || !cu.m_slice->m_pps->bConstrainedIntraPred)
+    {
+        bNeighborFlags[leftUnits] = isAboveLeftAvailable<false>(cu, partIdxLT);
+        numIntraNeighbor  = (int)(bNeighborFlags[leftUnits]);
+        numIntraNeighbor += isAboveAvailable<false>(cu, partIdxLT, partIdxRT, bNeighborFlags + leftUnits + 1);
+        numIntraNeighbor += isAboveRightAvailable<false>(cu, partIdxRT, bNeighborFlags + leftUnits + 1 + tuWidthInUnits, tuWidthInUnits);
+        numIntraNeighbor += isLeftAvailable<false>(cu, partIdxLT, partIdxLB, bNeighborFlags + leftUnits - 1);
+        numIntraNeighbor += isBelowLeftAvailable<false>(cu, partIdxLB, bNeighborFlags + tuHeightInUnits - 1, tuHeightInUnits);
+    }
+    else
+    {
+        bNeighborFlags[leftUnits] = isAboveLeftAvailable<true>(cu, partIdxLT);
+        numIntraNeighbor  = (int)(bNeighborFlags[leftUnits]);
+        numIntraNeighbor += isAboveAvailable<true>(cu, partIdxLT, partIdxRT, bNeighborFlags + leftUnits + 1);
+        numIntraNeighbor += isAboveRightAvailable<true>(cu, partIdxRT, bNeighborFlags + leftUnits + 1 + tuWidthInUnits, tuWidthInUnits);
+        numIntraNeighbor += isLeftAvailable<true>(cu, partIdxLT, partIdxLB, bNeighborFlags + leftUnits - 1);
+        numIntraNeighbor += isBelowLeftAvailable<true>(cu, partIdxLB, bNeighborFlags + tuHeightInUnits - 1, tuHeightInUnits);
+    }
 
     intraNeighbors->numIntraNeighbor = numIntraNeighbor;
     intraNeighbors->totalUnits = aboveUnits + leftUnits + 1;
@@ -793,7 +800,6 @@
     intraNeighbors->unitWidth = 1 << log2UnitWidth;
     intraNeighbors->unitHeight = 1 << log2UnitHeight;
     intraNeighbors->tuSize = tuSize;
-    intraNeighbors->log2TrSize = log2TrSize;
 }
 
 void Predict::fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, pixel* adiRef, const IntraNeighbors& intraNeighbors)
@@ -948,112 +954,100 @@
     }
 }
 
+template<bool cip>
 bool Predict::isAboveLeftAvailable(const CUData& cu, uint32_t partIdxLT)
 {
     uint32_t partAboveLeft;
     const CUData* cuAboveLeft = cu.getPUAboveLeft(partAboveLeft, partIdxLT);
 
-    if (!cu.m_slice->m_pps->bConstrainedIntraPred)
-        return cuAboveLeft ? true : false;
-    else
-        return cuAboveLeft && cuAboveLeft->isIntra(partAboveLeft);
+    return cuAboveLeft && (!cip || cuAboveLeft->isIntra(partAboveLeft));
 }
 
+template<bool cip>
 int Predict::isAboveAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool* bValidFlags)
 {
     const uint32_t rasterPartBegin = g_zscanToRaster[partIdxLT];
-    const uint32_t rasterPartEnd = g_zscanToRaster[partIdxRT] + 1;
+    const uint32_t rasterPartEnd = g_zscanToRaster[partIdxRT];
     const uint32_t idxStep = 1;
-    bool* validFlagPtr = bValidFlags;
     int numIntra = 0;
 
-    for (uint32_t rasterPart = rasterPartBegin; rasterPart < rasterPartEnd; rasterPart += idxStep)
+    for (uint32_t rasterPart = rasterPartBegin; rasterPart <= rasterPartEnd; rasterPart += idxStep, bValidFlags++)
     {
         uint32_t partAbove;
         const CUData* cuAbove = cu.getPUAbove(partAbove, g_rasterToZscan[rasterPart]);
-        if (cuAbove && (!cu.m_slice->m_pps->bConstrainedIntraPred || cuAbove->isIntra(partAbove)))
+        if (cuAbove && (!cip || cuAbove->isIntra(partAbove)))
         {
             numIntra++;
-            *validFlagPtr = true;
+            *bValidFlags = true;
         }
         else
-            *validFlagPtr = false;
-
-        validFlagPtr++;
+            *bValidFlags = false;
     }
 
     return numIntra;
 }
 
+template<bool cip>
 int Predict::isLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool* bValidFlags)
 {
     const uint32_t rasterPartBegin = g_zscanToRaster[partIdxLT];
-    const uint32_t rasterPartEnd = g_zscanToRaster[partIdxLB] + 1;
+    const uint32_t rasterPartEnd = g_zscanToRaster[partIdxLB];
     const uint32_t idxStep = cu.m_slice->m_sps->numPartInCUSize;
-    bool* validFlagPtr = bValidFlags;
     int numIntra = 0;
 
-    for (uint32_t rasterPart = rasterPartBegin; rasterPart < rasterPartEnd; rasterPart += idxStep)
+    for (uint32_t rasterPart = rasterPartBegin; rasterPart <= rasterPartEnd; rasterPart += idxStep, bValidFlags--) // opposite direction
     {
         uint32_t partLeft;
         const CUData* cuLeft = cu.getPULeft(partLeft, g_rasterToZscan[rasterPart]);
-        if (cuLeft && (!cu.m_slice->m_pps->bConstrainedIntraPred || cuLeft->isIntra(partLeft)))
+        if (cuLeft && (!cip || cuLeft->isIntra(partLeft)))
         {
             numIntra++;
-            *validFlagPtr = true;
+            *bValidFlags = true;
         }
         else
-            *validFlagPtr = false;
-
-        validFlagPtr--; // opposite direction
+            *bValidFlags = false;
     }
 
     return numIntra;
 }
 
-int Predict::isAboveRightAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool* bValidFlags)
+template<bool cip>
+int Predict::isAboveRightAvailable(const CUData& cu, uint32_t partIdxRT, bool* bValidFlags, uint32_t numUnits)
 {
-    const uint32_t numUnitsInPU = g_zscanToRaster[partIdxRT] - g_zscanToRaster[partIdxLT] + 1;
-    bool* validFlagPtr = bValidFlags;
     int numIntra = 0;
 
-    for (uint32_t offset = 1; offset <= numUnitsInPU; offset++)
+    for (uint32_t offset = 1; offset <= numUnits; offset++, bValidFlags++)
     {
         uint32_t partAboveRight;
         const CUData* cuAboveRight = cu.getPUAboveRightAdi(partAboveRight, partIdxRT, offset);
-        if (cuAboveRight && (!cu.m_slice->m_pps->bConstrainedIntraPred || cuAboveRight->isIntra(partAboveRight)))
+        if (cuAboveRight && (!cip || cuAboveRight->isIntra(partAboveRight)))
         {
             numIntra++;
-            *validFlagPtr = true;
+            *bValidFlags = true;
         }
         else
-            *validFlagPtr = false;
-
-        validFlagPtr++;
+            *bValidFlags = false;
     }
 
     return numIntra;
 }
 
-int Predict::isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool* bValidFlags)
+template<bool cip>
+int Predict::isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLB, bool* bValidFlags, uint32_t numUnits)
 {
-    const uint32_t numUnitsInPU = (g_zscanToRaster[partIdxLB] - g_zscanToRaster[partIdxLT]) / cu.m_slice->m_sps->numPartInCUSize + 1;
-    bool* validFlagPtr = bValidFlags;
     int numIntra = 0;
 
-    for (uint32_t offset = 1; offset <= numUnitsInPU; offset++)
+    for (uint32_t offset = 1; offset <= numUnits; offset++, bValidFlags--) // opposite direction
     {
         uint32_t partBelowLeft;
         const CUData* cuBelowLeft = cu.getPUBelowLeftAdi(partBelowLeft, partIdxLB, offset);
-        if (cuBelowLeft && (!cu.m_slice->m_pps->bConstrainedIntraPred || cuBelowLeft->isIntra(partBelowLeft)))
+        if (cuBelowLeft && (!cip || cuBelowLeft->isIntra(partBelowLeft)))
         {
             numIntra++;
-            *validFlagPtr = true;
+            *bValidFlags = true;
         }
         else
-            *validFlagPtr = false;
-
-        validFlagPtr--; // opposite direction
+            *bValidFlags = false;
     }
 
     return numIntra;
diff -r 5f9f7194267b -r d400c836b379 source/common/predict.h
--- a/source/common/predict.h	Tue Dec 23 17:40:53 2014 +0900
+++ b/source/common/predict.h	Thu Dec 25 13:15:56 2014 +0900
@@ -57,7 +57,6 @@
         int      unitWidth;
         int      unitHeight;
         int      tuSize;
-        uint32_t log2TrSize;
         bool     bNeighborFlags[4 * MAX_NUM_SPU_W + 1];
     };
 
@@ -105,14 +104,19 @@
     void addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightValues wp[3], bool bLuma, bool bChroma) const;
 
     /* Intra prediction helper functions */
-    static void initIntraNeighbors(const CUData& cu, uint32_t zOrderIdxInPart, uint32_t partDepth, bool isLuma, IntraNeighbors *IntraNeighbors);
+    static void initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, bool isLuma, IntraNeighbors *IntraNeighbors);
     static void fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, pixel* adiRef, const IntraNeighbors& intraNeighbors);
 
+    template<bool cip>
     static bool isAboveLeftAvailable(const CUData& cu, uint32_t partIdxLT);
+    template<bool cip>
     static int  isAboveAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool* bValidFlags);
+    template<bool cip>
     static int  isLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool* bValidFlags);
-    static int  isAboveRightAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool* bValidFlags);
-    static int  isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool* bValidFlags);
+    template<bool cip>
+    static int  isAboveRightAvailable(const CUData& cu, uint32_t partIdxRT, bool* bValidFlags, uint32_t numUnits);
+    template<bool cip>
+    static int  isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLB, bool* bValidFlags, uint32_t numUnits);
 
 public:
 
@@ -125,8 +129,8 @@
     void predIntraLumaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSize);
     void predIntraChromaAng(pixel* src, uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSizeC, int chFmt);
 
-    void initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t partDepth, int dirMode);
-    void initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t partDepth, uint32_t chromaId);
+    void initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, int dirMode);
+    void initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId);
     pixel* getAdiChromaBuf(uint32_t chromaId, int tuSize)
     {
         return m_predBuf + (chromaId == 1 ? 0 : 2 * ADI_BUF_STRIDE * (tuSize * 2 + 1));
diff -r 5f9f7194267b -r d400c836b379 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp	Tue Dec 23 17:40:53 2014 +0900
+++ b/source/encoder/analysis.cpp	Thu Dec 25 13:15:56 2014 +0900
@@ -914,7 +914,7 @@
                         cu.getInterTUQtDepthRange(tuDepthRange, 0);
 
                         m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize);
-                        residualTransformQuantInter(*md.bestMode, cuGeom, 0, cuGeom.depth, tuDepthRange);
+                        residualTransformQuantInter(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
                         if (cu.getQtRootCbf(0))
                             md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]);
                         else
@@ -938,8 +938,7 @@
                         uint32_t tuDepthRange[2];
                         cu.getIntraTUQtDepthRange(tuDepthRange, 0);
 
-                        uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N;
-                        residualTransformQuantIntra(*md.bestMode, cuGeom, initTuDepth, 0, tuDepthRange);
+                        residualTransformQuantIntra(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
                         getBestIntraModeChroma(*md.bestMode, cuGeom);
                         residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
                         md.bestMode->reconYuv.copyFromPicYuv(*m_frame->m_reconPic, cu.m_cuAddr, cuGeom.encodeIdx); // TODO:
@@ -1702,8 +1701,7 @@
         uint32_t tuDepthRange[2];
         cu.getIntraTUQtDepthRange(tuDepthRange, 0);
 
-        uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N;
-        residualTransformQuantIntra(*bestMode, cuGeom, initTuDepth, 0, tuDepthRange);
+        residualTransformQuantIntra(*bestMode, cuGeom, 0, 0, tuDepthRange);
         getBestIntraModeChroma(*bestMode, cuGeom);
         residualQTIntraChroma(*bestMode, cuGeom, 0, 0);
     }
@@ -1736,7 +1734,7 @@
         uint32_t tuDepthRange[2];
         cu.getInterTUQtDepthRange(tuDepthRange, 0);
 
-        residualTransformQuantInter(*bestMode, cuGeom, 0, cuGeom.depth, tuDepthRange);
+        residualTransformQuantInter(*bestMode, cuGeom, 0, 0, tuDepthRange);
 
         if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
             cu.setPredModeSubParts(MODE_SKIP);
diff -r 5f9f7194267b -r d400c836b379 source/encoder/search.cpp
--- a/source/encoder/search.cpp	Tue Dec 23 17:40:53 2014 +0900
+++ b/source/encoder/search.cpp	Thu Dec 25 13:15:56 2014 +0900
@@ -239,7 +239,8 @@
 
 void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2])
 {
-    uint32_t fullDepth  = mode.cu.m_cuDepth[0] + tuDepth;
+    CUData& cu = mode.cu;
+    uint32_t fullDepth  = cu.m_cuDepth[0] + tuDepth;
     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
     uint32_t qtLayer    = log2TrSize - 2;
     uint32_t sizeIdx    = log2TrSize - 2;
@@ -253,8 +254,6 @@
         mightSplit = true;
     }
 
-    CUData& cu = mode.cu;
-
     Cost fullCost;
     uint32_t bCBF = 0;
 
@@ -273,7 +272,9 @@
 
         // init availability pattern
         uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
-        initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
+        IntraNeighbors intraNeighbors;
+        initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
+        initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
 
         // get prediction signal
         predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
@@ -365,7 +366,7 @@
             m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);   // prep state of split encode
         }
 
-        // code split block
+        /* code split block */
         uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
 
         int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
@@ -451,11 +452,13 @@
     pixel*   pred = predYuv->getLumaAddr(absPartIdx);
     int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
     uint32_t stride = fencYuv->m_size;
-    int      sizeIdx = log2TrSize - 2;
+    uint32_t sizeIdx = log2TrSize - 2;
 
     // init availability pattern
     uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
-    initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
+    IntraNeighbors intraNeighbors;
+    initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
+    initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
 
     // get prediction signal
     predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
@@ -597,13 +600,12 @@
 }
 
 /* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */
-void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, const uint32_t depthRange[2])
+void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
 {
     CUData& cu = mode.cu;
-
-    uint32_t fullDepth   = cu.m_cuDepth[0] + tuDepth;
-    uint32_t log2TrSize  = g_maxLog2CUSize - fullDepth;
-    bool     bCheckFull  = log2TrSize <= depthRange[1];
+    uint32_t fullDepth  = cu.m_cuDepth[0] + tuDepth;
+    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
+    bool     bCheckFull = log2TrSize <= depthRange[1];
 
     X265_CHECK(m_slice->m_sliceType != I_SLICE, "residualTransformQuantIntra not intended for I slices\n");
 
@@ -614,28 +616,36 @@
 
     if (bCheckFull)
     {
-        const pixel* fenc  = mode.fencYuv->getLumaAddr(absPartIdx);
-        pixel*   pred      = mode.predYuv.getLumaAddr(absPartIdx);
-        int16_t* residual  = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
+        const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
+        pixel*   pred     = mode.predYuv.getLumaAddr(absPartIdx);
+        int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
+        uint32_t stride   = mode.fencYuv->m_size;
+
+        // init availability pattern
+        uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
+        IntraNeighbors intraNeighbors;
+        initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
+        initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
+
+        // get prediction signal
+        predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
+
+        X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n");
+        cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
+
+        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
+        coeff_t* coeffY       = cu.m_trCoeff[0] + coeffOffsetY;
+
+        uint32_t sizeIdx   = log2TrSize - 2;
+        primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
+
         pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
         intptr_t picStride = m_frame->m_reconPic->m_stride;
-        uint32_t stride    = mode.fencYuv->m_size;
-        uint32_t sizeIdx   = log2TrSize - 2;
-        uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
-        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
-        coeff_t* coeff        = cu.m_trCoeff[TEXT_LUMA] + coeffOffsetY;
-
-        initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
-        predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
-
-        X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n");
-        cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
-
-        primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
-        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, false);
+
+        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
         if (numSig)
         {
-            m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], residual, stride, coeff, log2TrSize, TEXT_LUMA, true, false, numSig);
+            m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
             primitives.luma_add_ps[sizeIdx](picReconY, picStride, pred, residual, stride, stride);
             cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
         }
@@ -654,11 +664,11 @@
         uint32_t cbf = 0;
         for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
         {
-            residualTransformQuantIntra(mode, cuGeom, tuDepth + 1, qPartIdx, depthRange);
+            residualTransformQuantIntra(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
             cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
         }
         for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
-            cu.m_cbf[TEXT_LUMA][absPartIdx + offs] |= (cbf << tuDepth);
+            cu.m_cbf[0][absPartIdx + offs] |= (cbf << tuDepth);
     }
 }
 
@@ -739,15 +749,14 @@
         }
         for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
         {
-            cu.m_cbf[TEXT_CHROMA_U][absPartIdx + offs] |= (splitCbfU << tuDepth);
-            cu.m_cbf[TEXT_CHROMA_V][absPartIdx + offs] |= (splitCbfV << tuDepth);
+            cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
+            cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
         }
 
         return outDist;
     }
 
     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-
     uint32_t tuDepthC = tuDepth;
     if (log2TrSizeC < 2)
     {
@@ -766,46 +775,48 @@
     if (checkTransformSkip)
         return codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, psyEnergy);
 
+    ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
     uint32_t qtLayer = log2TrSize - 2;
     uint32_t tuSize = 1 << log2TrSizeC;
+    uint32_t stride = mode.fencYuv->m_csize;
+    const uint32_t sizeIdxC = log2TrSizeC - 2;
     uint32_t outDist = 0;
 
     uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
     const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
 
-    for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
+    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
+    do
     {
-        TextType ttype = (TextType)chromaId;
-
-        TURecurse tuIterator(splitType, curPartNum, absPartIdx);
-        do
+        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
+
+        IntraNeighbors intraNeighbors;
+        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
+
+        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
         {
-            uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
+            TextType ttype = (TextType)chromaId;
 
             const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
             pixel*   pred     = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
-            int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
-            uint32_t stride   = mode.fencYuv->m_csize;
-            uint32_t sizeIdxC = log2TrSizeC - 2;
-
+            int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
             uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
             coeff_t* coeffC        = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
             pixel*   reconQt       = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
             uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
-
             pixel*   picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
             intptr_t picStride = m_frame->m_reconPic->m_strideC;
 
-            // init availability pattern
-            initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
-            pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
-
             uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
             if (chromaPredMode == DM_CHROMA_IDX)
                 chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
             if (m_csp == X265_CSP_I422)
                 chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
 
+            // init availability pattern
+            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
+            pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
+
             // get prediction signal
             predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
 
@@ -833,10 +844,13 @@
 
             primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, reconQt, reconQtStride);
         }
-        while (tuIterator.isNextSection());
-
-        if (splitType == VERTICAL_SPLIT)
-            offsetSubTUCBFs(cu, ttype, tuDepth, absPartIdx);
+    }
+    while (tuIterator.isNextSection());
+
+    if (splitType == VERTICAL_SPLIT)
+    {
+        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
+        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
     }
 
     return outDist;
@@ -864,14 +878,17 @@
     uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
     const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
 
-    for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
+    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
+    do
     {
-        TextType ttype = (TextType)chromaId;
-
-        TURecurse tuIterator(splitType, curPartNum, absPartIdx);
-        do
+        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
+
+        IntraNeighbors intraNeighbors;
+        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
+
+        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
         {
-            uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
+            TextType ttype = (TextType)chromaId;
 
             const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
             pixel*   pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
@@ -885,7 +902,7 @@
             uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
 
             // init availability pattern
-            initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
+            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
             pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
 
             uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
@@ -978,10 +995,13 @@
             outDist += bDist;
             psyEnergy += bEnergy;
         }
-        while (tuIterator.isNextSection());
-
-        if (splitType == VERTICAL_SPLIT)
-            offsetSubTUCBFs(cu, ttype, tuDepth, absPartIdx);
+    }
+    while (tuIterator.isNextSection());
+
+    if (splitType == VERTICAL_SPLIT)
+    {
+        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
+        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
     }
 
     m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
@@ -1020,91 +1040,18 @@
     }
 }
 
-void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx)
+void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth)
 {
     CUData& cu = mode.cu;
-    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
-    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
-    
-    if (tuDepth == cu.m_tuDepth[absPartIdx])
-    {
-        uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-        uint32_t tuDepthC = tuDepth;
-        if (log2TrSizeC < 2)
-        {
-            X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
-            if (absPartIdx & 3)
-                return;
-            log2TrSizeC = 2;
-            tuDepthC--;
-        }
-
-        ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
-        uint32_t tuSize = 1 << log2TrSizeC;
-        uint32_t stride = mode.fencYuv->m_csize;
-        const int sizeIdxC = log2TrSizeC - 2;
-
-        uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
-        const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
-
-        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
-        {
-            TextType ttype = (TextType)chromaId;
-
-            TURecurse tuIterator(splitType, curPartNum, absPartIdx);
-            do
-            {
-                uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
-
-                const pixel*   fenc   = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
-                pixel*   pred         = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
-                int16_t* residual     = resiYuv.getChromaAddr(chromaId, absPartIdxC);
-                pixel*   recon        = mode.reconYuv.getChromaAddr(chromaId, absPartIdxC); // TODO: needed?
-                uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
-                coeff_t* coeff        = cu.m_trCoeff[ttype] + coeffOffsetC;
-                pixel*   picReconC    = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
-                uint32_t picStride    = m_frame->m_reconPic->m_strideC;
-
-                uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
-                if (chromaPredMode == DM_CHROMA_IDX)
-                    chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
-                chromaPredMode = (m_csp == X265_CSP_I422) ? g_chroma422IntraAngleMappingTable[chromaPredMode] : chromaPredMode;
-                initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
-                pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
-
-                predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
-
-                X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
-
-                primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
-                uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, false);
-                if (numSig)
-                {
-                    m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], residual, stride, coeff, log2TrSizeC, ttype, true, false, numSig);
-                    primitives.luma_add_ps[sizeIdxC](recon, stride, pred, residual, stride, stride);
-                    primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, recon, stride);
-                    cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
-                }
-                else
-                {
-                    primitives.luma_copy_pp[sizeIdxC](recon, stride, pred, stride);
-                    primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, pred, stride);
-                    cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
-                }
-            }
-            while (tuIterator.isNextSection());
-
-            if (splitType == VERTICAL_SPLIT)
-                offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx);
-        }
-    }
-    else
+    uint32_t log2TrSize = cu.m_log2CUSize[absPartIdx] - tuDepth;
+
+    if (tuDepth < cu.m_tuDepth[absPartIdx])
     {
         uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
         uint32_t splitCbfU = 0, splitCbfV = 0;
         for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
         {
-            residualQTIntraChroma(mode, cuGeom, tuDepth + 1, qPartIdx);
+            residualQTIntraChroma(mode, cuGeom, qPartIdx, tuDepth + 1);
             splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
             splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
         }
@@ -1113,12 +1060,91 @@
             cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
             cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
         }
+
+        return;
+    }
+
+    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
+    uint32_t tuDepthC = tuDepth;
+    if (log2TrSizeC < 2)
+    {
+        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+        if (absPartIdx & 3)
+            return;
+        log2TrSizeC = 2;
+        tuDepthC--;
+    }
+
+    ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
+    uint32_t tuSize = 1 << log2TrSizeC;
+    uint32_t stride = mode.fencYuv->m_csize;
+    const uint32_t sizeIdxC = log2TrSizeC - 2;
+
+    uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
+    const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
+
+    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
+    do
+    {
+        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
+
+        IntraNeighbors intraNeighbors;
+        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
+
+        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
+        {
+            TextType ttype = (TextType)chromaId;
+
+            const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
+            pixel*   pred     = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
+            int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
+            uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
+            coeff_t* coeffC        = cu.m_trCoeff[ttype] + coeffOffsetC;
+            pixel*   picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
+            intptr_t picStride = m_frame->m_reconPic->m_strideC;
+
+            uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
+            if (chromaPredMode == DM_CHROMA_IDX)
+                chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
+            if (m_csp == X265_CSP_I422)
+                chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
+
+            // init availability pattern
+            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
+            pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
+
+            // get prediction signal
+            predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
+
+            X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
+
+            primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
+            uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
+            if (numSig)
+            {
+                m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
+                primitives.luma_add_ps[sizeIdxC](picReconC, picStride, pred, residual, stride, stride);
+                cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+            }
+            else
+            {
+                // no coded residual, recon = pred
+                primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, pred, stride);
+                cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+            }
+        }
+    }
+    while (tuIterator.isNextSection());
+
+    if (splitType == VERTICAL_SPLIT)
+    {
+        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
+        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
     }
 }
 
 void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes)
 {
-    uint32_t depth = cuGeom.depth;
     CUData& cu = intraMode.cu;
 
     cu.setPartSizeSubParts(partSize);
@@ -1141,7 +1167,7 @@
         m_entropyCoder.codePredMode(cu.m_predMode[0]);
     }
 
-    m_entropyCoder.codePartSize(cu, 0, depth);
+    m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
     m_entropyCoder.codePredInfo(cu, 0);
     intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits();
 
@@ -1151,7 +1177,10 @@
     intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
     intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
     if (m_rdCost.m_psyRd)
-        intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
+    {
+        const Yuv* fencYuv = intraMode.fencYuv;
+        intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
+    }
 
     updateModeCost(intraMode);
 }
@@ -1172,7 +1201,9 @@
     const uint32_t absPartIdx = 0;
 
     // Reference sample smoothing
-    initAdiPattern(cu, cuGeom, absPartIdx, initTuDepth, ALL_IDX);
+    IntraNeighbors intraNeighbors;
+    initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
+    initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);
 
     const pixel* fenc = intraMode.fencYuv->m_buf[0];
     uint32_t stride = intraMode.fencYuv->m_size;
@@ -1333,7 +1364,6 @@
 {
     CUData& cu = intraMode.cu;
     Yuv* reconYuv = &intraMode.reconYuv;
-    const Yuv* fencYuv = intraMode.fencYuv;
 
     X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
     X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
@@ -1367,7 +1397,10 @@
     intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
     intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
     if (m_rdCost.m_psyRd)
+    {
+        const Yuv* fencYuv = intraMode.fencYuv;
         intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
+    }
 
     m_entropyCoder.store(intraMode.contexts);
     updateModeCost(intraMode);
@@ -1402,7 +1435,9 @@
         else
         {
             // Reference sample smoothing
-            initAdiPattern(cu, cuGeom, absPartIdx, initTuDepth, ALL_IDX);
+            IntraNeighbors intraNeighbors;
+            initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
+            initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);
 
             // determine set of modes to be tested (using prediction signal only)
             const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
@@ -1600,8 +1635,10 @@
         log2TrSizeC = 5;
     }
 
-    Predict::initAdiPatternChroma(cu, cuGeom, 0, tuDepth, 1);
-    Predict::initAdiPatternChroma(cu, cuGeom, 0, tuDepth, 2);
+    IntraNeighbors intraNeighbors;
+    initIntraNeighbors(cu, 0, tuDepth, false, &intraNeighbors);
+    Predict::initAdiPatternChroma(cu, cuGeom, 0, intraNeighbors, 1); // U
+    Predict::initAdiPatternChroma(cu, cuGeom, 0, intraNeighbors, 2); // V
     cu.getAllowedChromaDir(0, modeList);
 
     // check chroma modes
@@ -2579,16 +2616,16 @@
     updateModeCost(interMode);
 }
 
-void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, const uint32_t depthRange[2])
+void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
 {
+    uint32_t depth = cuGeom.depth + tuDepth;
     CUData& cu = mode.cu;
     X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "invalid depth\n");
 
     uint32_t log2TrSize = g_maxLog2CUSize - depth;
-    uint32_t tuDepth = depth - cu.m_cuDepth[0];
 
     bool bCheckFull = log2TrSize <= depthRange[1];
-    if (cu.m_partSize[0] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && log2TrSize > depthRange[0])
+    if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && log2TrSize > depthRange[0])
         bCheckFull = false;
 
     if (bCheckFull)
@@ -2609,7 +2646,7 @@
         uint32_t setCbf = 1 << tuDepth;
 
         uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
-        coeff_t *coeffCurY = cu.m_trCoeff[0] + coeffOffsetY;
+        coeff_t* coeffCurY = cu.m_trCoeff[0] + coeffOffsetY;
 
         uint32_t sizeIdx  = log2TrSize  - 2;
 
@@ -2642,8 +2679,8 @@
             uint32_t strideResiC = resiYuv.m_csize;
 
             uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
-            coeff_t *coeffCurU = cu.m_trCoeff[1] + coeffOffsetC;
-            coeff_t *coeffCurV = cu.m_trCoeff[2] + coeffOffsetC;
+            coeff_t* coeffCurU = cu.m_trCoeff[1] + coeffOffsetC;
+            coeff_t* coeffCurV = cu.m_trCoeff[2] + coeffOffsetC;
             bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
 
             TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
@@ -2700,16 +2737,16 @@
         uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
         for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
         {
-            residualTransformQuantInter(mode, cuGeom, qPartIdx, depth + 1, depthRange);
-            ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
+            residualTransformQuantInter(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
+            ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
             ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
             vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
         }
-        for (uint32_t i = 0; i < 4 * qNumParts; i++)
+        for (uint32_t i = 0; i < 4 * qNumParts; ++i)
         {
-            cu.m_cbf[TEXT_LUMA][absPartIdx + i] |= ycbf << tuDepth;
-            cu.m_cbf[TEXT_CHROMA_U][absPartIdx + i] |= ucbf << tuDepth;
-            cu.m_cbf[TEXT_CHROMA_V][absPartIdx + i] |= vcbf << tuDepth;
+            cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth;
+            cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
+            cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
         }
     }
 }
@@ -2767,7 +2804,7 @@
 
     uint32_t trSize = 1 << log2TrSize;
     const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
-    uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] +  tuDepthC) << 1);
+    uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
     const Yuv* fencYuv = mode.fencYuv;
 
     // code full block
@@ -3125,16 +3162,19 @@
         //Encode cbf flags
         if (bCodeChroma)
         {
-            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
+            if (!splitIntoSubTUs)
             {
-                if (!splitIntoSubTUs)
-                    m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][0], tuDepth);
-                else
-                {
-                    offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx);
-                    m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][0], tuDepth);
-                    m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][1], tuDepth);
-                }
+                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth);
+                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth);
+            }
+            else
+            {
+                offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
+                offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
+                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth);
+                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][1], tuDepth);
+                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth);
+                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][1], tuDepth);
             }
         }
 
diff -r 5f9f7194267b -r d400c836b379 source/encoder/search.h
--- a/source/encoder/search.h	Tue Dec 23 17:40:53 2014 +0900
+++ b/source/encoder/search.h	Thu Dec 25 13:15:56 2014 +0900
@@ -178,9 +178,9 @@
     void     encodeResAndCalcRdSkipCU(Mode& interMode);
 
     // encode residual without rd-cost
-    void     residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, const uint32_t depthRange[2]);
-    void     residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, const uint32_t depthRange[2]);
-    void     residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx);
+    void     residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2]);
+    void     residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2]);
+    void     residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth);
 
     // pick be chroma mode from available using just sa8d costs
     void     getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom);


More information about the x265-devel mailing list