[x265] refine intra neighbors

Satoshi Nakagawa nakagawa424 at oki.com
Tue Dec 23 08:53:09 CET 2014


CIP is picture level flag, and typically OFF.

Separate functions are to simply non-CIP path.

Code size is small and *CIP() functions will not be loaded to cache.

 

 

From: x265-devel [mailto:x265-devel-bounces at videolan.org] On Behalf Of Ashok Kumar Mishra
Sent: Tuesday, December 23, 2014 4:23 PM
To: Development for x265
Subject: Re: [x265] refine intra neighbors

 

Hi,

 

We removed separate functions for constrained intra prediction(CIP) some time back. Because it was increasing the code size at the cost of few conditional checks.

Can you please send a separate patch for other changes not related to CIP.

 

Thanks

Ashok.

 

On Tue, Dec 23, 2014 at 11:23 AM, Satoshi Nakagawa <nakagawa424 at oki.com> wrote:

# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1419313799 -32400
#      Tue Dec 23 14:49:59 2014 +0900
# Node ID 6b59452a17d75c42c1750d47e2318c8da80c39fb
# Parent  8d2f418829c894c25da79daa861f16c61e5060d7
refine intra neighbors

diff -r 8d2f418829c8 -r 6b59452a17d7 source/common/common.h
--- a/source/common/common.h    Sat Dec 20 21:27:14 2014 +0900
+++ b/source/common/common.h    Tue Dec 23 14:49:59 2014 +0900
@@ -163,6 +163,9 @@
 template<typename T>
 inline T x265_max(T a, T b) { return a > b ? a : b; }

+template<typename T>
+inline T x265_clip3(T minVal, T maxVal, T a) { return x265_min(x265_max(minVal, a), maxVal); }
+
 typedef int16_t  coeff_t;      // transform coefficient

 #define X265_MIN(a, b) ((a) < (b) ? (a) : (b))
diff -r 8d2f418829c8 -r 6b59452a17d7 source/common/cudata.cpp
--- a/source/common/cudata.cpp  Sat Dec 20 21:27:14 2014 +0900
+++ b/source/common/cudata.cpp  Tue Dec 23 14:49:59 2014 +0900
@@ -608,7 +608,7 @@
         {
             if (curPartUnitIdx > g_rasterToZscan[absPartIdxRT - s_numPartInCUSize + 1])
             {
-                uint32_t absZorderCUIdx  = g_zscanToRaster[m_absIdxInCTU] + (1 << (m_log2CUSize[0] - LOG2_UNIT_SIZE)) - 1;
+                uint32_t absZorderCUIdx = g_zscanToRaster[m_absIdxInCTU] + (1 << (m_log2CUSize[0] - LOG2_UNIT_SIZE)) - 1;
                 arPartUnitIdx = g_rasterToZscan[absPartIdxRT - s_numPartInCUSize + 1];
                 if (isEqualRowOrCol(absPartIdxRT, absZorderCUIdx, s_numPartInCUSize))
                     return m_encData->getPicCTU(m_cuAddr);
@@ -689,8 +689,6 @@
             return NULL;
         }
         blPartUnitIdx = g_rasterToZscan[absPartIdxLB + (1 + partUnitOffset) * s_numPartInCUSize - 1];
-        if (!m_cuLeft || !m_cuLeft->m_slice)
-            return NULL;
         return m_cuLeft;
     }

@@ -723,8 +721,6 @@
             return NULL;
         }
         arPartUnitIdx = g_rasterToZscan[absPartIdxRT + NUM_CU_PARTITIONS - s_numPartInCUSize + partUnitOffset];
-        if (!m_cuAbove || !m_cuAbove->m_slice)
-            return NULL;
         return m_cuAbove;
     }

@@ -732,8 +728,6 @@
         return NULL;

     arPartUnitIdx = g_rasterToZscan[NUM_CU_PARTITIONS - s_numPartInCUSize + partUnitOffset - 1];
-    if ((m_cuAboveRight == NULL || m_cuAboveRight->m_slice == NULL || (m_cuAboveRight->m_cuAddr) > m_cuAddr))
-        return NULL;
     return m_cuAboveRight;
 }

@@ -904,7 +898,7 @@
     tuDepthRange[0] = m_slice->m_sps->quadtreeTULog2MinSize;
     tuDepthRange[1] = m_slice->m_sps->quadtreeTULog2MaxSize;

-    tuDepthRange[0] = X265_MAX(tuDepthRange[0], X265_MIN(log2CUSize - (m_slice->m_sps->quadtreeTUMaxDepthIntra - 1 + splitFlag), tuDepthRange[1]));
+    tuDepthRange[0] = x265_clip3(tuDepthRange[0], tuDepthRange[1], log2CUSize - (m_slice->m_sps->quadtreeTUMaxDepthIntra - 1 + splitFlag));
 }

 void CUData::getInterTUQtDepthRange(uint32_t tuDepthRange[2], uint32_t absPartIdx) const
@@ -916,7 +910,7 @@
     tuDepthRange[0] = m_slice->m_sps->quadtreeTULog2MinSize;
     tuDepthRange[1] = m_slice->m_sps->quadtreeTULog2MaxSize;

-    tuDepthRange[0] = X265_MAX(tuDepthRange[0], X265_MIN(log2CUSize - (quadtreeTUMaxDepth - 1 + splitFlag), tuDepthRange[1]));
+    tuDepthRange[0] = x265_clip3(tuDepthRange[0], tuDepthRange[1], log2CUSize - (quadtreeTUMaxDepth - 1 + splitFlag));
 }

 uint32_t CUData::getCtxSkipFlag(uint32_t absPartIdx) const
@@ -1363,14 +1357,6 @@
     return outPartIdxRB;
 }

-void CUData::deriveLeftRightTopIdxAdi(uint32_t& outPartIdxLT, uint32_t& outPartIdxRT, uint32_t partOffset, uint32_t partDepth) const
-{
-    uint32_t numPartInWidth = 1 << (m_log2CUSize[0] - LOG2_UNIT_SIZE - partDepth);
-
-    outPartIdxLT = m_absIdxInCTU + partOffset;
-    outPartIdxRT = g_rasterToZscan[g_zscanToRaster[outPartIdxLT] + numPartInWidth - 1];
-}
-
 bool CUData::hasEqualMotion(uint32_t absPartIdx, const CUData& candCU, uint32_t candAbsPartIdx) const
 {
     if (m_interDir[absPartIdx] != candCU.m_interDir[candAbsPartIdx])
diff -r 8d2f418829c8 -r 6b59452a17d7 source/common/cudata.h
--- a/source/common/cudata.h    Sat Dec 20 21:27:14 2014 +0900
+++ b/source/common/cudata.h    Tue Dec 23 14:49:59 2014 +0900
@@ -212,7 +212,6 @@

     void     getAllowedChromaDir(uint32_t absPartIdx, uint32_t* modeList) const;
     int      getIntraDirLumaPredictor(uint32_t absPartIdx, uint32_t* intraDirPred) const;
-    void     deriveLeftRightTopIdxAdi(uint32_t& partIdxLT, uint32_t& partIdxRT, uint32_t partOffset, uint32_t partDepth) const;

     uint32_t getSCUAddr() const                  { return (m_cuAddr << g_maxFullDepth * 2) + m_absIdxInCTU; }
     uint32_t getCtxSplitFlag(uint32_t absPartIdx, uint32_t depth) const;
diff -r 8d2f418829c8 -r 6b59452a17d7 source/common/predict.cpp
--- a/source/common/predict.cpp Sat Dec 20 21:27:14 2014 +0900
+++ b/source/common/predict.cpp Tue Dec 23 14:49:59 2014 +0900
@@ -654,11 +654,8 @@
     }
 }

-void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t partDepth, int dirMode)
+void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, int dirMode)
 {
-    IntraNeighbors intraNeighbors;
-    initIntraNeighbors(cu, absPartIdx, partDepth, true, &intraNeighbors);
-
     pixel* adiBuf      = m_predBuf;
     pixel* refAbove    = m_refAbove;
     pixel* refLeft     = m_refLeft;
@@ -700,12 +697,12 @@
             int refTL = refAbove[0];
             int refTR = refAbove[trSize2];
             bStrongSmoothing = (abs(refBL + refTL - 2 * refLeft[trSize]) < threshold &&
-                abs(refTL + refTR - 2 * refAbove[trSize]) < threshold);
+                                abs(refTL + refTR - 2 * refAbove[trSize]) < threshold);

             if (bStrongSmoothing)
             {
                 // bilinear interpolation
-                const int shift = 5 + 1; // intraNeighbors.log2TrSize + 1;
+                const int shift = 5 + 1; // log2TrSize + 1;
                 int init = (refTL << shift) + tuSize;
                 int delta;

@@ -738,10 +735,8 @@
     }
 }

-void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t partDepth, uint32_t chromaId)
+void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId)
 {
-    IntraNeighbors intraNeighbors;
-    initIntraNeighbors(cu, absPartIdx, partDepth, false, &intraNeighbors);
     uint32_t tuSize = intraNeighbors.tuSize;

     const pixel* adiOrigin = cu.m_encData->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
@@ -751,9 +746,9 @@
     fillReferenceSamples(adiOrigin, picStride, adiRef, intraNeighbors);
 }

-void Predict::initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t partDepth, bool isLuma, IntraNeighbors *intraNeighbors)
+void Predict::initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, bool isLuma, IntraNeighbors *intraNeighbors)
 {
-    uint32_t log2TrSize = cu.m_log2CUSize[0] - partDepth;
+    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
     int log2UnitWidth = LOG2_UNIT_SIZE;
     int log2UnitHeight = LOG2_UNIT_SIZE;

@@ -764,12 +759,12 @@
         log2UnitHeight -= cu.m_vChromaShift;
     }

-    int   numIntraNeighbor = 0;
+    int numIntraNeighbor;
     bool* bNeighborFlags = intraNeighbors->bNeighborFlags;

-    uint32_t partIdxLT, partIdxRT, partIdxLB;
-
-    cu.deriveLeftRightTopIdxAdi(partIdxLT, partIdxRT, absPartIdx, partDepth);
+    uint32_t numPartInWidth = 1 << (cu.m_log2CUSize[0] - LOG2_UNIT_SIZE - tuDepth);
+    uint32_t partIdxLT = cu.m_absIdxInCTU + absPartIdx;
+    uint32_t partIdxRT = g_rasterToZscan[g_zscanToRaster[partIdxLT] + numPartInWidth - 1];

     uint32_t tuSize = 1 << log2TrSize;
     int  tuWidthInUnits = tuSize >> log2UnitWidth;
@@ -777,14 +772,26 @@
     int  aboveUnits = tuWidthInUnits << 1;
     int  leftUnits = tuHeightInUnits << 1;
     int  partIdxStride = cu.m_slice->m_sps->numPartInCUSize;
-    partIdxLB = g_rasterToZscan[g_zscanToRaster[partIdxLT] + ((tuHeightInUnits - 1) * partIdxStride)];
+    uint32_t partIdxLB = g_rasterToZscan[g_zscanToRaster[partIdxLT] + ((tuHeightInUnits - 1) * partIdxStride)];

-    bNeighborFlags[leftUnits] = isAboveLeftAvailable(cu, partIdxLT);
-    numIntraNeighbor += (int)(bNeighborFlags[leftUnits]);
-    numIntraNeighbor += isAboveAvailable(cu, partIdxLT, partIdxRT, (bNeighborFlags + leftUnits + 1));
-    numIntraNeighbor += isAboveRightAvailable(cu, partIdxLT, partIdxRT, (bNeighborFlags + leftUnits + 1 + tuWidthInUnits));
-    numIntraNeighbor += isLeftAvailable(cu, partIdxLT, partIdxLB, (bNeighborFlags + leftUnits - 1));
-    numIntraNeighbor += isBelowLeftAvailable(cu, partIdxLT, partIdxLB, (bNeighborFlags + leftUnits - 1 - tuHeightInUnits));
+    if (cu.m_slice->isIntra() || !cu.m_slice->m_pps->bConstrainedIntraPred)
+    {
+        bNeighborFlags[leftUnits] = isAboveLeftAvailable(cu, partIdxLT);
+        numIntraNeighbor  = (int)(bNeighborFlags[leftUnits]);
+        numIntraNeighbor += isAboveAvailable(cu, partIdxLT, partIdxRT, bNeighborFlags + leftUnits + 1);
+        numIntraNeighbor += isAboveRightAvailable(cu, partIdxRT, bNeighborFlags + leftUnits + 1 + tuWidthInUnits, tuWidthInUnits);
+        numIntraNeighbor += isLeftAvailable(cu, partIdxLT, partIdxLB, bNeighborFlags + leftUnits - 1);
+        numIntraNeighbor += isBelowLeftAvailable(cu, partIdxLB, bNeighborFlags + tuHeightInUnits - 1, tuHeightInUnits);
+    }
+    else
+    {
+        bNeighborFlags[leftUnits] = isAboveLeftAvailableCIP(cu, partIdxLT);
+        numIntraNeighbor  = (int)(bNeighborFlags[leftUnits]);
+        numIntraNeighbor += isAboveAvailableCIP(cu, partIdxLT, partIdxRT, bNeighborFlags + leftUnits + 1);
+        numIntraNeighbor += isAboveRightAvailableCIP(cu, partIdxRT, bNeighborFlags + leftUnits + 1 + tuWidthInUnits, tuWidthInUnits);
+        numIntraNeighbor += isLeftAvailableCIP(cu, partIdxLT, partIdxLB, bNeighborFlags + leftUnits - 1);
+        numIntraNeighbor += isBelowLeftAvailableCIP(cu, partIdxLB, bNeighborFlags + tuHeightInUnits - 1, tuHeightInUnits);
+    }

     intraNeighbors->numIntraNeighbor = numIntraNeighbor;
     intraNeighbors->totalUnits = aboveUnits + leftUnits + 1;
@@ -793,7 +800,6 @@
     intraNeighbors->unitWidth = 1 << log2UnitWidth;
     intraNeighbors->unitHeight = 1 << log2UnitHeight;
     intraNeighbors->tuSize = tuSize;
-    intraNeighbors->log2TrSize = log2TrSize;
 }

 void Predict::fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, pixel* adiRef, const IntraNeighbors& intraNeighbors)
@@ -953,33 +959,27 @@
     uint32_t partAboveLeft;
     const CUData* cuAboveLeft = cu.getPUAboveLeft(partAboveLeft, partIdxLT);

-    if (!cu.m_slice->m_pps->bConstrainedIntraPred)
-        return cuAboveLeft ? true : false;
-    else
-        return cuAboveLeft && cuAboveLeft->isIntra(partAboveLeft);
+    return !!cuAboveLeft;
 }

 int Predict::isAboveAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool* bValidFlags)
 {
     const uint32_t rasterPartBegin = g_zscanToRaster[partIdxLT];
-    const uint32_t rasterPartEnd = g_zscanToRaster[partIdxRT] + 1;
+    const uint32_t rasterPartEnd = g_zscanToRaster[partIdxRT];
     const uint32_t idxStep = 1;
-    bool* validFlagPtr = bValidFlags;
     int numIntra = 0;

-    for (uint32_t rasterPart = rasterPartBegin; rasterPart < rasterPartEnd; rasterPart += idxStep)
+    for (uint32_t rasterPart = rasterPartBegin; rasterPart <= rasterPartEnd; rasterPart += idxStep, bValidFlags++)
     {
         uint32_t partAbove;
         const CUData* cuAbove = cu.getPUAbove(partAbove, g_rasterToZscan[rasterPart]);
-        if (cuAbove && (!cu.m_slice->m_pps->bConstrainedIntraPred || cuAbove->isIntra(partAbove)))
+        if (cuAbove)
         {
             numIntra++;
-            *validFlagPtr = true;
+            *bValidFlags = true;
         }
         else
-            *validFlagPtr = false;
-
-        validFlagPtr++;
+            *bValidFlags = false;
     }

     return numIntra;
@@ -988,73 +988,156 @@
 int Predict::isLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool* bValidFlags)
 {
     const uint32_t rasterPartBegin = g_zscanToRaster[partIdxLT];
-    const uint32_t rasterPartEnd = g_zscanToRaster[partIdxLB] + 1;
+    const uint32_t rasterPartEnd = g_zscanToRaster[partIdxLB];
     const uint32_t idxStep = cu.m_slice->m_sps->numPartInCUSize;
-    bool* validFlagPtr = bValidFlags;
     int numIntra = 0;

-    for (uint32_t rasterPart = rasterPartBegin; rasterPart < rasterPartEnd; rasterPart += idxStep)
+    for (uint32_t rasterPart = rasterPartBegin; rasterPart <= rasterPartEnd; rasterPart += idxStep, bValidFlags--) // opposite direction
     {
         uint32_t partLeft;
         const CUData* cuLeft = cu.getPULeft(partLeft, g_rasterToZscan[rasterPart]);
-        if (cuLeft && (!cu.m_slice->m_pps->bConstrainedIntraPred || cuLeft->isIntra(partLeft)))
+        if (cuLeft)
         {
             numIntra++;
-            *validFlagPtr = true;
+            *bValidFlags = true;
         }
         else
-            *validFlagPtr = false;
-
-        validFlagPtr--; // opposite direction
+            *bValidFlags = false;
     }

     return numIntra;
 }

-int Predict::isAboveRightAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool* bValidFlags)
+int Predict::isAboveRightAvailable(const CUData& cu, uint32_t partIdxRT, bool* bValidFlags, uint32_t numUnits)
 {
-    const uint32_t numUnitsInPU = g_zscanToRaster[partIdxRT] - g_zscanToRaster[partIdxLT] + 1;
-    bool* validFlagPtr = bValidFlags;
     int numIntra = 0;

-    for (uint32_t offset = 1; offset <= numUnitsInPU; offset++)
+    for (uint32_t offset = 1; offset <= numUnits; offset++, bValidFlags++)
     {
         uint32_t partAboveRight;
         const CUData* cuAboveRight = cu.getPUAboveRightAdi(partAboveRight, partIdxRT, offset);
-        if (cuAboveRight && (!cu.m_slice->m_pps->bConstrainedIntraPred || cuAboveRight->isIntra(partAboveRight)))
+        if (cuAboveRight)
         {
             numIntra++;
-            *validFlagPtr = true;
+            *bValidFlags = true;
         }
         else
-            *validFlagPtr = false;
-
-        validFlagPtr++;
+            *bValidFlags = false;
     }

     return numIntra;
 }

-int Predict::isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool* bValidFlags)
+int Predict::isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLB, bool* bValidFlags, uint32_t numUnits)
 {
-    const uint32_t numUnitsInPU = (g_zscanToRaster[partIdxLB] - g_zscanToRaster[partIdxLT]) / cu.m_slice->m_sps->numPartInCUSize + 1;
-    bool* validFlagPtr = bValidFlags;
     int numIntra = 0;

-    for (uint32_t offset = 1; offset <= numUnitsInPU; offset++)
+    for (uint32_t offset = 1; offset <= numUnits; offset++, bValidFlags--) // opposite direction
     {
         uint32_t partBelowLeft;
         const CUData* cuBelowLeft = cu.getPUBelowLeftAdi(partBelowLeft, partIdxLB, offset);
-        if (cuBelowLeft && (!cu.m_slice->m_pps->bConstrainedIntraPred || cuBelowLeft->isIntra(partBelowLeft)))
+        if (cuBelowLeft)
         {
             numIntra++;
-            *validFlagPtr = true;
+            *bValidFlags = true;
         }
         else
-            *validFlagPtr = false;
-
-        validFlagPtr--; // opposite direction
+            *bValidFlags = false;
     }

     return numIntra;
 }
+
+bool Predict::isAboveLeftAvailableCIP(const CUData& cu, uint32_t partIdxLT)
+{
+    uint32_t partAboveLeft;
+    const CUData* cuAboveLeft = cu.getPUAboveLeft(partAboveLeft, partIdxLT);
+
+    return cuAboveLeft && cuAboveLeft->isIntra(partAboveLeft);
+}
+
+int Predict::isAboveAvailableCIP(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool* bValidFlags)
+{
+    const uint32_t rasterPartBegin = g_zscanToRaster[partIdxLT];
+    const uint32_t rasterPartEnd = g_zscanToRaster[partIdxRT];
+    const uint32_t idxStep = 1;
+    int numIntra = 0;
+
+    for (uint32_t rasterPart = rasterPartBegin; rasterPart <= rasterPartEnd; rasterPart += idxStep, bValidFlags++)
+    {
+        uint32_t partAbove;
+        const CUData* cuAbove = cu.getPUAbove(partAbove, g_rasterToZscan[rasterPart]);
+        if (cuAbove && cuAbove->isIntra(partAbove))
+        {
+            numIntra++;
+            *bValidFlags = true;
+        }
+        else
+            *bValidFlags = false;
+    }
+
+    return numIntra;
+}
+
+int Predict::isLeftAvailableCIP(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool* bValidFlags)
+{
+    const uint32_t rasterPartBegin = g_zscanToRaster[partIdxLT];
+    const uint32_t rasterPartEnd = g_zscanToRaster[partIdxLB];
+    const uint32_t idxStep = cu.m_slice->m_sps->numPartInCUSize;
+    int numIntra = 0;
+
+    for (uint32_t rasterPart = rasterPartBegin; rasterPart <= rasterPartEnd; rasterPart += idxStep, bValidFlags--) // opposite direction
+    {
+        uint32_t partLeft;
+        const CUData* cuLeft = cu.getPULeft(partLeft, g_rasterToZscan[rasterPart]);
+        if (cuLeft && cuLeft->isIntra(partLeft))
+        {
+            numIntra++;
+            *bValidFlags = true;
+        }
+        else
+            *bValidFlags = false;
+    }
+
+    return numIntra;
+}
+
+int Predict::isAboveRightAvailableCIP(const CUData& cu, uint32_t partIdxRT, bool* bValidFlags, uint32_t numUnits)
+{
+    int numIntra = 0;
+
+    for (uint32_t offset = 1; offset <= numUnits; offset++, bValidFlags++)
+    {
+        uint32_t partAboveRight;
+        const CUData* cuAboveRight = cu.getPUAboveRightAdi(partAboveRight, partIdxRT, offset);
+        if (cuAboveRight && cuAboveRight->isIntra(partAboveRight))
+        {
+            numIntra++;
+            *bValidFlags = true;
+        }
+        else
+            *bValidFlags = false;
+    }
+
+    return numIntra;
+}
+
+int Predict::isBelowLeftAvailableCIP(const CUData& cu, uint32_t partIdxLB, bool* bValidFlags, uint32_t numUnits)
+{
+    int numIntra = 0;
+
+    for (uint32_t offset = 1; offset <= numUnits; offset++, bValidFlags--) // opposite direction
+    {
+        uint32_t partBelowLeft;
+        const CUData* cuBelowLeft = cu.getPUBelowLeftAdi(partBelowLeft, partIdxLB, offset);
+        if (cuBelowLeft && cuBelowLeft->isIntra(partBelowLeft))
+        {
+            numIntra++;
+            *bValidFlags = true;
+        }
+        else
+            *bValidFlags = false;
+    }
+
+    return numIntra;
+}
diff -r 8d2f418829c8 -r 6b59452a17d7 source/common/predict.h
--- a/source/common/predict.h   Sat Dec 20 21:27:14 2014 +0900
+++ b/source/common/predict.h   Tue Dec 23 14:49:59 2014 +0900
@@ -57,7 +57,6 @@
         int      unitWidth;
         int      unitHeight;
         int      tuSize;
-        uint32_t log2TrSize;
         bool     bNeighborFlags[4 * MAX_NUM_SPU_W + 1];
     };

@@ -105,14 +104,20 @@
     void addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightValues wp[3], bool bLuma, bool bChroma) const;

     /* Intra prediction helper functions */
-    static void initIntraNeighbors(const CUData& cu, uint32_t zOrderIdxInPart, uint32_t partDepth, bool isLuma, IntraNeighbors *IntraNeighbors);
+    static void initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, bool isLuma, IntraNeighbors *IntraNeighbors);
     static void fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, pixel* adiRef, const IntraNeighbors& intraNeighbors);

     static bool isAboveLeftAvailable(const CUData& cu, uint32_t partIdxLT);
     static int  isAboveAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool* bValidFlags);
     static int  isLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool* bValidFlags);
-    static int  isAboveRightAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool* bValidFlags);
-    static int  isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool* bValidFlags);
+    static int  isAboveRightAvailable(const CUData& cu, uint32_t partIdxRT, bool* bValidFlags, uint32_t numUnits);
+    static int  isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLB, bool* bValidFlags, uint32_t numUnits);
+
+    static bool isAboveLeftAvailableCIP(const CUData& cu, uint32_t partIdxLT);
+    static int  isAboveAvailableCIP(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool* bValidFlags);
+    static int  isLeftAvailableCIP(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool* bValidFlags);
+    static int  isAboveRightAvailableCIP(const CUData& cu, uint32_t partIdxRT, bool* bValidFlags, uint32_t numUnits);
+    static int  isBelowLeftAvailableCIP(const CUData& cu, uint32_t partIdxLB, bool* bValidFlags, uint32_t numUnits);

 public:

@@ -125,8 +130,8 @@
     void predIntraLumaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSize);
     void predIntraChromaAng(pixel* src, uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSizeC, int chFmt);

-    void initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t partDepth, int dirMode);
-    void initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t partDepth, uint32_t chromaId);
+    void initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, int dirMode);
+    void initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId);
     pixel* getAdiChromaBuf(uint32_t chromaId, int tuSize)
     {
         return m_predBuf + (chromaId == 1 ? 0 : 2 * ADI_BUF_STRIDE * (tuSize * 2 + 1));
diff -r 8d2f418829c8 -r 6b59452a17d7 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp       Sat Dec 20 21:27:14 2014 +0900
+++ b/source/encoder/analysis.cpp       Tue Dec 23 14:49:59 2014 +0900
@@ -914,7 +914,7 @@
                         cu.getInterTUQtDepthRange(tuDepthRange, 0);

                         m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize);
-                        residualTransformQuantInter(*md.bestMode, cuGeom, 0, cuGeom.depth, tuDepthRange);
+                        residualTransformQuantInter(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
                         if (cu.getQtRootCbf(0))
                             md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]);
                         else
@@ -938,8 +938,7 @@
                         uint32_t tuDepthRange[2];
                         cu.getIntraTUQtDepthRange(tuDepthRange, 0);

-                        uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N;
-                        residualTransformQuantIntra(*md.bestMode, cuGeom, initTuDepth, 0, tuDepthRange);
+                        residualTransformQuantIntra(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
                         getBestIntraModeChroma(*md.bestMode, cuGeom);
                         residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
                         md.bestMode->reconYuv.copyFromPicYuv(*m_frame->m_reconPic, cu.m_cuAddr, cuGeom.encodeIdx); // TODO:
@@ -1702,8 +1701,7 @@
         uint32_t tuDepthRange[2];
         cu.getIntraTUQtDepthRange(tuDepthRange, 0);

-        uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N;
-        residualTransformQuantIntra(*bestMode, cuGeom, initTuDepth, 0, tuDepthRange);
+        residualTransformQuantIntra(*bestMode, cuGeom, 0, 0, tuDepthRange);
         getBestIntraModeChroma(*bestMode, cuGeom);
         residualQTIntraChroma(*bestMode, cuGeom, 0, 0);
     }
@@ -1736,7 +1734,7 @@
         uint32_t tuDepthRange[2];
         cu.getInterTUQtDepthRange(tuDepthRange, 0);

-        residualTransformQuantInter(*bestMode, cuGeom, 0, cuGeom.depth, tuDepthRange);
+        residualTransformQuantInter(*bestMode, cuGeom, 0, 0, tuDepthRange);

         if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
             cu.setPredModeSubParts(MODE_SKIP);
diff -r 8d2f418829c8 -r 6b59452a17d7 source/encoder/search.cpp
--- a/source/encoder/search.cpp Sat Dec 20 21:27:14 2014 +0900
+++ b/source/encoder/search.cpp Tue Dec 23 14:49:59 2014 +0900
@@ -239,7 +239,8 @@

 void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2])
 {
-    uint32_t fullDepth  = mode.cu.m_cuDepth[0] + tuDepth;
+    CUData& cu = mode.cu;
+    uint32_t fullDepth  = cu.m_cuDepth[0] + tuDepth;
     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
     uint32_t qtLayer    = log2TrSize - 2;
     uint32_t sizeIdx    = log2TrSize - 2;
@@ -253,8 +254,6 @@
         mightSplit = true;
     }

-    CUData& cu = mode.cu;
-
     Cost fullCost;
     uint32_t bCBF = 0;

@@ -273,7 +272,9 @@

         // init availability pattern
         uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
-        initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
+        IntraNeighbors intraNeighbors;
+        initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
+        initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);

         // get prediction signal
         predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
@@ -365,7 +366,7 @@
             m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);   // prep state of split encode
         }

-        // code split block
+        /* code split block */
         uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;

         int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
@@ -451,11 +452,13 @@
     pixel*   pred = predYuv->getLumaAddr(absPartIdx);
     int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
     uint32_t stride = fencYuv->m_size;
-    int      sizeIdx = log2TrSize - 2;
+    uint32_t sizeIdx = log2TrSize - 2;

     // init availability pattern
     uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
-    initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
+    IntraNeighbors intraNeighbors;
+    initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
+    initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);

     // get prediction signal
     predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
@@ -597,13 +600,12 @@
 }

 /* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */
-void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, const uint32_t depthRange[2])
+void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
 {
     CUData& cu = mode.cu;
-
-    uint32_t fullDepth   = cu.m_cuDepth[0] + tuDepth;
-    uint32_t log2TrSize  = g_maxLog2CUSize - fullDepth;
-    bool     bCheckFull  = log2TrSize <= depthRange[1];
+    uint32_t fullDepth  = cu.m_cuDepth[0] + tuDepth;
+    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
+    bool     bCheckFull = log2TrSize <= depthRange[1];

     X265_CHECK(m_slice->m_sliceType != I_SLICE, "residualTransformQuantIntra not intended for I slices\n");

@@ -614,28 +616,36 @@

     if (bCheckFull)
     {
-        const pixel* fenc  = mode.fencYuv->getLumaAddr(absPartIdx);
-        pixel*   pred      = mode.predYuv.getLumaAddr(absPartIdx);
-        int16_t* residual  = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
+        const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
+        pixel*   pred     = mode.predYuv.getLumaAddr(absPartIdx);
+        int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
+        uint32_t stride   = mode.fencYuv->m_size;
+
+        // init availability pattern
+        uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
+        IntraNeighbors intraNeighbors;
+        initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
+        initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
+
+        // get prediction signal
+        predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
+
+        X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n");
+        cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
+
+        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
+        coeff_t* coeffY       = cu.m_trCoeff[0] + coeffOffsetY;
+
+        uint32_t sizeIdx   = log2TrSize - 2;
+        primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
+
         pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
         intptr_t picStride = m_frame->m_reconPic->m_stride;
-        uint32_t stride    = mode.fencYuv->m_size;
-        uint32_t sizeIdx   = log2TrSize - 2;
-        uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
-        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
-        coeff_t* coeff        = cu.m_trCoeff[TEXT_LUMA] + coeffOffsetY;
-
-        initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
-        predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
-
-        X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n");
-        cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
-
-        primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
-        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, false);
+
+        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
         if (numSig)
         {
-            m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], residual, stride, coeff, log2TrSize, TEXT_LUMA, true, false, numSig);
+            m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
             primitives.luma_add_ps[sizeIdx](picReconY, picStride, pred, residual, stride, stride);
             cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
         }
@@ -654,11 +664,11 @@
         uint32_t cbf = 0;
         for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
         {
-            residualTransformQuantIntra(mode, cuGeom, tuDepth + 1, qPartIdx, depthRange);
+            residualTransformQuantIntra(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
             cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
         }
         for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
-            cu.m_cbf[TEXT_LUMA][absPartIdx + offs] |= (cbf << tuDepth);
+            cu.m_cbf[0][absPartIdx + offs] |= (cbf << tuDepth);
     }
 }

@@ -739,15 +749,14 @@
         }
         for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
         {
-            cu.m_cbf[TEXT_CHROMA_U][absPartIdx + offs] |= (splitCbfU << tuDepth);
-            cu.m_cbf[TEXT_CHROMA_V][absPartIdx + offs] |= (splitCbfV << tuDepth);
+            cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
+            cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
         }

         return outDist;
     }

     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-
     uint32_t tuDepthC = tuDepth;
     if (log2TrSizeC < 2)
     {
@@ -766,46 +775,48 @@
     if (checkTransformSkip)
         return codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, psyEnergy);

+    ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
     uint32_t qtLayer = log2TrSize - 2;
     uint32_t tuSize = 1 << log2TrSizeC;
+    uint32_t stride = mode.fencYuv->m_csize;
+    const uint32_t sizeIdxC = log2TrSizeC - 2;
     uint32_t outDist = 0;

     uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
     const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;

-    for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
+    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
+    do
     {
-        TextType ttype = (TextType)chromaId;
-
-        TURecurse tuIterator(splitType, curPartNum, absPartIdx);
-        do
+        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
+
+        IntraNeighbors intraNeighbors;
+        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
+
+        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
         {
-            uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
+            TextType ttype = (TextType)chromaId;

             const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
             pixel*   pred     = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
-            int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
-            uint32_t stride   = mode.fencYuv->m_csize;
-            uint32_t sizeIdxC = log2TrSizeC - 2;
-
+            int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
             uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
             coeff_t* coeffC        = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
             pixel*   reconQt       = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
             uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
-
             pixel*   picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
             intptr_t picStride = m_frame->m_reconPic->m_strideC;

-            // init availability pattern
-            initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
-            pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
-
             uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
             if (chromaPredMode == DM_CHROMA_IDX)
                 chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
             if (m_csp == X265_CSP_I422)
                 chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];

+            // init availability pattern
+            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
+            pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
+
             // get prediction signal
             predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);

@@ -813,7 +824,6 @@

             primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
             uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
-            uint32_t tmpDist;
             if (numSig)
             {
                 m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
@@ -827,7 +837,7 @@
                 cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
             }

-            tmpDist = primitives.sse_pp[sizeIdxC](reconQt, reconQtStride, fenc, stride);
+            uint32_t tmpDist = primitives.sse_pp[sizeIdxC](reconQt, reconQtStride, fenc, stride);
             outDist += (ttype == TEXT_CHROMA_U) ? m_rdCost.scaleChromaDistCb(tmpDist) : m_rdCost.scaleChromaDistCr(tmpDist);

             if (m_rdCost.m_psyRd)
@@ -835,10 +845,13 @@

             primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, reconQt, reconQtStride);
         }
-        while (tuIterator.isNextSection());
-
-        if (splitType == VERTICAL_SPLIT)
-            offsetSubTUCBFs(cu, ttype, tuDepth, absPartIdx);
+    }
+    while (tuIterator.isNextSection());
+
+    if (splitType == VERTICAL_SPLIT)
+    {
+        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
+        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
     }

     return outDist;
@@ -866,14 +879,17 @@
     uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
     const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;

-    for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
+    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
+    do
     {
-        TextType ttype = (TextType)chromaId;
-
-        TURecurse tuIterator(splitType, curPartNum, absPartIdx);
-        do
+        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
+
+        IntraNeighbors intraNeighbors;
+        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
+
+        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
         {
-            uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
+            TextType ttype = (TextType)chromaId;

             const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
             pixel*   pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
@@ -887,7 +903,7 @@
             uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;

             // init availability pattern
-            initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
+            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
             pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);

             uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
@@ -980,10 +996,13 @@
             outDist += bDist;
             psyEnergy += bEnergy;
         }
-        while (tuIterator.isNextSection());
-
-        if (splitType == VERTICAL_SPLIT)
-            offsetSubTUCBFs(cu, ttype, tuDepth, absPartIdx);
+    }
+    while (tuIterator.isNextSection());
+
+    if (splitType == VERTICAL_SPLIT)
+    {
+        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
+        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
     }

     m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
@@ -1022,91 +1041,18 @@
     }
 }

-void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx)
+void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth)
 {
     CUData& cu = mode.cu;
-    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
-    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
-
-    if (tuDepth == cu.m_tuDepth[absPartIdx])
-    {
-        uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-        uint32_t tuDepthC = tuDepth;
-        if (log2TrSizeC < 2)
-        {
-            X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
-            if (absPartIdx & 3)
-                return;
-            log2TrSizeC = 2;
-            tuDepthC--;
-        }
-
-        ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
-        uint32_t tuSize = 1 << log2TrSizeC;
-        uint32_t stride = mode.fencYuv->m_csize;
-        const int sizeIdxC = log2TrSizeC - 2;
-
-        uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
-        const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
-
-        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
-        {
-            TextType ttype = (TextType)chromaId;
-
-            TURecurse tuIterator(splitType, curPartNum, absPartIdx);
-            do
-            {
-                uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
-
-                const pixel*   fenc   = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
-                pixel*   pred         = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
-                int16_t* residual     = resiYuv.getChromaAddr(chromaId, absPartIdxC);
-                pixel*   recon        = mode.reconYuv.getChromaAddr(chromaId, absPartIdxC); // TODO: needed?
-                uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
-                coeff_t* coeff        = cu.m_trCoeff[ttype] + coeffOffsetC;
-                pixel*   picReconC    = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
-                uint32_t picStride    = m_frame->m_reconPic->m_strideC;
-
-                uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
-                if (chromaPredMode == DM_CHROMA_IDX)
-                    chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
-                chromaPredMode = (m_csp == X265_CSP_I422) ? g_chroma422IntraAngleMappingTable[chromaPredMode] : chromaPredMode;
-                initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
-                pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
-
-                predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
-
-                X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
-
-                primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
-                uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, false);
-                if (numSig)
-                {
-                    m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], residual, stride, coeff, log2TrSizeC, ttype, true, false, numSig);
-                    primitives.luma_add_ps[sizeIdxC](recon, stride, pred, residual, stride, stride);
-                    primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, recon, stride);
-                    cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
-                }
-                else
-                {
-                    primitives.luma_copy_pp[sizeIdxC](recon, stride, pred, stride);
-                    primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, pred, stride);
-                    cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
-                }
-            }
-            while (tuIterator.isNextSection());
-
-            if (splitType == VERTICAL_SPLIT)
-                offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx);
-        }
-    }
-    else
+    uint32_t log2TrSize = cu.m_log2CUSize[absPartIdx] - tuDepth;
+
+    if (tuDepth < cu.m_tuDepth[absPartIdx])
     {
         uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
         uint32_t splitCbfU = 0, splitCbfV = 0;
         for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
         {
-            residualQTIntraChroma(mode, cuGeom, tuDepth + 1, qPartIdx);
+            residualQTIntraChroma(mode, cuGeom, qPartIdx, tuDepth + 1);
             splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
             splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
         }
@@ -1115,12 +1061,91 @@
             cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
             cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
         }
+
+        return;
+    }
+
+    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
+    uint32_t tuDepthC = tuDepth;
+    if (log2TrSizeC < 2)
+    {
+        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+        if (absPartIdx & 3)
+            return;
+        log2TrSizeC = 2;
+        tuDepthC--;
+    }
+
+    ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
+    uint32_t tuSize = 1 << log2TrSizeC;
+    uint32_t stride = mode.fencYuv->m_csize;
+    const uint32_t sizeIdxC = log2TrSizeC - 2;
+
+    uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
+    const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
+
+    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
+    do
+    {
+        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
+
+        IntraNeighbors intraNeighbors;
+        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
+
+        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
+        {
+            TextType ttype = (TextType)chromaId;
+
+            const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
+            pixel*   pred     = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
+            int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
+            uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
+            coeff_t* coeffC        = cu.m_trCoeff[ttype] + coeffOffsetC;
+            pixel*   picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
+            intptr_t picStride = m_frame->m_reconPic->m_strideC;
+
+            uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
+            if (chromaPredMode == DM_CHROMA_IDX)
+                chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
+            if (m_csp == X265_CSP_I422)
+                chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
+
+            // init availability pattern
+            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
+            pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
+
+            // get prediction signal
+            predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
+
+            X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
+
+            primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
+            uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
+            if (numSig)
+            {
+                m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
+                primitives.luma_add_ps[sizeIdxC](picReconC, picStride, pred, residual, stride, stride);
+                cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+            }
+            else
+            {
+                // no coded residual, recon = pred
+                primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, pred, stride);
+                cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+            }
+        }
+    }
+    while (tuIterator.isNextSection());
+
+    if (splitType == VERTICAL_SPLIT)
+    {
+        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
+        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
     }
 }

 void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes)
 {
-    uint32_t depth = cuGeom.depth;
     CUData& cu = intraMode.cu;

     cu.setPartSizeSubParts(partSize);
@@ -1143,7 +1168,7 @@
         m_entropyCoder.codePredMode(cu.m_predMode[0]);
     }

-    m_entropyCoder.codePartSize(cu, 0, depth);
+    m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
     m_entropyCoder.codePredInfo(cu, 0);
     intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits();

@@ -1153,7 +1178,10 @@
     intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
     intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
     if (m_rdCost.m_psyRd)
-        intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
+    {
+        const Yuv* fencYuv = intraMode.fencYuv;
+        intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
+    }

     updateModeCost(intraMode);
 }
@@ -1174,7 +1202,9 @@
     const uint32_t absPartIdx = 0;

     // Reference sample smoothing
-    initAdiPattern(cu, cuGeom, absPartIdx, initTuDepth, ALL_IDX);
+    IntraNeighbors intraNeighbors;
+    initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
+    initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);

     const pixel* fenc = intraMode.fencYuv->m_buf[0];
     uint32_t stride = intraMode.fencYuv->m_size;
@@ -1335,7 +1365,6 @@
 {
     CUData& cu = intraMode.cu;
     Yuv* reconYuv = &intraMode.reconYuv;
-    const Yuv* fencYuv = intraMode.fencYuv;

     X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
     X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
@@ -1369,7 +1398,10 @@
     intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
     intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
     if (m_rdCost.m_psyRd)
+    {
+        const Yuv* fencYuv = intraMode.fencYuv;
         intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
+    }

     m_entropyCoder.store(intraMode.contexts);
     updateModeCost(intraMode);
@@ -1404,7 +1436,9 @@
         else
         {
             // Reference sample smoothing
-            initAdiPattern(cu, cuGeom, absPartIdx, initTuDepth, ALL_IDX);
+            IntraNeighbors intraNeighbors;
+            initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
+            initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);

             // determine set of modes to be tested (using prediction signal only)
             const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
@@ -1602,8 +1636,10 @@
         log2TrSizeC = 5;
     }

-    Predict::initAdiPatternChroma(cu, cuGeom, 0, tuDepth, 1);
-    Predict::initAdiPatternChroma(cu, cuGeom, 0, tuDepth, 2);
+    IntraNeighbors intraNeighbors;
+    initIntraNeighbors(cu, 0, tuDepth, false, &intraNeighbors);
+    Predict::initAdiPatternChroma(cu, cuGeom, 0, intraNeighbors, 1); // U
+    Predict::initAdiPatternChroma(cu, cuGeom, 0, intraNeighbors, 2); // V
     cu.getAllowedChromaDir(0, modeList);

     // check chroma modes
@@ -2581,16 +2617,16 @@
     updateModeCost(interMode);
 }

-void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, const uint32_t depthRange[2])
+void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
 {
+    uint32_t depth = cuGeom.depth + tuDepth;
     CUData& cu = mode.cu;
     X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "invalid depth\n");

     uint32_t log2TrSize = g_maxLog2CUSize - depth;
-    uint32_t tuDepth = depth - cu.m_cuDepth[0];

     bool bCheckFull = log2TrSize <= depthRange[1];
-    if (cu.m_partSize[0] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && log2TrSize > depthRange[0])
+    if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && log2TrSize > depthRange[0])
         bCheckFull = false;

     if (bCheckFull)
@@ -2611,7 +2647,7 @@
         uint32_t setCbf = 1 << tuDepth;

         uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
-        coeff_t *coeffCurY = cu.m_trCoeff[0] + coeffOffsetY;
+        coeff_t* coeffCurY = cu.m_trCoeff[0] + coeffOffsetY;

         uint32_t sizeIdx  = log2TrSize  - 2;

@@ -2644,8 +2680,8 @@
             uint32_t strideResiC = resiYuv.m_csize;

             uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
-            coeff_t *coeffCurU = cu.m_trCoeff[1] + coeffOffsetC;
-            coeff_t *coeffCurV = cu.m_trCoeff[2] + coeffOffsetC;
+            coeff_t* coeffCurU = cu.m_trCoeff[1] + coeffOffsetC;
+            coeff_t* coeffCurV = cu.m_trCoeff[2] + coeffOffsetC;
             bool splitIntoSubTUs = (m_csp == X265_CSP_I422);

             TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
@@ -2702,16 +2738,16 @@
         uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
         for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
         {
-            residualTransformQuantInter(mode, cuGeom, qPartIdx, depth + 1, depthRange);
-            ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
+            residualTransformQuantInter(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
+            ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
             ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
             vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
         }
-        for (uint32_t i = 0; i < 4 * qNumParts; i++)
+        for (uint32_t i = 0; i < 4 * qNumParts; ++i)
         {
-            cu.m_cbf[TEXT_LUMA][absPartIdx + i] |= ycbf << tuDepth;
-            cu.m_cbf[TEXT_CHROMA_U][absPartIdx + i] |= ucbf << tuDepth;
-            cu.m_cbf[TEXT_CHROMA_V][absPartIdx + i] |= vcbf << tuDepth;
+            cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth;
+            cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
+            cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
         }
     }
 }
@@ -2769,7 +2805,7 @@

     uint32_t trSize = 1 << log2TrSize;
     const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
-    uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] +  tuDepthC) << 1);
+    uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
     const Yuv* fencYuv = mode.fencYuv;

     // code full block
@@ -3127,16 +3163,19 @@
         //Encode cbf flags
         if (bCodeChroma)
         {
-            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
+            if (!splitIntoSubTUs)
             {
-                if (!splitIntoSubTUs)
-                    m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][0], tuDepth);
-                else
-                {
-                    offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx);
-                    m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][0], tuDepth);
-                    m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][1], tuDepth);
-                }
+                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth);
+                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth);
+            }
+            else
+            {
+                offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
+                offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
+                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth);
+                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][1], tuDepth);
+                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth);
+                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][1], tuDepth);
             }
         }

diff -r 8d2f418829c8 -r 6b59452a17d7 source/encoder/search.h
--- a/source/encoder/search.h   Sat Dec 20 21:27:14 2014 +0900
+++ b/source/encoder/search.h   Tue Dec 23 14:49:59 2014 +0900
@@ -178,9 +178,9 @@
     void     encodeResAndCalcRdSkipCU(Mode& interMode);

     // encode residual without rd-cost
-    void     residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, const uint32_t depthRange[2]);
-    void     residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, const uint32_t depthRange[2]);
-    void     residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx);
+    void     residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2]);
+    void     residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2]);
+    void     residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth);

     // pick be chroma mode from available using just sa8d costs
     void     getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom);
_______________________________________________
x265-devel mailing list
x265-devel at videolan.org
https://mailman.videolan.org/listinfo/x265-devel

 

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20141223/038831cf/attachment-0001.html>


More information about the x265-devel mailing list