[x265] refine intra neighbors

Deepthi Nandakumar deepthi at multicorewareinc.com
Tue Dec 23 09:13:51 CET 2014


Thanks - code maintainability is still an issue though. For better
performance, we can store the pps->bConstrainedIntraPred as a member
variable Predict::bConstrainedIntra, so it stays in the cache.

On Tue, Dec 23, 2014 at 1:23 PM, Satoshi Nakagawa <nakagawa424 at oki.com>
wrote:

> CIP is picture level flag, and typically OFF.
>
> Separate functions are to simply non-CIP path.
>
> Code size is small and *CIP() functions will not be loaded to cache.
>
>
>
>
>
> *From:* x265-devel [mailto:x265-devel-bounces at videolan.org] *On Behalf Of
> *Ashok Kumar Mishra
> *Sent:* Tuesday, December 23, 2014 4:23 PM
> *To:* Development for x265
> *Subject:* Re: [x265] refine intra neighbors
>
>
>
> Hi,
>
>
>
> We removed separate functions for constrained intra prediction(CIP) some
> time back. Because it was increasing the code size at the cost of few
> conditional checks.
>
> Can you please send a separate patch for other changes not related to CIP.
>
>
>
> Thanks
>
> Ashok.
>
>
>
> On Tue, Dec 23, 2014 at 11:23 AM, Satoshi Nakagawa <nakagawa424 at oki.com>
> wrote:
>
> # HG changeset patch
> # User Satoshi Nakagawa <nakagawa424 at oki.com>
> # Date 1419313799 -32400
> #      Tue Dec 23 14:49:59 2014 +0900
> # Node ID 6b59452a17d75c42c1750d47e2318c8da80c39fb
> # Parent  8d2f418829c894c25da79daa861f16c61e5060d7
> refine intra neighbors
>
> diff -r 8d2f418829c8 -r 6b59452a17d7 source/common/common.h
> --- a/source/common/common.h    Sat Dec 20 21:27:14 2014 +0900
> +++ b/source/common/common.h    Tue Dec 23 14:49:59 2014 +0900
> @@ -163,6 +163,9 @@
>  template<typename T>
>  inline T x265_max(T a, T b) { return a > b ? a : b; }
>
> +template<typename T>
> +inline T x265_clip3(T minVal, T maxVal, T a) { return
> x265_min(x265_max(minVal, a), maxVal); }
> +
>  typedef int16_t  coeff_t;      // transform coefficient
>
>  #define X265_MIN(a, b) ((a) < (b) ? (a) : (b))
> diff -r 8d2f418829c8 -r 6b59452a17d7 source/common/cudata.cpp
> --- a/source/common/cudata.cpp  Sat Dec 20 21:27:14 2014 +0900
> +++ b/source/common/cudata.cpp  Tue Dec 23 14:49:59 2014 +0900
> @@ -608,7 +608,7 @@
>          {
>              if (curPartUnitIdx > g_rasterToZscan[absPartIdxRT -
> s_numPartInCUSize + 1])
>              {
> -                uint32_t absZorderCUIdx  = g_zscanToRaster[m_absIdxInCTU]
> + (1 << (m_log2CUSize[0] - LOG2_UNIT_SIZE)) - 1;
> +                uint32_t absZorderCUIdx = g_zscanToRaster[m_absIdxInCTU]
> + (1 << (m_log2CUSize[0] - LOG2_UNIT_SIZE)) - 1;
>                  arPartUnitIdx = g_rasterToZscan[absPartIdxRT -
> s_numPartInCUSize + 1];
>                  if (isEqualRowOrCol(absPartIdxRT, absZorderCUIdx,
> s_numPartInCUSize))
>                      return m_encData->getPicCTU(m_cuAddr);
> @@ -689,8 +689,6 @@
>              return NULL;
>          }
>          blPartUnitIdx = g_rasterToZscan[absPartIdxLB + (1 +
> partUnitOffset) * s_numPartInCUSize - 1];
> -        if (!m_cuLeft || !m_cuLeft->m_slice)
> -            return NULL;
>          return m_cuLeft;
>      }
>
> @@ -723,8 +721,6 @@
>              return NULL;
>          }
>          arPartUnitIdx = g_rasterToZscan[absPartIdxRT + NUM_CU_PARTITIONS
> - s_numPartInCUSize + partUnitOffset];
> -        if (!m_cuAbove || !m_cuAbove->m_slice)
> -            return NULL;
>          return m_cuAbove;
>      }
>
> @@ -732,8 +728,6 @@
>          return NULL;
>
>      arPartUnitIdx = g_rasterToZscan[NUM_CU_PARTITIONS - s_numPartInCUSize
> + partUnitOffset - 1];
> -    if ((m_cuAboveRight == NULL || m_cuAboveRight->m_slice == NULL ||
> (m_cuAboveRight->m_cuAddr) > m_cuAddr))
> -        return NULL;
>      return m_cuAboveRight;
>  }
>
> @@ -904,7 +898,7 @@
>      tuDepthRange[0] = m_slice->m_sps->quadtreeTULog2MinSize;
>      tuDepthRange[1] = m_slice->m_sps->quadtreeTULog2MaxSize;
>
> -    tuDepthRange[0] = X265_MAX(tuDepthRange[0], X265_MIN(log2CUSize -
> (m_slice->m_sps->quadtreeTUMaxDepthIntra - 1 + splitFlag),
> tuDepthRange[1]));
> +    tuDepthRange[0] = x265_clip3(tuDepthRange[0], tuDepthRange[1],
> log2CUSize - (m_slice->m_sps->quadtreeTUMaxDepthIntra - 1 + splitFlag));
>  }
>
>  void CUData::getInterTUQtDepthRange(uint32_t tuDepthRange[2], uint32_t
> absPartIdx) const
> @@ -916,7 +910,7 @@
>      tuDepthRange[0] = m_slice->m_sps->quadtreeTULog2MinSize;
>      tuDepthRange[1] = m_slice->m_sps->quadtreeTULog2MaxSize;
>
> -    tuDepthRange[0] = X265_MAX(tuDepthRange[0], X265_MIN(log2CUSize -
> (quadtreeTUMaxDepth - 1 + splitFlag), tuDepthRange[1]));
> +    tuDepthRange[0] = x265_clip3(tuDepthRange[0], tuDepthRange[1],
> log2CUSize - (quadtreeTUMaxDepth - 1 + splitFlag));
>  }
>
>  uint32_t CUData::getCtxSkipFlag(uint32_t absPartIdx) const
> @@ -1363,14 +1357,6 @@
>      return outPartIdxRB;
>  }
>
> -void CUData::deriveLeftRightTopIdxAdi(uint32_t& outPartIdxLT, uint32_t&
> outPartIdxRT, uint32_t partOffset, uint32_t partDepth) const
> -{
> -    uint32_t numPartInWidth = 1 << (m_log2CUSize[0] - LOG2_UNIT_SIZE -
> partDepth);
> -
> -    outPartIdxLT = m_absIdxInCTU + partOffset;
> -    outPartIdxRT = g_rasterToZscan[g_zscanToRaster[outPartIdxLT] +
> numPartInWidth - 1];
> -}
> -
>  bool CUData::hasEqualMotion(uint32_t absPartIdx, const CUData& candCU,
> uint32_t candAbsPartIdx) const
>  {
>      if (m_interDir[absPartIdx] != candCU.m_interDir[candAbsPartIdx])
> diff -r 8d2f418829c8 -r 6b59452a17d7 source/common/cudata.h
> --- a/source/common/cudata.h    Sat Dec 20 21:27:14 2014 +0900
> +++ b/source/common/cudata.h    Tue Dec 23 14:49:59 2014 +0900
> @@ -212,7 +212,6 @@
>
>      void     getAllowedChromaDir(uint32_t absPartIdx, uint32_t* modeList)
> const;
>      int      getIntraDirLumaPredictor(uint32_t absPartIdx, uint32_t*
> intraDirPred) const;
> -    void     deriveLeftRightTopIdxAdi(uint32_t& partIdxLT, uint32_t&
> partIdxRT, uint32_t partOffset, uint32_t partDepth) const;
>
>      uint32_t getSCUAddr() const                  { return (m_cuAddr <<
> g_maxFullDepth * 2) + m_absIdxInCTU; }
>      uint32_t getCtxSplitFlag(uint32_t absPartIdx, uint32_t depth) const;
> diff -r 8d2f418829c8 -r 6b59452a17d7 source/common/predict.cpp
> --- a/source/common/predict.cpp Sat Dec 20 21:27:14 2014 +0900
> +++ b/source/common/predict.cpp Tue Dec 23 14:49:59 2014 +0900
> @@ -654,11 +654,8 @@
>      }
>  }
>
> -void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom,
> uint32_t absPartIdx, uint32_t partDepth, int dirMode)
> +void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom,
> uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, int dirMode)
>  {
> -    IntraNeighbors intraNeighbors;
> -    initIntraNeighbors(cu, absPartIdx, partDepth, true, &intraNeighbors);
> -
>      pixel* adiBuf      = m_predBuf;
>      pixel* refAbove    = m_refAbove;
>      pixel* refLeft     = m_refLeft;
> @@ -700,12 +697,12 @@
>              int refTL = refAbove[0];
>              int refTR = refAbove[trSize2];
>              bStrongSmoothing = (abs(refBL + refTL - 2 * refLeft[trSize])
> < threshold &&
> -                abs(refTL + refTR - 2 * refAbove[trSize]) < threshold);
> +                                abs(refTL + refTR - 2 * refAbove[trSize])
> < threshold);
>
>              if (bStrongSmoothing)
>              {
>                  // bilinear interpolation
> -                const int shift = 5 + 1; // intraNeighbors.log2TrSize + 1;
> +                const int shift = 5 + 1; // log2TrSize + 1;
>                  int init = (refTL << shift) + tuSize;
>                  int delta;
>
> @@ -738,10 +735,8 @@
>      }
>  }
>
> -void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom&
> cuGeom, uint32_t absPartIdx, uint32_t partDepth, uint32_t chromaId)
> +void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom&
> cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, uint32_t
> chromaId)
>  {
> -    IntraNeighbors intraNeighbors;
> -    initIntraNeighbors(cu, absPartIdx, partDepth, false, &intraNeighbors);
>      uint32_t tuSize = intraNeighbors.tuSize;
>
>      const pixel* adiOrigin =
> cu.m_encData->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr,
> cuGeom.encodeIdx + absPartIdx);
> @@ -751,9 +746,9 @@
>      fillReferenceSamples(adiOrigin, picStride, adiRef, intraNeighbors);
>  }
>
> -void Predict::initIntraNeighbors(const CUData& cu, uint32_t absPartIdx,
> uint32_t partDepth, bool isLuma, IntraNeighbors *intraNeighbors)
> +void Predict::initIntraNeighbors(const CUData& cu, uint32_t absPartIdx,
> uint32_t tuDepth, bool isLuma, IntraNeighbors *intraNeighbors)
>  {
> -    uint32_t log2TrSize = cu.m_log2CUSize[0] - partDepth;
> +    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
>      int log2UnitWidth = LOG2_UNIT_SIZE;
>      int log2UnitHeight = LOG2_UNIT_SIZE;
>
> @@ -764,12 +759,12 @@
>          log2UnitHeight -= cu.m_vChromaShift;
>      }
>
> -    int   numIntraNeighbor = 0;
> +    int numIntraNeighbor;
>      bool* bNeighborFlags = intraNeighbors->bNeighborFlags;
>
> -    uint32_t partIdxLT, partIdxRT, partIdxLB;
> -
> -    cu.deriveLeftRightTopIdxAdi(partIdxLT, partIdxRT, absPartIdx,
> partDepth);
> +    uint32_t numPartInWidth = 1 << (cu.m_log2CUSize[0] - LOG2_UNIT_SIZE -
> tuDepth);
> +    uint32_t partIdxLT = cu.m_absIdxInCTU + absPartIdx;
> +    uint32_t partIdxRT = g_rasterToZscan[g_zscanToRaster[partIdxLT] +
> numPartInWidth - 1];
>
>      uint32_t tuSize = 1 << log2TrSize;
>      int  tuWidthInUnits = tuSize >> log2UnitWidth;
> @@ -777,14 +772,26 @@
>      int  aboveUnits = tuWidthInUnits << 1;
>      int  leftUnits = tuHeightInUnits << 1;
>      int  partIdxStride = cu.m_slice->m_sps->numPartInCUSize;
> -    partIdxLB = g_rasterToZscan[g_zscanToRaster[partIdxLT] +
> ((tuHeightInUnits - 1) * partIdxStride)];
> +    uint32_t partIdxLB = g_rasterToZscan[g_zscanToRaster[partIdxLT] +
> ((tuHeightInUnits - 1) * partIdxStride)];
>
> -    bNeighborFlags[leftUnits] = isAboveLeftAvailable(cu, partIdxLT);
> -    numIntraNeighbor += (int)(bNeighborFlags[leftUnits]);
> -    numIntraNeighbor += isAboveAvailable(cu, partIdxLT, partIdxRT,
> (bNeighborFlags + leftUnits + 1));
> -    numIntraNeighbor += isAboveRightAvailable(cu, partIdxLT, partIdxRT,
> (bNeighborFlags + leftUnits + 1 + tuWidthInUnits));
> -    numIntraNeighbor += isLeftAvailable(cu, partIdxLT, partIdxLB,
> (bNeighborFlags + leftUnits - 1));
> -    numIntraNeighbor += isBelowLeftAvailable(cu, partIdxLT, partIdxLB,
> (bNeighborFlags + leftUnits - 1 - tuHeightInUnits));
> +    if (cu.m_slice->isIntra() ||
> !cu.m_slice->m_pps->bConstrainedIntraPred)
> +    {
> +        bNeighborFlags[leftUnits] = isAboveLeftAvailable(cu, partIdxLT);
> +        numIntraNeighbor  = (int)(bNeighborFlags[leftUnits]);
> +        numIntraNeighbor += isAboveAvailable(cu, partIdxLT, partIdxRT,
> bNeighborFlags + leftUnits + 1);
> +        numIntraNeighbor += isAboveRightAvailable(cu, partIdxRT,
> bNeighborFlags + leftUnits + 1 + tuWidthInUnits, tuWidthInUnits);
> +        numIntraNeighbor += isLeftAvailable(cu, partIdxLT, partIdxLB,
> bNeighborFlags + leftUnits - 1);
> +        numIntraNeighbor += isBelowLeftAvailable(cu, partIdxLB,
> bNeighborFlags + tuHeightInUnits - 1, tuHeightInUnits);
> +    }
> +    else
> +    {
> +        bNeighborFlags[leftUnits] = isAboveLeftAvailableCIP(cu,
> partIdxLT);
> +        numIntraNeighbor  = (int)(bNeighborFlags[leftUnits]);
> +        numIntraNeighbor += isAboveAvailableCIP(cu, partIdxLT, partIdxRT,
> bNeighborFlags + leftUnits + 1);
> +        numIntraNeighbor += isAboveRightAvailableCIP(cu, partIdxRT,
> bNeighborFlags + leftUnits + 1 + tuWidthInUnits, tuWidthInUnits);
> +        numIntraNeighbor += isLeftAvailableCIP(cu, partIdxLT, partIdxLB,
> bNeighborFlags + leftUnits - 1);
> +        numIntraNeighbor += isBelowLeftAvailableCIP(cu, partIdxLB,
> bNeighborFlags + tuHeightInUnits - 1, tuHeightInUnits);
> +    }
>
>      intraNeighbors->numIntraNeighbor = numIntraNeighbor;
>      intraNeighbors->totalUnits = aboveUnits + leftUnits + 1;
> @@ -793,7 +800,6 @@
>      intraNeighbors->unitWidth = 1 << log2UnitWidth;
>      intraNeighbors->unitHeight = 1 << log2UnitHeight;
>      intraNeighbors->tuSize = tuSize;
> -    intraNeighbors->log2TrSize = log2TrSize;
>  }
>
>  void Predict::fillReferenceSamples(const pixel* adiOrigin, intptr_t
> picStride, pixel* adiRef, const IntraNeighbors& intraNeighbors)
> @@ -953,33 +959,27 @@
>      uint32_t partAboveLeft;
>      const CUData* cuAboveLeft = cu.getPUAboveLeft(partAboveLeft,
> partIdxLT);
>
> -    if (!cu.m_slice->m_pps->bConstrainedIntraPred)
> -        return cuAboveLeft ? true : false;
> -    else
> -        return cuAboveLeft && cuAboveLeft->isIntra(partAboveLeft);
> +    return !!cuAboveLeft;
>  }
>
>  int Predict::isAboveAvailable(const CUData& cu, uint32_t partIdxLT,
> uint32_t partIdxRT, bool* bValidFlags)
>  {
>      const uint32_t rasterPartBegin = g_zscanToRaster[partIdxLT];
> -    const uint32_t rasterPartEnd = g_zscanToRaster[partIdxRT] + 1;
> +    const uint32_t rasterPartEnd = g_zscanToRaster[partIdxRT];
>      const uint32_t idxStep = 1;
> -    bool* validFlagPtr = bValidFlags;
>      int numIntra = 0;
>
> -    for (uint32_t rasterPart = rasterPartBegin; rasterPart <
> rasterPartEnd; rasterPart += idxStep)
> +    for (uint32_t rasterPart = rasterPartBegin; rasterPart <=
> rasterPartEnd; rasterPart += idxStep, bValidFlags++)
>      {
>          uint32_t partAbove;
>          const CUData* cuAbove = cu.getPUAbove(partAbove,
> g_rasterToZscan[rasterPart]);
> -        if (cuAbove && (!cu.m_slice->m_pps->bConstrainedIntraPred ||
> cuAbove->isIntra(partAbove)))
> +        if (cuAbove)
>          {
>              numIntra++;
> -            *validFlagPtr = true;
> +            *bValidFlags = true;
>          }
>          else
> -            *validFlagPtr = false;
> -
> -        validFlagPtr++;
> +            *bValidFlags = false;
>      }
>
>      return numIntra;
> @@ -988,73 +988,156 @@
>  int Predict::isLeftAvailable(const CUData& cu, uint32_t partIdxLT,
> uint32_t partIdxLB, bool* bValidFlags)
>  {
>      const uint32_t rasterPartBegin = g_zscanToRaster[partIdxLT];
> -    const uint32_t rasterPartEnd = g_zscanToRaster[partIdxLB] + 1;
> +    const uint32_t rasterPartEnd = g_zscanToRaster[partIdxLB];
>      const uint32_t idxStep = cu.m_slice->m_sps->numPartInCUSize;
> -    bool* validFlagPtr = bValidFlags;
>      int numIntra = 0;
>
> -    for (uint32_t rasterPart = rasterPartBegin; rasterPart <
> rasterPartEnd; rasterPart += idxStep)
> +    for (uint32_t rasterPart = rasterPartBegin; rasterPart <=
> rasterPartEnd; rasterPart += idxStep, bValidFlags--) // opposite direction
>      {
>          uint32_t partLeft;
>          const CUData* cuLeft = cu.getPULeft(partLeft,
> g_rasterToZscan[rasterPart]);
> -        if (cuLeft && (!cu.m_slice->m_pps->bConstrainedIntraPred ||
> cuLeft->isIntra(partLeft)))
> +        if (cuLeft)
>          {
>              numIntra++;
> -            *validFlagPtr = true;
> +            *bValidFlags = true;
>          }
>          else
> -            *validFlagPtr = false;
> -
> -        validFlagPtr--; // opposite direction
> +            *bValidFlags = false;
>      }
>
>      return numIntra;
>  }
>
> -int Predict::isAboveRightAvailable(const CUData& cu, uint32_t partIdxLT,
> uint32_t partIdxRT, bool* bValidFlags)
> +int Predict::isAboveRightAvailable(const CUData& cu, uint32_t partIdxRT,
> bool* bValidFlags, uint32_t numUnits)
>  {
> -    const uint32_t numUnitsInPU = g_zscanToRaster[partIdxRT] -
> g_zscanToRaster[partIdxLT] + 1;
> -    bool* validFlagPtr = bValidFlags;
>      int numIntra = 0;
>
> -    for (uint32_t offset = 1; offset <= numUnitsInPU; offset++)
> +    for (uint32_t offset = 1; offset <= numUnits; offset++, bValidFlags++)
>      {
>          uint32_t partAboveRight;
>          const CUData* cuAboveRight =
> cu.getPUAboveRightAdi(partAboveRight, partIdxRT, offset);
> -        if (cuAboveRight && (!cu.m_slice->m_pps->bConstrainedIntraPred ||
> cuAboveRight->isIntra(partAboveRight)))
> +        if (cuAboveRight)
>          {
>              numIntra++;
> -            *validFlagPtr = true;
> +            *bValidFlags = true;
>          }
>          else
> -            *validFlagPtr = false;
> -
> -        validFlagPtr++;
> +            *bValidFlags = false;
>      }
>
>      return numIntra;
>  }
>
> -int Predict::isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLT,
> uint32_t partIdxLB, bool* bValidFlags)
> +int Predict::isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLB,
> bool* bValidFlags, uint32_t numUnits)
>  {
> -    const uint32_t numUnitsInPU = (g_zscanToRaster[partIdxLB] -
> g_zscanToRaster[partIdxLT]) / cu.m_slice->m_sps->numPartInCUSize + 1;
> -    bool* validFlagPtr = bValidFlags;
>      int numIntra = 0;
>
> -    for (uint32_t offset = 1; offset <= numUnitsInPU; offset++)
> +    for (uint32_t offset = 1; offset <= numUnits; offset++,
> bValidFlags--) // opposite direction
>      {
>          uint32_t partBelowLeft;
>          const CUData* cuBelowLeft = cu.getPUBelowLeftAdi(partBelowLeft,
> partIdxLB, offset);
> -        if (cuBelowLeft && (!cu.m_slice->m_pps->bConstrainedIntraPred ||
> cuBelowLeft->isIntra(partBelowLeft)))
> +        if (cuBelowLeft)
>          {
>              numIntra++;
> -            *validFlagPtr = true;
> +            *bValidFlags = true;
>          }
>          else
> -            *validFlagPtr = false;
> -
> -        validFlagPtr--; // opposite direction
> +            *bValidFlags = false;
>      }
>
>      return numIntra;
>  }
> +
> +bool Predict::isAboveLeftAvailableCIP(const CUData& cu, uint32_t
> partIdxLT)
> +{
> +    uint32_t partAboveLeft;
> +    const CUData* cuAboveLeft = cu.getPUAboveLeft(partAboveLeft,
> partIdxLT);
> +
> +    return cuAboveLeft && cuAboveLeft->isIntra(partAboveLeft);
> +}
> +
> +int Predict::isAboveAvailableCIP(const CUData& cu, uint32_t partIdxLT,
> uint32_t partIdxRT, bool* bValidFlags)
> +{
> +    const uint32_t rasterPartBegin = g_zscanToRaster[partIdxLT];
> +    const uint32_t rasterPartEnd = g_zscanToRaster[partIdxRT];
> +    const uint32_t idxStep = 1;
> +    int numIntra = 0;
> +
> +    for (uint32_t rasterPart = rasterPartBegin; rasterPart <=
> rasterPartEnd; rasterPart += idxStep, bValidFlags++)
> +    {
> +        uint32_t partAbove;
> +        const CUData* cuAbove = cu.getPUAbove(partAbove,
> g_rasterToZscan[rasterPart]);
> +        if (cuAbove && cuAbove->isIntra(partAbove))
> +        {
> +            numIntra++;
> +            *bValidFlags = true;
> +        }
> +        else
> +            *bValidFlags = false;
> +    }
> +
> +    return numIntra;
> +}
> +
> +int Predict::isLeftAvailableCIP(const CUData& cu, uint32_t partIdxLT,
> uint32_t partIdxLB, bool* bValidFlags)
> +{
> +    const uint32_t rasterPartBegin = g_zscanToRaster[partIdxLT];
> +    const uint32_t rasterPartEnd = g_zscanToRaster[partIdxLB];
> +    const uint32_t idxStep = cu.m_slice->m_sps->numPartInCUSize;
> +    int numIntra = 0;
> +
> +    for (uint32_t rasterPart = rasterPartBegin; rasterPart <=
> rasterPartEnd; rasterPart += idxStep, bValidFlags--) // opposite direction
> +    {
> +        uint32_t partLeft;
> +        const CUData* cuLeft = cu.getPULeft(partLeft,
> g_rasterToZscan[rasterPart]);
> +        if (cuLeft && cuLeft->isIntra(partLeft))
> +        {
> +            numIntra++;
> +            *bValidFlags = true;
> +        }
> +        else
> +            *bValidFlags = false;
> +    }
> +
> +    return numIntra;
> +}
> +
> +int Predict::isAboveRightAvailableCIP(const CUData& cu, uint32_t
> partIdxRT, bool* bValidFlags, uint32_t numUnits)
> +{
> +    int numIntra = 0;
> +
> +    for (uint32_t offset = 1; offset <= numUnits; offset++, bValidFlags++)
> +    {
> +        uint32_t partAboveRight;
> +        const CUData* cuAboveRight =
> cu.getPUAboveRightAdi(partAboveRight, partIdxRT, offset);
> +        if (cuAboveRight && cuAboveRight->isIntra(partAboveRight))
> +        {
> +            numIntra++;
> +            *bValidFlags = true;
> +        }
> +        else
> +            *bValidFlags = false;
> +    }
> +
> +    return numIntra;
> +}
> +
> +int Predict::isBelowLeftAvailableCIP(const CUData& cu, uint32_t
> partIdxLB, bool* bValidFlags, uint32_t numUnits)
> +{
> +    int numIntra = 0;
> +
> +    for (uint32_t offset = 1; offset <= numUnits; offset++,
> bValidFlags--) // opposite direction
> +    {
> +        uint32_t partBelowLeft;
> +        const CUData* cuBelowLeft = cu.getPUBelowLeftAdi(partBelowLeft,
> partIdxLB, offset);
> +        if (cuBelowLeft && cuBelowLeft->isIntra(partBelowLeft))
> +        {
> +            numIntra++;
> +            *bValidFlags = true;
> +        }
> +        else
> +            *bValidFlags = false;
> +    }
> +
> +    return numIntra;
> +}
> diff -r 8d2f418829c8 -r 6b59452a17d7 source/common/predict.h
> --- a/source/common/predict.h   Sat Dec 20 21:27:14 2014 +0900
> +++ b/source/common/predict.h   Tue Dec 23 14:49:59 2014 +0900
> @@ -57,7 +57,6 @@
>          int      unitWidth;
>          int      unitHeight;
>          int      tuSize;
> -        uint32_t log2TrSize;
>          bool     bNeighborFlags[4 * MAX_NUM_SPU_W + 1];
>      };
>
> @@ -105,14 +104,20 @@
>      void addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const
> WeightValues wp[3], bool bLuma, bool bChroma) const;
>
>      /* Intra prediction helper functions */
> -    static void initIntraNeighbors(const CUData& cu, uint32_t
> zOrderIdxInPart, uint32_t partDepth, bool isLuma, IntraNeighbors
> *IntraNeighbors);
> +    static void initIntraNeighbors(const CUData& cu, uint32_t absPartIdx,
> uint32_t tuDepth, bool isLuma, IntraNeighbors *IntraNeighbors);
>      static void fillReferenceSamples(const pixel* adiOrigin, intptr_t
> picStride, pixel* adiRef, const IntraNeighbors& intraNeighbors);
>
>      static bool isAboveLeftAvailable(const CUData& cu, uint32_t
> partIdxLT);
>      static int  isAboveAvailable(const CUData& cu, uint32_t partIdxLT,
> uint32_t partIdxRT, bool* bValidFlags);
>      static int  isLeftAvailable(const CUData& cu, uint32_t partIdxLT,
> uint32_t partIdxLB, bool* bValidFlags);
> -    static int  isAboveRightAvailable(const CUData& cu, uint32_t
> partIdxLT, uint32_t partIdxRT, bool* bValidFlags);
> -    static int  isBelowLeftAvailable(const CUData& cu, uint32_t
> partIdxLT, uint32_t partIdxLB, bool* bValidFlags);
> +    static int  isAboveRightAvailable(const CUData& cu, uint32_t
> partIdxRT, bool* bValidFlags, uint32_t numUnits);
> +    static int  isBelowLeftAvailable(const CUData& cu, uint32_t
> partIdxLB, bool* bValidFlags, uint32_t numUnits);
> +
> +    static bool isAboveLeftAvailableCIP(const CUData& cu, uint32_t
> partIdxLT);
> +    static int  isAboveAvailableCIP(const CUData& cu, uint32_t partIdxLT,
> uint32_t partIdxRT, bool* bValidFlags);
> +    static int  isLeftAvailableCIP(const CUData& cu, uint32_t partIdxLT,
> uint32_t partIdxLB, bool* bValidFlags);
> +    static int  isAboveRightAvailableCIP(const CUData& cu, uint32_t
> partIdxRT, bool* bValidFlags, uint32_t numUnits);
> +    static int  isBelowLeftAvailableCIP(const CUData& cu, uint32_t
> partIdxLB, bool* bValidFlags, uint32_t numUnits);
>
>  public:
>
> @@ -125,8 +130,8 @@
>      void predIntraLumaAng(uint32_t dirMode, pixel* pred, intptr_t stride,
> uint32_t log2TrSize);
>      void predIntraChromaAng(pixel* src, uint32_t dirMode, pixel* pred,
> intptr_t stride, uint32_t log2TrSizeC, int chFmt);
>
> -    void initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t
> absPartIdx, uint32_t partDepth, int dirMode);
> -    void initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom,
> uint32_t absPartIdx, uint32_t partDepth, uint32_t chromaId);
> +    void initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t
> absPartIdx, const IntraNeighbors& intraNeighbors, int dirMode);
> +    void initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom,
> uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, uint32_t
> chromaId);
>      pixel* getAdiChromaBuf(uint32_t chromaId, int tuSize)
>      {
>          return m_predBuf + (chromaId == 1 ? 0 : 2 * ADI_BUF_STRIDE *
> (tuSize * 2 + 1));
> diff -r 8d2f418829c8 -r 6b59452a17d7 source/encoder/analysis.cpp
> --- a/source/encoder/analysis.cpp       Sat Dec 20 21:27:14 2014 +0900
> +++ b/source/encoder/analysis.cpp       Tue Dec 23 14:49:59 2014 +0900
> @@ -914,7 +914,7 @@
>                          cu.getInterTUQtDepthRange(tuDepthRange, 0);
>
>
>  m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv,
> md.bestMode->predYuv, cuGeom.log2CUSize);
> -                        residualTransformQuantInter(*md.bestMode, cuGeom,
> 0, cuGeom.depth, tuDepthRange);
> +                        residualTransformQuantInter(*md.bestMode, cuGeom,
> 0, 0, tuDepthRange);
>                          if (cu.getQtRootCbf(0))
>
>  md.bestMode->reconYuv.addClip(md.bestMode->predYuv,
> m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]);
>                          else
> @@ -938,8 +938,7 @@
>                          uint32_t tuDepthRange[2];
>                          cu.getIntraTUQtDepthRange(tuDepthRange, 0);
>
> -                        uint32_t initTuDepth = cu.m_partSize[0] !=
> SIZE_2Nx2N;
> -                        residualTransformQuantIntra(*md.bestMode, cuGeom,
> initTuDepth, 0, tuDepthRange);
> +                        residualTransformQuantIntra(*md.bestMode, cuGeom,
> 0, 0, tuDepthRange);
>                          getBestIntraModeChroma(*md.bestMode, cuGeom);
>                          residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
>
>  md.bestMode->reconYuv.copyFromPicYuv(*m_frame->m_reconPic, cu.m_cuAddr,
> cuGeom.encodeIdx); // TODO:
> @@ -1702,8 +1701,7 @@
>          uint32_t tuDepthRange[2];
>          cu.getIntraTUQtDepthRange(tuDepthRange, 0);
>
> -        uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N;
> -        residualTransformQuantIntra(*bestMode, cuGeom, initTuDepth, 0,
> tuDepthRange);
> +        residualTransformQuantIntra(*bestMode, cuGeom, 0, 0,
> tuDepthRange);
>          getBestIntraModeChroma(*bestMode, cuGeom);
>          residualQTIntraChroma(*bestMode, cuGeom, 0, 0);
>      }
> @@ -1736,7 +1734,7 @@
>          uint32_t tuDepthRange[2];
>          cu.getInterTUQtDepthRange(tuDepthRange, 0);
>
> -        residualTransformQuantInter(*bestMode, cuGeom, 0, cuGeom.depth,
> tuDepthRange);
> +        residualTransformQuantInter(*bestMode, cuGeom, 0, 0,
> tuDepthRange);
>
>          if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N &&
> !cu.getQtRootCbf(0))
>              cu.setPredModeSubParts(MODE_SKIP);
> diff -r 8d2f418829c8 -r 6b59452a17d7 source/encoder/search.cpp
> --- a/source/encoder/search.cpp Sat Dec 20 21:27:14 2014 +0900
> +++ b/source/encoder/search.cpp Tue Dec 23 14:49:59 2014 +0900
> @@ -239,7 +239,8 @@
>
>  void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t
> tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const
> uint32_t depthRange[2])
>  {
> -    uint32_t fullDepth  = mode.cu.m_cuDepth[0] + tuDepth;
> +    CUData& cu = mode.cu;
> +    uint32_t fullDepth  = cu.m_cuDepth[0] + tuDepth;
>      uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
>      uint32_t qtLayer    = log2TrSize - 2;
>      uint32_t sizeIdx    = log2TrSize - 2;
> @@ -253,8 +254,6 @@
>          mightSplit = true;
>      }
>
> -    CUData& cu = mode.cu;
> -
>      Cost fullCost;
>      uint32_t bCBF = 0;
>
> @@ -273,7 +272,9 @@
>
>          // init availability pattern
>          uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
> -        initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
> +        IntraNeighbors intraNeighbors;
> +        initIntraNeighbors(cu, absPartIdx, tuDepth, true,
> &intraNeighbors);
> +        initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors,
> lumaPredMode);
>
>          // get prediction signal
>          predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
> @@ -365,7 +366,7 @@
>              m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);   // prep
> state of split encode
>          }
>
> -        // code split block
> +        /* code split block */
>          uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
>
>          int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled &&
> (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
> @@ -451,11 +452,13 @@
>      pixel*   pred = predYuv->getLumaAddr(absPartIdx);
>      int16_t* residual =
> m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
>      uint32_t stride = fencYuv->m_size;
> -    int      sizeIdx = log2TrSize - 2;
> +    uint32_t sizeIdx = log2TrSize - 2;
>
>      // init availability pattern
>      uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
> -    initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
> +    IntraNeighbors intraNeighbors;
> +    initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
> +    initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
>
>      // get prediction signal
>      predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
> @@ -597,13 +600,12 @@
>  }
>
>  /* fast luma intra residual generation. Only perform the minimum number
> of TU splits required by the CU size */
> -void Search::residualTransformQuantIntra(Mode& mode, const CUGeom&
> cuGeom, uint32_t tuDepth, uint32_t absPartIdx, const uint32_t depthRange[2])
> +void Search::residualTransformQuantIntra(Mode& mode, const CUGeom&
> cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
>  {
>      CUData& cu = mode.cu;
> -
> -    uint32_t fullDepth   = cu.m_cuDepth[0] + tuDepth;
> -    uint32_t log2TrSize  = g_maxLog2CUSize - fullDepth;
> -    bool     bCheckFull  = log2TrSize <= depthRange[1];
> +    uint32_t fullDepth  = cu.m_cuDepth[0] + tuDepth;
> +    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
> +    bool     bCheckFull = log2TrSize <= depthRange[1];
>
>      X265_CHECK(m_slice->m_sliceType != I_SLICE,
> "residualTransformQuantIntra not intended for I slices\n");
>
> @@ -614,28 +616,36 @@
>
>      if (bCheckFull)
>      {
> -        const pixel* fenc  = mode.fencYuv->getLumaAddr(absPartIdx);
> -        pixel*   pred      = mode.predYuv.getLumaAddr(absPartIdx);
> -        int16_t* residual  =
> m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
> +        const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
> +        pixel*   pred     = mode.predYuv.getLumaAddr(absPartIdx);
> +        int16_t* residual =
> m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
> +        uint32_t stride   = mode.fencYuv->m_size;
> +
> +        // init availability pattern
> +        uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
> +        IntraNeighbors intraNeighbors;
> +        initIntraNeighbors(cu, absPartIdx, tuDepth, true,
> &intraNeighbors);
> +        initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors,
> lumaPredMode);
> +
> +        // get prediction signal
> +        predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
> +
> +        X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx],
> "unexpected tskip flag in residualTransformQuantIntra\n");
> +        cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
> +
> +        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
> +        coeff_t* coeffY       = cu.m_trCoeff[0] + coeffOffsetY;
> +
> +        uint32_t sizeIdx   = log2TrSize - 2;
> +        primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
> +
>          pixel*   picReconY =
> m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx +
> absPartIdx);
>          intptr_t picStride = m_frame->m_reconPic->m_stride;
> -        uint32_t stride    = mode.fencYuv->m_size;
> -        uint32_t sizeIdx   = log2TrSize - 2;
> -        uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
> -        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
> -        coeff_t* coeff        = cu.m_trCoeff[TEXT_LUMA] + coeffOffsetY;
> -
> -        initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
> -        predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
> -
> -        X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx],
> "unexpected tskip flag in residualTransformQuantIntra\n");
> -        cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
> -
> -        primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
> -        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride,
> residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, false);
> +
> +        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride,
> residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
>          if (numSig)
>          {
> -            m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], residual,
> stride, coeff, log2TrSize, TEXT_LUMA, true, false, numSig);
> +            m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride,
> coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
>              primitives.luma_add_ps[sizeIdx](picReconY, picStride, pred,
> residual, stride, stride);
>              cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx,
> fullDepth);
>          }
> @@ -654,11 +664,11 @@
>          uint32_t cbf = 0;
>          for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx,
> qPartIdx += qNumParts)
>          {
> -            residualTransformQuantIntra(mode, cuGeom, tuDepth + 1,
> qPartIdx, depthRange);
> +            residualTransformQuantIntra(mode, cuGeom, qPartIdx, tuDepth +
> 1, depthRange);
>              cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
>          }
>          for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
> -            cu.m_cbf[TEXT_LUMA][absPartIdx + offs] |= (cbf << tuDepth);
> +            cu.m_cbf[0][absPartIdx + offs] |= (cbf << tuDepth);
>      }
>  }
>
> @@ -739,15 +749,14 @@
>          }
>          for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
>          {
> -            cu.m_cbf[TEXT_CHROMA_U][absPartIdx + offs] |= (splitCbfU <<
> tuDepth);
> -            cu.m_cbf[TEXT_CHROMA_V][absPartIdx + offs] |= (splitCbfV <<
> tuDepth);
> +            cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
> +            cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
>          }
>
>          return outDist;
>      }
>
>      uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
> -
>      uint32_t tuDepthC = tuDepth;
>      if (log2TrSizeC < 2)
>      {
> @@ -766,46 +775,48 @@
>      if (checkTransformSkip)
>          return codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC,
> absPartIdx, psyEnergy);
>
> +    ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
>      uint32_t qtLayer = log2TrSize - 2;
>      uint32_t tuSize = 1 << log2TrSizeC;
> +    uint32_t stride = mode.fencYuv->m_csize;
> +    const uint32_t sizeIdxC = log2TrSizeC - 2;
>      uint32_t outDist = 0;
>
>      uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] +
> tuDepthC) << 1);
>      const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT
> : DONT_SPLIT;
>
> -    for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V;
> chromaId++)
> +    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
> +    do
>      {
> -        TextType ttype = (TextType)chromaId;
> -
> -        TURecurse tuIterator(splitType, curPartNum, absPartIdx);
> -        do
> +        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
> +
> +        IntraNeighbors intraNeighbors;
> +        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false,
> &intraNeighbors);
> +
> +        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <=
> TEXT_CHROMA_V; chromaId++)
>          {
> -            uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
> +            TextType ttype = (TextType)chromaId;
>
>              const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId,
> absPartIdxC);
>              pixel*   pred     = mode.predYuv.getChromaAddr(chromaId,
> absPartIdxC);
> -            int16_t* residual =
> m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
> -            uint32_t stride   = mode.fencYuv->m_csize;
> -            uint32_t sizeIdxC = log2TrSizeC - 2;
> -
> +            int16_t* residual = resiYuv.getChromaAddr(chromaId,
> absPartIdxC);
>              uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 -
> (m_hChromaShift + m_vChromaShift));
>              coeff_t* coeffC        = m_rqt[qtLayer].coeffRQT[chromaId] +
> coeffOffsetC;
>              pixel*   reconQt       =
> m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
>              uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
> -
>              pixel*   picReconC =
> m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx
> + absPartIdxC);
>              intptr_t picStride = m_frame->m_reconPic->m_strideC;
>
> -            // init availability pattern
> -            initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC,
> chromaId);
> -            pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
> -
>              uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
>              if (chromaPredMode == DM_CHROMA_IDX)
>                  chromaPredMode = cu.m_lumaIntraDir[(m_csp ==
> X265_CSP_I444) ? absPartIdxC : 0];
>              if (m_csp == X265_CSP_I422)
>                  chromaPredMode =
> g_chroma422IntraAngleMappingTable[chromaPredMode];
>
> +            // init availability pattern
> +            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors,
> chromaId);
> +            pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
> +
>              // get prediction signal
>              predIntraChromaAng(chromaPred, chromaPredMode, pred, stride,
> log2TrSizeC, m_csp);
>
> @@ -813,7 +824,6 @@
>
>              primitives.calcresidual[sizeIdxC](fenc, pred, residual,
> stride);
>              uint32_t numSig = m_quant.transformNxN(cu, fenc, stride,
> residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
> -            uint32_t tmpDist;
>              if (numSig)
>              {
>                  m_quant.invtransformNxN(cu.m_tqBypass[0], residual,
> stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
> @@ -827,7 +837,7 @@
>                  cu.setCbfPartRange(0, ttype, absPartIdxC,
> tuIterator.absPartIdxStep);
>              }
>
> -            tmpDist = primitives.sse_pp[sizeIdxC](reconQt, reconQtStride,
> fenc, stride);
> +            uint32_t tmpDist = primitives.sse_pp[sizeIdxC](reconQt,
> reconQtStride, fenc, stride);
>              outDist += (ttype == TEXT_CHROMA_U) ?
> m_rdCost.scaleChromaDistCb(tmpDist) : m_rdCost.scaleChromaDistCr(tmpDist);
>
>              if (m_rdCost.m_psyRd)
> @@ -835,10 +845,13 @@
>
>              primitives.luma_copy_pp[sizeIdxC](picReconC, picStride,
> reconQt, reconQtStride);
>          }
> -        while (tuIterator.isNextSection());
> -
> -        if (splitType == VERTICAL_SPLIT)
> -            offsetSubTUCBFs(cu, ttype, tuDepth, absPartIdx);
> +    }
> +    while (tuIterator.isNextSection());
> +
> +    if (splitType == VERTICAL_SPLIT)
> +    {
> +        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
> +        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
>      }
>
>      return outDist;
> @@ -866,14 +879,17 @@
>      uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] +
> tuDepthC) << 1);
>      const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT
> : DONT_SPLIT;
>
> -    for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V;
> chromaId++)
> +    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
> +    do
>      {
> -        TextType ttype = (TextType)chromaId;
> -
> -        TURecurse tuIterator(splitType, curPartNum, absPartIdx);
> -        do
> +        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
> +
> +        IntraNeighbors intraNeighbors;
> +        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false,
> &intraNeighbors);
> +
> +        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <=
> TEXT_CHROMA_V; chromaId++)
>          {
> -            uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
> +            TextType ttype = (TextType)chromaId;
>
>              const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId,
> absPartIdxC);
>              pixel*   pred = mode.predYuv.getChromaAddr(chromaId,
> absPartIdxC);
> @@ -887,7 +903,7 @@
>              uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
>
>              // init availability pattern
> -            initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC,
> chromaId);
> +            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors,
> chromaId);
>              pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
>
>              uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
> @@ -980,10 +996,13 @@
>              outDist += bDist;
>              psyEnergy += bEnergy;
>          }
> -        while (tuIterator.isNextSection());
> -
> -        if (splitType == VERTICAL_SPLIT)
> -            offsetSubTUCBFs(cu, ttype, tuDepth, absPartIdx);
> +    }
> +    while (tuIterator.isNextSection());
> +
> +    if (splitType == VERTICAL_SPLIT)
> +    {
> +        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
> +        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
>      }
>
>      m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
> @@ -1022,91 +1041,18 @@
>      }
>  }
>
> -void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom,
> uint32_t tuDepth, uint32_t absPartIdx)
> +void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom,
> uint32_t absPartIdx, uint32_t tuDepth)
>  {
>      CUData& cu = mode.cu;
> -    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
> -    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
> -
> -    if (tuDepth == cu.m_tuDepth[absPartIdx])
> -    {
> -        uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
> -        uint32_t tuDepthC = tuDepth;
> -        if (log2TrSizeC < 2)
> -        {
> -            X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 &&
> tuDepth, "invalid tuDepth\n");
> -            if (absPartIdx & 3)
> -                return;
> -            log2TrSizeC = 2;
> -            tuDepthC--;
> -        }
> -
> -        ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
> -        uint32_t tuSize = 1 << log2TrSizeC;
> -        uint32_t stride = mode.fencYuv->m_csize;
> -        const int sizeIdxC = log2TrSizeC - 2;
> -
> -        uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] +
> tuDepthC) << 1);
> -        const SplitType splitType = (m_csp == X265_CSP_I422) ?
> VERTICAL_SPLIT : DONT_SPLIT;
> -
> -        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <=
> TEXT_CHROMA_V; chromaId++)
> -        {
> -            TextType ttype = (TextType)chromaId;
> -
> -            TURecurse tuIterator(splitType, curPartNum, absPartIdx);
> -            do
> -            {
> -                uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
> -
> -                const pixel*   fenc   =
> mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
> -                pixel*   pred         =
> mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
> -                int16_t* residual     = resiYuv.getChromaAddr(chromaId,
> absPartIdxC);
> -                pixel*   recon        =
> mode.reconYuv.getChromaAddr(chromaId, absPartIdxC); // TODO: needed?
> -                uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE *
> 2 - (m_hChromaShift + m_vChromaShift));
> -                coeff_t* coeff        = cu.m_trCoeff[ttype] +
> coeffOffsetC;
> -                pixel*   picReconC    =
> m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx
> + absPartIdxC);
> -                uint32_t picStride    = m_frame->m_reconPic->m_strideC;
> -
> -                uint32_t chromaPredMode =
> cu.m_chromaIntraDir[absPartIdxC];
> -                if (chromaPredMode == DM_CHROMA_IDX)
> -                    chromaPredMode = cu.m_lumaIntraDir[(m_csp ==
> X265_CSP_I444) ? absPartIdxC : 0];
> -                chromaPredMode = (m_csp == X265_CSP_I422) ?
> g_chroma422IntraAngleMappingTable[chromaPredMode] : chromaPredMode;
> -                initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC,
> chromaId);
> -                pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
> -
> -                predIntraChromaAng(chromaPred, chromaPredMode, pred,
> stride, log2TrSizeC, m_csp);
> -
> -                X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip
> not supported at low RD levels\n");
> -
> -                primitives.calcresidual[sizeIdxC](fenc, pred, residual,
> stride);
> -                uint32_t numSig = m_quant.transformNxN(cu, fenc, stride,
> residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, false);
> -                if (numSig)
> -                {
> -                    m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC],
> residual, stride, coeff, log2TrSizeC, ttype, true, false, numSig);
> -                    primitives.luma_add_ps[sizeIdxC](recon, stride, pred,
> residual, stride, stride);
> -                    primitives.luma_copy_pp[sizeIdxC](picReconC,
> picStride, recon, stride);
> -                    cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC,
> tuIterator.absPartIdxStep);
> -                }
> -                else
> -                {
> -                    primitives.luma_copy_pp[sizeIdxC](recon, stride,
> pred, stride);
> -                    primitives.luma_copy_pp[sizeIdxC](picReconC,
> picStride, pred, stride);
> -                    cu.setCbfPartRange(0, ttype, absPartIdxC,
> tuIterator.absPartIdxStep);
> -                }
> -            }
> -            while (tuIterator.isNextSection());
> -
> -            if (splitType == VERTICAL_SPLIT)
> -                offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth,
> absPartIdx);
> -        }
> -    }
> -    else
> +    uint32_t log2TrSize = cu.m_log2CUSize[absPartIdx] - tuDepth;
> +
> +    if (tuDepth < cu.m_tuDepth[absPartIdx])
>      {
>          uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
>          uint32_t splitCbfU = 0, splitCbfV = 0;
>          for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx,
> qPartIdx += qNumParts)
>          {
> -            residualQTIntraChroma(mode, cuGeom, tuDepth + 1, qPartIdx);
> +            residualQTIntraChroma(mode, cuGeom, qPartIdx, tuDepth + 1);
>              splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
>              splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
>          }
> @@ -1115,12 +1061,91 @@
>              cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
>              cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
>          }
> +
> +        return;
> +    }
> +
> +    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
> +    uint32_t tuDepthC = tuDepth;
> +    if (log2TrSizeC < 2)
> +    {
> +        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth,
> "invalid tuDepth\n");
> +        if (absPartIdx & 3)
> +            return;
> +        log2TrSizeC = 2;
> +        tuDepthC--;
> +    }
> +
> +    ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
> +    uint32_t tuSize = 1 << log2TrSizeC;
> +    uint32_t stride = mode.fencYuv->m_csize;
> +    const uint32_t sizeIdxC = log2TrSizeC - 2;
> +
> +    uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] +
> tuDepthC) << 1);
> +    const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT
> : DONT_SPLIT;
> +
> +    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
> +    do
> +    {
> +        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
> +
> +        IntraNeighbors intraNeighbors;
> +        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false,
> &intraNeighbors);
> +
> +        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <=
> TEXT_CHROMA_V; chromaId++)
> +        {
> +            TextType ttype = (TextType)chromaId;
> +
> +            const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId,
> absPartIdxC);
> +            pixel*   pred     = mode.predYuv.getChromaAddr(chromaId,
> absPartIdxC);
> +            int16_t* residual = resiYuv.getChromaAddr(chromaId,
> absPartIdxC);
> +            uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 -
> (m_hChromaShift + m_vChromaShift));
> +            coeff_t* coeffC        = cu.m_trCoeff[ttype] + coeffOffsetC;
> +            pixel*   picReconC =
> m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx
> + absPartIdxC);
> +            intptr_t picStride = m_frame->m_reconPic->m_strideC;
> +
> +            uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
> +            if (chromaPredMode == DM_CHROMA_IDX)
> +                chromaPredMode = cu.m_lumaIntraDir[(m_csp ==
> X265_CSP_I444) ? absPartIdxC : 0];
> +            if (m_csp == X265_CSP_I422)
> +                chromaPredMode =
> g_chroma422IntraAngleMappingTable[chromaPredMode];
> +
> +            // init availability pattern
> +            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors,
> chromaId);
> +            pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
> +
> +            // get prediction signal
> +            predIntraChromaAng(chromaPred, chromaPredMode, pred, stride,
> log2TrSizeC, m_csp);
> +
> +            X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not
> supported at low RD levels\n");
> +
> +            primitives.calcresidual[sizeIdxC](fenc, pred, residual,
> stride);
> +            uint32_t numSig = m_quant.transformNxN(cu, fenc, stride,
> residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
> +            if (numSig)
> +            {
> +                m_quant.invtransformNxN(cu.m_tqBypass[0], residual,
> stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
> +                primitives.luma_add_ps[sizeIdxC](picReconC, picStride,
> pred, residual, stride, stride);
> +                cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC,
> tuIterator.absPartIdxStep);
> +            }
> +            else
> +            {
> +                // no coded residual, recon = pred
> +                primitives.luma_copy_pp[sizeIdxC](picReconC, picStride,
> pred, stride);
> +                cu.setCbfPartRange(0, ttype, absPartIdxC,
> tuIterator.absPartIdxStep);
> +            }
> +        }
> +    }
> +    while (tuIterator.isNextSection());
> +
> +    if (splitType == VERTICAL_SPLIT)
> +    {
> +        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
> +        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
>      }
>  }
>
>  void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize
> partSize, uint8_t* sharedModes)
>  {
> -    uint32_t depth = cuGeom.depth;
>      CUData& cu = intraMode.cu;
>
>      cu.setPartSizeSubParts(partSize);
> @@ -1143,7 +1168,7 @@
>          m_entropyCoder.codePredMode(cu.m_predMode[0]);
>      }
>
> -    m_entropyCoder.codePartSize(cu, 0, depth);
> +    m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
>      m_entropyCoder.codePredInfo(cu, 0);
>      intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits();
>
> @@ -1153,7 +1178,10 @@
>      intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
>      intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
>      if (m_rdCost.m_psyRd)
> -        intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2,
> intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size,
> intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
> +    {
> +        const Yuv* fencYuv = intraMode.fencYuv;
> +        intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2,
> fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0],
> intraMode.reconYuv.m_size);
> +    }
>
>      updateModeCost(intraMode);
>  }
> @@ -1174,7 +1202,9 @@
>      const uint32_t absPartIdx = 0;
>
>      // Reference sample smoothing
> -    initAdiPattern(cu, cuGeom, absPartIdx, initTuDepth, ALL_IDX);
> +    IntraNeighbors intraNeighbors;
> +    initIntraNeighbors(cu, absPartIdx, initTuDepth, true,
> &intraNeighbors);
> +    initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);
>
>      const pixel* fenc = intraMode.fencYuv->m_buf[0];
>      uint32_t stride = intraMode.fencYuv->m_size;
> @@ -1335,7 +1365,6 @@
>  {
>      CUData& cu = intraMode.cu;
>      Yuv* reconYuv = &intraMode.reconYuv;
> -    const Yuv* fencYuv = intraMode.fencYuv;
>
>      X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does
> not expect NxN intra\n");
>      X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect
> to be used in I slices\n");
> @@ -1369,7 +1398,10 @@
>      intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
>      intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
>      if (m_rdCost.m_psyRd)
> +    {
> +        const Yuv* fencYuv = intraMode.fencYuv;
>          intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2,
> fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
> +    }
>
>      m_entropyCoder.store(intraMode.contexts);
>      updateModeCost(intraMode);
> @@ -1404,7 +1436,9 @@
>          else
>          {
>              // Reference sample smoothing
> -            initAdiPattern(cu, cuGeom, absPartIdx, initTuDepth, ALL_IDX);
> +            IntraNeighbors intraNeighbors;
> +            initIntraNeighbors(cu, absPartIdx, initTuDepth, true,
> &intraNeighbors);
> +            initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors,
> ALL_IDX);
>
>              // determine set of modes to be tested (using prediction
> signal only)
>              const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
> @@ -1602,8 +1636,10 @@
>          log2TrSizeC = 5;
>      }
>
> -    Predict::initAdiPatternChroma(cu, cuGeom, 0, tuDepth, 1);
> -    Predict::initAdiPatternChroma(cu, cuGeom, 0, tuDepth, 2);
> +    IntraNeighbors intraNeighbors;
> +    initIntraNeighbors(cu, 0, tuDepth, false, &intraNeighbors);
> +    Predict::initAdiPatternChroma(cu, cuGeom, 0, intraNeighbors, 1); // U
> +    Predict::initAdiPatternChroma(cu, cuGeom, 0, intraNeighbors, 2); // V
>      cu.getAllowedChromaDir(0, modeList);
>
>      // check chroma modes
> @@ -2581,16 +2617,16 @@
>      updateModeCost(interMode);
>  }
>
> -void Search::residualTransformQuantInter(Mode& mode, const CUGeom&
> cuGeom, uint32_t absPartIdx, uint32_t depth, const uint32_t depthRange[2])
> +void Search::residualTransformQuantInter(Mode& mode, const CUGeom&
> cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
>  {
> +    uint32_t depth = cuGeom.depth + tuDepth;
>      CUData& cu = mode.cu;
>      X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "invalid
> depth\n");
>
>      uint32_t log2TrSize = g_maxLog2CUSize - depth;
> -    uint32_t tuDepth = depth - cu.m_cuDepth[0];
>
>      bool bCheckFull = log2TrSize <= depthRange[1];
> -    if (cu.m_partSize[0] != SIZE_2Nx2N && depth ==
> cu.m_cuDepth[absPartIdx] && log2TrSize > depthRange[0])
> +    if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && log2TrSize >
> depthRange[0])
>          bCheckFull = false;
>
>      if (bCheckFull)
> @@ -2611,7 +2647,7 @@
>          uint32_t setCbf = 1 << tuDepth;
>
>          uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
> -        coeff_t *coeffCurY = cu.m_trCoeff[0] + coeffOffsetY;
> +        coeff_t* coeffCurY = cu.m_trCoeff[0] + coeffOffsetY;
>
>          uint32_t sizeIdx  = log2TrSize  - 2;
>
> @@ -2644,8 +2680,8 @@
>              uint32_t strideResiC = resiYuv.m_csize;
>
>              uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift +
> m_vChromaShift);
> -            coeff_t *coeffCurU = cu.m_trCoeff[1] + coeffOffsetC;
> -            coeff_t *coeffCurV = cu.m_trCoeff[2] + coeffOffsetC;
> +            coeff_t* coeffCurU = cu.m_trCoeff[1] + coeffOffsetC;
> +            coeff_t* coeffCurV = cu.m_trCoeff[2] + coeffOffsetC;
>              bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
>
>              TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT :
> DONT_SPLIT, absPartIdxStep, absPartIdx);
> @@ -2702,16 +2738,16 @@
>          uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
>          for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx,
> qPartIdx += qNumParts)
>          {
> -            residualTransformQuantInter(mode, cuGeom, qPartIdx, depth +
> 1, depthRange);
> -            ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
> +            residualTransformQuantInter(mode, cuGeom, qPartIdx, tuDepth +
> 1, depthRange);
> +            ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
>              ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
>              vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
>          }
> -        for (uint32_t i = 0; i < 4 * qNumParts; i++)
> +        for (uint32_t i = 0; i < 4 * qNumParts; ++i)
>          {
> -            cu.m_cbf[TEXT_LUMA][absPartIdx + i] |= ycbf << tuDepth;
> -            cu.m_cbf[TEXT_CHROMA_U][absPartIdx + i] |= ucbf << tuDepth;
> -            cu.m_cbf[TEXT_CHROMA_V][absPartIdx + i] |= vcbf << tuDepth;
> +            cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth;
> +            cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
> +            cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
>          }
>      }
>  }
> @@ -2769,7 +2805,7 @@
>
>      uint32_t trSize = 1 << log2TrSize;
>      const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
> -    uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] +
> tuDepthC) << 1);
> +    uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] +
> tuDepthC) << 1);
>      const Yuv* fencYuv = mode.fencYuv;
>
>      // code full block
> @@ -3127,16 +3163,19 @@
>          //Encode cbf flags
>          if (bCodeChroma)
>          {
> -            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <=
> TEXT_CHROMA_V; chromaId++)
> +            if (!splitIntoSubTUs)
>              {
> -                if (!splitIntoSubTUs)
> -                    m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][0],
> tuDepth);
> -                else
> -                {
> -                    offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth,
> absPartIdx);
> -                    m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][0],
> tuDepth);
> -                    m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][1],
> tuDepth);
> -                }
> +                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0],
> tuDepth);
> +                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0],
> tuDepth);
> +            }
> +            else
> +            {
> +                offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
> +                offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
> +                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0],
> tuDepth);
> +                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][1],
> tuDepth);
> +                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0],
> tuDepth);
> +                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][1],
> tuDepth);
>              }
>          }
>
> diff -r 8d2f418829c8 -r 6b59452a17d7 source/encoder/search.h
> --- a/source/encoder/search.h   Sat Dec 20 21:27:14 2014 +0900
> +++ b/source/encoder/search.h   Tue Dec 23 14:49:59 2014 +0900
> @@ -178,9 +178,9 @@
>      void     encodeResAndCalcRdSkipCU(Mode& interMode);
>
>      // encode residual without rd-cost
> -    void     residualTransformQuantInter(Mode& mode, const CUGeom&
> cuGeom, uint32_t absPartIdx, uint32_t depth, const uint32_t depthRange[2]);
> -    void     residualTransformQuantIntra(Mode& mode, const CUGeom&
> cuGeom, uint32_t tuDepth, uint32_t absPartIdx, const uint32_t
> depthRange[2]);
> -    void     residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom,
> uint32_t tuDepth, uint32_t absPartIdx);
> +    void     residualTransformQuantInter(Mode& mode, const CUGeom&
> cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t
> depthRange[2]);
> +    void     residualTransformQuantIntra(Mode& mode, const CUGeom&
> cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t
> depthRange[2]);
> +    void     residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom,
> uint32_t absPartIdx, uint32_t tuDepth);
>
>      // pick be chroma mode from available using just sa8d costs
>      void     getBestIntraModeChroma(Mode& intraMode, const CUGeom&
> cuGeom);
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20141223/fdd1a704/attachment-0001.html>


More information about the x265-devel mailing list