[x265] refine tskip related

Tue Jun 10 19:58:06 CEST 2014

On Tue, Jun 10, 2014 at 4:56 AM, Satoshi Nakagawa <nakagawa424 at oki.com> wrote:
> # HG changeset patch
> # User Satoshi Nakagawa <nakagawa424 at oki.com>
> # Date 1402394075 -32400
> #      Tue Jun 10 18:54:35 2014 +0900
> # Node ID b6302b087ea414d52fe76050acd2889e34b352c8
> # Parent  0cbc7320c9f2904bb1439dca70fd278ea42ed5aa
> refine tskip related

rebased on top of the quant patch and queued for testing

> diff -r 0cbc7320c9f2 -r b6302b087ea4 source/Lib/TLibEncoder/TEncEntropy.cpp
> --- a/source/Lib/TLibEncoder/TEncEntropy.cpp    Mon Jun 09 11:34:11 2014 +0530
> +++ b/source/Lib/TLibEncoder/TEncEntropy.cpp    Tue Jun 10 18:54:35 2014 +0900
> @@ -211,7 +211,7 @@
>  void TEncEntropy::xEncodeTransform(TComDataCU* cu, uint32_t offsetLuma, uint32_t offsetChroma, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t depth, uint32_t tuSize, uint32_t trIdx, bool& bCodeDQP)
>  {
>      const uint32_t subdiv = cu->getTransformIdx(absPartIdx) + cu->getDepth(absPartIdx) > depth;
> -    const uint32_t log2TrafoSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
> +    const uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
>      uint32_t hChromaShift        = cu->getHorzChromaShift();
>      uint32_t vChromaShift        = cu->getVertChromaShift();
>      uint32_t cbfY = cu->getCbf(absPartIdx, TEXT_LUMA, trIdx);
> @@ -223,7 +223,7 @@
>          m_bakAbsPartIdxCU = absPartIdx;
>      }
>
> -    if ((log2TrafoSize == 2) && !(cu->getChromaFormat() == CHROMA_444))
> +    if ((log2TrSize == 2) && !(cu->getChromaFormat() == CHROMA_444))
>      {
>          uint32_t partNum = cu->getPic()->getNumPartInCU() >> ((depth - 1) << 1);
>          if ((absPartIdx & (partNum - 1)) == 0)
> @@ -244,7 +244,7 @@
>      }
>      else if (cu->getPredictionMode(absPartIdx) == MODE_INTER && (cu->getPartitionSize(absPartIdx) != SIZE_2Nx2N) && depth == cu->getDepth(absPartIdx) &&  (cu->getSlice()->getSPS()->getQuadtreeTUMaxDepthInter() == 1))
>      {
> -        if (log2TrafoSize > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx))
> +        if (log2TrSize > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx))
>          {
>              X265_CHECK(subdiv, "subdivision state failure\n");
>          }
> @@ -253,22 +253,22 @@
>              X265_CHECK(!subdiv, "subdivision state failure\n");
>          }
>      }
> -    else if (log2TrafoSize > cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize())
> +    else if (log2TrSize > cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize())
>      {
>          X265_CHECK(subdiv, "subdivision state failure\n");
>      }
> -    else if (log2TrafoSize == cu->getSlice()->getSPS()->getQuadtreeTULog2MinSize())
> +    else if (log2TrSize == cu->getSlice()->getSPS()->getQuadtreeTULog2MinSize())
>      {
>          X265_CHECK(!subdiv, "subdivision state failure\n");
>      }
> -    else if (log2TrafoSize == cu->getQuadtreeTULog2MinSizeInCU(absPartIdx))
> +    else if (log2TrSize == cu->getQuadtreeTULog2MinSizeInCU(absPartIdx))
>      {
>          X265_CHECK(!subdiv, "subdivision state failure\n");
>      }
>      else
>      {
> -        X265_CHECK(log2TrafoSize > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx), "transform size failure\n");
> -        m_entropyCoderIf->codeTransformSubdivFlag(subdiv, 5 - log2TrafoSize);
> +        X265_CHECK(log2TrSize > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx), "transform size failure\n");
> +        m_entropyCoderIf->codeTransformSubdivFlag(subdiv, 5 - log2TrSize);
>      }
>
>      const uint32_t trDepthCurr = depth - cu->getDepth(absPartIdx);
> @@ -365,12 +365,12 @@
>          }
>
>          int chFmt = cu->getChromaFormat();
> -        if ((log2TrafoSize == 2) && !(chFmt == CHROMA_444))
> +        if ((log2TrSize == 2) && !(chFmt == CHROMA_444))
>          {
>              uint32_t partNum = cu->getPic()->getNumPartInCU() >> ((depth - 1) << 1);
>              if ((absPartIdx & (partNum - 1)) == (partNum - 1))
>              {
> -                uint32_t trSizeC           = 1 << log2TrafoSize;
> +                uint32_t trSizeC           = 1 << log2TrSize;
>                  const bool splitIntoSubTUs = (chFmt == CHROMA_422);
>
>                  uint32_t curPartNum = cu->getPic()->getNumPartInCU() >> ((depth - 1) << 1);
> diff -r 0cbc7320c9f2 -r b6302b087ea4 source/Lib/TLibEncoder/TEncSearch.cpp
> --- a/source/Lib/TLibEncoder/TEncSearch.cpp     Mon Jun 09 11:34:11 2014 +0530
> +++ b/source/Lib/TLibEncoder/TEncSearch.cpp     Tue Jun 10 18:54:35 2014 +0900
> @@ -160,37 +160,37 @@
>      uint32_t fullDepth  = cu->getDepth(0) + trDepth;
>      uint32_t trMode     = cu->getTransformIdx(absPartIdx);
>      uint32_t subdiv     = (trMode > trDepth ? 1 : 0);
> -    uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> +    uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
>
>      if (cu->getPredictionMode(0) == MODE_INTRA && cu->getPartitionSize(0) == SIZE_NxN && trDepth == 0)
>      {
>          X265_CHECK(subdiv, "subdivision not present\n");
>      }
> -    else if (trSizeLog2 > cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize())
> +    else if (log2TrSize > cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize())
>      {
>          X265_CHECK(subdiv, "subdivision not present\n");
>      }
> -    else if (trSizeLog2 == cu->getSlice()->getSPS()->getQuadtreeTULog2MinSize())
> +    else if (log2TrSize == cu->getSlice()->getSPS()->getQuadtreeTULog2MinSize())
>      {
>          X265_CHECK(!subdiv, "subdivision present\n");
>      }
> -    else if (trSizeLog2 == cu->getQuadtreeTULog2MinSizeInCU(absPartIdx))
> +    else if (log2TrSize == cu->getQuadtreeTULog2MinSizeInCU(absPartIdx))
>      {
>          X265_CHECK(!subdiv, "subdivision present\n");
>      }
>      else
>      {
> -        X265_CHECK(trSizeLog2 > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx), "transform size too small\n");
> +        X265_CHECK(log2TrSize > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx), "transform size too small\n");
>          if (bLuma)
>          {
> -            m_entropyCoder->encodeTransformSubdivFlag(subdiv, 5 - trSizeLog2);
> +            m_entropyCoder->encodeTransformSubdivFlag(subdiv, 5 - log2TrSize);
>          }
>      }
>
>      if (bChroma)
>      {
>          int      chFmt      = cu->getChromaFormat();
> -        if ((trSizeLog2 > 2) && !(chFmt == CHROMA_444))
> +        if ((log2TrSize > 2) && !(chFmt == CHROMA_444))
>          {
>              if (trDepth == 0 || cu->getCbf(absPartIdx, TEXT_CHROMA_U, trDepth - 1))
>                  m_entropyCoder->encodeQtCbf(cu, absPartIdx, absPartIdxStep, (width >> m_hChromaShift), (height >> m_vChromaShift), TEXT_CHROMA_U, trDepth, (subdiv == 0));
> @@ -245,9 +245,9 @@
>
>      uint32_t origTrDepth = trDepth;
>
> -    uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> +    uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
>      int chFmt           = cu->getChromaFormat();
> -    if ((ttype != TEXT_LUMA) && (trSizeLog2 == 2) && !(chFmt == CHROMA_444))
> +    if ((ttype != TEXT_LUMA) && (log2TrSize == 2) && !(chFmt == CHROMA_444))
>      {
>          X265_CHECK(trDepth > 0, "transform size too small\n");
>          trDepth--;
> @@ -267,7 +267,7 @@
>      uint32_t height = cu->getCUSize(0) >> (trDepth + cspy);
>      height = splitIntoSubTUs ? height >> 1 : height;
>      uint32_t coeffOffset = absPartIdx << (cu->getPic()->getLog2UnitSize() * 2 - (cspx + cspy));
> -    uint32_t qtLayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> +    uint32_t qtLayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
>      coeff_t* coeff = m_qtTempCoeff[ttype][qtLayer] + coeffOffset;
>
>      if (width == height)
> @@ -386,35 +386,32 @@
>  }
>
>  void TEncSearch::xIntraCodingLumaBlk(TComDataCU* cu,
> -                                     uint32_t    trDepth,
>                                       uint32_t    absPartIdx,
> +                                     uint32_t    log2TrSize,
>                                       TComYuv*    fencYuv,
>                                       TComYuv*    predYuv,
>                                       ShortYuv*   resiYuv,
> +                                     uint32_t&   cbf,
>                                       uint32_t&   outDist)
>  {
> -    uint32_t fullDepth    = cu->getDepth(0)  + trDepth;
> -    uint32_t tuSize       = cu->getCUSize(0) >> trDepth;
> +    uint32_t tuSize       = 1 << log2TrSize;
>      uint32_t stride       = fencYuv->getStride();
>      pixel*   fenc         = fencYuv->getLumaAddr(absPartIdx);
>      pixel*   pred         = predYuv->getLumaAddr(absPartIdx);
>      int16_t* residual     = resiYuv->getLumaAddr(absPartIdx);
> -    int      part         = partitionFromSize(tuSize);
> -    int      sizeIdx      = g_convertToBit[tuSize];
> -
> -    uint32_t trSizeLog2     = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> -    uint32_t qtLayer        = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> +
> +    uint32_t qtLayer        = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
>      uint32_t coeffOffsetY   = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
>      coeff_t* coeff          = m_qtTempCoeff[0][qtLayer] + coeffOffsetY;
> -
>      int16_t* reconQt        = m_qtTempShortYuv[qtLayer].getLumaAddr(absPartIdx);
>      X265_CHECK(m_qtTempShortYuv[qtLayer].m_width == MAX_CU_SIZE, "width is not max CU size\n");
>      const uint32_t reconQtStride = MAX_CU_SIZE;
> -
>      uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
>      pixel*   reconIPred       = cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
>      uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getStride();
>      bool     useTransformSkip = !!cu->getTransformSkip(absPartIdx, TEXT_LUMA);
> +    int      part = partitionFromSize(tuSize);
> +    int      sizeIdx = log2TrSize - 2;
>
>      //===== get residual signal =====
>      X265_CHECK(!((intptr_t)fenc & (tuSize - 1)), "fenc alignment check fail\n");
> @@ -430,9 +427,8 @@
>      }
>
>      //--- transform and quantization ---
> -    uint32_t absSum = 0;
> +    uint32_t absSum;
>      int lastPos = -1;
> -    cu->setTrIdxSubParts(trDepth, absPartIdx, fullDepth);
>
>      int chFmt = cu->getChromaFormat();
>      m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
> @@ -441,7 +437,7 @@
>      absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, tuSize, TEXT_LUMA, absPartIdx, &lastPos, useTransformSkip);
>
>      //--- set coded block flag ---
> -    cu->setCbfSubParts((absSum ? 1 : 0) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
> +    cbf = absSum ? 1 : 0;
>
>      if (absSum)
>      {
> @@ -449,7 +445,7 @@
>          int scalingListType = 0 + TEXT_LUMA;
>          X265_CHECK(scalingListType < 6, "scalingListType invalid %d\n", scalingListType);
>          m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), cu->getLumaIntraDir(absPartIdx), residual, stride, coeff, tuSize, scalingListType, useTransformSkip, lastPos);
> -        X265_CHECK(tuSize <= 32, "tuSize is too large %d\n", tuSize);
> +        X265_CHECK(log2TrSize <= 5, "log2TrSize is too large %d\n", log2TrSize);
>          //===== reconstruction =====
>          primitives.calcrecon[sizeIdx](pred, residual, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
>          //===== update distortion =====
> @@ -461,66 +457,48 @@
>          memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);
>  #endif
>          //===== reconstruction =====
> -        primitives.luma_copy_ps[part](reconQt,    reconQtStride,    pred, stride);
> -        primitives.luma_copy_pp[part](reconIPred, reconIPredStride, pred, stride);
> +        primitives.square_copy_ps[sizeIdx](reconQt,    reconQtStride,    pred, stride);
> +        primitives.square_copy_pp[sizeIdx](reconIPred, reconIPredStride, pred, stride);
>          //===== update distortion =====
>          outDist += primitives.sse_pp[part](pred, stride, fenc, stride);
>      }
>  }
>
>  void TEncSearch::xIntraCodingChromaBlk(TComDataCU* cu,
> -                                       uint32_t    trDepth,
>                                         uint32_t    absPartIdx,
> -                                       uint32_t    absPartIdxStep,
> +                                       uint32_t    log2TrSize,
>                                         TComYuv*    fencYuv,
>                                         TComYuv*    predYuv,
>                                         ShortYuv*   resiYuv,
> +                                       uint32_t&   cbf,
>                                         uint32_t&   outDist,
> -                                       uint32_t    chromaId)
> +                                       uint32_t    chromaId,
> +                                       uint32_t    log2TrSizeC)
>  {
> -    uint32_t fullDepth   = cu->getDepth(0) + trDepth;
> -    uint32_t trSizeLog2  = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> -    int      chFmt       = cu->getChromaFormat();
> -
> -    uint32_t origTrDepth = trDepth;
> -
> -    if ((trSizeLog2 == 2) && !(chFmt == CHROMA_444))
> -    {
> -        X265_CHECK(trDepth > 0, "trDepth should be non-zero\n");
> -        trDepth--;
> -        uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trDepth) << 1);
> -        bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
> -        bool bSecondQ = (chFmt == CHROMA_422) ? ((absPartIdx & (qpdiv - 1)) == 2) : false;
> -        if ((!bFirstQ) && (!bSecondQ))
> -        {
> -            return;
> -        }
> -    }
> -
> -    TextType ttype          = (TextType)chromaId;
> -    uint32_t tuSize         = cu->getCUSize(0) >> (trDepth + m_hChromaShift);
> -    uint32_t stride         = fencYuv->getCStride();
> -    pixel*   fenc           = fencYuv->getChromaAddr(chromaId, absPartIdx);
> -    pixel*   pred           = predYuv->getChromaAddr(chromaId, absPartIdx);
> -    int16_t* residual       = resiYuv->getChromaAddr(chromaId, absPartIdx);
> -
> -    uint32_t qtlayer        = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> +    TextType ttype        = (TextType)chromaId;
> +    uint32_t tuSize       = 1 << log2TrSizeC;
> +    uint32_t stride       = fencYuv->getCStride();
> +    pixel*   fenc         = fencYuv->getChromaAddr(chromaId, absPartIdx);
> +    pixel*   pred         = predYuv->getChromaAddr(chromaId, absPartIdx);
> +    int16_t* residual     = resiYuv->getChromaAddr(chromaId, absPartIdx);
> +
> +    uint32_t qtLayer        = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
>      uint32_t coeffOffsetC   = absPartIdx << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
> -    coeff_t* coeff          = m_qtTempCoeff[chromaId][qtlayer] + coeffOffsetC;
> -    int16_t* reconQt        = m_qtTempShortYuv[qtlayer].getChromaAddr(chromaId, absPartIdx);
> -    uint32_t reconQtStride  = m_qtTempShortYuv[qtlayer].m_cwidth;
> +    coeff_t* coeff          = m_qtTempCoeff[chromaId][qtLayer] + coeffOffsetC;
> +    int16_t* reconQt        = m_qtTempShortYuv[qtLayer].getChromaAddr(chromaId, absPartIdx);
> +    uint32_t reconQtStride  = m_qtTempShortYuv[qtLayer].m_cwidth;
>      uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
>      pixel*   reconIPred       = cu->getPic()->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), zorder);
>      uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getCStride();
> -    bool     useTransformSkipChroma = !!cu->getTransformSkip(absPartIdx, ttype);
> +    bool     useTransformSkipC = !!cu->getTransformSkip(absPartIdx, ttype);
>      int      part = partitionFromSize(tuSize);
> -    int      sizeIdx = g_convertToBit[tuSize];
> +    int      sizeIdxC = log2TrSizeC - 2;
>
>      //===== get residual signal =====
>      X265_CHECK(!((intptr_t)fenc & (tuSize - 1)), "fenc alignment check fail\n");
>      X265_CHECK(!((intptr_t)pred & (tuSize - 1)), "pred alignment check fail\n");
>      X265_CHECK(!((intptr_t)residual & (tuSize - 1)), "residual alignment check fail\n");
> -    primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
> +    primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
>
>      //===== transform and quantization =====
>      //--- init rate estimation arrays for RDOQ ---
> @@ -530,9 +508,10 @@
>      }
>
>      //--- transform and quantization ---
> -    uint32_t absSum = 0;
> +    uint32_t absSum;
>      int lastPos = -1;
>
> +    int chFmt = cu->getChromaFormat();
>      int curChromaQpOffset;
>      if (ttype == TEXT_CHROMA_U)
>      {
> @@ -545,10 +524,10 @@
>      m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
>      m_trQuant->selectLambda(TEXT_CHROMA);
>
> -    absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, tuSize, ttype, absPartIdx, &lastPos, useTransformSkipChroma);
> +    absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, tuSize, ttype, absPartIdx, &lastPos, useTransformSkipC);
>
>      //--- set coded block flag ---
> -    cu->setCbfPartRange((((absSum > 0) ? 1 : 0) << origTrDepth), ttype, absPartIdx, absPartIdxStep);
> +    cbf = absSum ? 1 : 0;
>
>      uint32_t dist;
>      if (absSum)
> @@ -556,10 +535,10 @@
>          //--- inverse transform ---
>          int scalingListType = 0 + ttype;
>          X265_CHECK(scalingListType < 6, "scalingListType invalid %d\n", scalingListType);
> -        m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, residual, stride, coeff, tuSize, scalingListType, useTransformSkipChroma, lastPos);
> -        X265_CHECK(tuSize <= 32, "tuSize is too large %d\n", tuSize);
> +        m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, residual, stride, coeff, tuSize, scalingListType, useTransformSkipC, lastPos);
> +        X265_CHECK(log2TrSizeC <= 5, "log2TrSizeC is too large %d\n", log2TrSizeC);
>          //===== reconstruction =====
> -        primitives.calcrecon[sizeIdx](pred, residual, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
> +        primitives.calcrecon[sizeIdxC](pred, residual, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
>          //===== update distortion =====
>          dist = primitives.sse_sp[part](reconQt, reconQtStride, fenc, stride);
>      }
> @@ -569,8 +548,8 @@
>          memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);
>  #endif
>          //===== reconstruction =====
> -        primitives.square_copy_ps[sizeIdx](reconQt,    reconQtStride,    pred, stride);
> -        primitives.square_copy_pp[sizeIdx](reconIPred, reconIPredStride, pred, stride);
> +        primitives.square_copy_ps[sizeIdxC](reconQt,    reconQtStride,    pred, stride);
> +        primitives.square_copy_pp[sizeIdxC](reconIPred, reconIPredStride, pred, stride);
>          //===== update distortion =====
>          dist = primitives.sse_pp[part](pred, stride, fenc, stride);
>      }
> @@ -597,9 +576,9 @@
>                                       uint64_t&   rdCost)
>  {
>      uint32_t fullDepth   = cu->getDepth(0) +  trDepth;
> -    uint32_t trSizeLog2  = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> -    bool     bCheckFull  = (trSizeLog2 <= cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize());
> -    bool     bCheckSplit = (trSizeLog2 > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx));
> +    uint32_t log2TrSize  = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> +    bool     bCheckFull  = (log2TrSize <= cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize());
> +    bool     bCheckSplit = (log2TrSize > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx));
>
>      int maxTuSize = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize();
>      int isIntraSlice = (cu->getSlice()->getSliceType() == I_SLICE);
> @@ -610,12 +589,12 @@
>      if (m_cfg->m_param->rdPenalty && !isIntraSlice)
>      {
>          // in addition don't check split if TU size is less or equal to 16x16 TU size for non-intra slice
> -        noSplitIntraMaxTuSize = (trSizeLog2 <= X265_MIN(maxTuSize, 4));
> +        noSplitIntraMaxTuSize = (log2TrSize <= X265_MIN(maxTuSize, 4));
>
>          // if maximum RD-penalty don't check TU size 32x32
>          if (m_cfg->m_param->rdPenalty == 2)
>          {
> -            bCheckFull = (trSizeLog2 <= X265_MIN(maxTuSize, 4));
> +            bCheckFull = (log2TrSize <= X265_MIN(maxTuSize, 4));
>          }
>      }
>      if (bCheckFirst && noSplitIntraMaxTuSize)
> @@ -631,10 +610,10 @@
>
>      if (bCheckFull)
>      {
> -        uint32_t tuSize = 1 << trSizeLog2;
> +        uint32_t tuSize = 1 << log2TrSize;
>
>          bool checkTransformSkip = (cu->getSlice()->getPPS()->getUseTransformSkip() &&
> -                                   trSizeLog2 <= LOG2_MAX_TS_SIZE &&
> +                                   log2TrSize <= LOG2_MAX_TS_SIZE &&
>                                     !cu->getCUTransquantBypass(0));
>          if (checkTransformSkip)
>          {
> @@ -657,6 +636,8 @@
>          //===== get prediction signal =====
>          predIntraLumaAng(lumaPredMode, pred, stride, tuSize);
>
> +        cu->setTrIdxSubParts(trDepth, absPartIdx, fullDepth);
> +
>          if (checkTransformSkip || checkTQbypass)
>          {
>              //----- store original entropy coding status -----
> @@ -680,8 +661,8 @@
>                  }
>
>                  //----- code luma block with given intra prediction mode and store Cbf-----
> -                xIntraCodingLumaBlk(cu, trDepth, absPartIdx, fencYuv, predYuv, resiYuv, singleDistYTmp);
> -                singleCbfYTmp  = cu->getCbf(absPartIdx, TEXT_LUMA, trDepth);
> +                xIntraCodingLumaBlk(cu, absPartIdx, log2TrSize, fencYuv, predYuv, resiYuv, singleCbfYTmp, singleDistYTmp);
> +                cu->setCbfSubParts(singleCbfYTmp << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
>                  singleTQbypass = cu->getCUTransquantBypass(absPartIdx);
>
>                  if ((modeId == 1) && (singleCbfYTmp == 0) && checkTransformSkip)
> @@ -704,7 +685,7 @@
>                      bestModeId   = modeId;
>                      if (bestModeId == firstCheckId)
>                      {
> -                        xStoreIntraResultQT(cu, trDepth, absPartIdx);
> +                        xStoreIntraResultQT(cu, absPartIdx, log2TrSize);
>                          m_rdGoOnSbacCoder->store(m_rdSbacCoders[fullDepth][CI_TEMP_BEST]);
>                      }
>                  }
> @@ -722,7 +703,7 @@
>
>              if (bestModeId == firstCheckId)
>              {
> -                xLoadIntraResultQT(cu, trDepth, absPartIdx);
> +                xLoadIntraResultQT(cu, absPartIdx, log2TrSize);
>                  cu->setCbfSubParts(singleCbfY << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
>                  m_rdGoOnSbacCoder->load(m_rdSbacCoders[fullDepth][CI_TEMP_BEST]);
>              }
> @@ -733,13 +714,11 @@
>
>              //----- code luma block with given intra prediction mode and store Cbf-----
>              cu->setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
> -            xIntraCodingLumaBlk(cu, trDepth, absPartIdx, fencYuv, predYuv, resiYuv, singleDistY);
> -
> -            if (bCheckSplit)
> -                singleCbfY = cu->getCbf(absPartIdx, TEXT_LUMA, trDepth);
> +            xIntraCodingLumaBlk(cu, absPartIdx, log2TrSize, fencYuv, predYuv, resiYuv, singleCbfY, singleDistY);
> +            cu->setCbfSubParts(singleCbfY << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
>
>              uint32_t singleBits = xGetIntraBitsQT(cu, trDepth, absPartIdx, 0, true, false);
> -            if (m_cfg->m_param->rdPenalty && (trSizeLog2 == 5) && !isIntraSlice)
> +            if (m_cfg->m_param->rdPenalty && (log2TrSize == 5) && !isIntraSlice)
>                  singleBits *= 4;
>
>              singleCost = m_rdCost->calcRdCost(singleDistY, singleBits);
> @@ -804,15 +783,16 @@
>          cu->setTransformSkipSubParts(bestModeId, TEXT_LUMA, absPartIdx, fullDepth);
>
>          //--- set reconstruction for next intra prediction blocks ---
> -        uint32_t qtLayer   = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> +        uint32_t qtLayer   = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
>          uint32_t zorder    = cu->getZorderIdxInCU() + absPartIdx;
> -        int16_t* src       = m_qtTempShortYuv[qtLayer].getLumaAddr(absPartIdx);
> +        int16_t* reconQt   = m_qtTempShortYuv[qtLayer].getLumaAddr(absPartIdx);
>          X265_CHECK(m_qtTempShortYuv[qtLayer].m_width == MAX_CU_SIZE, "width is not max CU size\n");
> -        const uint32_t srcstride = MAX_CU_SIZE;
> +        const uint32_t reconQtStride = MAX_CU_SIZE;
> +
>          pixel*   dst       = cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
>          uint32_t dststride = cu->getPic()->getPicYuvRec()->getStride();
> -        int sizeIdx = trSizeLog2 - 2;
> -        primitives.square_copy_sp[sizeIdx](dst, dststride, src, srcstride);
> +        int sizeIdx = log2TrSize - 2;
> +        primitives.square_copy_sp[sizeIdx](dst, dststride, reconQt, reconQtStride);
>      }
>
>      outDistY += singleDistY;
> @@ -828,9 +808,9 @@
>                                               TComYuv*    reconYuv)
>  {
>      uint32_t fullDepth   = cu->getDepth(0) +  trDepth;
> -    uint32_t trSizeLog2  = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> -    bool     bCheckFull  = (trSizeLog2 <= cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize());
> -    bool     bCheckSplit = (trSizeLog2 > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx));
> +    uint32_t log2TrSize  = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> +    bool     bCheckFull  = (log2TrSize <= cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize());
> +    bool     bCheckSplit = (log2TrSize > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx));
>
>      int maxTuSize = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize();
>      int isIntraSlice = (cu->getSlice()->getSliceType() == I_SLICE);
> @@ -838,7 +818,7 @@
>      if (m_cfg->m_param->rdPenalty == 2 && !isIntraSlice)
>      {
>          // if maximum RD-penalty don't check TU size 32x32
> -        bCheckFull = (trSizeLog2 <= X265_MIN(maxTuSize, 4));
> +        bCheckFull = (log2TrSize <= X265_MIN(maxTuSize, 4));
>      }
>      if (bCheckFull)
>      {
> @@ -846,7 +826,7 @@
>
>          //----- code luma block with given intra prediction mode and store Cbf-----
>          uint32_t lumaPredMode = cu->getLumaIntraDir(absPartIdx);
> -        uint32_t tuSize       = cu->getCUSize(0) >> trDepth;
> +        uint32_t tuSize       = 1 << log2TrSize;
>          int      chFmt        = cu->getChromaFormat();
>          uint32_t stride       = fencYuv->getStride();
>          pixel*   fenc         = fencYuv->getLumaAddr(absPartIdx);
> @@ -867,17 +847,18 @@
>          //===== get prediction signal =====
>          predIntraLumaAng(lumaPredMode, pred, stride, tuSize);
>
> +        cu->setTrIdxSubParts(trDepth, absPartIdx, fullDepth);
> +
>          //===== get residual signal =====
>          X265_CHECK(!((intptr_t)fenc & (tuSize - 1)), "fenc alignment failure\n");
>          X265_CHECK(!((intptr_t)pred & (tuSize - 1)), "pred alignment failure\n");
>          X265_CHECK(!((intptr_t)residual & (tuSize - 1)), "residual alignment failure\n");
> -        int sizeIdx = g_convertToBit[tuSize];
> +        int sizeIdx = log2TrSize - 2;
>          primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
>
>          //===== transform and quantization =====
>          uint32_t absSum = 0;
>          int lastPos = -1;
> -        cu->setTrIdxSubParts(trDepth, absPartIdx, fullDepth);
>
>          m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
>          m_trQuant->selectLambda(TEXT_LUMA);
> @@ -941,18 +922,18 @@
>
>      if (trMode == trDepth)
>      {
> -        uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> -        uint32_t qtlayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> +        uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> +        uint32_t qtLayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
>
>          //===== copy transform coefficients =====
> -        uint32_t numCoeffY    = 1 << (trSizeLog2 * 2);
> +        uint32_t numCoeffY    = 1 << (log2TrSize * 2);
>          uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> -        coeff_t* coeffSrcY    = m_qtTempCoeff[0][qtlayer] + coeffOffsetY;
> +        coeff_t* coeffSrcY    = m_qtTempCoeff[0][qtLayer] + coeffOffsetY;
>          coeff_t* coeffDestY   = cu->getCoeffY()           + coeffOffsetY;
>          ::memcpy(coeffDestY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
>
>          //===== copy reconstruction =====
> -        m_qtTempShortYuv[qtlayer].copyPartToPartLuma(reconYuv, absPartIdx, 1 << trSizeLog2);
> +        m_qtTempShortYuv[qtLayer].copyPartToPartLuma(reconYuv, absPartIdx, 1 << log2TrSize);
>      }
>      else
>      {
> @@ -964,164 +945,116 @@
>      }
>  }
>
> -void TEncSearch::xStoreIntraResultQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx)
> +void TEncSearch::xStoreIntraResultQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize)
>  {
> -    uint32_t fullDepth = cu->getDepth(0) + trDepth;
> -    uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> -    uint32_t qtlayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> +    uint32_t qtLayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
>
>      //===== copy transform coefficients =====
> -    uint32_t numCoeffY    = 1 << (trSizeLog2 * 2);
> +    uint32_t numCoeffY = 1 << (log2TrSize * 2);
>      uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> -    coeff_t* coeffSrcY = m_qtTempCoeff[0][qtlayer] + coeffOffsetY;
> +    coeff_t* coeffSrcY = m_qtTempCoeff[0][qtLayer] + coeffOffsetY;
>      coeff_t* coeffDstY = m_qtTempTUCoeff[0];
> -
>      ::memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
>
>      //===== copy reconstruction =====
> -    m_qtTempShortYuv[qtlayer].copyPartToPartLuma(&m_qtTempTransformSkipYuv, absPartIdx, 1 << trSizeLog2);
> +    pixel*   reconTs       = m_qtTempTransformSkipYuv.getLumaAddr(absPartIdx);
> +    uint32_t reconTsStride = m_qtTempTransformSkipYuv.getStride();
> +    int16_t* reconQt       = m_qtTempShortYuv[qtLayer].getLumaAddr(absPartIdx);
> +    X265_CHECK(m_qtTempShortYuv[qtLayer].m_width == MAX_CU_SIZE, "width is not max CU size\n");
> +    const uint32_t reconQtStride = MAX_CU_SIZE;
> +    int sizeIdx = log2TrSize - 2;
> +    primitives.square_copy_sp[sizeIdx](reconTs, reconTsStride, reconQt, reconQtStride);
>  }
>
> -void TEncSearch::xLoadIntraResultQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx)
> +void TEncSearch::xLoadIntraResultQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize)
>  {
> -    uint32_t fullDepth = cu->getDepth(0) + trDepth;
> -    uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> -    uint32_t qtlayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> +    uint32_t qtLayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
>
>      //===== copy transform coefficients =====
> -    uint32_t numCoeffY    = 1 << (trSizeLog2 * 2);
> +    uint32_t numCoeffY = 1 << (log2TrSize * 2);
>      uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> -    coeff_t* coeffDstY = m_qtTempCoeff[0][qtlayer] + coeffOffsetY;
> +    coeff_t* coeffDstY = m_qtTempCoeff[0][qtLayer] + coeffOffsetY;
>      coeff_t* coeffSrcY = m_qtTempTUCoeff[0];
> -
>      ::memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
>
>      //===== copy reconstruction =====
> -    uint32_t trSize = 1 << trSizeLog2;
> -    m_qtTempTransformSkipYuv.copyPartToPartLuma(&m_qtTempShortYuv[qtlayer], absPartIdx, trSize);
> -
> -    uint32_t   zOrder           = cu->getZorderIdxInCU() + absPartIdx;
> -    pixel*     reconIPred       = cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zOrder);
> -    uint32_t   reconIPredStride = cu->getPic()->getPicYuvRec()->getStride();
> -    int16_t*   reconQt          = m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx);
> -    X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width is not max CU size\n");
> +    pixel*   reconTs       = m_qtTempTransformSkipYuv.getLumaAddr(absPartIdx);
> +    uint32_t reconTsStride = m_qtTempTransformSkipYuv.getStride();
> +    int16_t* reconQt       = m_qtTempShortYuv[qtLayer].getLumaAddr(absPartIdx);
> +    X265_CHECK(m_qtTempShortYuv[qtLayer].m_width == MAX_CU_SIZE, "width is not max CU size\n");
>      const uint32_t reconQtStride = MAX_CU_SIZE;
> -    int sizeIdx = trSizeLog2 - 2;
> -    primitives.square_copy_sp[sizeIdx](reconIPred, reconIPredStride, reconQt, reconQtStride);
> +    int sizeIdx = log2TrSize - 2;
> +    primitives.square_copy_ps[sizeIdx](reconQt, reconQtStride, reconTs, reconTsStride);
> +
> +    uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
> +    pixel*   reconIPred       = cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
> +    uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getStride();
> +    primitives.square_copy_pp[sizeIdx](reconIPred, reconIPredStride, reconTs, reconTsStride);
>  }
>
> -void TEncSearch::xStoreIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t chromaId, const bool splitIntoSubTUs)
> -{
> -    assert(chromaId == 1 || chromaId == 2);
> -
> -    uint32_t fullDepth = cu->getDepth(0) + trDepth;
> -    uint32_t trMode    = cu->getTransformIdx(absPartIdx);
> -
> -    if (trMode == trDepth)
> -    {
> -        uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> -        uint32_t qtlayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> -        int      chFmt      = cu->getChromaFormat();
> -
> -        bool bChromaSame = false;
> -        if (trSizeLog2 == 2 && !(chFmt == CHROMA_444))
> -        {
> -            X265_CHECK(trDepth > 0, "invalid trDepth\n");
> -            trDepth--;
> -            uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trDepth) << 1);
> -            bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
> -            bool bSecondQ = (chFmt == CHROMA_422) ? ((absPartIdx & (qpdiv - 1)) == 2) : false;
> -            if ((!bFirstQ) && (!bSecondQ))
> -            {
> -                return;
> -            }
> -            bChromaSame = true;
> -        }
> -        uint32_t width  = cu->getCUSize(absPartIdx) >> (trDepth + m_hChromaShift);
> -        uint32_t height = cu->getCUSize(absPartIdx) >> (trDepth + m_vChromaShift);
> -        height = splitIntoSubTUs ? height >> 1 : height;
> -        uint32_t numCoeffC = width * height;
> -        uint32_t coeffOffsetC = absPartIdx << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
> -
> -        coeff_t* coeffSrc = m_qtTempCoeff[chromaId][qtlayer] + coeffOffsetC;
> -        coeff_t* coeffDst = m_qtTempTUCoeff[chromaId];
> -        ::memcpy(coeffDst, coeffSrc, sizeof(coeff_t) * numCoeffC);
> -
> -        //===== copy reconstruction =====
> -        uint32_t lumaSize = 1 << (bChromaSame ? trSizeLog2 + 1 : trSizeLog2);
> -        m_qtTempShortYuv[qtlayer].copyPartToPartYuvChroma(&m_qtTempTransformSkipYuv, absPartIdx, lumaSize, chromaId, splitIntoSubTUs);
> -    }
> -}
> -
> -void TEncSearch::xLoadIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t chromaId)
> +void TEncSearch::xStoreIntraResultChromaQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize, uint32_t log2TrSizeC, uint32_t chromaId)
>  {
>      X265_CHECK(chromaId == 1 || chromaId == 2, "invalid chroma id");
>
> -    uint32_t fullDepth = cu->getDepth(0) + trDepth;
> -    uint32_t trMode    = cu->getTransformIdx(absPartIdx);
> -
> -    if (trMode == trDepth)
> -    {
> -        uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> -        uint32_t qtlayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> -        int      chFmt      = cu->getChromaFormat();
> -        const bool splitIntoSubTUs = (chFmt == CHROMA_422);
> -
> -        uint32_t trSizeCLog2 = trSizeLog2 - m_hChromaShift;
> -        bool bChromaSame = false;
> -        if (trSizeLog2 == 2 && !(chFmt == CHROMA_444))
> -        {
> -            X265_CHECK(trDepth > 0, "invalid trDepth\n");
> -            trDepth--;
> -            trSizeCLog2++;
> -            uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trDepth) << 1);
> -            bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
> -            bool bSecondQ = (chFmt == CHROMA_422) ? ((absPartIdx & (qpdiv - 1)) == 2) : false;
> -            if ((!bFirstQ) && (!bSecondQ))
> -            {
> -                return;
> -            }
> -            bChromaSame = true;
> -        }
> -
> -        //===== copy transform coefficients =====
> -        uint32_t numCoeffC = 1 << (trSizeCLog2 * 2);
> -        uint32_t coeffOffsetC = absPartIdx << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
> -
> -        coeff_t* coeffDst = m_qtTempCoeff[chromaId][qtlayer] + coeffOffsetC;
> -        coeff_t* coeffSrc = m_qtTempTUCoeff[chromaId];
> -        ::memcpy(coeffDst, coeffSrc, sizeof(coeff_t) * numCoeffC);
> -
> -        //===== copy reconstruction =====
> -        uint32_t lumaSize = 1 << (bChromaSame ? trSizeLog2 + 1 : trSizeLog2);
> -        m_qtTempTransformSkipYuv.copyPartToPartChroma(&m_qtTempShortYuv[qtlayer], absPartIdx, lumaSize, chromaId, splitIntoSubTUs);
> -
> -        uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
> -        uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getCStride();
> -
> -        pixel* reconIPred = cu->getPic()->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), zorder);
> -        int16_t* reconQt  = m_qtTempShortYuv[qtlayer].getChromaAddr(chromaId, absPartIdx);
> -        uint32_t reconQtStride    = m_qtTempShortYuv[qtlayer].m_cwidth;
> -        int sizeIdxC = trSizeCLog2 - 2;
> -        primitives.square_copy_sp[sizeIdxC](reconIPred, reconIPredStride, reconQt, reconQtStride);
> -    }
> +    uint32_t qtLayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
> +
> +    //===== copy transform coefficients =====
> +    uint32_t numCoeffC = 1 << (log2TrSizeC * 2);
> +    uint32_t coeffOffsetC = absPartIdx << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
> +    coeff_t* coeffSrcC = m_qtTempCoeff[chromaId][qtLayer] + coeffOffsetC;
> +    coeff_t* coeffDstC = m_qtTempTUCoeff[chromaId];
> +    ::memcpy(coeffDstC, coeffSrcC, sizeof(coeff_t) * numCoeffC);
> +
> +    //===== copy reconstruction =====
> +    pixel*   reconTs       = m_qtTempTransformSkipYuv.getChromaAddr(chromaId, absPartIdx);
> +    uint32_t reconTsStride = m_qtTempTransformSkipYuv.getCStride();
> +    int16_t* reconQt       = m_qtTempShortYuv[qtLayer].getChromaAddr(chromaId, absPartIdx);
> +    uint32_t reconQtStride = m_qtTempShortYuv[qtLayer].m_cwidth;
> +    int sizeIdxC = log2TrSizeC - 2;
> +    primitives.square_copy_sp[sizeIdxC](reconTs, reconTsStride, reconQt, reconQtStride);
> +}
> +
> +void TEncSearch::xLoadIntraResultChromaQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize, uint32_t log2TrSizeC, uint32_t chromaId)
> +{
> +    X265_CHECK(chromaId == 1 || chromaId == 2, "invalid chroma id");
> +
> +    uint32_t qtLayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
> +
> +    //===== copy transform coefficients =====
> +    uint32_t numCoeffC = 1 << (log2TrSizeC * 2);
> +    uint32_t coeffOffsetC = absPartIdx << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
> +    coeff_t* coeffDstC = m_qtTempCoeff[chromaId][qtLayer] + coeffOffsetC;
> +    coeff_t* coeffSrcC = m_qtTempTUCoeff[chromaId];
> +    ::memcpy(coeffDstC, coeffSrcC, sizeof(coeff_t) * numCoeffC);
> +
> +    //===== copy reconstruction =====
> +    pixel*   reconTs       = m_qtTempTransformSkipYuv.getChromaAddr(chromaId, absPartIdx);
> +    uint32_t reconTsStride = m_qtTempTransformSkipYuv.getCStride();
> +    int16_t* reconQt       = m_qtTempShortYuv[qtLayer].getChromaAddr(chromaId, absPartIdx);
> +    uint32_t reconQtStride = m_qtTempShortYuv[qtLayer].m_cwidth;
> +    int sizeIdxC = log2TrSizeC - 2;
> +    primitives.square_copy_ps[sizeIdxC](reconQt, reconQtStride, reconTs, reconTsStride);
> +
> +    uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
> +    pixel*   reconIPred       = cu->getPic()->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), zorder);
> +    uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getCStride();
> +    primitives.square_copy_pp[sizeIdxC](reconIPred, reconIPredStride, reconTs, reconTsStride);
>  }
>
>  void TEncSearch::offsetSubTUCBFs(TComDataCU* cu, TextType ttype, uint32_t trDepth, uint32_t absPartIdx)
>  {
>      uint32_t depth = cu->getDepth(0);
>      uint32_t fullDepth = depth + trDepth;
> -    uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> -
> -    uint32_t actualTrDepth = trDepth;
> -
> -    if ((trSizeLog2 == 2) && !(cu->getChromaFormat() == CHROMA_444))
> +    uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> +
> +    uint32_t trDepthC = trDepth;
> +    if ((log2TrSize == 2) && !(cu->getChromaFormat() == CHROMA_444))
>      {
> -        X265_CHECK(actualTrDepth > 0, "actualTrDepth invalid\n");
> -        actualTrDepth--;
> +        X265_CHECK(trDepthC > 0, "trDepthC invalid\n");
> +        trDepthC--;
>      }
>
> -    uint32_t partIdxesPerSubTU     = (cu->getPic()->getNumPartInCU() >> ((depth + actualTrDepth) << 1)) >> 1;
> +    uint32_t partIdxesPerSubTU     = (cu->getPic()->getNumPartInCU() >> ((depth + trDepthC) << 1)) >> 1;
>
>      //move the CBFs down a level and set the parent CBF
>      uint8_t subTUCBF[2];
> @@ -1158,15 +1091,16 @@
>      if (trMode == trDepth)
>      {
>          int chFmt = cu->getChromaFormat();
> -        uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> -        uint32_t trSizeCLog2 = trSizeLog2 - m_hChromaShift;
> -        uint32_t actualTrDepth = trDepth;
> -        if ((trSizeLog2 == 2) && !(chFmt == CHROMA_444))
> +        uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> +        uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
> +
> +        uint32_t trDepthC = trDepth;
> +        if ((log2TrSize == 2) && !(chFmt == CHROMA_444))
>          {
>              X265_CHECK(trDepth > 0, "invalid trDepth\n");
> -            actualTrDepth--;
> -            trSizeCLog2++;
> -            uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + actualTrDepth) << 1);
> +            trDepthC--;
> +            log2TrSizeC++;
> +            uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trDepthC) << 1);
>              bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
>              if (!bFirstQ)
>              {
> @@ -1174,12 +1108,12 @@
>              }
>          }
>
> -        uint32_t tuSize = cu->getCUSize(0) >> (actualTrDepth + m_hChromaShift);
> +        uint32_t tuSize = 1 << log2TrSizeC;
>          uint32_t stride = fencYuv->getCStride();
>          const bool splitIntoSubTUs = (chFmt == CHROMA_422);
>
>          bool checkTransformSkip = (cu->getSlice()->getPPS()->getUseTransformSkip() &&
> -                                   trSizeCLog2 <= LOG2_MAX_TS_SIZE &&
> +                                   log2TrSizeC <= LOG2_MAX_TS_SIZE &&
>                                     !cu->getCUTransquantBypass(0));
>
>          if (m_cfg->m_param->bEnableTSkipFast)
> @@ -1200,7 +1134,7 @@
>          for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
>          {
>              TComTURecurse tuIterator;
> -            uint32_t curPartNum = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) +  actualTrDepth) << 1);
> +            uint32_t curPartNum = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) +  trDepthC) << 1);
>              initSection(&tuIterator, splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, curPartNum, absPartIdx);
>
>              do
> @@ -1209,7 +1143,7 @@
>                  pixel*   pred        = predYuv->getChromaAddr(chromaId, absPartIdxC);
>
>                  //===== init availability pattern =====
> -                TComPattern::initAdiPatternChroma(cu, absPartIdxC, actualTrDepth, m_predBuf, chromaId);
> +                TComPattern::initAdiPatternChroma(cu, absPartIdxC, trDepthC, m_predBuf, chromaId);
>                  pixel* chromaPred = TComPattern::getAdiChromaBuf(chromaId, tuSize, m_predBuf);
>
>                  uint32_t chromaPredMode = cu->getChromaIntraDir(absPartIdxC);
> @@ -1223,6 +1157,8 @@
>                  //===== get prediction signal =====
>                  predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, tuSize, chFmt);
>
> +                uint32_t singleCbfC     = 0;
> +
>                  if (checkTransformSkip)
>                  {
>                      // use RDO to decide whether Cr/Cb takes TS
> @@ -1231,7 +1167,6 @@
>                      uint64_t singleCost     = MAX_INT64;
>                      int      bestModeId     = 0;
>                      uint32_t singleDistC    = 0;
> -                    uint32_t singleCbfC     = 0;
>                      uint32_t singleDistCTmp = 0;
>                      uint64_t singleCostTmp  = 0;
>                      uint32_t singleCbfCTmp  = 0;
> @@ -1243,9 +1178,8 @@
>                          cu->setTransformSkipPartRange(chromaModeId, (TextType)chromaId, absPartIdxC, tuIterator.m_absPartIdxStep);
>
>                          singleDistCTmp = 0;
> -                        xIntraCodingChromaBlk(cu, trDepth, absPartIdxC, tuIterator.m_absPartIdxStep, fencYuv, predYuv, resiYuv, singleDistCTmp, chromaId);
> -
> -                        singleCbfCTmp = cu->getCbf(absPartIdxC, (TextType)chromaId, trDepth);
> +                        xIntraCodingChromaBlk(cu, absPartIdxC, log2TrSize, fencYuv, predYuv, resiYuv, singleCbfCTmp, singleDistCTmp, chromaId, log2TrSizeC);
> +                        cu->setCbfPartRange(singleCbfCTmp << trDepth, (TextType)chromaId, absPartIdxC, tuIterator.m_absPartIdxStep);
>
>                          if (chromaModeId == 1 && singleCbfCTmp == 0)
>                          {
> @@ -1267,7 +1201,7 @@
>
>                              if (bestModeId == firstCheckId)
>                              {
> -                                xStoreIntraResultChromaQT(cu, trDepth, absPartIdxC, chromaId, splitIntoSubTUs);
> +                                xStoreIntraResultChromaQT(cu, absPartIdxC, log2TrSize, log2TrSizeC, chromaId);
>                                  m_rdGoOnSbacCoder->store(m_rdSbacCoders[fullDepth][CI_TEMP_BEST]);
>                              }
>                          }
> @@ -1279,7 +1213,7 @@
>
>                      if (bestModeId == firstCheckId)
>                      {
> -                        xLoadIntraResultChromaQT(cu, trDepth, absPartIdxC, chromaId);
> +                        xLoadIntraResultChromaQT(cu, absPartIdxC, log2TrSize, log2TrSizeC, chromaId);
>                          cu->setCbfPartRange(singleCbfC << trDepth, (TextType)chromaId, absPartIdxC, tuIterator.m_absPartIdxStep);
>
>                          m_rdGoOnSbacCoder->load(m_rdSbacCoders[fullDepth][CI_TEMP_BEST]);
> @@ -1297,7 +1231,8 @@
>                  else
>                  {
>                      cu->setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.m_absPartIdxStep);
> -                    xIntraCodingChromaBlk(cu, trDepth, absPartIdxC, tuIterator.m_absPartIdxStep, fencYuv, predYuv, resiYuv, outDist, chromaId);
> +                    xIntraCodingChromaBlk(cu, absPartIdxC, log2TrSize, fencYuv, predYuv, resiYuv, singleCbfC, outDist, chromaId, log2TrSizeC);
> +                    cu->setCbfPartRange(singleCbfC << trDepth, (TextType)chromaId, absPartIdxC, tuIterator.m_absPartIdxStep);
>                  }
>              }
>              while (isNextSection(&tuIterator));
> @@ -1337,14 +1272,16 @@
>      if (trMode == trDepth)
>      {
>          int      chFmt      = cu->getChromaFormat();
> -        uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> -        uint32_t qtlayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> +        uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> +        uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
> +        uint32_t qtLayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
>
>          bool bChromaSame = false;
> -        if ((trSizeLog2 == 2) && !(chFmt == CHROMA_444))
> +        if ((log2TrSize == 2) && !(chFmt == CHROMA_444))
>          {
>              X265_CHECK(trDepth > 0, "invalid trDepth\n");
>              trDepth--;
> +            log2TrSizeC++;
>              uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trDepth) << 1);
>              if ((absPartIdx & (qpdiv - 1)) != 0)
>              {
> @@ -1355,20 +1292,18 @@
>
>          //===== copy transform coefficients =====
>
> -        uint32_t width     = cu->getCUSize(absPartIdx) >> (trDepth + m_hChromaShift);
> -        uint32_t height    = cu->getCUSize(absPartIdx) >> (trDepth + m_vChromaShift);
> -        uint32_t numCoeffC = width * height;
> +        uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (chFmt == CHROMA_422));
>          uint32_t coeffOffsetC = absPartIdx << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
>
> -        coeff_t* coeffSrcU = m_qtTempCoeff[1][qtlayer] + coeffOffsetC;
> -        coeff_t* coeffSrcV = m_qtTempCoeff[2][qtlayer] + coeffOffsetC;
> +        coeff_t* coeffSrcU = m_qtTempCoeff[1][qtLayer] + coeffOffsetC;
> +        coeff_t* coeffSrcV = m_qtTempCoeff[2][qtLayer] + coeffOffsetC;
>          coeff_t* coeffDstU = cu->getCoeffCb()          + coeffOffsetC;
>          coeff_t* coeffDstV = cu->getCoeffCr()          + coeffOffsetC;
>          ::memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
>          ::memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
>
>          //===== copy reconstruction =====
> -        m_qtTempShortYuv[qtlayer].copyPartToPartChroma(reconYuv, absPartIdx, 1 << trSizeLog2, (bChromaSame && (chFmt != CHROMA_422)));
> +        m_qtTempShortYuv[qtLayer].copyPartToPartChroma(reconYuv, absPartIdx, 1 << log2TrSize, (bChromaSame && (chFmt != CHROMA_422)));
>      }
>      else
>      {
> @@ -1394,14 +1329,16 @@
>      if (trMode == trDepth)
>      {
>          int      chFmt     = cu->getChromaFormat();
> -        uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> +        uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> +        uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
>          uint32_t origTrDepth = trDepth;
> -        uint32_t actualTrDepth = trDepth;
> -        if ((trSizeLog2 == 2) && !(chFmt == CHROMA_444))
> +        uint32_t trDepthC = trDepth;
> +        if ((log2TrSize == 2) && !(chFmt == CHROMA_444))
>          {
>              X265_CHECK(trDepth > 0, "invalid trDepth\n");
> -            actualTrDepth--;
> -            uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + actualTrDepth) << 1);
> +            trDepthC--;
> +            log2TrSizeC++;
> +            uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trDepthC) << 1);
>              bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
>              if (!bFirstQ)
>              {
> @@ -1409,16 +1346,16 @@
>              }
>          }
>
> -        uint32_t tuSize = cu->getCUSize(0) >> (actualTrDepth + m_hChromaShift);
> +        uint32_t tuSize = 1 << log2TrSizeC;
>          uint32_t stride = fencYuv->getCStride();
>          const bool splitIntoSubTUs = (chFmt == CHROMA_422);
> -        int sizeIdx = g_convertToBit[tuSize];
> +        int sizeIdxC = log2TrSizeC - 2;
>          int part = partitionFromSize(tuSize);
>
>          for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
>          {
>              TComTURecurse tuIterator;
> -            uint32_t curPartNum = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + actualTrDepth) << 1);
> +            uint32_t curPartNum = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trDepthC) << 1);
>              initSection(&tuIterator, splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, curPartNum, absPartIdx);
>
>              do
> @@ -1436,8 +1373,8 @@
>                  pixel*   reconIPred     = cu->getPic()->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), zorder);
>                  uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getCStride();
>
> -                //bool     useTransformSkipChroma = cu->getTransformSkip(absPartIdxC, ttype);
> -                const bool useTransformSkipChroma = false;
> +                //bool     useTransformSkipC = cu->getTransformSkip(absPartIdxC, ttype);
> +                const bool useTransformSkipC = false;
>                  cu->setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.m_absPartIdxStep);
>
>                  uint32_t chromaPredMode = cu->getChromaIntraDir(absPartIdxC);
> @@ -1449,7 +1386,7 @@
>                  }
>                  chromaPredMode = (chFmt == CHROMA_422) ? g_chroma422IntraAngleMappingTable[chromaPredMode] : chromaPredMode;
>                  //===== init availability pattern =====
> -                TComPattern::initAdiPatternChroma(cu, absPartIdxC, actualTrDepth, m_predBuf, chromaId);
> +                TComPattern::initAdiPatternChroma(cu, absPartIdxC, trDepthC, m_predBuf, chromaId);
>                  pixel* chromaPred = TComPattern::getAdiChromaBuf(chromaId, tuSize, m_predBuf);
>
>                  //===== get prediction signal =====
> @@ -1459,7 +1396,7 @@
>                  X265_CHECK(!((intptr_t)fenc & (tuSize - 1)), "fenc alignment failure\n");
>                  X265_CHECK(!((intptr_t)pred & (tuSize - 1)), "pred alignment failure\n");
>                  X265_CHECK(!((intptr_t)residual & (tuSize - 1)), "residual alignment failure\n");
> -                primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
> +                primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
>
>                  //--- transform and quantization ---
>                  uint32_t absSum = 0;
> @@ -1478,7 +1415,7 @@
>
>                  m_trQuant->selectLambda(TEXT_CHROMA);
>
> -                absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, tuSize, ttype, absPartIdxC, &lastPos, useTransformSkipChroma);
> +                absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, tuSize, ttype, absPartIdxC, &lastPos, useTransformSkipC);
>
>                  //--- set coded block flag ---
>                  cu->setCbfPartRange((((absSum > 0) ? 1 : 0) << origTrDepth), ttype, absPartIdxC, tuIterator.m_absPartIdxStep);
> @@ -1488,12 +1425,12 @@
>                      //--- inverse transform ---
>                      int scalingListType = 0 + ttype;
>                      X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> -                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, residual, stride, coeff, tuSize, scalingListType, useTransformSkipChroma, lastPos);
> +                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, residual, stride, coeff, tuSize, scalingListType, useTransformSkipC, lastPos);
>
>                      //===== reconstruction =====
>                      // use square primitives
>                      primitives.chroma[CHROMA_444].add_ps[part](recon, stride, pred, residual, stride, stride);
> -                    primitives.square_copy_pp[sizeIdx](reconIPred, reconIPredStride, recon, stride);
> +                    primitives.square_copy_pp[sizeIdxC](reconIPred, reconIPredStride, recon, stride);
>                  }
>                  else
>                  {
> @@ -1502,8 +1439,8 @@
>  #endif
>
>                      //===== reconstruction =====
> -                    primitives.square_copy_pp[sizeIdx](recon,      stride,           pred, stride);
> -                    primitives.square_copy_pp[sizeIdx](reconIPred, reconIPredStride, pred, stride);
> +                    primitives.square_copy_pp[sizeIdxC](recon,      stride,           pred, stride);
> +                    primitives.square_copy_pp[sizeIdxC](reconIPred, reconIPredStride, pred, stride);
>                  }
>              }
>              while (isNextSection(&tuIterator));
> @@ -2840,17 +2777,17 @@
>  {
>      X265_CHECK(cu->getDepth(0) == cu->getDepth(absPartIdx), "invalid depth\n");
>      const uint32_t trMode = depth - cu->getDepth(0);
> -    const uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
> +    const uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
>      const uint32_t setCbf     = 1 << trMode;
>      int chFmt                 = cu->getChromaFormat();
>
>      bool bSplitFlag = ((cu->getSlice()->getSPS()->getQuadtreeTUMaxDepthInter() == 1) && cu->getPredictionMode(absPartIdx) == MODE_INTER && (cu->getPartitionSize(absPartIdx) != SIZE_2Nx2N));
>      bool bCheckFull;
> -    if (bSplitFlag && depth == cu->getDepth(absPartIdx) && (trSizeLog2 > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx)))
> +    if (bSplitFlag && depth == cu->getDepth(absPartIdx) && (log2TrSize > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx)))
>          bCheckFull = false;
>      else
> -        bCheckFull = (trSizeLog2 <= cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize());
> -    const bool bCheckSplit = (trSizeLog2 > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx));
> +        bCheckFull = (log2TrSize <= cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize());
> +    const bool bCheckSplit = (log2TrSize > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx));
>      X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
>
>      // code full block
> @@ -2858,12 +2795,12 @@
>      int lastPosY = -1, lastPosU = -1, lastPosV = -1;
>      if (bCheckFull)
>      {
> -        uint32_t trSizeCLog2 = trSizeLog2 - m_hChromaShift;
> +        uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
>          bool bCodeChroma = true;
>          uint32_t trModeC = trMode;
> -        if ((trSizeLog2 == 2) && !(chFmt == CHROMA_444))
> +        if ((log2TrSize == 2) && !(chFmt == CHROMA_444))
>          {
> -            trSizeCLog2++;
> +            log2TrSizeC++;
>              trModeC--;
>              uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((depth - 1) << 1);
>              bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
> @@ -2878,10 +2815,10 @@
>          coeff_t *coeffCurU = cu->getCoeffCb() + coeffOffsetC;
>          coeff_t *coeffCurV = cu->getCoeffCr() + coeffOffsetC;
>
> -        uint32_t trSize   = 1 << trSizeLog2;
> -        uint32_t trSizeC  = 1 << trSizeCLog2;
> -        uint32_t sizeIdx  = trSizeLog2  - 2;
> -        uint32_t sizeIdxC = trSizeCLog2 - 2;
> +        uint32_t trSize   = 1 << log2TrSize;
> +        uint32_t trSizeC  = 1 << log2TrSizeC;
> +        uint32_t sizeIdx  = log2TrSize  - 2;
> +        uint32_t sizeIdxC = log2TrSizeC - 2;
>          cu->setTrIdxSubParts(depth - cu->getDepth(0), absPartIdx, depth);
>
>          cu->setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
> @@ -3026,26 +2963,26 @@
>  {
>      X265_CHECK(cu->getDepth(0) == cu->getDepth(absPartIdx), "depth not matching\n");
>      const uint32_t trMode = depth - cu->getDepth(0);
> -    const uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
> +    const uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
>      const uint32_t subTUDepth = trMode + 1;
>      const uint32_t setCbf     = 1 << trMode;
>      int chFmt                 = cu->getChromaFormat();
>
>      bool bSplitFlag = ((cu->getSlice()->getSPS()->getQuadtreeTUMaxDepthInter() == 1) && cu->getPredictionMode(absPartIdx) == MODE_INTER && (cu->getPartitionSize(absPartIdx) != SIZE_2Nx2N));
>      bool bCheckFull;
> -    if (bSplitFlag && depth == cu->getDepth(absPartIdx) && (trSizeLog2 > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx)))
> +    if (bSplitFlag && depth == cu->getDepth(absPartIdx) && (log2TrSize > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx)))
>          bCheckFull = false;
>      else
> -        bCheckFull = (trSizeLog2 <= cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize());
> -    const bool bCheckSplit = (trSizeLog2 > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx));
> +        bCheckFull = (log2TrSize <= cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize());
> +    const bool bCheckSplit = (log2TrSize > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx));
>      X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
>
> -    uint32_t trSizeCLog2 = trSizeLog2 - m_hChromaShift;
> +    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
>      bool bCodeChroma = true;
>      uint32_t trModeC = trMode;
> -    if ((trSizeLog2 == 2) && !(chFmt == CHROMA_444))
> +    if ((log2TrSize == 2) && !(chFmt == CHROMA_444))
>      {
> -        trSizeCLog2++;
> +        log2TrSizeC++;
>          trModeC--;
>          uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((depth - 1) << 1);
>          bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
> @@ -3066,27 +3003,27 @@
>      uint32_t bestsubTUCBF[MAX_NUM_COMPONENT][2];
>      m_rdGoOnSbacCoder->store(m_rdSbacCoders[depth][CI_QT_TRAFO_ROOT]);
>
> -    uint32_t trSize = 1 << trSizeLog2;
> +    uint32_t trSize = 1 << log2TrSize;
>      const bool splitIntoSubTUs = (chFmt == CHROMA_422);
>      uint32_t absPartIdxStep = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) +  trModeC) << 1);
>
>      // code full block
>      if (bCheckFull)
>      {
> -        uint32_t trSizeC = 1 << trSizeCLog2;
> -        int sizeIdx  = trSizeLog2 - 2;
> -        int sizeIdxC = trSizeCLog2 - 2;
> -        const uint32_t qtlayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> +        uint32_t trSizeC = 1 << log2TrSizeC;
> +        int sizeIdx  = log2TrSize - 2;
> +        int sizeIdxC = log2TrSizeC - 2;
> +        const uint32_t qtLayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
>          uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
>          uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
> -        coeff_t *coeffCurY = m_qtTempCoeff[0][qtlayer] + coeffOffsetY;
> -        coeff_t *coeffCurU = m_qtTempCoeff[1][qtlayer] + coeffOffsetC;
> -        coeff_t *coeffCurV = m_qtTempCoeff[2][qtlayer] + coeffOffsetC;
> +        coeff_t *coeffCurY = m_qtTempCoeff[0][qtLayer] + coeffOffsetY;
> +        coeff_t *coeffCurU = m_qtTempCoeff[1][qtLayer] + coeffOffsetC;
> +        coeff_t *coeffCurV = m_qtTempCoeff[2][qtLayer] + coeffOffsetC;
>
>          cu->setTrIdxSubParts(depth - cu->getDepth(0), absPartIdx, depth);
>          bool checkTransformSkip   = cu->getSlice()->getPPS()->getUseTransformSkip() && !cu->getCUTransquantBypass(0);
> -        bool checkTransformSkipY  = checkTransformSkip && trSizeLog2  <= LOG2_MAX_TS_SIZE;
> -        bool checkTransformSkipUV = checkTransformSkip && trSizeCLog2 <= LOG2_MAX_TS_SIZE;
> +        bool checkTransformSkipY  = checkTransformSkip && log2TrSize  <= LOG2_MAX_TS_SIZE;
> +        bool checkTransformSkipUV = checkTransformSkip && log2TrSizeC <= LOG2_MAX_TS_SIZE;
>
>          cu->setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
>
> @@ -3161,8 +3098,8 @@
>              while (isNextSection(&tuIterator));
>          }
>
> -        const uint32_t numCoeffY = 1 << (trSizeLog2 * 2);
> -        const uint32_t numCoeffC = 1 << (trSizeCLog2 * 2);
> +        const uint32_t numCoeffY = 1 << (log2TrSize * 2);
> +        const uint32_t numCoeffC = 1 << (log2TrSizeC * 2);
>
>          for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
>          {
> @@ -3173,10 +3110,10 @@
>
>          int partSize = partitionFromSize(trSize);
>          uint32_t distY = primitives.sse_sp[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, (pixel*)RDCost::zeroPel, trSize);
> -        int16_t *curResiY = m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx);
> -        X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width not full CU\n");
> +        int16_t *curResiY = m_qtTempShortYuv[qtLayer].getLumaAddr(absPartIdx);
> +        X265_CHECK(m_qtTempShortYuv[qtLayer].m_width == MAX_CU_SIZE, "width not full CU\n");
>          const uint32_t strideResiY = MAX_CU_SIZE;
> -        const uint32_t strideResiC = m_qtTempShortYuv[qtlayer].m_cwidth;
> +        const uint32_t strideResiC = m_qtTempShortYuv[qtLayer].m_cwidth;
>
>          if (outZeroDist)
>          {
> @@ -3253,8 +3190,8 @@
>                  uint32_t absPartIdxC = tuIterator.m_absPartIdxTURelCU;
>                  uint32_t subTUBufferOffset = trSizeC * trSizeC * tuIterator.m_section;
>
> -                int16_t *curResiU = m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC);
> -                int16_t *curResiV = m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC);
> +                int16_t *curResiU = m_qtTempShortYuv[qtLayer].getCbAddr(absPartIdxC);
> +                int16_t *curResiV = m_qtTempShortYuv[qtLayer].getCrAddr(absPartIdxC);
>
>                  distU = m_rdCost->scaleChromaDistCb(primitives.sse_sp[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, (pixel*)RDCost::zeroPel, trSizeC));
>
> @@ -3399,11 +3336,8 @@
>              uint32_t nonZeroDistY = 0, absSumTransformSkipY;
>              uint64_t singleCostY = MAX_INT64;
>
> -            coeff_t bestCoeffY[MAX_TS_SIZE * MAX_TS_SIZE];
> -            memcpy(bestCoeffY, coeffCurY, sizeof(coeff_t) * numCoeffY);
> -
> -            int16_t bestResiY[MAX_TS_SIZE * MAX_TS_SIZE];
> -            primitives.square_copy_ss[sizeIdx](bestResiY, trSize, curResiY, strideResiY);
> +            ALIGN_VAR_32(coeff_t, tsCoeffY[MAX_TS_SIZE * MAX_TS_SIZE]);
> +            ALIGN_VAR_32(int16_t, tsResiY[MAX_TS_SIZE * MAX_TS_SIZE]);
>
>              m_rdGoOnSbacCoder->load(m_rdSbacCoders[depth][CI_QT_TRAFO_ROOT]);
>
> @@ -3417,7 +3351,7 @@
>              m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
>
>              m_trQuant->selectLambda(TEXT_LUMA);
> -            absSumTransformSkipY = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, coeffCurY,
> +            absSumTransformSkipY = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, tsCoeffY,
>                                                             trSize, TEXT_LUMA, absPartIdx, &lastPosTransformSkip[TEXT_LUMA][0], true, curuseRDOQ);
>              cu->setCbfSubParts(absSumTransformSkipY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
>
> @@ -3425,7 +3359,7 @@
>              {
>                  m_entropyCoder->resetBits();
>                  m_entropyCoder->encodeQtCbf(cu, absPartIdx, 0, trSize, trSize, TEXT_LUMA, trMode, true);
> -                m_entropyCoder->encodeCoeffNxN(cu, coeffCurY, absPartIdx, trSize, TEXT_LUMA);
> +                m_entropyCoder->encodeCoeffNxN(cu, tsCoeffY, absPartIdx, trSize, TEXT_LUMA);
>                  const uint32_t skipSingleBitsY = m_entropyCoder->getNumberOfWrittenBits();
>
>                  m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
> @@ -3433,10 +3367,10 @@
>                  int scalingListType = 3 + TEXT_LUMA;
>                  X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
>
> -                m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, strideResiY,  coeffCurY, trSize, scalingListType, true, lastPosTransformSkip[TEXT_LUMA][0]);
> +                m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, tsResiY, trSize, tsCoeffY, trSize, scalingListType, true, lastPosTransformSkip[TEXT_LUMA][0]);
>
>                  nonZeroDistY = primitives.sse_ss[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width,
> -                                                           curResiY, strideResiY);
> +                                                           tsResiY, trSize);
>
>                  singleCostY = m_rdCost->calcRdCost(nonZeroDistY, skipSingleBitsY);
>              }
> @@ -3444,14 +3378,14 @@
>              if (!absSumTransformSkipY || minCost[TEXT_LUMA][0] < singleCostY)
>              {
>                  cu->setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
> -                memcpy(coeffCurY, bestCoeffY, sizeof(coeff_t) * numCoeffY);
> -                primitives.square_copy_ss[sizeIdx](curResiY, strideResiY, bestResiY, trSize);
>              }
>              else
>              {
>                  singleDistComp[TEXT_LUMA][0] = nonZeroDistY;
>                  absSum[TEXT_LUMA][0] = absSumTransformSkipY;
>                  bestTransformMode[TEXT_LUMA][0] = 1;
> +                memcpy(coeffCurY, tsCoeffY, sizeof(coeff_t) * numCoeffY);
> +                primitives.square_copy_ss[sizeIdx](curResiY, strideResiY, tsResiY, trSize);
>              }
>
>              cu->setCbfSubParts(absSum[TEXT_LUMA][0] ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
> @@ -3475,16 +3409,13 @@
>                  uint32_t absPartIdxC = tuIterator.m_absPartIdxTURelCU;
>                  uint32_t subTUBufferOffset = trSizeC * trSizeC * tuIterator.m_section;
>
> -                int16_t *curResiU = m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC);
> -                int16_t *curResiV = m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC);
> -
> -                coeff_t bestCoeffU[MAX_TS_SIZE * MAX_TS_SIZE], bestCoeffV[MAX_TS_SIZE * MAX_TS_SIZE];
> -                memcpy(bestCoeffU, coeffCurU + subTUBufferOffset, sizeof(coeff_t) * numCoeffC);
> -                memcpy(bestCoeffV, coeffCurV + subTUBufferOffset, sizeof(coeff_t) * numCoeffC);
> -
> -                int16_t bestResiU[MAX_TS_SIZE * MAX_TS_SIZE], bestResiV[MAX_TS_SIZE * MAX_TS_SIZE];
> -                primitives.square_copy_ss[sizeIdxC](bestResiU, trSizeC, curResiU, strideResiC);
> -                primitives.square_copy_ss[sizeIdxC](bestResiV, trSizeC, curResiV, strideResiC);
> +                int16_t *curResiU = m_qtTempShortYuv[qtLayer].getCbAddr(absPartIdxC);
> +                int16_t *curResiV = m_qtTempShortYuv[qtLayer].getCrAddr(absPartIdxC);
> +
> +                ALIGN_VAR_32(coeff_t, tsCoeffU[MAX_TS_SIZE * MAX_TS_SIZE]);
> +                ALIGN_VAR_32(int16_t, tsResiU[MAX_TS_SIZE * MAX_TS_SIZE]);
> +                ALIGN_VAR_32(coeff_t, tsCoeffV[MAX_TS_SIZE * MAX_TS_SIZE]);
> +                ALIGN_VAR_32(int16_t, tsResiV[MAX_TS_SIZE * MAX_TS_SIZE]);
>
>                  cu->setTransformSkipPartRange(1, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
>                  cu->setTransformSkipPartRange(1, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> @@ -3498,11 +3429,11 @@
>                  m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
>                  m_trQuant->selectLambda(TEXT_CHROMA);
>
> -                absSumTransformSkipU = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurU + subTUBufferOffset,
> +                absSumTransformSkipU = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, tsCoeffU,
>                                                                 trSizeC, TEXT_CHROMA_U, absPartIdxC, &lastPosTransformSkip[TEXT_CHROMA_U][tuIterator.m_section], true, curuseRDOQ);
>                  curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
>                  m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> -                absSumTransformSkipV = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurV + subTUBufferOffset,
> +                absSumTransformSkipV = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, tsCoeffV,
>                                                                 trSizeC, TEXT_CHROMA_V, absPartIdxC, &lastPosTransformSkip[TEXT_CHROMA_V][tuIterator.m_section], true, curuseRDOQ);
>
>                  cu->setCbfPartRange(absSumTransformSkipU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> @@ -3514,7 +3445,7 @@
>                  if (absSumTransformSkipU)
>                  {
>                      m_entropyCoder->encodeQtCbf(cu, absPartIdxC, tuIterator.m_absPartIdxStep, trSizeC, trSizeC, TEXT_CHROMA_U, trMode, true);
> -                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurU + subTUBufferOffset, absPartIdxC, trSizeC, TEXT_CHROMA_U);
> +                    m_entropyCoder->encodeCoeffNxN(cu, tsCoeffU, absPartIdxC, trSizeC, TEXT_CHROMA_U);
>                      singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section] = m_entropyCoder->getNumberOfWrittenBits();
>
>                      curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
> @@ -3522,10 +3453,10 @@
>
>                      int scalingListType = 3 + TEXT_CHROMA_U;
>                      X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> -                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiU, strideResiC, coeffCurU + subTUBufferOffset,
> +                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, tsResiU, trSizeC, tsCoeffU,
>                                                 trSizeC, scalingListType, true, lastPosTransformSkip[TEXT_CHROMA_U][tuIterator.m_section]);
>                      uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth,
> -                                                                 curResiU, strideResiC);
> +                                                                 tsResiU, trSizeC);
>                      nonZeroDistU = m_rdCost->scaleChromaDistCb(dist);
>                      singleCostU = m_rdCost->calcRdCost(nonZeroDistU, singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section]);
>                  }
> @@ -3533,21 +3464,20 @@
>                  if (!absSumTransformSkipU || minCost[TEXT_CHROMA_U][tuIterator.m_section] < singleCostU)
>                  {
>                      cu->setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> -
> -                    memcpy(coeffCurU + subTUBufferOffset, bestCoeffU, sizeof(coeff_t) * numCoeffC);
> -                    primitives.square_copy_ss[sizeIdxC](curResiU, strideResiC, bestResiU, trSizeC);
>                  }
>                  else
>                  {
>                      singleDistComp[TEXT_CHROMA_U][tuIterator.m_section] = nonZeroDistU;
>                      absSum[TEXT_CHROMA_U][tuIterator.m_section] = absSumTransformSkipU;
>                      bestTransformMode[TEXT_CHROMA_U][tuIterator.m_section] = 1;
> +                    memcpy(coeffCurU + subTUBufferOffset, tsCoeffU, sizeof(coeff_t) * numCoeffC);
> +                    primitives.square_copy_ss[sizeIdxC](curResiU, strideResiC, tsResiU, trSizeC);
>                  }
>
>                  if (absSumTransformSkipV)
>                  {
>                      m_entropyCoder->encodeQtCbf(cu, absPartIdxC, tuIterator.m_absPartIdxStep, trSizeC, trSizeC, TEXT_CHROMA_V, trMode, true);
> -                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurV + subTUBufferOffset, absPartIdxC, trSizeC, TEXT_CHROMA_V);
> +                    m_entropyCoder->encodeCoeffNxN(cu, tsCoeffV, absPartIdxC, trSizeC, TEXT_CHROMA_V);
>                      singleBitsComp[TEXT_CHROMA_V][tuIterator.m_section] = m_entropyCoder->getNumberOfWrittenBits() - singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section];
>
>                      curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
> @@ -3555,10 +3485,10 @@
>
>                      int scalingListType = 3 + TEXT_CHROMA_V;
>                      X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> -                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, strideResiC, coeffCurV + subTUBufferOffset,
> +                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, tsResiV, trSizeC, tsCoeffV,
>                                                 trSizeC, scalingListType, true, lastPosTransformSkip[TEXT_CHROMA_V][tuIterator.m_section]);
>                      uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth,
> -                                                                 curResiV, strideResiC);
> +                                                                 tsResiV, trSizeC);
>                      nonZeroDistV = m_rdCost->scaleChromaDistCr(dist);
>                      singleCostV = m_rdCost->calcRdCost(nonZeroDistV, singleBitsComp[TEXT_CHROMA_V][tuIterator.m_section]);
>                  }
> @@ -3566,15 +3496,14 @@
>                  if (!absSumTransformSkipV || minCost[TEXT_CHROMA_V][tuIterator.m_section] < singleCostV)
>                  {
>                      cu->setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> -
> -                    memcpy(coeffCurV + subTUBufferOffset, bestCoeffV, sizeof(coeff_t) * numCoeffC);
> -                    primitives.square_copy_ss[sizeIdxC](curResiV, strideResiC, bestResiV, trSizeC);
>                  }
>                  else
>                  {
>                      singleDistComp[TEXT_CHROMA_V][tuIterator.m_section] = nonZeroDistV;
>                      absSum[TEXT_CHROMA_V][tuIterator.m_section] = absSumTransformSkipV;
>                      bestTransformMode[TEXT_CHROMA_V][tuIterator.m_section] = 1;
> +                    memcpy(coeffCurV + subTUBufferOffset, tsCoeffV, sizeof(coeff_t) * numCoeffC);
> +                    primitives.square_copy_ss[sizeIdxC](curResiV, strideResiC, tsResiV, trSizeC);
>                  }
>
>                  cu->setCbfPartRange(absSum[TEXT_CHROMA_U][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> @@ -3588,9 +3517,9 @@
>
>          m_entropyCoder->resetBits();
>
> -        if (trSizeLog2 > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx))
> +        if (log2TrSize > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx))
>          {
> -            m_entropyCoder->encodeTransformSubdivFlag(0, 5 - trSizeLog2);
> +            m_entropyCoder->encodeTransformSubdivFlag(0, 5 - log2TrSize);
>          }
>
>          if (bCodeChroma)
> @@ -3793,21 +3722,21 @@
>      const uint32_t curTrMode   = depth - cu->getDepth(0);
>      const uint32_t trMode      = cu->getTransformIdx(absPartIdx);
>      const bool     bSubdiv     = curTrMode != trMode;
> -    const uint32_t trSizeLog2  = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
> -    uint32_t       trSizeCLog2 = trSizeLog2 - m_hChromaShift;
> +    const uint32_t log2TrSize  = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
> +    uint32_t       log2TrSizeC = log2TrSize - m_hChromaShift;
>      int            chFmt       = cu->getChromaFormat();
>      const bool splitIntoSubTUs = (chFmt == CHROMA_422);
>
> -    if (bSubdivAndCbf && trSizeLog2 <= cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() && trSizeLog2 > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx))
> +    if (bSubdivAndCbf && log2TrSize <= cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() && log2TrSize > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx))
>      {
> -        m_entropyCoder->encodeTransformSubdivFlag(bSubdiv, 5 - trSizeLog2);
> +        m_entropyCoder->encodeTransformSubdivFlag(bSubdiv, 5 - log2TrSize);
>      }
>
>      X265_CHECK(cu->getPredictionMode(absPartIdx) != MODE_INTRA, "xEncodeResidualQT() with intra block\n");
>
>      bool mCodeAll = true;
> -    uint32_t trSize    = 1 << trSizeLog2;
> -    uint32_t trWidthC  = 1 << trSizeCLog2;
> +    uint32_t trSize    = 1 << log2TrSize;
> +    uint32_t trWidthC  = 1 << log2TrSizeC;
>      uint32_t trHeightC = splitIntoSubTUs ? (trWidthC << 1) : trWidthC;
>
>      const uint32_t numPels = trWidthC * trHeightC;
> @@ -3841,16 +3770,16 @@
>      if (!bSubdiv)
>      {
>          //Luma
> -        const uint32_t qtlayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> +        const uint32_t qtLayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
>          uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> -        coeff_t *coeffCurY = m_qtTempCoeff[0][qtlayer] + coeffOffsetY;
> +        coeff_t *coeffCurY = m_qtTempCoeff[0][qtLayer] + coeffOffsetY;
>
>          //Chroma
>          bool bCodeChroma = true;
>          uint32_t trModeC = trMode;
> -        if ((trSizeLog2 == 2) && !(chFmt == CHROMA_444))
> +        if ((log2TrSize == 2) && !(chFmt == CHROMA_444))
>          {
> -            trSizeCLog2++;
> +            log2TrSizeC++;
>              trModeC--;
>              uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((depth - 1) << 1);
>              bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
> @@ -3869,9 +3798,9 @@
>              if (bCodeChroma)
>              {
>                  uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
> -                coeff_t *coeffCurU = m_qtTempCoeff[1][qtlayer] + coeffOffsetC;
> -                coeff_t *coeffCurV = m_qtTempCoeff[2][qtlayer] + coeffOffsetC;
> -                uint32_t trSizeC = 1 << trSizeCLog2;
> +                coeff_t *coeffCurU = m_qtTempCoeff[1][qtLayer] + coeffOffsetC;
> +                coeff_t *coeffCurV = m_qtTempCoeff[2][qtLayer] + coeffOffsetC;
> +                uint32_t trSizeC = 1 << log2TrSizeC;
>
>                  if (!splitIntoSubTUs)
>                  {
> @@ -3928,16 +3857,16 @@
>      if (curTrMode == trMode)
>      {
>          int            chFmt      = cu->getChromaFormat();
> -        const uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
> -        const uint32_t qtlayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> -
> -        uint32_t trSizeCLog2 = trSizeLog2 - m_hChromaShift;
> +        const uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
> +        const uint32_t qtLayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
> +
> +        uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
>          bool bCodeChroma = true;
>          bool bChromaSame = false;
>          uint32_t trModeC = trMode;
> -        if ((trSizeLog2 == 2) && !(chFmt == CHROMA_444))
> +        if ((log2TrSize == 2) && !(chFmt == CHROMA_444))
>          {
> -            trSizeCLog2++;
> +            log2TrSizeC++;
>              trModeC--;
>              uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trModeC) << 1);
>              bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
> @@ -3946,28 +3875,28 @@
>
>          if (bSpatial)
>          {
> -            uint32_t trSize = 1 << trSizeLog2;
> -            m_qtTempShortYuv[qtlayer].copyPartToPartLuma(resiYuv, absPartIdx, trSize);
> +            uint32_t trSize = 1 << log2TrSize;
> +            m_qtTempShortYuv[qtLayer].copyPartToPartLuma(resiYuv, absPartIdx, trSize);
>
>              if (bCodeChroma)
>              {
> -                m_qtTempShortYuv[qtlayer].copyPartToPartChroma(resiYuv, absPartIdx, trSize, (bChromaSame && (chFmt != CHROMA_422)));
> +                m_qtTempShortYuv[qtLayer].copyPartToPartChroma(resiYuv, absPartIdx, trSize, (bChromaSame && (chFmt != CHROMA_422)));
>              }
>          }
>          else
>          {
> -            uint32_t numCoeffY = 1 << (trSizeLog2 * 2);
> +            uint32_t numCoeffY = 1 << (log2TrSize * 2);
>              uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> -            coeff_t* coeffSrcY = m_qtTempCoeff[0][qtlayer] + coeffOffsetY;
> +            coeff_t* coeffSrcY = m_qtTempCoeff[0][qtLayer] + coeffOffsetY;
>              coeff_t* coeffDstY = cu->getCoeffY()           + coeffOffsetY;
>              ::memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
>              if (bCodeChroma)
>              {
> -                uint32_t numCoeffC = 1 << (trSizeCLog2 * 2 + (chFmt == CHROMA_422));
> +                uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (chFmt == CHROMA_422));
>                  uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
>
> -                coeff_t* coeffSrcU = m_qtTempCoeff[1][qtlayer] + coeffOffsetC;
> -                coeff_t* coeffSrcV = m_qtTempCoeff[2][qtlayer] + coeffOffsetC;
> +                coeff_t* coeffSrcU = m_qtTempCoeff[1][qtLayer] + coeffOffsetC;
> +                coeff_t* coeffSrcV = m_qtTempCoeff[2][qtLayer] + coeffOffsetC;
>                  coeff_t* coeffDstU = cu->getCoeffCb()          + coeffOffsetC;
>                  coeff_t* coeffDstV = cu->getCoeffCr()          + coeffOffsetC;
>                  ::memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
> diff -r 0cbc7320c9f2 -r b6302b087ea4 source/Lib/TLibEncoder/TEncSearch.h
> --- a/source/Lib/TLibEncoder/TEncSearch.h       Mon Jun 09 11:34:11 2014 +0530
> +++ b/source/Lib/TLibEncoder/TEncSearch.h       Tue Jun 10 18:54:35 2014 +0900
> @@ -207,11 +207,11 @@
>      void xEncIntraHeader(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, bool bLuma, bool bChroma);
>      uint32_t xGetIntraBitsQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t absPartIdxStep, bool bLuma, bool bChroma);
>      uint32_t xGetIntraBitsQTChroma(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t chromaId, const bool splitIntoSubTUs);
> -    void xIntraCodingLumaBlk(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv,
> -                             ShortYuv* resiYuv, uint32_t& outDist);
> +    void xIntraCodingLumaBlk(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize, TComYuv* fencYuv, TComYuv* predYuv,
> +                             ShortYuv* resiYuv, uint32_t& cbf, uint32_t& outDist);
>
> -    void xIntraCodingChromaBlk(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t absPartIdxStep, TComYuv* fencYuv, TComYuv* predYuv,
> -                               ShortYuv* resiYuv, uint32_t& outDist, uint32_t chromaId);
> +    void xIntraCodingChromaBlk(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize, TComYuv* fencYuv, TComYuv* predYuv,
> +                               ShortYuv* resiYuv, uint32_t& cbf, uint32_t& outDist, uint32_t chromaId, uint32_t log2TrSizeC);
>
>      void xRecurIntraChromaCodingQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv,
>                                     TComYuv* predYuv, ShortYuv* resiYuv, uint32_t& outDist);
> @@ -223,10 +223,10 @@
>
>      void xSetIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* reconYuv);
>
> -    void xStoreIntraResultQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx);
> -    void xLoadIntraResultQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx);
> -    void xStoreIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t chromaId, const bool splitIntoSubTUs);
> -    void xLoadIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t chromaId);
> +    void xStoreIntraResultQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize);
> +    void xLoadIntraResultQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize);
> +    void xStoreIntraResultChromaQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize, uint32_t log2TrSizeC, uint32_t chromaId);
> +    void xLoadIntraResultChromaQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize, uint32_t log2TrSizeC, uint32_t chromaId);
>
>      // --------------------------------------------------------------------------------------------
>      // Inter search (AMP)
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel


-- 
Steve Borho