[x265] refine block size related

Fri May 23 16:12:52 CEST 2014

On Thu, May 22, 2014 at 11:37 PM, Satoshi Nakagawa <nakagawa424 at oki.com> wrote:
> # HG changeset patch
> # User Satoshi Nakagawa <nakagawa424 at oki.com>
> # Date 1400819691 -32400
> #      Fri May 23 13:34:51 2014 +0900
> # Node ID 085be1ffd4a9752f64f8422e404985527e890921
> # Parent  5134e76aa729b6fece18701fdc00390c2f2ffb32
> refine block size related

nice!

staged for regression testing

> diff -r 5134e76aa729 -r 085be1ffd4a9 source/Lib/TLibCommon/TComBitStream.cpp
> --- a/source/Lib/TLibCommon/TComBitStream.cpp   Thu May 22 21:46:21 2014 -0500
> +++ b/source/Lib/TLibCommon/TComBitStream.cpp   Fri May 23 13:34:51 2014 +0900
> @@ -88,7 +88,7 @@
>      /* any modulo 8 remainder of num_total_bits cannot be written this time,
>       * and will be held until next time. */
>      uint32_t num_total_bits = numBits + m_num_held_bits;
> -    uint32_t next_num_held_bits = num_total_bits % 8;
> +    uint32_t next_num_held_bits = num_total_bits & 7;
>
>      /* form a byte aligned word (write_bits), by concatenating any held bits
>       * with the new bits, discarding the bits that will form the next_held_bits.
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/Lib/TLibCommon/TComDataCU.cpp
> --- a/source/Lib/TLibCommon/TComDataCU.cpp      Thu May 22 21:46:21 2014 -0500
> +++ b/source/Lib/TLibCommon/TComDataCU.cpp      Fri May 23 13:34:51 2014 +0900
> @@ -1337,7 +1337,7 @@
>  {
>      uint32_t curPartNum = m_pic->getNumPartInCU() >> (depth << 1);
>
> -    return ((m_absIdxInLCU + absPartIdx) % curPartNum) == 0;
> +    return ((m_absIdxInLCU + absPartIdx) & (curPartNum - 1)) == 0;
>  }
>
>  void TComDataCU::setPartSizeSubParts(PartSize mode, uint32_t absPartIdx, uint32_t depth)
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/Lib/TLibCommon/TComPrediction.cpp
> --- a/source/Lib/TLibCommon/TComPrediction.cpp  Thu May 22 21:46:21 2014 -0500
> +++ b/source/Lib/TLibCommon/TComPrediction.cpp  Fri May 23 13:34:51 2014 +0900
> @@ -117,15 +117,15 @@
>  {
>      bool bFilter;
>
> -    if (dirMode == DC_IDX)
> +    if (dirMode == DC_IDX || tuSize <= 4)
>      {
> -        bFilter = false; // no smoothing for DC or LM chroma
> +        bFilter = false; // no smoothing for DC
>      }
>      else
>      {
>          int diff = std::min<int>(abs((int)dirMode - HOR_IDX), abs((int)dirMode - VER_IDX));
> -        uint32_t sizeIndex = g_convertToBit[tuSize];
> -        bFilter = diff > intraFilterThreshold[sizeIndex];
> +        uint32_t sizeIdx = g_convertToBit[tuSize];
> +        bFilter = diff > intraFilterThreshold[sizeIdx];
>      }
>
>      return bFilter;
> @@ -134,7 +134,7 @@
>  void TComPrediction::predIntraLumaAng(uint32_t dirMode, pixel* dst, intptr_t stride, int tuSize)
>  {
>      X265_CHECK(tuSize >= 4 && tuSize <= 64, "intra block size is out of range\n");
> -    int log2BlkSize = g_convertToBit[tuSize];
> +    int sizeIdx = g_convertToBit[tuSize];
>      bool bUseFilteredPredictions = TComPrediction::filteringIntraReferenceSamples(dirMode, tuSize);
>
>      pixel *refLft, *refAbv;
> @@ -148,13 +148,13 @@
>      }
>
>      bool bFilter = tuSize <= 16 && dirMode != PLANAR_IDX;
> -    primitives.intra_pred[log2BlkSize][dirMode](dst, stride, refLft, refAbv, dirMode, bFilter);
> +    primitives.intra_pred[sizeIdx][dirMode](dst, stride, refLft, refAbv, dirMode, bFilter);
>  }
>
>  // Angular chroma
>  void TComPrediction::predIntraChromaAng(pixel* src, uint32_t dirMode, pixel* dst, intptr_t stride, int tuSize, int chFmt)
>  {
> -    int log2BlkSize = g_convertToBit[tuSize];
> +    int sizeIdx = g_convertToBit[tuSize];
>      uint32_t tuSize2 = tuSize << 1;
>
>      // Create the prediction
> @@ -222,7 +222,7 @@
>          }
>      }
>
> -    primitives.intra_pred[log2BlkSize][dirMode](dst, stride, refLft + tuSize - 1, refAbv + tuSize - 1, dirMode, 0);
> +    primitives.intra_pred[sizeIdx][dirMode](dst, stride, refLft + tuSize - 1, refAbv + tuSize - 1, dirMode, 0);
>  }
>
>  /** Function for checking identical motion.
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/Lib/TLibCommon/TComSlice.h
> --- a/source/Lib/TLibCommon/TComSlice.h Thu May 22 21:46:21 2014 -0500
> +++ b/source/Lib/TLibCommon/TComSlice.h Fri May 23 13:34:51 2014 +0900
> @@ -906,6 +906,8 @@
>
>      void setLog2DiffMaxMinCodingBlockSize(int val) { m_log2DiffMaxMinCodingBlockSize = val; }
>
> +    int  getLog2MaxCodingBlockSize() const { return m_log2MinCodingBlockSize + m_log2DiffMaxMinCodingBlockSize; }
> +
>      void setMaxCUSize(uint32_t u) { m_maxCUSize = u; }
>
>      uint32_t getMaxCUSize() const  { return m_maxCUSize; }
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/Lib/TLibCommon/TComYuv.cpp
> --- a/source/Lib/TLibCommon/TComYuv.cpp Thu May 22 21:46:21 2014 -0500
> +++ b/source/Lib/TLibCommon/TComYuv.cpp Fri May 23 13:34:51 2014 +0900
> @@ -186,7 +186,7 @@
>
>  void TComYuv::copyPartToPartLuma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize)
>  {
> -    int part = partitionFromSizes(lumaSize, lumaSize);
> +    int part = partitionFromSize(lumaSize);
>
>      int16_t* dst = dstPicYuv->getLumaAddr(partIdx);
>      uint32_t dststride = dstPicYuv->m_width;
> @@ -196,7 +196,7 @@
>
>  void TComYuv::copyPartToPartChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, uint32_t chromaId, const bool splitIntoSubTUs)
>  {
> -    int part = splitIntoSubTUs ? NUM_CHROMA_PARTITIONS422 : partitionFromSizes(lumaSize, lumaSize);
> +    int part = splitIntoSubTUs ? NUM_CHROMA_PARTITIONS422 : partitionFromSize(lumaSize);
>
>      if (chromaId == 1)
>      {
> @@ -235,7 +235,7 @@
>
>  void TComYuv::addClip(TComYuv* srcYuv0, ShortYuv* srcYuv1, uint32_t partSize)
>  {
> -    int part = partitionFromSizes(partSize, partSize);
> +    int part = partitionFromSize(partSize);
>
>      addClipLuma(srcYuv0, srcYuv1, part);
>      addClipChroma(srcYuv0, srcYuv1, part);
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/Lib/TLibEncoder/TEncCu.cpp
> --- a/source/Lib/TLibEncoder/TEncCu.cpp Thu May 22 21:46:21 2014 -0500
> +++ b/source/Lib/TLibEncoder/TEncCu.cpp Fri May 23 13:34:51 2014 +0900
> @@ -571,13 +571,14 @@
>          m_origYuv[0]->copyPartToYuv(m_origYuv[depth], outBestCU->getZorderIdxInCU());
>      }
>
> +    uint32_t cuSize = outTempCU->getCUSize(0);
>      TComSlice* slice = outTempCU->getSlice();
>      if (!bInsidePicture)
>      {
>          uint32_t lpelx = outBestCU->getCUPelX();
>          uint32_t tpely = outBestCU->getCUPelY();
> -        uint32_t rpelx = lpelx + outBestCU->getCUSize(0);
> -        uint32_t bpely = tpely + outBestCU->getCUSize(0);
> +        uint32_t rpelx = lpelx + cuSize;
> +        uint32_t bpely = tpely + cuSize;
>          bInsidePicture = (rpelx <= slice->getSPS()->getPicWidthInLumaSamples() &&
>                            bpely <= slice->getSPS()->getPicHeightInLumaSamples());
>      }
> @@ -592,7 +593,7 @@
>
>          if (depth == g_maxCUDepth - g_addCUDepth)
>          {
> -            if (outTempCU->getCUSize(0) > (1 << slice->getSPS()->getQuadtreeTULog2MinSize()))
> +            if (cuSize > (1 << slice->getSPS()->getQuadtreeTULog2MinSize()))
>              {
>                  xCheckRDCostIntra(outBestCU, outTempCU, SIZE_NxN);
>              }
> @@ -715,13 +716,14 @@
>      bool doNotBlockPu = true;
>      bool earlyDetectionSkipMode = false;
>
> +    uint32_t cuSize = outTempCU->getCUSize(0);
>      TComSlice* slice = outTempCU->getSlice();
>      if (!bInsidePicture)
>      {
>          uint32_t lpelx = outBestCU->getCUPelX();
>          uint32_t tpely = outBestCU->getCUPelY();
> -        uint32_t rpelx = lpelx + outBestCU->getCUSize(0);
> -        uint32_t bpely = tpely + outBestCU->getCUSize(0);
> +        uint32_t rpelx = lpelx + cuSize;
> +        uint32_t bpely = tpely + cuSize;
>          bInsidePicture = (rpelx <= slice->getSPS()->getPicWidthInLumaSamples() &&
>                            bpely <= slice->getSPS()->getPicHeightInLumaSamples());
>      }
> @@ -765,7 +767,7 @@
>              if (slice->getSliceType() != I_SLICE)
>              {
>                  // 2Nx2N, NxN
> -                if (!(outBestCU->getCUSize(0) == 8))
> +                if (!(cuSize == 8))
>                  {
>                      if (depth == g_maxCUDepth - g_addCUDepth && doNotBlockPu)
>                      {
> @@ -899,7 +901,7 @@
>
>                  if (depth == g_maxCUDepth - g_addCUDepth)
>                  {
> -                    if (outTempCU->getCUSize(0) > (1 << slice->getSPS()->getQuadtreeTULog2MinSize()))
> +                    if (cuSize > (1 << slice->getSPS()->getQuadtreeTULog2MinSize()))
>                      {
>                          xCheckRDCostIntraInInter(outBestCU, outTempCU, SIZE_NxN);
>                          outTempCU->initEstData(depth);
> @@ -908,10 +910,10 @@
>              }
>              // test PCM
>              if (slice->getSPS()->getUsePCM()
> -                && outTempCU->getCUSize(0) <= (1 << slice->getSPS()->getPCMLog2MaxSize())
> -                && outTempCU->getCUSize(0) >= (1 << slice->getSPS()->getPCMLog2MinSize()))
> +                && cuSize <= (1 << slice->getSPS()->getPCMLog2MaxSize())
> +                && cuSize >= (1 << slice->getSPS()->getPCMLog2MinSize()))
>              {
> -                uint32_t rawbits = (2 * X265_DEPTH + X265_DEPTH) * outBestCU->getCUSize(0) * outBestCU->getCUSize(0) / 2;
> +                uint32_t rawbits = (2 * X265_DEPTH + X265_DEPTH) * cuSize * cuSize / 2;
>                  uint32_t bestbits = outBestCU->m_totalBits;
>                  if ((bestbits > rawbits) || (outBestCU->m_totalCost > m_rdCost->calcRdCost(0, rawbits)))
>                  {
> @@ -1045,6 +1047,7 @@
>      uint32_t posy = (externalAddress / pic->getFrameWidthInCU()) * g_maxCUSize + g_rasterToPelY[g_zscanToRaster[internalAddress]];
>      uint32_t width = slice->getSPS()->getPicWidthInLumaSamples();
>      uint32_t height = slice->getSPS()->getPicHeightInLumaSamples();
> +    uint32_t cuSize = cu->getCUSize(absPartIdx);
>
>      while (posx >= width || posy >= height)
>      {
> @@ -1070,8 +1073,8 @@
>      uint32_t uiGranularityWidth = g_maxCUSize;
>      posx = cu->getCUPelX() + g_rasterToPelX[g_zscanToRaster[absPartIdx]];
>      posy = cu->getCUPelY() + g_rasterToPelY[g_zscanToRaster[absPartIdx]];
> -    bool granularityBoundary = ((posx + cu->getCUSize(absPartIdx)) % uiGranularityWidth == 0 || (posx + cu->getCUSize(absPartIdx) == width))
> -        && ((posy + cu->getCUSize(absPartIdx)) % uiGranularityWidth == 0 || (posy + cu->getCUSize(absPartIdx) == height));
> +    bool granularityBoundary = ((posx + cuSize) % uiGranularityWidth == 0 || (posx + cuSize == width))
> +        && ((posy + cuSize) % uiGranularityWidth == 0 || (posy + cuSize == height));
>
>      if (granularityBoundary)
>      {
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/Lib/TLibEncoder/TEncEntropy.cpp
> --- a/source/Lib/TLibEncoder/TEncEntropy.cpp    Thu May 22 21:46:21 2014 -0500
> +++ b/source/Lib/TLibEncoder/TEncEntropy.cpp    Fri May 23 13:34:51 2014 +0900
> @@ -212,7 +212,7 @@
>  void TEncEntropy::xEncodeTransform(TComDataCU* cu, uint32_t offsetLuma, uint32_t offsetChroma, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t depth, uint32_t tuSize, uint32_t trIdx, bool& bCodeDQP)
>  {
>      const uint32_t subdiv = cu->getTransformIdx(absPartIdx) + cu->getDepth(absPartIdx) > depth;
> -    const uint32_t log2TrafoSize = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize()] + 2 - depth;
> +    const uint32_t log2TrafoSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
>      uint32_t hChromaShift        = cu->getHorzChromaShift();
>      uint32_t vChromaShift        = cu->getVertChromaShift();
>      uint32_t cbfY = cu->getCbf(absPartIdx, TEXT_LUMA, trIdx);
> @@ -227,12 +227,12 @@
>      if ((log2TrafoSize == 2) && !(cu->getChromaFormat() == CHROMA_444))
>      {
>          uint32_t partNum = cu->getPic()->getNumPartInCU() >> ((depth - 1) << 1);
> -        if ((absPartIdx % partNum) == 0)
> +        if ((absPartIdx & (partNum - 1)) == 0)
>          {
>              m_bakAbsPartIdx   = absPartIdx;
>              m_bakChromaOffset = offsetChroma;
>          }
> -        else if ((absPartIdx % partNum) == (partNum - 1))
> +        else if ((absPartIdx & (partNum - 1)) == (partNum - 1))
>          {
>              cbfU = cu->getCbf(m_bakAbsPartIdx, TEXT_CHROMA_U, trIdx);
>              cbfV = cu->getCbf(m_bakAbsPartIdx, TEXT_CHROMA_V, trIdx);
> @@ -369,9 +369,9 @@
>          if ((log2TrafoSize == 2) && !(chFmt == CHROMA_444))
>          {
>              uint32_t partNum = cu->getPic()->getNumPartInCU() >> ((depth - 1) << 1);
> -            if ((absPartIdx % partNum) == (partNum - 1))
> +            if ((absPartIdx & (partNum - 1)) == (partNum - 1))
>              {
> -                uint32_t trWidthC          = log2TrafoSize << 1;
> +                uint32_t trSizeC           = 1 << log2TrafoSize;
>                  const bool splitIntoSubTUs = (chFmt == CHROMA_422);
>
>                  uint32_t curPartNum = cu->getPic()->getNumPartInCU() >> ((depth - 1) << 1);
> @@ -384,10 +384,10 @@
>                      do
>                      {
>                          uint32_t cbf = cu->getCbf(tuIterator.m_absPartIdxTURelCU, (TextType)chromaId, trIdx + splitIntoSubTUs);
> -                        uint32_t subTUIndex = tuIterator.m_section * trWidthC * trWidthC;
> +                        uint32_t subTUIndex = tuIterator.m_section * trSizeC * trSizeC;
>                          if (cbf)
>                          {
> -                            m_entropyCoderIf->codeCoeffNxN(cu, (coeffChroma + m_bakChromaOffset + subTUIndex), tuIterator.m_absPartIdxTURelCU, trWidthC, (TextType)chromaId);
> +                            m_entropyCoderIf->codeCoeffNxN(cu, (coeffChroma + m_bakChromaOffset + subTUIndex), tuIterator.m_absPartIdxTURelCU, trSizeC, (TextType)chromaId);
>                          }
>                      }
>                      while (isNextTUSection(&tuIterator));
> @@ -396,10 +396,8 @@
>          }
>          else
>          {
> -            uint32_t trWidthC  = tuSize >> hChromaShift;
> -            uint32_t trHeightC = tuSize >> vChromaShift;
> +            uint32_t trSizeC  = tuSize >> hChromaShift;
>              const bool splitIntoSubTUs = (chFmt == CHROMA_422);
> -            trHeightC = splitIntoSubTUs ? trHeightC >> 1 : trHeightC;
>              uint32_t curPartNum = cu->getPic()->getNumPartInCU() >> (depth << 1);
>              for (uint32_t chromaId = TEXT_CHROMA; chromaId < MAX_NUM_COMPONENT; chromaId++)
>              {
> @@ -409,10 +407,10 @@
>                  do
>                  {
>                      uint32_t cbf = cu->getCbf(tuIterator.m_absPartIdxTURelCU, (TextType)chromaId, trIdx + splitIntoSubTUs);
> -                    uint32_t subTUIndex = tuIterator.m_section * trWidthC * trHeightC;
> +                    uint32_t subTUIndex = tuIterator.m_section * trSizeC * trSizeC;
>                      if (cbf)
>                      {
> -                        m_entropyCoderIf->codeCoeffNxN(cu, (coeffChroma + offsetChroma + subTUIndex), tuIterator.m_absPartIdxTURelCU, trWidthC, (TextType)chromaId);
> +                        m_entropyCoderIf->codeCoeffNxN(cu, (coeffChroma + offsetChroma + subTUIndex), tuIterator.m_absPartIdxTURelCU, trSizeC, (TextType)chromaId);
>                      }
>                  }
>                  while (isNextTUSection(&tuIterator));
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/Lib/TLibEncoder/TEncSbac.cpp
> --- a/source/Lib/TLibEncoder/TEncSbac.cpp       Thu May 22 21:46:21 2014 -0500
> +++ b/source/Lib/TLibEncoder/TEncSbac.cpp       Fri May 23 13:34:51 2014 +0900
> @@ -1930,8 +1930,7 @@
>          m_binIf->encodePCMAlignBits();
>
>          uint32_t lumaOffset   = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> -        uint32_t chromaOffset = lumaOffset >> 2;
> -        //uint32_t chromaOffset = lumaOffset >> (m_hChromaShift + m_vChromaShift);
> +        uint32_t chromaOffset = lumaOffset >> (cu->getHorzChromaShift() + cu->getVertChromaShift());
>          uint32_t width;
>          uint32_t height;
>          uint32_t sampleBits;
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/Lib/TLibEncoder/TEncSearch.cpp
> --- a/source/Lib/TLibEncoder/TEncSearch.cpp     Thu May 22 21:46:21 2014 -0500
> +++ b/source/Lib/TLibEncoder/TEncSearch.cpp     Fri May 23 13:34:51 2014 +0900
> @@ -160,7 +160,7 @@
>      uint32_t fullDepth  = cu->getDepth(0) + trDepth;
>      uint32_t trMode     = cu->getTransformIdx(absPartIdx);
>      uint32_t subdiv     = (trMode > trDepth ? 1 : 0);
> -    uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize()] + 2 - fullDepth;
> +    uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
>
>      if (cu->getPredictionMode(0) == MODE_INTRA && cu->getPartitionSize(0) == SIZE_NxN && trDepth == 0)
>      {
> @@ -245,14 +245,14 @@
>
>      uint32_t origTrDepth = trDepth;
>
> -    uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize()] + 2 - fullDepth;
> +    uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
>      int chFmt           = cu->getChromaFormat();
>      if ((ttype != TEXT_LUMA) && (trSizeLog2 == 2) && !(chFmt == CHROMA_444))
>      {
>          X265_CHECK(trDepth > 0, "transform size too small\n");
>          trDepth--;
>          uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trDepth) << 1);
> -        bool bFirstQ = ((absPartIdx % qpdiv) == 0);
> +        bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
>          if (!bFirstQ)
>          {
>              return;
> @@ -290,7 +290,7 @@
>      {
>          uint32_t subTUSize = width * width;
>          uint32_t partIdxesPerSubTU  = cu->getPic()->getNumPartInCU() >> (((cu->getDepth(absPartIdx) + trDepth) << 1) + 1);
> -
> +
>          if (cu->getCbf(absPartIdx, ttype, origTrDepth + 1))
>              m_entropyCoder->encodeCoeffNxN(cu, coeff, absPartIdx, width, ttype);
>          if (cu->getCbf(absPartIdx + partIdxesPerSubTU, ttype, origTrDepth + 1))
> @@ -346,7 +346,7 @@
>                      m_entropyCoder->encodeIntraDirModeLuma(cu, part * qtNumParts);
>                  }
>              }
> -            else if ((absPartIdx % qtNumParts) == 0)
> +            else if ((absPartIdx & (qtNumParts - 1)) == 0)
>              {
>                  m_entropyCoder->encodeIntraDirModeLuma(cu, absPartIdx);
>              }
> @@ -366,7 +366,7 @@
>          {
>              uint32_t qtNumParts = cu->getTotalNumPart() >> 2;
>              X265_CHECK(trDepth > 0, "unexpected trDepth %d\n", trDepth);
> -            if ((absPartIdx % qtNumParts) == 0)
> +            if ((absPartIdx & (qtNumParts - 1)) == 0)
>                  m_entropyCoder->encodeIntraDirModeChroma(cu, absPartIdx);
>          }
>      }
> @@ -418,12 +418,13 @@
>      pixel*   fenc         = fencYuv->getLumaAddr(absPartIdx);
>      pixel*   pred         = predYuv->getLumaAddr(absPartIdx);
>      int16_t* residual     = resiYuv->getLumaAddr(absPartIdx);
> -    int      part         = partitionFromSizes(tuSize, tuSize);
> -
> -    uint32_t trSizeLog2     = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> +    int      part         = partitionFromSize(tuSize);
> +    int      sizeIdx      = g_convertToBit[tuSize];
> +
> +    uint32_t trSizeLog2     = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
>      uint32_t qtLayer        = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> -    uint32_t numCoeffPerInc = cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize() >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1);
> -    coeff_t* coeff          = m_qtTempCoeffY[qtLayer] + numCoeffPerInc * absPartIdx;
> +    uint32_t coeffOffsetY   = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> +    coeff_t* coeff          = m_qtTempCoeffY[qtLayer] + coeffOffsetY;
>
>      int16_t* reconQt        = m_qtTempShortYuv[qtLayer].getLumaAddr(absPartIdx);
>
> @@ -435,10 +436,10 @@
>      bool     useTransformSkip = !!cu->getTransformSkip(absPartIdx, TEXT_LUMA);
>
>      //===== get residual signal =====
> -    X265_CHECK(!((uint32_t)(size_t)fenc & (tuSize - 1)), "fenc alignment check fail\n");
> -    X265_CHECK(!((uint32_t)(size_t)pred & (tuSize - 1)), "pred alignment check fail\n");
> -    X265_CHECK(!((uint32_t)(size_t)residual & (tuSize - 1)), "residual alignment check fail\n");
> -    primitives.calcresidual[(int)g_convertToBit[tuSize]](fenc, pred, residual, stride);
> +    X265_CHECK(!((intptr_t)fenc & (tuSize - 1)), "fenc alignment check fail\n");
> +    X265_CHECK(!((intptr_t)pred & (tuSize - 1)), "pred alignment check fail\n");
> +    X265_CHECK(!((intptr_t)residual & (tuSize - 1)), "residual alignment check fail\n");
> +    primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
>
>      //===== transform and quantization =====
>      //--- init rate estimation arrays for RDOQ ---
> @@ -462,7 +463,6 @@
>      cu->setCbfSubParts((absSum ? 1 : 0) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
>
>      //--- inverse transform ---
> -    int size = g_convertToBit[tuSize];
>      if (absSum)
>      {
>          int scalingListType = 0 + TEXT_LUMA;
> @@ -473,12 +473,12 @@
>      {
>          int16_t* resiTmp = residual;
>          memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);
> -        primitives.blockfill_s[size](resiTmp, stride, 0);
> +        primitives.blockfill_s[sizeIdx](resiTmp, stride, 0);
>      }
>
>      X265_CHECK(tuSize <= 32, "tuSize is too large %d\n", tuSize);
>      //===== reconstruction =====
> -    primitives.calcrecon[size](pred, residual, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
> +    primitives.calcrecon[sizeIdx](pred, residual, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
>      //===== update distortion =====
>      outDist += primitives.sse_sp[part](reconQt, MAX_CU_SIZE, fenc, stride);
>  }
> @@ -494,7 +494,7 @@
>                                         uint32_t    chromaId)
>  {
>      uint32_t fullDepth   = cu->getDepth(0) + trDepth;
> -    uint32_t trSizeLog2  = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> +    uint32_t trSizeLog2  = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
>      int      chFmt       = cu->getChromaFormat();
>
>      uint32_t origTrDepth = trDepth;
> @@ -504,8 +504,8 @@
>          X265_CHECK(trDepth > 0, "trDepth should be non-zero\n");
>          trDepth--;
>          uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trDepth) << 1);
> -        bool bFirstQ = ((absPartIdx % qpdiv) == 0);
> -        bool bSecondQ = (chFmt == CHROMA_422) ? ((absPartIdx % qpdiv) == 2) : false;
> +        bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
> +        bool bSecondQ = (chFmt == CHROMA_422) ? ((absPartIdx & (qpdiv - 1)) == 2) : false;
>          if ((!bFirstQ) && (!bSecondQ))
>          {
>              return;
> @@ -520,22 +520,22 @@
>      int16_t* residual       = (chromaId == 1) ? resiYuv->getCbAddr(absPartIdx) : resiYuv->getCrAddr(absPartIdx);
>
>      uint32_t qtlayer        = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> -    uint32_t numCoeffPerInc = (cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize() >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1)) >> (m_hChromaShift + m_vChromaShift);
> -    coeff_t* coeff          = (chromaId == 1 ? m_qtTempCoeffCb[qtlayer] : m_qtTempCoeffCr[qtlayer]) + numCoeffPerInc * absPartIdx;
> +    uint32_t coeffOffsetC   = absPartIdx << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
> +    coeff_t* coeff          = (chromaId == 1 ? m_qtTempCoeffCb[qtlayer] : m_qtTempCoeffCr[qtlayer]) + coeffOffsetC;
>      int16_t* reconQt        = (chromaId == 1) ? m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdx) : m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdx);
>      uint32_t reconQtStride  = m_qtTempShortYuv[qtlayer].m_cwidth;
>      uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
>      pixel*   reconIPred       = (chromaId == 1) ? cu->getPic()->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder) : cu->getPic()->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder);
>      uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getCStride();
>      bool     useTransformSkipChroma = !!cu->getTransformSkip(absPartIdx, ttype);
> -    int      part = partitionFromSizes(tuSize, tuSize);
> +    int      part = partitionFromSize(tuSize);
> +    int      sizeIdx = g_convertToBit[tuSize];
>
>      //===== get residual signal =====
> -    X265_CHECK(!((uint32_t)(size_t)fenc & (tuSize - 1)), "fenc alignment check fail\n");
> -    X265_CHECK(!((uint32_t)(size_t)pred & (tuSize - 1)), "pred alignment check fail\n");
> -    X265_CHECK(!((uint32_t)(size_t)residual & (tuSize - 1)), "residual alignment check fail\n");
> -    int size = g_convertToBit[tuSize];
> -    primitives.calcresidual[size](fenc, pred, residual, stride);
> +    X265_CHECK(!((intptr_t)fenc & (tuSize - 1)), "fenc alignment check fail\n");
> +    X265_CHECK(!((intptr_t)pred & (tuSize - 1)), "pred alignment check fail\n");
> +    X265_CHECK(!((intptr_t)residual & (tuSize - 1)), "residual alignment check fail\n");
> +    primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
>
>      //===== transform and quantization =====
>      {
> @@ -577,14 +577,14 @@
>          {
>              int16_t* resiTmp = residual;
>              memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);
> -            primitives.blockfill_s[size](resiTmp, stride, 0);
> +            primitives.blockfill_s[sizeIdx](resiTmp, stride, 0);
>          }
>      }
>
>      X265_CHECK(((intptr_t)residual & (tuSize - 1)) == 0, "residual alignment check failure\n");
>      X265_CHECK(tuSize <= 32, "tuSize invalud\n");
>      //===== reconstruction =====
> -    primitives.calcrecon[size](pred, residual, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
> +    primitives.calcrecon[sizeIdx](pred, residual, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
>      //===== update distortion =====
>      uint32_t dist = primitives.sse_sp[part](reconQt, reconQtStride, fenc, stride);
>      if (ttype == TEXT_CHROMA_U)
> @@ -612,7 +612,7 @@
>                                       uint64_t&   rdCost)
>  {
>      uint32_t fullDepth   = cu->getDepth(0) +  trDepth;
> -    uint32_t trSizeLog2  = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> +    uint32_t trSizeLog2  = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
>      bool     bCheckFull  = (trSizeLog2 <= cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize());
>      bool     bCheckSplit = (trSizeLog2 > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx));
>
> @@ -829,7 +829,7 @@
>                                               TComYuv*    reconYuv)
>  {
>      uint32_t fullDepth   = cu->getDepth(0) +  trDepth;
> -    uint32_t trSizeLog2  = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> +    uint32_t trSizeLog2  = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
>      bool     bCheckFull  = (trSizeLog2 <= cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize());
>      bool     bCheckSplit = (trSizeLog2 > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx));
>
> @@ -854,9 +854,8 @@
>          pixel*   pred         = predYuv->getLumaAddr(absPartIdx);
>          int16_t* residual     = resiYuv->getLumaAddr(absPartIdx);
>          pixel*   recon        = reconYuv->getLumaAddr(absPartIdx);
> -
> -        uint32_t numCoeffPerInc = cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize() >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1);
> -        coeff_t* coeff          = cu->getCoeffY() + numCoeffPerInc * absPartIdx;
> +        uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> +        coeff_t* coeff        = cu->getCoeffY() + coeffOffsetY;
>
>          uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
>          pixel*   reconIPred       = cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
> @@ -870,10 +869,11 @@
>          predIntraLumaAng(lumaPredMode, pred, stride, tuSize);
>
>          //===== get residual signal =====
> -        X265_CHECK(!((uint32_t)(size_t)fenc & (tuSize - 1)), "fenc alignment failure\n");
> -        X265_CHECK(!((uint32_t)(size_t)pred & (tuSize - 1)), "pred alignment failure\n");
> -        X265_CHECK(!((uint32_t)(size_t)residual & (tuSize - 1)), "residual alignment failure\n");
> -        primitives.calcresidual[(int)g_convertToBit[tuSize]](fenc, pred, residual, stride);
> +        X265_CHECK(!((intptr_t)fenc & (tuSize - 1)), "fenc alignment failure\n");
> +        X265_CHECK(!((intptr_t)pred & (tuSize - 1)), "pred alignment failure\n");
> +        X265_CHECK(!((intptr_t)residual & (tuSize - 1)), "residual alignment failure\n");
> +        int sizeIdx = g_convertToBit[tuSize];
> +        primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
>
>          //===== transform and quantization =====
>          uint32_t absSum = 0;
> @@ -888,7 +888,6 @@
>          cu->setCbfSubParts((absSum ? 1 : 0) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
>
>          //--- inverse transform ---
> -        int size = g_convertToBit[tuSize];
>          if (absSum)
>          {
>              int scalingListType = 0 + TEXT_LUMA;
> @@ -899,12 +898,12 @@
>          {
>              int16_t* resiTmp = residual;
>              memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);
> -            primitives.blockfill_s[size](resiTmp, stride, 0);
> +            primitives.blockfill_s[sizeIdx](resiTmp, stride, 0);
>          }
>
>          //Generate Recon
>          X265_CHECK(tuSize <= 32, "tuSize is too large\n");
> -        int part = partitionFromSizes(tuSize, tuSize);
> +        int part = partitionFromSize(tuSize);
>          primitives.luma_add_ps[part](recon, stride, pred, residual, stride, stride);
>          primitives.blockcpy_pp(tuSize, tuSize, reconIPred, reconIPredStride, recon, stride);
>      }
> @@ -939,14 +938,14 @@
>
>      if (trMode == trDepth)
>      {
> -        uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> +        uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
>          uint32_t qtlayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
>
>          //===== copy transform coefficients =====
> -        uint32_t numCoeffY    = (cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize()) >> (fullDepth << 1);
> -        uint32_t numCoeffIncY = (cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize()) >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1);
> -        coeff_t* coeffSrcY    = m_qtTempCoeffY[qtlayer] + (numCoeffIncY * absPartIdx);
> -        coeff_t* coeffDestY   = cu->getCoeffY()         + (numCoeffIncY * absPartIdx);
> +        uint32_t numCoeffY    = 1 << (trSizeLog2 * 2);
> +        uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> +        coeff_t* coeffSrcY    = m_qtTempCoeffY[qtlayer] + coeffOffsetY;
> +        coeff_t* coeffDestY   = cu->getCoeffY()         + coeffOffsetY;
>          ::memcpy(coeffDestY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
>
>          //===== copy reconstruction =====
> @@ -964,15 +963,14 @@
>
>  void TEncSearch::xStoreIntraResultQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx)
>  {
> -    uint32_t fullMode = cu->getDepth(0) + trDepth;
> -
> -    uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullMode] + 2;
> +    uint32_t fullDepth = cu->getDepth(0) + trDepth;
> +    uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
>      uint32_t qtlayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
>
>      //===== copy transform coefficients =====
> -    uint32_t numCoeffY    = (cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize()) >> (fullMode << 1);
> -    uint32_t numCoeffIncY = (cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize()) >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1);
> -    coeff_t* coeffSrcY = m_qtTempCoeffY[qtlayer] + (numCoeffIncY * absPartIdx);
> +    uint32_t numCoeffY    = 1 << (trSizeLog2 * 2);
> +    uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> +    coeff_t* coeffSrcY = m_qtTempCoeffY[qtlayer] + coeffOffsetY;
>      coeff_t* coeffDstY = m_qtTempTUCoeffY;
>
>      ::memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
> @@ -984,14 +982,13 @@
>  void TEncSearch::xLoadIntraResultQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx)
>  {
>      uint32_t fullDepth = cu->getDepth(0) + trDepth;
> -
> -    uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> +    uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
>      uint32_t qtlayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
>
>      //===== copy transform coefficients =====
> -    uint32_t numCoeffY    = (cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize()) >> (fullDepth << 1);
> -    uint32_t numCoeffIncY = (cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize()) >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1);
> -    coeff_t* coeffDstY = m_qtTempCoeffY[qtlayer] + (numCoeffIncY * absPartIdx);
> +    uint32_t numCoeffY    = 1 << (trSizeLog2 * 2);
> +    uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> +    coeff_t* coeffDstY = m_qtTempCoeffY[qtlayer] + coeffOffsetY;
>      coeff_t* coeffSrcY = m_qtTempTUCoeffY;
>
>      ::memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
> @@ -1008,25 +1005,27 @@
>      X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width is not max CU size\n");
>  }
>
> -void TEncSearch::xStoreIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t stateU0V1Both2, const bool splitIntoSubTUs)
> +void TEncSearch::xStoreIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t chromaId, const bool splitIntoSubTUs)
>  {
> +    assert(chromaId == 1 || chromaId == 2);
> +
>      uint32_t fullDepth = cu->getDepth(0) + trDepth;
>      uint32_t trMode    = cu->getTransformIdx(absPartIdx);
>
>      if (trMode == trDepth)
>      {
> -        uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> +        uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
>          uint32_t qtlayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
>          int      chFmt      = cu->getChromaFormat();
>
>          bool bChromaSame = false;
>          if (trSizeLog2 == 2 && !(chFmt == CHROMA_444))
>          {
> -            X265_CHECK(trDepth > 0, "trDepth is invalid\n");
> +            X265_CHECK(trDepth > 0, "invalid trDepth\n");
>              trDepth--;
>              uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trDepth) << 1);
> -            bool bFirstQ = ((absPartIdx % qpdiv) == 0);
> -            bool bSecondQ = (chFmt == CHROMA_422) ? ((absPartIdx % qpdiv) == 2) : false;
> +            bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
> +            bool bSecondQ = (chFmt == CHROMA_422) ? ((absPartIdx & (qpdiv - 1)) == 2) : false;
>              if ((!bFirstQ) && (!bSecondQ))
>              {
>                  return;
> @@ -1037,35 +1036,37 @@
>          uint32_t height = cu->getCUSize(absPartIdx) >> (trDepth + m_vChromaShift);
>          height = splitIntoSubTUs ? height >> 1 : height;
>          uint32_t numCoeffC = width * height;
> -
> -        uint32_t numCoeffIncC = (cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize()) >> ((cu->getSlice()->getSPS()->getMaxCUDepth() << 1) + (m_hChromaShift + m_vChromaShift));
> -        if (stateU0V1Both2 == 1 || stateU0V1Both2 == 3)
> +        uint32_t coeffOffsetC = absPartIdx << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
> +
> +        if (chromaId == 1)
>          {
> -            coeff_t* coeffSrcU = m_qtTempCoeffCb[qtlayer] + (numCoeffIncC * absPartIdx);
> +            coeff_t* coeffSrcU = m_qtTempCoeffCb[qtlayer] + coeffOffsetC;
>              coeff_t* coeffDstU = m_qtTempTUCoeffCb;
>              ::memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
>          }
> -        if (stateU0V1Both2 == 2 || stateU0V1Both2 == 3)
> +        if (chromaId == 2)
>          {
> -            coeff_t* coeffSrcV = m_qtTempCoeffCr[qtlayer] + (numCoeffIncC * absPartIdx);
> +            coeff_t* coeffSrcV = m_qtTempCoeffCr[qtlayer] + coeffOffsetC;
>              coeff_t* coeffDstV = m_qtTempTUCoeffCr;
>              ::memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
>          }
>
>          //===== copy reconstruction =====
>          uint32_t lumaSize = 1 << (bChromaSame ? trSizeLog2 + 1 : trSizeLog2);
> -        m_qtTempShortYuv[qtlayer].copyPartToPartYuvChroma(&m_qtTempTransformSkipYuv, absPartIdx, lumaSize, stateU0V1Both2, splitIntoSubTUs);
> +        m_qtTempShortYuv[qtlayer].copyPartToPartYuvChroma(&m_qtTempTransformSkipYuv, absPartIdx, lumaSize, chromaId, splitIntoSubTUs);
>      }
>  }
>
> -void TEncSearch::xLoadIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t stateU0V1Both2, const bool splitIntoSubTUs)
> +void TEncSearch::xLoadIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t chromaId, const bool splitIntoSubTUs)
>  {
> +    assert(chromaId == 1 || chromaId == 2);
> +
>      uint32_t fullDepth = cu->getDepth(0) + trDepth;
>      uint32_t trMode    = cu->getTransformIdx(absPartIdx);
>
>      if (trMode == trDepth)
>      {
> -        uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> +        uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
>          uint32_t qtlayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
>          int      chFmt      = cu->getChromaFormat();
>
> @@ -1075,8 +1076,8 @@
>              X265_CHECK(trDepth > 0, "invalid trDepth\n");
>              trDepth--;
>              uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trDepth) << 1);
> -            bool bFirstQ = ((absPartIdx % qpdiv) == 0);
> -            bool bSecondQ = (chFmt == CHROMA_422) ? ((absPartIdx % qpdiv) == 2) : false;
> +            bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
> +            bool bSecondQ = (chFmt == CHROMA_422) ? ((absPartIdx & (qpdiv - 1)) == 2) : false;
>              if ((!bFirstQ) && (!bSecondQ))
>              {
>                  return;
> @@ -1085,45 +1086,44 @@
>          }
>
>          //===== copy transform coefficients =====
> -        uint32_t trWidth  = cu->getCUSize(absPartIdx) >> (trDepth + m_hChromaShift);
> -        uint32_t trHeight = cu->getCUSize(absPartIdx) >> (trDepth + m_vChromaShift);
> -        trHeight = splitIntoSubTUs ? trHeight >> 1 : trHeight;
> -        uint32_t numCoeffC = trWidth * trHeight;
> -
> -        uint32_t numCoeffIncC = (cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize()) >> ((cu->getSlice()->getSPS()->getMaxCUDepth() << 1) + (m_hChromaShift + m_vChromaShift));
> -
> -        if (stateU0V1Both2 == 1 || stateU0V1Both2 == 3)
> +        uint32_t trWidthC  = cu->getCUSize(absPartIdx) >> (trDepth + m_hChromaShift);
> +        uint32_t trHeightC = cu->getCUSize(absPartIdx) >> (trDepth + m_vChromaShift);
> +        trHeightC = splitIntoSubTUs ? trHeightC >> 1 : trHeightC;
> +        uint32_t numCoeffC = trWidthC * trHeightC;
> +        uint32_t coeffOffsetC = absPartIdx << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
> +
> +        if (chromaId == 1)
>          {
> -            coeff_t* coeffDstU = m_qtTempCoeffCb[qtlayer] + (numCoeffIncC * absPartIdx);
> +            coeff_t* coeffDstU = m_qtTempCoeffCb[qtlayer] + coeffOffsetC;
>              coeff_t* coeffSrcU = m_qtTempTUCoeffCb;
>              ::memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
>          }
> -        if (stateU0V1Both2 == 2 || stateU0V1Both2 == 3)
> +        if (chromaId == 2)
>          {
> -            coeff_t* coeffDstV = m_qtTempCoeffCr[qtlayer] + (numCoeffIncC * absPartIdx);
> +            coeff_t* coeffDstV = m_qtTempCoeffCr[qtlayer] + coeffOffsetC;
>              coeff_t* coeffSrcV = m_qtTempTUCoeffCr;
>              ::memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
>          }
>
>          //===== copy reconstruction =====
>          uint32_t lumaSize = 1 << (bChromaSame ? trSizeLog2 + 1 : trSizeLog2);
> -        m_qtTempTransformSkipYuv.copyPartToPartChroma(&m_qtTempShortYuv[qtlayer], absPartIdx, lumaSize, stateU0V1Both2, splitIntoSubTUs);
> +        m_qtTempTransformSkipYuv.copyPartToPartChroma(&m_qtTempShortYuv[qtlayer], absPartIdx, lumaSize, chromaId, splitIntoSubTUs);
>
>          uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
>          uint32_t reconQtStride    = m_qtTempShortYuv[qtlayer].m_cwidth;
>          uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getCStride();
>
> -        if (stateU0V1Both2 == 1 || stateU0V1Both2 == 3)
> +        if (chromaId == 1)
>          {
>              pixel* reconIPred = cu->getPic()->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder);
>              int16_t* reconQt  = m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdx);
> -            primitives.blockcpy_ps(trWidth, trHeight, reconIPred, reconIPredStride, reconQt, reconQtStride);
> +            primitives.blockcpy_ps(trWidthC, trHeightC, reconIPred, reconIPredStride, reconQt, reconQtStride);
>          }
> -        if (stateU0V1Both2 == 2 || stateU0V1Both2 == 3)
> +        if (chromaId == 2)
>          {
>              pixel* reconIPred = cu->getPic()->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder);
>              int16_t* reconQt  = m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdx);
> -            primitives.blockcpy_ps(trWidth, trHeight, reconIPred, reconIPredStride, reconQt, reconQtStride);
> +            primitives.blockcpy_ps(trWidthC, trHeightC, reconIPred, reconIPredStride, reconQt, reconQtStride);
>          }
>      }
>  }
> @@ -1132,7 +1132,7 @@
>  {
>      uint32_t depth = cu->getDepth(0);
>      uint32_t fullDepth = depth + trDepth;
> -    uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> +    uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
>
>      uint32_t actualTrDepth = trDepth;
>
> @@ -1179,7 +1179,7 @@
>      if (trMode == trDepth)
>      {
>          int chFmt = cu->getChromaFormat();
> -        uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> +        uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
>          uint32_t trSizeCLog2 = trSizeLog2 - m_hChromaShift;
>          uint32_t actualTrDepth = trDepth;
>          if ((trSizeLog2 == 2) && !(chFmt == CHROMA_444))
> @@ -1188,7 +1188,7 @@
>              actualTrDepth--;
>              trSizeCLog2++;
>              uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + actualTrDepth) << 1);
> -            bool bFirstQ = ((absPartIdx % qpdiv) == 0);
> +            bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
>              if (!bFirstQ)
>              {
>                  return;
> @@ -1288,7 +1288,7 @@
>
>                              if (bestModeId == firstCheckId)
>                              {
> -                                xStoreIntraResultChromaQT(cu, trDepth, absPartIdxC, (TextType)chromaId, splitIntoSubTUs);
> +                                xStoreIntraResultChromaQT(cu, trDepth, absPartIdxC, chromaId, splitIntoSubTUs);
>                                  m_rdGoOnSbacCoder->store(m_rdSbacCoders[fullDepth][CI_TEMP_BEST]);
>                              }
>                          }
> @@ -1300,7 +1300,7 @@
>
>                      if (bestModeId == firstCheckId)
>                      {
> -                        xLoadIntraResultChromaQT(cu, trDepth, absPartIdxC, (TextType)chromaId, splitIntoSubTUs);
> +                        xLoadIntraResultChromaQT(cu, trDepth, absPartIdxC, chromaId, splitIntoSubTUs);
>                          cu->setCbfPartRange(singleCbfC << trDepth, (TextType)chromaId, absPartIdxC, tuIterator.m_absPartIdxStep);
>
>                          m_rdGoOnSbacCoder->load(m_rdSbacCoders[fullDepth][CI_TEMP_BEST]);
> @@ -1358,7 +1358,7 @@
>      if (trMode == trDepth)
>      {
>          int      chFmt      = cu->getChromaFormat();
> -        uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> +        uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
>          uint32_t qtlayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
>
>          bool bChromaSame = false;
> @@ -1367,7 +1367,7 @@
>              X265_CHECK(trDepth > 0, "invalid trDepth\n");
>              trDepth--;
>              uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trDepth) << 1);
> -            if ((absPartIdx % qpdiv) != 0)
> +            if ((absPartIdx & (qpdiv - 1)) != 0)
>              {
>                  return;
>              }
> @@ -1379,12 +1379,12 @@
>          uint32_t width     = cu->getCUSize(absPartIdx) >> (trDepth + m_hChromaShift);
>          uint32_t height    = cu->getCUSize(absPartIdx) >> (trDepth + m_vChromaShift);
>          uint32_t numCoeffC = width * height;
> -        uint32_t numCoeffIncC = ((cu->getSlice()->getSPS()->getMaxCUSize() >> m_hChromaShift) * (cu->getSlice()->getSPS()->getMaxCUSize() >> m_vChromaShift)) >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1);
> -
> -        coeff_t* coeffSrcU = m_qtTempCoeffCb[qtlayer] + (numCoeffIncC * absPartIdx);
> -        coeff_t* coeffSrcV = m_qtTempCoeffCr[qtlayer] + (numCoeffIncC * absPartIdx);
> -        coeff_t* coeffDstU = cu->getCoeffCb()         + (numCoeffIncC * absPartIdx);
> -        coeff_t* coeffDstV = cu->getCoeffCr()         + (numCoeffIncC * absPartIdx);
> +        uint32_t coeffOffsetC = absPartIdx << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
> +
> +        coeff_t* coeffSrcU = m_qtTempCoeffCb[qtlayer] + coeffOffsetC;
> +        coeff_t* coeffSrcV = m_qtTempCoeffCr[qtlayer] + coeffOffsetC;
> +        coeff_t* coeffDstU = cu->getCoeffCb()         + coeffOffsetC;
> +        coeff_t* coeffDstV = cu->getCoeffCr()         + coeffOffsetC;
>          ::memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
>          ::memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
>
> @@ -1415,7 +1415,7 @@
>      if (trMode == trDepth)
>      {
>          int      chFmt     = cu->getChromaFormat();
> -        uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> +        uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
>          uint32_t origTrDepth = trDepth;
>          uint32_t actualTrDepth = trDepth;
>          if ((trSizeLog2 == 2) && !(chFmt == CHROMA_444))
> @@ -1423,7 +1423,7 @@
>              X265_CHECK(trDepth > 0, "invalid trDepth\n");
>              actualTrDepth--;
>              uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + actualTrDepth) << 1);
> -            bool bFirstQ = ((absPartIdx % qpdiv) == 0);
> +            bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
>              if (!bFirstQ)
>              {
>                  return;
> @@ -1433,6 +1433,7 @@
>          uint32_t tuSize = cu->getCUSize(0) >> (actualTrDepth + m_hChromaShift);
>          uint32_t stride = fencYuv->getCStride();
>          const bool splitIntoSubTUs = (chFmt == CHROMA_422);
> +        int sizeIdx = g_convertToBit[tuSize];
>
>          for (int chromaId = TEXT_CHROMA; chromaId < MAX_NUM_COMPONENT; chromaId++)
>          {
> @@ -1450,8 +1451,8 @@
>                  pixel*   pred           = (chromaId == 1) ? predYuv->getCbAddr(absPartIdxC) : predYuv->getCrAddr(absPartIdxC);
>                  int16_t* residual       = (chromaId == 1) ? resiYuv->getCbAddr(absPartIdxC) : resiYuv->getCrAddr(absPartIdxC);
>                  pixel*   recon          = (chromaId == 1) ? reconYuv->getCbAddr(absPartIdxC) : reconYuv->getCrAddr(absPartIdxC);
> -                uint32_t numCoeffPerInc = (cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize() >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1)) >> (m_hChromaShift + m_vChromaShift);
> -                coeff_t* coeff          = (chromaId == 1 ? cu->getCoeffCb() : cu->getCoeffCr()) + numCoeffPerInc * absPartIdxC;
> +                uint32_t coeffOffsetC   = absPartIdxC << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
> +                coeff_t* coeff          = (chromaId == 1 ? cu->getCoeffCb() : cu->getCoeffCr()) + coeffOffsetC;
>                  uint32_t zorder         = cu->getZorderIdxInCU() + absPartIdxC;
>                  pixel*   reconIPred     = (chromaId == 1) ? cu->getPic()->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder) : cu->getPic()->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder);
>                  uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getCStride();
> @@ -1474,11 +1475,10 @@
>                  predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, tuSize, chFmt);
>
>                  //===== get residual signal =====
> -                X265_CHECK(!((uint32_t)(size_t)fenc & (tuSize - 1)), "fenc alignment failure\n");
> -                X265_CHECK(!((uint32_t)(size_t)pred & (tuSize - 1)), "pred alignment failure\n");
> -                X265_CHECK(!((uint32_t)(size_t)residual & (tuSize - 1)), "residual alignment failure\n");
> -                int size = g_convertToBit[tuSize];
> -                primitives.calcresidual[size](fenc, pred, residual, stride);
> +                X265_CHECK(!((intptr_t)fenc & (tuSize - 1)), "fenc alignment failure\n");
> +                X265_CHECK(!((intptr_t)pred & (tuSize - 1)), "pred alignment failure\n");
> +                X265_CHECK(!((intptr_t)residual & (tuSize - 1)), "residual alignment failure\n");
> +                primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
>
>                  //--- transform and quantization ---
>                  uint32_t absSum = 0;
> @@ -1513,7 +1513,7 @@
>                  {
>                      int16_t* resiTmp = residual;
>                      memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);
> -                    primitives.blockfill_s[size](resiTmp, stride, 0);
> +                    primitives.blockfill_s[sizeIdx](resiTmp, stride, 0);
>                  }
>
>                  //===== reconstruction =====
> @@ -1521,7 +1521,7 @@
>                  X265_CHECK(tuSize <= 32, "tuSize out of range\n");
>
>                  // use square primitive
> -                int part = partitionFromSizes(tuSize, tuSize);
> +                int part = partitionFromSize(tuSize);
>                  primitives.chroma[CHROMA_444].add_ps[part](recon, stride, pred, residual, stride, stride);
>                  primitives.chroma[CHROMA_444].copy_pp[part](reconIPred, reconIPredStride, recon, stride);
>              }
> @@ -1565,7 +1565,7 @@
>      uint32_t overallDistY = 0;
>      uint32_t candNum;
>      uint64_t candCostList[FAST_UDI_MAX_RDMODE_NUM];
> -    uint32_t tuSizeIdx    = g_convertToBit[tuSize]; // log2(tuSize) - 2
> +    uint32_t sizeIdx      = g_convertToBit[tuSize]; // log2(tuSize) - 2
>      static const uint8_t intraModeNumFast[] = { 8, 8, 3, 3, 3 }; // 4x4, 8x8, 16x16, 32x32, 64x64
>
>      //===== loop over partitions =====
> @@ -1581,7 +1581,7 @@
>          pixel*   fenc   = fencYuv->getLumaAddr(pu, tuSize);
>          uint32_t stride = predYuv->getStride();
>          uint32_t rdModeList[FAST_UDI_MAX_RDMODE_NUM];
> -        int numModesForFullRD = intraModeNumFast[tuSizeIdx];
> +        int numModesForFullRD = intraModeNumFast[sizeIdx];
>
>          bool doFastSearch = (numModesForFullRD != numModesAvailable);
>          if (doFastSearch)
> @@ -1629,6 +1629,7 @@
>                  scaleTuSize = 32;
>                  scaleStride = 32;
>                  costShift = 2;
> +                sizeIdx = 5 - 2; // g_convertToBit[scaleTuSize];
>
>                  // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
>                  above         = aboveScale;
> @@ -1637,11 +1638,10 @@
>                  leftFiltered  = leftScale;
>              }
>
> -            int log2SizeMinus2 = g_convertToBit[scaleTuSize];
> -            pixelcmp_t sa8d = primitives.sa8d[log2SizeMinus2];
> +            pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
>
>              // DC
> -            primitives.intra_pred[log2SizeMinus2][DC_IDX](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
> +            primitives.intra_pred[sizeIdx][DC_IDX](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
>              modeCosts[DC_IDX] = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
>
>              pixel *abovePlanar   = above;
> @@ -1654,13 +1654,13 @@
>              }
>
>              // PLANAR
> -            primitives.intra_pred[log2SizeMinus2][PLANAR_IDX](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
> +            primitives.intra_pred[sizeIdx][PLANAR_IDX](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
>              modeCosts[PLANAR_IDX] = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
>
>              // Transpose NxN
> -            primitives.transpose[log2SizeMinus2](buf_trans, fenc, scaleStride);
> -
> -            primitives.intra_pred_allangs[log2SizeMinus2](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
> +            primitives.transpose[sizeIdx](buf_trans, fenc, scaleStride);
> +
> +            primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
>
>              for (uint32_t mode = 2; mode < numModesAvailable; mode++)
>              {
> @@ -1786,7 +1786,7 @@
>          if (pu != numPU - 1)
>          {
>              uint32_t zorder      = cu->getZorderIdxInCU() + partOffset;
> -            int      part        = partitionFromSizes(tuSize, tuSize);
> +            int      part        = partitionFromSize(tuSize);
>              pixel*   dst         = cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
>              uint32_t dststride   = cu->getPic()->getPicYuvRec()->getStride();
>              pixel*   src         = reconYuv->getLumaAddr(partOffset);
> @@ -1844,6 +1844,8 @@
>          scaleTuSize = 32;
>          costShift = 2;
>      }
> +    int sizeIdx = g_convertToBit[scaleTuSize];
> +    pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
>
>      TComPattern::initAdiPatternChroma(cu, absPartIdx, trDepth, m_predBuf, 1);
>      TComPattern::initAdiPatternChroma(cu, absPartIdx, trDepth, m_predBuf, 2);
> @@ -1866,8 +1868,6 @@
>
>              //===== get prediction signal =====
>              predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, scaleTuSize, chFmt);
> -            int log2SizeMinus2 = g_convertToBit[scaleTuSize];
> -            pixelcmp_t sa8d = primitives.sa8d[log2SizeMinus2];
>              cost += sa8d(fenc, stride, pred, stride) << costShift;
>          }
>
> @@ -1980,7 +1980,7 @@
>
>          if (!isLastSection(&tuIterator))
>          {
> -            uint32_t compWidth   = (cu->getCUSize(0) >> m_hChromaShift)  >> initTrDepth;
> +            uint32_t compWidth   = (cu->getCUSize(0) >> m_hChromaShift) >> initTrDepth;
>              uint32_t compHeight  = (cu->getCUSize(0) >> m_vChromaShift) >> initTrDepth;
>              uint32_t zorder      = cu->getZorderIdxInCU() + tuIterator.m_partOffset;
>              pixel*     dst         = cu->getPic()->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder);
> @@ -2662,8 +2662,7 @@
>      uint32_t bits = 0, bestBits = 0;
>      uint32_t distortion = 0, bdist = 0;
>
> -    uint32_t width  = cu->getCUSize(0);
> -    uint32_t height = cu->getCUSize(0);
> +    uint32_t cuSize = cu->getCUSize(0);
>
>      // No residual coding : SKIP mode
>      if (bSkipRes)
> @@ -2672,10 +2671,10 @@
>
>          predYuv->copyToPartYuv(outReconYuv, 0);
>          // Luma
> -        int part = partitionFromSizes(width, height);
> +        int part = partitionFromSize(cuSize);
>          distortion = primitives.sse_pp[part](fencYuv->getLumaAddr(), fencYuv->getStride(), outReconYuv->getLumaAddr(), outReconYuv->getStride());
>          // Chroma
> -        part = partitionFromSizes(width >> m_hChromaShift, height >> m_vChromaShift);
> +        part = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
>          distortion += m_rdCost->scaleChromaDistCb(primitives.sse_pp[part](fencYuv->getCbAddr(), fencYuv->getCStride(), outReconYuv->getCbAddr(), outReconYuv->getCStride()));
>          distortion += m_rdCost->scaleChromaDistCr(primitives.sse_pp[part](fencYuv->getCrAddr(), fencYuv->getCStride(), outReconYuv->getCrAddr(), outReconYuv->getCStride()));
>
> @@ -2695,9 +2694,9 @@
>          cu->m_totalDistortion = distortion;
>          if (m_rdCost->psyRdEnabled())
>          {
> -            int size = g_convertToBit[cu->getCUSize(0)];
> +            int size = g_convertToBit[cuSize];
>              uint32_t psyRdCost = m_rdCost->psyCost(size, fencYuv->getLumaAddr(), fencYuv->getStride(),
> -                                                         outReconYuv->getLumaAddr(), outReconYuv->getStride());
> +                                                   outReconYuv->getLumaAddr(), outReconYuv->getStride());
>              cu->m_totalCost = m_rdCost->calcPsyRdCost(cu->m_totalDistortion, cu->m_totalBits, psyRdCost);
>          }
>          else
> @@ -2718,7 +2717,7 @@
>      bits = 0;
>      distortion = 0;
>
> -    outResiYuv->subtract(fencYuv, predYuv, width);
> +    outResiYuv->subtract(fencYuv, predYuv, cuSize);
>      m_rdGoOnSbacCoder->load(m_rdSbacCoders[cu->getDepth(0)][CI_CURR_BEST]);
>      xEstimateResidualQT(cu, 0, outResiYuv, cu->getDepth(0), cost, bits, distortion, &zeroDistortion, curUseRDOQ);
>
> @@ -2739,9 +2738,9 @@
>          ::memset(cu->getCbf(TEXT_LUMA), 0, qpartnum * sizeof(uint8_t));
>          ::memset(cu->getCbf(TEXT_CHROMA_U), 0, qpartnum * sizeof(uint8_t));
>          ::memset(cu->getCbf(TEXT_CHROMA_V), 0, qpartnum * sizeof(uint8_t));
> -        ::memset(cu->getCoeffY(), 0, width * height * sizeof(coeff_t));
> -        ::memset(cu->getCoeffCb(), 0, width * height * sizeof(coeff_t) >> (m_hChromaShift + m_vChromaShift));
> -        ::memset(cu->getCoeffCr(), 0, width * height * sizeof(coeff_t) >> (m_hChromaShift + m_vChromaShift));
> +        ::memset(cu->getCoeffY(), 0, cuSize * cuSize * sizeof(coeff_t));
> +        ::memset(cu->getCoeffCb(), 0, cuSize * cuSize * sizeof(coeff_t) >> (m_hChromaShift + m_vChromaShift));
> +        ::memset(cu->getCoeffCr(), 0, cuSize * cuSize * sizeof(coeff_t) >> (m_hChromaShift + m_vChromaShift));
>          cu->setTransformSkipSubParts(0, 0, 0, 0, cu->getDepth(0));
>      }
>      else
> @@ -2771,7 +2770,7 @@
>
>      if (cu->getQtRootCbf(0))
>      {
> -        outReconYuv->addClip(predYuv, outBestResiYuv, width);
> +        outReconYuv->addClip(predYuv, outBestResiYuv, cuSize);
>      }
>      else
>      {
> @@ -2779,16 +2778,16 @@
>      }
>
>      // update with clipped distortion and cost (qp estimation loop uses unclipped values)
> -    int part = partitionFromSizes(width, height);
> +    int part = partitionFromSize(cuSize);
>      bdist = primitives.sse_pp[part](fencYuv->getLumaAddr(), fencYuv->getStride(), outReconYuv->getLumaAddr(), outReconYuv->getStride());
> -    part = partitionFromSizes(width >> cu->getHorzChromaShift(), height >> cu->getVertChromaShift());
> +    part = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
>      bdist += m_rdCost->scaleChromaDistCb(primitives.sse_pp[part](fencYuv->getCbAddr(), fencYuv->getCStride(), outReconYuv->getCbAddr(), outReconYuv->getCStride()));
>      bdist += m_rdCost->scaleChromaDistCr(primitives.sse_pp[part](fencYuv->getCrAddr(), fencYuv->getCStride(), outReconYuv->getCrAddr(), outReconYuv->getCStride()));
>      if (m_rdCost->psyRdEnabled())
>      {
> -        int size = g_convertToBit[cu->getCUSize(0)];
> +        int size = g_convertToBit[cuSize];
>          uint32_t psyRdCost = m_rdCost->psyCost(size, fencYuv->getLumaAddr(), fencYuv->getStride(),
> -                                                     outReconYuv->getLumaAddr(), outReconYuv->getStride());
> +                                               outReconYuv->getLumaAddr(), outReconYuv->getStride());
>          bcost = m_rdCost->calcPsyRdCost(bdist, bestBits, psyRdCost);
>      }
>      else
> @@ -2847,8 +2846,7 @@
>  {
>      X265_CHECK(cu->getDepth(0) == cu->getDepth(absPartIdx), "invalid depth\n");
>      const uint32_t trMode = depth - cu->getDepth(0);
> -    const uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> depth] + 2;
> -    uint32_t  trSizeCLog2     = trSizeLog2 - m_hChromaShift;
> +    const uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
>      const uint32_t setCbf     = 1 << trMode;
>      int chFmt                 = cu->getChromaFormat();
>
> @@ -2861,34 +2859,35 @@
>      const bool bCheckSplit = (trSizeLog2 > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx));
>      X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
>
> -    bool bCodeChroma = true;
> -    uint32_t trModeC = trMode;
> -    if ((trSizeLog2 == 2) && !(chFmt == CHROMA_444))
> -    {
> -        trSizeCLog2++;
> -        trModeC--;
> -        uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((depth - 1) << 1);
> -        bCodeChroma = ((absPartIdx % qpdiv) == 0);
> -    }
> -
> -    const bool splitIntoSubTUs = (chFmt == CHROMA_422);
> -    uint32_t absPartIdxStep = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) +  trModeC) << 1);
> -
>      // code full block
>      uint32_t absSumY = 0, absSumU = 0, absSumV = 0;
>      int lastPosY = -1, lastPosU = -1, lastPosV = -1;
>      if (bCheckFull)
>      {
> -        const uint32_t numCoeffPerAbsPartIdxIncrement = cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize() >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1);
> -
> -        coeff_t *coeffCurY = cu->getCoeffY() + (numCoeffPerAbsPartIdxIncrement * absPartIdx);
> -        coeff_t *coeffCurU = cu->getCoeffCb() + (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift + m_vChromaShift));
> -        coeff_t *coeffCurV = cu->getCoeffCr() + (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift + m_vChromaShift));
> -
> -        int trWidth = 0, trHeight = 0, trWidthC = 0, trHeightC = 0;
> -
> -        trWidth  = trHeight  = 1 << trSizeLog2;
> -        trWidthC = trHeightC = 1 << trSizeCLog2;
> +        uint32_t trSizeCLog2 = trSizeLog2 - m_hChromaShift;
> +        bool bCodeChroma = true;
> +        uint32_t trModeC = trMode;
> +        if ((trSizeLog2 == 2) && !(chFmt == CHROMA_444))
> +        {
> +            trSizeCLog2++;
> +            trModeC--;
> +            uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((depth - 1) << 1);
> +            bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
> +        }
> +
> +        const bool splitIntoSubTUs = (chFmt == CHROMA_422);
> +        uint32_t absPartIdxStep = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) +  trModeC) << 1);
> +
> +        uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> +        uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
> +        coeff_t *coeffCurY = cu->getCoeffY() + coeffOffsetY;
> +        coeff_t *coeffCurU = cu->getCoeffCb() + coeffOffsetC;
> +        coeff_t *coeffCurV = cu->getCoeffCr() + coeffOffsetC;
> +
> +        uint32_t trSize   = 1 << trSizeLog2;
> +        uint32_t trSizeC  = 1 << trSizeCLog2;
> +        uint32_t sizeIdx  = trSizeLog2  - 2;
> +        uint32_t sizeIdxC = trSizeCLog2 - 2;
>          cu->setTrIdxSubParts(depth - cu->getDepth(0), absPartIdx, depth);
>
>          cu->setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
> @@ -2897,7 +2896,7 @@
>          m_trQuant->selectLambda(TEXT_LUMA);
>
>          absSumY = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, coeffCurY,
> -                                          trWidth, TEXT_LUMA, absPartIdx, &lastPosY, false, curuseRDOQ);
> +                                          trSize, TEXT_LUMA, absPartIdx, &lastPosY, false, curuseRDOQ);
>
>          cu->setCbfSubParts(absSumY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
>
> @@ -2909,13 +2908,12 @@
>
>              int scalingListType = 3 + TEXT_LUMA;
>              X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> -            m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, resiYuv->m_width,  coeffCurY, trWidth, scalingListType, false, lastPosY); //this is for inter mode only
> +            m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, resiYuv->m_width,  coeffCurY, trSize, scalingListType, false, lastPosY); //this is for inter mode only
>          }
>          else
>          {
>              int16_t *ptr = resiYuv->getLumaAddr(absPartIdx);
> -            X265_CHECK(trWidth == trHeight, "square transform expected\n");
> -            primitives.blockfill_s[(int)g_convertToBit[trWidth]](ptr, resiYuv->m_width, 0);
> +            primitives.blockfill_s[sizeIdx](ptr, resiYuv->m_width, 0);
>          }
>          cu->setCbfSubParts(absSumY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
>
> @@ -2924,13 +2922,10 @@
>              TComTURecurse tuIterator;
>              initSection(&tuIterator, splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
>
> -            uint32_t widthC  = trWidthC;
> -            uint32_t heightC = trWidthC;
> -
>              do
>              {
>                  uint32_t absPartIdxC = tuIterator.m_absPartIdxTURelCU;
> -                uint32_t subTUBufferOffset = widthC * heightC * tuIterator.m_section;
> +                uint32_t subTUBufferOffset = trSizeC * trSizeC * tuIterator.m_section;
>
>                  cu->setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
>                  cu->setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> @@ -2941,12 +2936,12 @@
>                  m_trQuant->selectLambda(TEXT_CHROMA);
>
>                  absSumU = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurU + subTUBufferOffset,
> -                                                  trWidthC, TEXT_CHROMA_U, absPartIdxC, &lastPosU, false, curuseRDOQ);
> +                                                  trSizeC, TEXT_CHROMA_U, absPartIdxC, &lastPosU, false, curuseRDOQ);
>
>                  curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
>                  m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
>                  absSumV = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurV + subTUBufferOffset,
> -                                                  trWidthC, TEXT_CHROMA_V, absPartIdxC, &lastPosV, false, curuseRDOQ);
> +                                                  trSizeC, TEXT_CHROMA_V, absPartIdxC, &lastPosV, false, curuseRDOQ);
>
>                  cu->setCbfPartRange(absSumU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
>                  cu->setCbfPartRange(absSumV ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> @@ -2960,13 +2955,12 @@
>
>                      int scalingListType = 3 + TEXT_CHROMA_U;
>                      X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> -                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, pcResiCurrU, resiYuv->m_cwidth, coeffCurU + subTUBufferOffset, trWidthC, scalingListType, false, lastPosU);
> +                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, pcResiCurrU, resiYuv->m_cwidth, coeffCurU + subTUBufferOffset, trSizeC, scalingListType, false, lastPosU);
>                  }
>                  else
>                  {
>                      int16_t *ptr = resiYuv->getCbAddr(absPartIdxC);
> -                    X265_CHECK(widthC == heightC, "square chroma transform expected\n");
> -                    primitives.blockfill_s[(int)g_convertToBit[trWidthC]](ptr, resiYuv->m_cwidth, 0);
> +                    primitives.blockfill_s[sizeIdxC](ptr, resiYuv->m_cwidth, 0);
>                  }
>                  if (absSumV)
>                  {
> @@ -2976,13 +2970,12 @@
>
>                      int scalingListType = 3 + TEXT_CHROMA_V;
>                      X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> -                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, resiYuv->m_cwidth, coeffCurV + subTUBufferOffset, trWidthC, scalingListType, false, lastPosV);
> +                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, resiYuv->m_cwidth, coeffCurV + subTUBufferOffset, trSizeC, scalingListType, false, lastPosV);
>                  }
>                  else
>                  {
>                      int16_t *ptr = resiYuv->getCrAddr(absPartIdxC);
> -                    X265_CHECK(widthC == heightC, "square chroma transform expected\n");
> -                    primitives.blockfill_s[(int)g_convertToBit[trWidthC]](ptr, resiYuv->m_cwidth, 0);
> +                    primitives.blockfill_s[sizeIdxC](ptr, resiYuv->m_cwidth, 0);
>                  }
>                  cu->setCbfPartRange(absSumU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
>                  cu->setCbfPartRange(absSumV ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> @@ -3040,8 +3033,7 @@
>  {
>      X265_CHECK(cu->getDepth(0) == cu->getDepth(absPartIdx), "depth not matching\n");
>      const uint32_t trMode = depth - cu->getDepth(0);
> -    const uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> depth] + 2;
> -    uint32_t  trSizeCLog2     = trSizeLog2 - m_hChromaShift;
> +    const uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
>      const uint32_t subTUDepth = trMode + 1;
>      const uint32_t setCbf     = 1 << trMode;
>      int chFmt                 = cu->getChromaFormat();
> @@ -3055,6 +3047,7 @@
>      const bool bCheckSplit = (trSizeLog2 > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx));
>      X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
>
> +    uint32_t trSizeCLog2 = trSizeLog2 - m_hChromaShift;
>      bool bCodeChroma = true;
>      uint32_t trModeC = trMode;
>      if ((trSizeLog2 == 2) && !(chFmt == CHROMA_444))
> @@ -3062,7 +3055,7 @@
>          trSizeCLog2++;
>          trModeC--;
>          uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((depth - 1) << 1);
> -        bCodeChroma = ((absPartIdx % qpdiv) == 0);
> +        bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
>      }
>
>      // code full block
> @@ -3080,22 +3073,20 @@
>      uint32_t bestsubTUCBF[MAX_NUM_COMPONENT][2];
>      m_rdGoOnSbacCoder->store(m_rdSbacCoders[depth][CI_QT_TRAFO_ROOT]);
>
> -    int trWidth = 0, trHeight = 0, trWidthC = 0, trHeightC = 0;
> -
> -    trWidth = trHeight  = 1 << trSizeLog2;
> -    trWidthC            = 1 << trSizeCLog2;
> -    trHeightC           = (chFmt == CHROMA_422) ? (trWidthC << 1) : trWidthC;
> +    uint32_t trSize = 1 << trSizeLog2;
>      const bool splitIntoSubTUs = (chFmt == CHROMA_422);
>      uint32_t absPartIdxStep = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) +  trModeC) << 1);
>
>      // code full block
>      if (bCheckFull)
>      {
> -        const uint32_t numCoeffPerAbsPartIdxIncrement = cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize() >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1);
> +        uint32_t trSizeC = 1 << trSizeCLog2;
>          const uint32_t qtlayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> -        coeff_t *coeffCurY = m_qtTempCoeffY[qtlayer] + (numCoeffPerAbsPartIdxIncrement * absPartIdx);
> -        coeff_t *coeffCurU = m_qtTempCoeffCb[qtlayer] + (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift + m_vChromaShift));
> -        coeff_t *coeffCurV = m_qtTempCoeffCr[qtlayer] + (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift + m_vChromaShift));
> +        uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> +        uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
> +        coeff_t *coeffCurY = m_qtTempCoeffY[qtlayer] + coeffOffsetY;
> +        coeff_t *coeffCurU = m_qtTempCoeffCb[qtlayer] + coeffOffsetC;
> +        coeff_t *coeffCurV = m_qtTempCoeffCr[qtlayer] + coeffOffsetC;
>
>          cu->setTrIdxSubParts(depth - cu->getDepth(0), absPartIdx, depth);
>          bool checkTransformSkip   = cu->getSlice()->getPPS()->getUseTransformSkip() && !cu->getCUTransquantBypass(0);
> @@ -3106,21 +3097,21 @@
>
>          if (m_cfg->bEnableRDOQ && curuseRDOQ)
>          {
> -            m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac, trWidth, TEXT_LUMA);
> +            m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac, trSize, TEXT_LUMA);
>          }
>
>          m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
>          m_trQuant->selectLambda(TEXT_LUMA);
>
>          absSum[TEXT_LUMA][0] = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, coeffCurY,
> -                                                       trWidth, TEXT_LUMA, absPartIdx, &lastPos[TEXT_LUMA][0], false, curuseRDOQ);
> +                                                       trSize, TEXT_LUMA, absPartIdx, &lastPos[TEXT_LUMA][0], false, curuseRDOQ);
>
>          cu->setCbfSubParts(absSum[TEXT_LUMA][0] ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
>
>          m_entropyCoder->resetBits();
> -        m_entropyCoder->encodeQtCbf(cu, absPartIdx, 0, trWidth, trHeight, TEXT_LUMA, trMode, true);
> +        m_entropyCoder->encodeQtCbf(cu, absPartIdx, 0, trSize, trSize, TEXT_LUMA, trMode, true);
>          if (absSum[TEXT_LUMA][0])
> -            m_entropyCoder->encodeCoeffNxN(cu, coeffCurY, absPartIdx,  trWidth, TEXT_LUMA);
> +            m_entropyCoder->encodeCoeffNxN(cu, coeffCurY, absPartIdx,  trSize, TEXT_LUMA);
>          singleBitsComp[TEXT_LUMA][0] = m_entropyCoder->getNumberOfWrittenBits();
>
>          uint32_t singleBitsPrev = singleBitsComp[TEXT_LUMA][0];
> @@ -3130,20 +3121,17 @@
>              TComTURecurse tuIterator;
>              initSection(&tuIterator, splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
>
> -            uint32_t widthC  = trWidthC;
> -            uint32_t heightC = splitIntoSubTUs ? (trHeightC >> 1) : trHeightC;
> -
>              do
>              {
>                  uint32_t absPartIdxC = tuIterator.m_absPartIdxTURelCU;
> -                uint32_t subTUBufferOffset    = widthC * heightC * tuIterator.m_section;
> +                uint32_t subTUBufferOffset    = trSizeC * trSizeC * tuIterator.m_section;
>
>                  cu->setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
>                  cu->setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
>
>                  if (m_cfg->bEnableRDOQ && curuseRDOQ)
>                  {
> -                    m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac, widthC, TEXT_CHROMA);
> +                    m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac, trSizeC, TEXT_CHROMA);
>                  }
>                  //Cb transform
>                  int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
> @@ -3152,24 +3140,24 @@
>                  m_trQuant->selectLambda(TEXT_CHROMA);
>
>                  absSum[TEXT_CHROMA_U][tuIterator.m_section] = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurU + subTUBufferOffset,
> -                                                                                      widthC, TEXT_CHROMA_U, absPartIdxC, &lastPos[TEXT_CHROMA_U][tuIterator.m_section], false, curuseRDOQ);
> +                                                                                      trSizeC, TEXT_CHROMA_U, absPartIdxC, &lastPos[TEXT_CHROMA_U][tuIterator.m_section], false, curuseRDOQ);
>                  //Cr transform
>                  curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
>                  m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
>                  absSum[TEXT_CHROMA_V][tuIterator.m_section] = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurV + subTUBufferOffset,
> -                                                                                      widthC, TEXT_CHROMA_V, absPartIdxC, &lastPos[TEXT_CHROMA_V][tuIterator.m_section], false, curuseRDOQ);
> +                                                                                      trSizeC, TEXT_CHROMA_V, absPartIdxC, &lastPos[TEXT_CHROMA_V][tuIterator.m_section], false, curuseRDOQ);
>
>                  cu->setCbfPartRange(absSum[TEXT_CHROMA_U][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
>                  cu->setCbfPartRange(absSum[TEXT_CHROMA_V][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
>
> -                m_entropyCoder->encodeQtCbf(cu, absPartIdxC, tuIterator.m_absPartIdxStep, widthC, heightC, TEXT_CHROMA_U, trMode, true);
> +                m_entropyCoder->encodeQtCbf(cu, absPartIdxC, tuIterator.m_absPartIdxStep, trSizeC, trSizeC, TEXT_CHROMA_U, trMode, true);
>                  if (absSum[TEXT_CHROMA_U][tuIterator.m_section])
> -                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurU + subTUBufferOffset, absPartIdxC, widthC, TEXT_CHROMA_U);
> +                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurU + subTUBufferOffset, absPartIdxC, trSizeC, TEXT_CHROMA_U);
>                  singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section] = m_entropyCoder->getNumberOfWrittenBits() - singleBitsPrev;
>
> -                m_entropyCoder->encodeQtCbf(cu, absPartIdxC, tuIterator.m_absPartIdxStep, widthC, heightC, TEXT_CHROMA_V, trMode, true);
> +                m_entropyCoder->encodeQtCbf(cu, absPartIdxC, tuIterator.m_absPartIdxStep, trSizeC, trSizeC, TEXT_CHROMA_V, trMode, true);
>                  if (absSum[TEXT_CHROMA_V][tuIterator.m_section])
> -                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurV + subTUBufferOffset, absPartIdxC, widthC, TEXT_CHROMA_V);
> +                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurV + subTUBufferOffset, absPartIdxC, trSizeC, TEXT_CHROMA_V);
>                  uint32_t newBits = m_entropyCoder->getNumberOfWrittenBits();
>                  singleBitsComp[TEXT_CHROMA_V][tuIterator.m_section] = newBits - (singleBitsPrev + singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section]);
>
> @@ -3187,8 +3175,8 @@
>              minCost[TEXT_CHROMA_V][subTUIndex] = MAX_INT64;
>          }
>
> -        int partSize = partitionFromSizes(trWidth, trHeight);
> -        uint32_t distY = primitives.sse_sp[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, (pixel*)RDCost::zeroPel, trWidth);
> +        int partSize = partitionFromSize(trSize);
> +        uint32_t distY = primitives.sse_sp[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, (pixel*)RDCost::zeroPel, trSize);
>
>          if (outZeroDist)
>          {
> @@ -3203,7 +3191,7 @@
>              int scalingListType = 3 + TEXT_LUMA;
>              X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
>              X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width not full CU\n");
> -            m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, MAX_CU_SIZE,  coeffCurY, trWidth, scalingListType, false, lastPos[TEXT_LUMA][0]); //this is for inter mode only
> +            m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, MAX_CU_SIZE,  coeffCurY, trSize, scalingListType, false, lastPos[TEXT_LUMA][0]); //this is for inter mode only
>
>              const uint32_t nonZeroDistY = primitives.sse_ss[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx), MAX_CU_SIZE);
>              if (cu->isLosslessCoded(0))
> @@ -3250,8 +3238,8 @@
>          {
>              int16_t *ptr =  m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx);
>              X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width not full CU\n");
> -            X265_CHECK(trWidth == trHeight, "not square block\n");
> -            primitives.blockfill_s[(int)g_convertToBit[trWidth]](ptr, MAX_CU_SIZE, 0);
> +            int sizeIdx = trSizeLog2 - 2;
> +            primitives.blockfill_s[sizeIdx](ptr, MAX_CU_SIZE, 0);
>          }
>          cu->setCbfSubParts(absSum[TEXT_LUMA][0] ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
>
> @@ -3262,18 +3250,15 @@
>              TComTURecurse tuIterator;
>              initSection(&tuIterator, splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
>
> -            uint32_t widthC  = trWidthC;
> -            uint32_t heightC = splitIntoSubTUs ? (trHeightC >> 1) : trHeightC;
> -
> -            int partSizeC = partitionFromSizes(widthC, heightC);
> -            const uint32_t numSamplesChroma = widthC * heightC;
> +            int partSizeC = partitionFromSize(trSizeC);
> +            const uint32_t numSamplesChroma = trSizeC * trSizeC;
>
>              do
>              {
>                  uint32_t absPartIdxC = tuIterator.m_absPartIdxTURelCU;
> -                uint32_t subTUBufferOffset = widthC * heightC * tuIterator.m_section;
> -
> -                distU = m_rdCost->scaleChromaDistCb(primitives.sse_sp[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, (pixel*)RDCost::zeroPel, widthC));
> +                uint32_t subTUBufferOffset = trSizeC * trSizeC * tuIterator.m_section;
> +
> +                distU = m_rdCost->scaleChromaDistCb(primitives.sse_sp[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, (pixel*)RDCost::zeroPel, trSizeC));
>
>                  if (outZeroDist)
>                  {
> @@ -3289,7 +3274,7 @@
>                      int scalingListType = 3 + TEXT_CHROMA_U;
>                      X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
>                      m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, pcResiCurrU, m_qtTempShortYuv[qtlayer].m_cwidth, coeffCurU + subTUBufferOffset,
> -                                               widthC, scalingListType, false, lastPos[TEXT_CHROMA_U][tuIterator.m_section]);
> +                                               trSizeC, scalingListType, false, lastPos[TEXT_CHROMA_U][tuIterator.m_section]);
>                      uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth,
>                                                                   m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC),
>                                                                   m_qtTempShortYuv[qtlayer].m_cwidth);
> @@ -3339,11 +3324,11 @@
>                  {
>                      int16_t *ptr = m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC);
>                      const uint32_t stride = m_qtTempShortYuv[qtlayer].m_cwidth;
> -                    X265_CHECK(widthC == heightC, "square chroma transform expected\n");
> -                    primitives.blockfill_s[(int)g_convertToBit[widthC]](ptr, stride, 0);
> +                    int sizeIdxC = trSizeCLog2 - 2;
> +                    primitives.blockfill_s[sizeIdxC](ptr, stride, 0);
>                  }
>
> -                distV = m_rdCost->scaleChromaDistCr(primitives.sse_sp[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, (pixel*)RDCost::zeroPel, widthC));
> +                distV = m_rdCost->scaleChromaDistCr(primitives.sse_sp[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, (pixel*)RDCost::zeroPel, trSizeC));
>                  if (outZeroDist)
>                  {
>                      *outZeroDist += distV;
> @@ -3357,7 +3342,7 @@
>                      int scalingListType = 3 + TEXT_CHROMA_V;
>                      X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
>                      m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, m_qtTempShortYuv[qtlayer].m_cwidth, coeffCurV + subTUBufferOffset,
> -                                               widthC, scalingListType, false, lastPos[TEXT_CHROMA_V][tuIterator.m_section]);
> +                                               trSizeC, scalingListType, false, lastPos[TEXT_CHROMA_V][tuIterator.m_section]);
>                      uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth,
>                                                                   m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC),
>                                                                   m_qtTempShortYuv[qtlayer].m_cwidth);
> @@ -3407,8 +3392,8 @@
>                  {
>                      int16_t *ptr =  m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC);
>                      const uint32_t stride = m_qtTempShortYuv[qtlayer].m_cwidth;
> -                    X265_CHECK(widthC == heightC, "square chroma transform expected\n");
> -                    primitives.blockfill_s[(int)g_convertToBit[widthC]](ptr, stride, 0);
> +                    int sizeIdxC = trSizeCLog2 - 2;
> +                    primitives.blockfill_s[sizeIdxC](ptr, stride, 0);
>                  }
>
>                  cu->setCbfPartRange(absSum[TEXT_CHROMA_U][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> @@ -3430,9 +3415,9 @@
>              memcpy(bestCoeffY, coeffCurY, sizeof(coeff_t) * numSamplesLuma);
>
>              int16_t bestResiY[32 * 32];
> -            for (int i = 0; i < trHeight; ++i)
> +            for (int i = 0; i < trSize; ++i)
>              {
> -                memcpy(bestResiY + i * trWidth, curResiY + i * MAX_CU_SIZE, sizeof(int16_t) * trWidth);
> +                memcpy(bestResiY + i * trSize, curResiY + i * MAX_CU_SIZE, sizeof(int16_t) * trSize);
>              }
>
>              m_rdGoOnSbacCoder->load(m_rdSbacCoders[depth][CI_QT_TRAFO_ROOT]);
> @@ -3441,21 +3426,21 @@
>
>              if (m_cfg->bEnableRDOQTS)
>              {
> -                m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac, trWidth, TEXT_LUMA);
> +                m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac, trSize, TEXT_LUMA);
>              }
>
>              m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
>
>              m_trQuant->selectLambda(TEXT_LUMA);
>              absSumTransformSkipY = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, coeffCurY,
> -                                                           trWidth, TEXT_LUMA, absPartIdx, &lastPosTransformSkip[TEXT_LUMA][0], true, curuseRDOQ);
> +                                                           trSize, TEXT_LUMA, absPartIdx, &lastPosTransformSkip[TEXT_LUMA][0], true, curuseRDOQ);
>              cu->setCbfSubParts(absSumTransformSkipY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
>
>              if (absSumTransformSkipY)
>              {
>                  m_entropyCoder->resetBits();
> -                m_entropyCoder->encodeQtCbf(cu, absPartIdx, 0, trWidth, trHeight, TEXT_LUMA, trMode, true);
> -                m_entropyCoder->encodeCoeffNxN(cu, coeffCurY, absPartIdx, trWidth, TEXT_LUMA);
> +                m_entropyCoder->encodeQtCbf(cu, absPartIdx, 0, trSize, trSize, TEXT_LUMA, trMode, true);
> +                m_entropyCoder->encodeCoeffNxN(cu, coeffCurY, absPartIdx, trSize, TEXT_LUMA);
>                  const uint32_t skipSingleBitsY = m_entropyCoder->getNumberOfWrittenBits();
>
>                  m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
> @@ -3464,7 +3449,7 @@
>                  X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
>                  X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width not full CU\n");
>
> -                m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, MAX_CU_SIZE,  coeffCurY, trWidth, scalingListType, true, lastPosTransformSkip[TEXT_LUMA][0]);
> +                m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, MAX_CU_SIZE,  coeffCurY, trSize, scalingListType, true, lastPosTransformSkip[TEXT_LUMA][0]);
>
>                  nonZeroDistY = primitives.sse_ss[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width,
>                                                             m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx),
> @@ -3477,9 +3462,9 @@
>              {
>                  cu->setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
>                  memcpy(coeffCurY, bestCoeffY, sizeof(coeff_t) * numSamplesLuma);
> -                for (int i = 0; i < trHeight; ++i)
> +                for (int i = 0; i < trSize; ++i)
>                  {
> -                    memcpy(curResiY + i * MAX_CU_SIZE, &bestResiY[i * trWidth], sizeof(int16_t) * trWidth);
> +                    memcpy(curResiY + i * MAX_CU_SIZE, &bestResiY[i * trSize], sizeof(int16_t) * trSize);
>                  }
>              }
>              else
> @@ -3503,16 +3488,13 @@
>              TComTURecurse tuIterator;
>              initSection(&tuIterator, splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
>
> -            uint32_t widthC  = trWidthC;
> -            uint32_t heightC = splitIntoSubTUs ? (trHeightC >> 1) : trHeightC;
> -
> -            int partSizeC = partitionFromSizes(widthC, heightC);
> -            const uint32_t numSamplesChroma = widthC * heightC;
> +            int partSizeC = partitionFromSize(trSizeC);
> +            const uint32_t numSamplesChroma = trSizeC * trSizeC;
>
>              do
>              {
>                  uint32_t absPartIdxC = tuIterator.m_absPartIdxTURelCU;
> -                uint32_t subTUBufferOffset = widthC * heightC * tuIterator.m_section;
> +                uint32_t subTUBufferOffset = trSizeC * trSizeC * tuIterator.m_section;
>
>                  int16_t *curResiU = m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC);
>                  int16_t *curResiV = m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC);
> @@ -3523,10 +3505,10 @@
>                  memcpy(bestCoeffV, coeffCurV + subTUBufferOffset, sizeof(coeff_t) * numSamplesChroma);
>
>                  int16_t bestResiU[32 * 32], bestResiV[32 * 32];
> -                for (int i = 0; i < heightC; ++i)
> +                for (int i = 0; i < trSizeC; ++i)
>                  {
> -                    memcpy(&bestResiU[i * widthC], curResiU + i * stride, sizeof(int16_t) * widthC);
> -                    memcpy(&bestResiV[i * widthC], curResiV + i * stride, sizeof(int16_t) * widthC);
> +                    memcpy(&bestResiU[i * trSizeC], curResiU + i * stride, sizeof(int16_t) * trSizeC);
> +                    memcpy(&bestResiV[i * trSizeC], curResiV + i * stride, sizeof(int16_t) * trSizeC);
>                  }
>
>                  cu->setTransformSkipPartRange(1, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> @@ -3534,7 +3516,7 @@
>
>                  if (m_cfg->bEnableRDOQTS)
>                  {
> -                    m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac, widthC, TEXT_CHROMA);
> +                    m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac, trSizeC, TEXT_CHROMA);
>                  }
>
>                  int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
> @@ -3542,11 +3524,11 @@
>                  m_trQuant->selectLambda(TEXT_CHROMA);
>
>                  absSumTransformSkipU = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurU + subTUBufferOffset,
> -                                                               widthC, TEXT_CHROMA_U, absPartIdxC, &lastPosTransformSkip[TEXT_CHROMA_U][tuIterator.m_section], true, curuseRDOQ);
> +                                                               trSizeC, TEXT_CHROMA_U, absPartIdxC, &lastPosTransformSkip[TEXT_CHROMA_U][tuIterator.m_section], true, curuseRDOQ);
>                  curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
>                  m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
>                  absSumTransformSkipV = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurV + subTUBufferOffset,
> -                                                               widthC, TEXT_CHROMA_V, absPartIdxC, &lastPosTransformSkip[TEXT_CHROMA_V][tuIterator.m_section], true, curuseRDOQ);
> +                                                               trSizeC, TEXT_CHROMA_V, absPartIdxC, &lastPosTransformSkip[TEXT_CHROMA_V][tuIterator.m_section], true, curuseRDOQ);
>
>                  cu->setCbfPartRange(absSumTransformSkipU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
>                  cu->setCbfPartRange(absSumTransformSkipV ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> @@ -3556,8 +3538,8 @@
>
>                  if (absSumTransformSkipU)
>                  {
> -                    m_entropyCoder->encodeQtCbf(cu, absPartIdxC, tuIterator.m_absPartIdxStep, widthC, heightC, TEXT_CHROMA_U, trMode, true);
> -                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurU + subTUBufferOffset, absPartIdxC, widthC, TEXT_CHROMA_U);
> +                    m_entropyCoder->encodeQtCbf(cu, absPartIdxC, tuIterator.m_absPartIdxStep, trSizeC, trSizeC, TEXT_CHROMA_U, trMode, true);
> +                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurU + subTUBufferOffset, absPartIdxC, trSizeC, TEXT_CHROMA_U);
>                      singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section] = m_entropyCoder->getNumberOfWrittenBits();
>
>                      curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
> @@ -3566,7 +3548,7 @@
>                      int scalingListType = 3 + TEXT_CHROMA_U;
>                      X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
>                      m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiU, m_qtTempShortYuv[qtlayer].m_cwidth, coeffCurU + subTUBufferOffset,
> -                                               widthC, scalingListType, true, lastPosTransformSkip[TEXT_CHROMA_U][tuIterator.m_section]);
> +                                               trSizeC, scalingListType, true, lastPosTransformSkip[TEXT_CHROMA_U][tuIterator.m_section]);
>                      uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth,
>                                                                   m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC),
>                                                                   m_qtTempShortYuv[qtlayer].m_cwidth);
> @@ -3579,9 +3561,9 @@
>                      cu->setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
>
>                      memcpy(coeffCurU + subTUBufferOffset, bestCoeffU, sizeof(coeff_t) * numSamplesChroma);
> -                    for (int i = 0; i < heightC; ++i)
> +                    for (int i = 0; i < trSizeC; ++i)
>                      {
> -                        memcpy(curResiU + i * stride, &bestResiU[i * widthC], sizeof(int16_t) * widthC);
> +                        memcpy(curResiU + i * stride, &bestResiU[i * trSizeC], sizeof(int16_t) * trSizeC);
>                      }
>                  }
>                  else
> @@ -3593,8 +3575,8 @@
>
>                  if (absSumTransformSkipV)
>                  {
> -                    m_entropyCoder->encodeQtCbf(cu, absPartIdxC, tuIterator.m_absPartIdxStep, widthC, heightC, TEXT_CHROMA_V, trMode, true);
> -                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurV + subTUBufferOffset, absPartIdxC, widthC, TEXT_CHROMA_V);
> +                    m_entropyCoder->encodeQtCbf(cu, absPartIdxC, tuIterator.m_absPartIdxStep, trSizeC, trSizeC, TEXT_CHROMA_V, trMode, true);
> +                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurV + subTUBufferOffset, absPartIdxC, trSizeC, TEXT_CHROMA_V);
>                      singleBitsComp[TEXT_CHROMA_V][tuIterator.m_section] = m_entropyCoder->getNumberOfWrittenBits() - singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section];
>
>                      curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
> @@ -3603,7 +3585,7 @@
>                      int scalingListType = 3 + TEXT_CHROMA_V;
>                      X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
>                      m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, m_qtTempShortYuv[qtlayer].m_cwidth, coeffCurV + subTUBufferOffset,
> -                                               widthC, scalingListType, true, lastPosTransformSkip[TEXT_CHROMA_V][tuIterator.m_section]);
> +                                               trSizeC, scalingListType, true, lastPosTransformSkip[TEXT_CHROMA_V][tuIterator.m_section]);
>                      uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth,
>                                                                   m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC),
>                                                                   m_qtTempShortYuv[qtlayer].m_cwidth);
> @@ -3616,9 +3598,9 @@
>                      cu->setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
>
>                      memcpy(coeffCurV + subTUBufferOffset, bestCoeffV, sizeof(coeff_t) * numSamplesChroma);
> -                    for (int i = 0; i < heightC; ++i)
> +                    for (int i = 0; i < trSizeC; ++i)
>                      {
> -                        memcpy(curResiV + i * stride, &bestResiV[i * widthC], sizeof(int16_t) * widthC);
> +                        memcpy(curResiV + i * stride, &bestResiV[i * trSizeC], sizeof(int16_t) * trSizeC);
>                      }
>                  }
>                  else
> @@ -3651,36 +3633,37 @@
>                  offsetSubTUCBFs(cu, TEXT_CHROMA_V, trMode, absPartIdx);
>              }
>
> -            m_entropyCoder->encodeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_U, trMode, true);
> -            m_entropyCoder->encodeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_V, trMode, true);
> +            uint32_t trHeightC = (chFmt == CHROMA_422) ? (trSizeC << 1) : trSizeC;
> +            m_entropyCoder->encodeQtCbf(cu, absPartIdx, absPartIdxStep, trSizeC, trHeightC, TEXT_CHROMA_U, trMode, true);
> +            m_entropyCoder->encodeQtCbf(cu, absPartIdx, absPartIdxStep, trSizeC, trHeightC, TEXT_CHROMA_V, trMode, true);
>          }
>
> -        m_entropyCoder->encodeQtCbf(cu, absPartIdx, 0, trWidth, trHeight, TEXT_LUMA,     trMode, true);
> +        m_entropyCoder->encodeQtCbf(cu, absPartIdx, 0, trSize, trSize, TEXT_LUMA,     trMode, true);
>          if (absSum[TEXT_LUMA][0])
> -            m_entropyCoder->encodeCoeffNxN(cu, coeffCurY, absPartIdx, trWidth, TEXT_LUMA);
> +            m_entropyCoder->encodeCoeffNxN(cu, coeffCurY, absPartIdx, trSize, TEXT_LUMA);
>
>          if (bCodeChroma)
>          {
>              if (!splitIntoSubTUs)
>              {
>                  if (absSum[TEXT_CHROMA_U][0])
> -                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurU, absPartIdx, trWidthC, TEXT_CHROMA_U);
> +                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurU, absPartIdx, trSizeC, TEXT_CHROMA_U);
>                  if (absSum[TEXT_CHROMA_V][0])
> -                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurV, absPartIdx, trWidthC, TEXT_CHROMA_V);
> +                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurV, absPartIdx, trSizeC, TEXT_CHROMA_V);
>              }
>              else
>              {
> -                uint32_t subTUSize = trWidthC * trWidthC;
> +                uint32_t subTUSize = trSizeC * trSizeC;
>                  uint32_t partIdxesPerSubTU = absPartIdxStep >> 1;
>
>                  if (absSum[TEXT_CHROMA_U][0])
> -                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurU, absPartIdx, trWidthC, TEXT_CHROMA_U);
> +                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurU, absPartIdx, trSizeC, TEXT_CHROMA_U);
>                  if (absSum[TEXT_CHROMA_U][1])
> -                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + partIdxesPerSubTU, trWidthC, TEXT_CHROMA_U);
> +                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + partIdxesPerSubTU, trSizeC, TEXT_CHROMA_U);
>                  if (absSum[TEXT_CHROMA_V][0])
> -                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurV, absPartIdx, trWidthC, TEXT_CHROMA_V);
> +                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurV, absPartIdx, trSizeC, TEXT_CHROMA_V);
>                  if (absSum[TEXT_CHROMA_V][1])
> -                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + partIdxesPerSubTU, trWidthC, TEXT_CHROMA_V);
> +                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + partIdxesPerSubTU, trSizeC, TEXT_CHROMA_V);
>              }
>          }
>
> @@ -3842,7 +3825,7 @@
>      const uint32_t curTrMode   = depth - cu->getDepth(0);
>      const uint32_t trMode      = cu->getTransformIdx(absPartIdx);
>      const bool     bSubdiv     = curTrMode != trMode;
> -    const uint32_t trSizeLog2  = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> depth] + 2;
> +    const uint32_t trSizeLog2  = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
>      uint32_t       trSizeCLog2 = trSizeLog2 - m_hChromaShift;
>      int            chFmt       = cu->getChromaFormat();
>      const bool splitIntoSubTUs = (chFmt == CHROMA_422);
> @@ -3855,8 +3838,7 @@
>      X265_CHECK(cu->getPredictionMode(absPartIdx) != MODE_INTRA, "xEncodeResidualQT() with intra block\n");
>
>      bool mCodeAll = true;
> -    uint32_t trWidth   = 1 << trSizeLog2;
> -    uint32_t trHeight  = trWidth;
> +    uint32_t trSize    = 1 << trSizeLog2;
>      uint32_t trWidthC  = 1 << trSizeCLog2;
>      uint32_t trHeightC = splitIntoSubTUs ? (trWidthC << 1) : trWidthC;
>
> @@ -3891,13 +3873,11 @@
>      if (!bSubdiv)
>      {
>          //Luma
> -        const uint32_t numCoeffPerAbsPartIdxIncrement = cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize() >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1);
>          const uint32_t qtlayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> -        coeff_t *coeffCurY = m_qtTempCoeffY[qtlayer] +  numCoeffPerAbsPartIdxIncrement * absPartIdx;
> +        uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> +        coeff_t *coeffCurY = m_qtTempCoeffY[qtlayer] + coeffOffsetY;
>
>          //Chroma
> -        coeff_t *coeffCurU = m_qtTempCoeffCb[qtlayer] + (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift + m_vChromaShift));
> -        coeff_t *coeffCurV = m_qtTempCoeffCr[qtlayer] + (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift + m_vChromaShift));
>          bool bCodeChroma = true;
>          uint32_t trModeC = trMode;
>          if ((trSizeLog2 == 2) && !(chFmt == CHROMA_444))
> @@ -3905,21 +3885,24 @@
>              trSizeCLog2++;
>              trModeC--;
>              uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((depth - 1) << 1);
> -            bCodeChroma = ((absPartIdx % qpdiv) == 0);
> +            bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
>          }
>
>          if (bSubdivAndCbf)
>          {
> -            m_entropyCoder->encodeQtCbf(cu, absPartIdx, 0, trWidth, trHeight, TEXT_LUMA, trMode, true);
> +            m_entropyCoder->encodeQtCbf(cu, absPartIdx, 0, trSize, trSize, TEXT_LUMA, trMode, true);
>          }
>          else
>          {
>              if (ttype == TEXT_LUMA && cu->getCbf(absPartIdx, TEXT_LUMA, trMode))
>              {
> -                m_entropyCoder->encodeCoeffNxN(cu, coeffCurY, absPartIdx, trWidth, TEXT_LUMA);
> +                m_entropyCoder->encodeCoeffNxN(cu, coeffCurY, absPartIdx, trSize, TEXT_LUMA);
>              }
>              if (bCodeChroma)
>              {
> +                uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
> +                coeff_t *coeffCurU = m_qtTempCoeffCb[qtlayer] + coeffOffsetC;
> +                coeff_t *coeffCurV = m_qtTempCoeffCr[qtlayer] + coeffOffsetC;
>                  uint32_t trSizeC = 1 << trSizeCLog2;
>
>                  if (!splitIntoSubTUs)
> @@ -3977,10 +3960,10 @@
>      if (curTrMode == trMode)
>      {
>          int            chFmt      = cu->getChromaFormat();
> -        const uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> depth] + 2;
> -        uint32_t  trSizeCLog2     = trSizeLog2 - m_hChromaShift;
> +        const uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
>          const uint32_t qtlayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
>
> +        uint32_t trSizeCLog2 = trSizeLog2 - m_hChromaShift;
>          bool bCodeChroma = true;
>          bool bChromaSame = false;
>          uint32_t trModeC = trMode;
> @@ -3989,15 +3972,14 @@
>              trSizeCLog2++;
>              trModeC--;
>              uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trModeC) << 1);
> -            bCodeChroma = ((absPartIdx % qpdiv) == 0);
> +            bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
>              bChromaSame = true;
>          }
>
>          if (bSpatial)
>          {
> -            uint32_t trWidth  = 1 << trSizeLog2;
> -            uint32_t trHeight = 1 << trSizeLog2;
> -            m_qtTempShortYuv[qtlayer].copyPartToPartLuma(resiYuv, absPartIdx, trWidth, trHeight);
> +            uint32_t trSize = 1 << trSizeLog2;
> +            m_qtTempShortYuv[qtlayer].copyPartToPartLuma(resiYuv, absPartIdx, trSize, trSize);
>
>              if (bCodeChroma)
>              {
> @@ -4006,21 +3988,20 @@
>          }
>          else
>          {
> -            uint32_t numCoeffPerAbsPartIdxIncrement = cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize() >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1);
> -            uint32_t numCoeffY = (1 << (trSizeLog2 << 1));
> -            coeff_t* coeffSrcY = m_qtTempCoeffY[qtlayer] +  numCoeffPerAbsPartIdxIncrement * absPartIdx;
> -            coeff_t* coeffDstY = cu->getCoeffY() + numCoeffPerAbsPartIdxIncrement * absPartIdx;
> +            uint32_t numCoeffY = 1 << (trSizeLog2 * 2);
> +            uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> +            coeff_t* coeffSrcY = m_qtTempCoeffY[qtlayer] + coeffOffsetY;
> +            coeff_t* coeffDstY = cu->getCoeffY() + coeffOffsetY;
>              ::memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
>              if (bCodeChroma)
>              {
> -                uint32_t trWidthC  = 1 << trSizeCLog2;
> -                uint32_t trHeightC = (chFmt == CHROMA_422) ? (trWidthC << 1) : trWidthC;
> -                uint32_t numCoeffC = trWidthC * trHeightC;
> -
> -                coeff_t* coeffSrcU = m_qtTempCoeffCb[qtlayer] + (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift + m_vChromaShift));
> -                coeff_t* coeffSrcV = m_qtTempCoeffCr[qtlayer] + (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift + m_vChromaShift));
> -                coeff_t* coeffDstU = cu->getCoeffCb() + (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift + m_vChromaShift));
> -                coeff_t* coeffDstV = cu->getCoeffCr() + (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift + m_vChromaShift));
> +                uint32_t numCoeffC = 1 << (trSizeCLog2 * 2 + (chFmt == CHROMA_422));
> +                uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
> +
> +                coeff_t* coeffSrcU = m_qtTempCoeffCb[qtlayer] + coeffOffsetC;
> +                coeff_t* coeffSrcV = m_qtTempCoeffCr[qtlayer] + coeffOffsetC;
> +                coeff_t* coeffDstU = cu->getCoeffCb() + coeffOffsetC;
> +                coeff_t* coeffDstV = cu->getCoeffCr() + coeffOffsetC;
>                  ::memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
>                  ::memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
>              }
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/Lib/TLibEncoder/TEncSearch.h
> --- a/source/Lib/TLibEncoder/TEncSearch.h       Thu May 22 21:46:21 2014 -0500
> +++ b/source/Lib/TLibEncoder/TEncSearch.h       Fri May 23 13:34:51 2014 +0900
> @@ -229,8 +229,8 @@
>
>      void xStoreIntraResultQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx);
>      void xLoadIntraResultQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx);
> -    void xStoreIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t stateU0V1Both2, const bool splitIntoSubTUs);
> -    void xLoadIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t stateU0V1Both2, const bool splitIntoSubTUs);
> +    void xStoreIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t chromaId, const bool splitIntoSubTUs);
> +    void xLoadIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t chromaId, const bool splitIntoSubTUs);
>
>      // --------------------------------------------------------------------------------------------
>      // Inter search (AMP)
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/common/pixel.cpp
> --- a/source/common/pixel.cpp   Thu May 22 21:46:21 2014 -0500
> +++ b/source/common/pixel.cpp   Fri May 23 13:34:51 2014 +0900
> @@ -1151,11 +1151,11 @@
>      p.calcrecon[BLOCK_32x32] = calcRecons<32>;
>      p.calcrecon[BLOCK_64x64] = NULL;
>
> -    p.transpose[0] = transpose<4>;
> -    p.transpose[1] = transpose<8>;
> -    p.transpose[2] = transpose<16>;
> -    p.transpose[3] = transpose<32>;
> -    p.transpose[4] = transpose<64>;
> +    p.transpose[BLOCK_4x4] = transpose<4>;
> +    p.transpose[BLOCK_8x8] = transpose<8>;
> +    p.transpose[BLOCK_16x16] = transpose<16>;
> +    p.transpose[BLOCK_32x32] = transpose<32>;
> +    p.transpose[BLOCK_64x64] = transpose<64>;
>
>      p.weight_pp = weight_pp_c;
>      p.weight_sp = weight_sp_c;
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/common/primitives.cpp
> --- a/source/common/primitives.cpp      Thu May 22 21:46:21 2014 -0500
> +++ b/source/common/primitives.cpp      Fri May 23 13:34:51 2014 +0900
> @@ -29,7 +29,7 @@
>  namespace x265 {
>  // x265 private namespace
>
> -uint8_t lumaPartitioneMapTable[] =
> +extern const uint8_t lumaPartitionMapTable[] =
>  {
>  //  4          8          12          16          20  24          28  32          36  40  44  48          52  56  60  64
>      LUMA_4x4,  LUMA_4x8,  255,        LUMA_4x16,  255, 255,        255, 255,        255, 255, 255, 255,        255, 255, 255, 255,        // 4
> @@ -50,6 +50,11 @@
>      255,        255,      255,        LUMA_64x16, 255, 255,        255, LUMA_64x32, 255, 255, 255, LUMA_64x48, 255, 255, 255, LUMA_64x64  // 64
>  };
>
> +extern const uint8_t lumaSquarePartitionMapTable[] =
> +{
> +    LUMA_4x4,  LUMA_8x8,  255,        LUMA_16x16, 255, 255,        255, LUMA_32x32, 255, 255, 255, 255,        255, 255, 255, LUMA_64x64
> +};
> +
>  /* the "authoritative" set of encoder primitives */
>  EncoderPrimitives primitives;
>
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/common/primitives.h
> --- a/source/common/primitives.h        Thu May 22 21:46:21 2014 -0500
> +++ b/source/common/primitives.h        Fri May 23 13:34:51 2014 +0900
> @@ -103,14 +103,23 @@
>  inline int partitionFromSizes(int width, int height)
>  {
>      X265_CHECK(((width | height) & ~(4 | 8 | 16 | 32 | 64)) == 0, "Invalid block width/height\n");
> -    extern uint8_t lumaPartitioneMapTable[];
> +    extern const uint8_t lumaPartitionMapTable[];
>      int w = (width >> 2) - 1;
>      int h = (height >> 2) - 1;
> -    int part = (int)lumaPartitioneMapTable[(w << 4) + h];
> +    int part = (int)lumaPartitionMapTable[(w << 4) + h];
>      X265_CHECK(part != 255, "Invalid block width %d height %d\n", width, height);
>      return part;
>  }
>
> +inline int partitionFromSize(int size)
> +{
> +    X265_CHECK((size & ~(4 | 8 | 16 | 32 | 64)) == 0, "Invalid block size\n");
> +    extern const uint8_t lumaSquarePartitionMapTable[];
> +    int part = (int)lumaSquarePartitionMapTable[(size >> 2) - 1];
> +    X265_CHECK(part != 255, "Invalid block size %d\n", size);
> +    return part;
> +}
> +
>  typedef int  (*pixelcmp_t)(pixel *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride); // fenc is aligned
>  typedef int  (*pixelcmp_ss_t)(int16_t *fenc, intptr_t fencstride, int16_t *fref, intptr_t frefstride);
>  typedef int  (*pixelcmp_sp_t)(int16_t *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride);
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/common/shortyuv.cpp
> --- a/source/common/shortyuv.cpp        Thu May 22 21:46:21 2014 -0500
> +++ b/source/common/shortyuv.cpp        Fri May 23 13:34:51 2014 +0900
> @@ -84,7 +84,7 @@
>
>  void ShortYuv::subtract(TComYuv* srcYuv0, TComYuv* srcYuv1, uint32_t partSize)
>  {
> -    int part = partitionFromSizes(partSize, partSize);
> +    int part = partitionFromSize(partSize);
>
>      pixel* srcY0 = srcYuv0->getLumaAddr();
>      pixel* srcY1 = srcYuv1->getLumaAddr();
> @@ -136,7 +136,7 @@
>
>  void ShortYuv::copyPartToPartChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, bool bChromaSame)
>  {
> -    int part = partitionFromSizes(lumaSize, lumaSize);
> +    int part = partitionFromSize(lumaSize);
>
>      part = ((part == 0) && (m_csp == CHROMA_422)) ? 1 : part;
>      int16_t* srcU = getCbAddr(partIdx);
> @@ -158,7 +158,7 @@
>
>  void ShortYuv::copyPartToPartChroma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, bool bChromaSame)
>  {
> -    int part = partitionFromSizes(lumaSize, lumaSize);
> +    int part = partitionFromSize(lumaSize);
>      int16_t* srcU = getCbAddr(partIdx);
>      int16_t* srcV = getCrAddr(partIdx);
>      pixel* dstU = dstPicYuv->getCbAddr(partIdx);
> @@ -181,7 +181,7 @@
>
>  void ShortYuv::copyPartToPartShortChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, uint32_t chromaId)
>  {
> -    int part = partitionFromSizes(lumaSize, lumaSize);
> +    int part = partitionFromSize(lumaSize);
>
>      if (chromaId == 0)
>      {
> @@ -214,7 +214,9 @@
>
>  void ShortYuv::copyPartToPartYuvChroma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, uint32_t chromaId, const bool splitIntoSubTUs)
>  {
> -    int part = splitIntoSubTUs ? NUM_CHROMA_PARTITIONS422 : partitionFromSizes(lumaSize, lumaSize);
> +    assert(chromaId == 1 || chromaId == 2);
> +
> +    int part = splitIntoSubTUs ? NUM_CHROMA_PARTITIONS422 : partitionFromSize(lumaSize);
>
>      if (chromaId == 1)
>      {
> @@ -224,7 +226,7 @@
>          uint32_t dstStride = dstPicYuv->getCStride();
>          primitives.chroma[m_csp].copy_sp[part](dstU, dstStride, srcU, srcStride);
>      }
> -    else if (chromaId == 2)
> +    else
>      {
>          int16_t* srcV = getCrAddr(partIdx);
>          pixel* dstV = dstPicYuv->getCrAddr(partIdx);
> @@ -232,16 +234,4 @@
>          uint32_t dstStride = dstPicYuv->getCStride();
>          primitives.chroma[m_csp].copy_sp[part](dstV, dstStride, srcV, srcStride);
>      }
> -    else
> -    {
> -        int16_t* srcU = getCbAddr(partIdx);
> -        int16_t* srcV = getCrAddr(partIdx);
> -        pixel* dstU = dstPicYuv->getCbAddr(partIdx);
> -        pixel* dstV = dstPicYuv->getCrAddr(partIdx);
> -
> -        uint32_t srcStride = m_cwidth;
> -        uint32_t dstStride = dstPicYuv->getCStride();
> -        primitives.chroma[m_csp].copy_sp[part](dstU, dstStride, srcU, srcStride);
> -        primitives.chroma[m_csp].copy_sp[part](dstV, dstStride, srcV, srcStride);
> -    }
>  }
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/common/vec/blockcopy-sse3.cpp
> --- a/source/common/vec/blockcopy-sse3.cpp      Thu May 22 21:46:21 2014 -0500
> +++ b/source/common/vec/blockcopy-sse3.cpp      Fri May 23 13:34:51 2014 +0900
> @@ -30,7 +30,7 @@
>  #if HIGH_BIT_DEPTH
>  void blockcopy_pp(int bx, int by, pixel *dst, intptr_t dstride, pixel *src, intptr_t sstride)
>  {
> -    if ((bx & 7) || (((size_t)dst | (size_t)src | sstride | dstride) & 15))
> +    if ((bx & 7) || (((intptr_t)dst | (intptr_t)src | sstride | dstride) & 15))
>      {
>          // slow path, irregular memory alignments or sizes
>          for (int y = 0; y < by; y++)
> @@ -60,7 +60,7 @@
>  #else // if HIGH_BIT_DEPTH
>  void blockcopy_pp(int bx, int by, pixel *dst, intptr_t dstride, pixel *src, intptr_t sstride)
>  {
> -    size_t aligncheck = (size_t)dst | (size_t)src | bx | sstride | dstride;
> +    intptr_t aligncheck = (intptr_t)dst | (intptr_t)src | bx | sstride | dstride;
>
>      if (!(aligncheck & 15))
>      {
> @@ -91,7 +91,7 @@
>
>  void blockcopy_ps(int bx, int by, pixel *dst, intptr_t dstride, int16_t *src, intptr_t sstride)
>  {
> -    size_t aligncheck = (size_t)dst | (size_t)src | bx | sstride | dstride;
> +    intptr_t aligncheck = (intptr_t)dst | (intptr_t)src | bx | sstride | dstride;
>
>      if (!(aligncheck & 15))
>      {
> @@ -134,7 +134,7 @@
>
>  void pixeladd_ss(int bx, int by, int16_t *dst, intptr_t dstride, int16_t *src0, int16_t *src1, intptr_t sstride0, intptr_t sstride1)
>  {
> -    size_t aligncheck = (size_t)dst | (size_t)src0 | sstride0 | sstride1 | dstride;
> +    intptr_t aligncheck = (intptr_t)dst | (intptr_t)src0 | sstride0 | sstride1 | dstride;
>
>      if (!(aligncheck & 15) && !(bx & 7))
>      {
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/encoder/compress.cpp
> --- a/source/encoder/compress.cpp       Thu May 22 21:46:21 2014 -0500
> +++ b/source/encoder/compress.cpp       Fri May 23 13:34:51 2014 +0900
> @@ -138,8 +138,8 @@
>          leftFiltered  = leftScale;
>      }
>
> -    int log2SizeMinus2 = g_convertToBit[scaleTuSize];
> -    pixelcmp_t sa8d = primitives.sa8d[log2SizeMinus2];
> +    int sizeIdx = g_convertToBit[scaleTuSize];
> +    pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
>
>      uint32_t preds[3];
>      cu->getIntraDirLumaPredictor(partOffset, preds);
> @@ -148,7 +148,7 @@
>      uint32_t rbits = m_search->xModeBitsRemIntra(cu, partOffset, depth, preds, mpms);
>
>      // DC
> -    primitives.intra_pred[log2SizeMinus2][DC_IDX](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
> +    primitives.intra_pred[sizeIdx][DC_IDX](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
>      bsad = costMultiplier * sa8d(fenc, scaleStride, tmp, scaleStride);
>      bmode = mode = DC_IDX;
>      bbits = !(mpms & ((uint64_t)1 << mode)) ? rbits : m_search->xModeBitsIntra(cu, mode, partOffset, depth);
> @@ -164,7 +164,7 @@
>      }
>
>      // PLANAR
> -    primitives.intra_pred[log2SizeMinus2][PLANAR_IDX](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
> +    primitives.intra_pred[sizeIdx][PLANAR_IDX](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
>      sad = costMultiplier * sa8d(fenc, scaleStride, tmp, scaleStride);
>      mode = PLANAR_IDX;
>      bits = !(mpms & ((uint64_t)1 << mode)) ? rbits : m_search->xModeBitsIntra(cu, mode, partOffset, depth);
> @@ -172,9 +172,9 @@
>      COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
>
>      // Transpose NxN
> -    primitives.transpose[log2SizeMinus2](buf_trans, fenc, scaleStride);
> +    primitives.transpose[sizeIdx](buf_trans, fenc, scaleStride);
>
> -    primitives.intra_pred_allangs[log2SizeMinus2](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
> +    primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
>
>      for (mode = 2; mode < 35; mode++)
>      {
> @@ -211,9 +211,9 @@
>      outTempCU->m_totalBits = 0;
>      if (m_search->predInterSearch(outTempCU, outPredYuv, bUseMRG, false))
>      {
> -        int part = g_convertToBit[outTempCU->getCUSize(0)];
> -        uint32_t distortion = primitives.sa8d[part](m_origYuv[depth]->getLumaAddr(), m_origYuv[depth]->getStride(),
> -                                                    outPredYuv->getLumaAddr(), outPredYuv->getStride());
> +        int sizeIdx = g_convertToBit[outTempCU->getCUSize(0)];
> +        uint32_t distortion = primitives.sa8d[sizeIdx](m_origYuv[depth]->getLumaAddr(), m_origYuv[depth]->getStride(),
> +                                                       outPredYuv->getLumaAddr(), outPredYuv->getStride());
>          outTempCU->m_totalDistortion = distortion;
>          outTempCU->m_totalCost = m_rdCost->calcRdSADCost(distortion, outTempCU->m_totalBits);
>      }
> @@ -243,7 +243,7 @@
>      outBestCU->setPredModeSubParts(MODE_INTER, 0, depth);
>      outBestCU->setMergeFlag(0, true);
>
> -    int part = g_convertToBit[outTempCU->getCUSize(0)];
> +    int sizeIdx = g_convertToBit[outTempCU->getCUSize(0)];
>      int bestMergeCand = -1;
>
>      for (uint32_t mergeCand = 0; mergeCand < maxNumMergeCand; ++mergeCand)
> @@ -262,8 +262,8 @@
>              m_search->motionCompensation(outTempCU, m_tmpPredYuv[depth], REF_PIC_LIST_X, 0, true, false);
>              uint32_t bitsCand = getTUBits(mergeCand, maxNumMergeCand);
>              outTempCU->m_totalBits = bitsCand;
> -            outTempCU->m_totalDistortion = primitives.sa8d[part](m_origYuv[depth]->getLumaAddr(), m_origYuv[depth]->getStride(),
> -                                                                 m_tmpPredYuv[depth]->getLumaAddr(), m_tmpPredYuv[depth]->getStride());
> +            outTempCU->m_totalDistortion = primitives.sa8d[sizeIdx](m_origYuv[depth]->getLumaAddr(), m_origYuv[depth]->getStride(),
> +                                                                    m_tmpPredYuv[depth]->getLumaAddr(), m_tmpPredYuv[depth]->getStride());
>              outTempCU->m_totalCost = m_rdCost->calcRdSADCost(outTempCU->m_totalDistortion, outTempCU->m_totalBits);
>
>              if (outTempCU->m_totalCost < outBestCU->m_totalCost)
> @@ -866,7 +866,7 @@
>              uint32_t src2stride = m_bestPredYuv[0]->getStride();
>              uint32_t src1stride = m_origYuv[0]->getStride();
>              uint32_t dststride = m_tmpResiYuv[depth]->m_width;
> -            int part = partitionFromSizes(cu->getCUSize(0), cu->getCUSize(0));
> +            int part = partitionFromSize(cu->getCUSize(0));
>              primitives.luma_sub_ps[part](dst, dststride, src1, src2, src1stride, src2stride);
>
>              src2 = m_bestPredYuv[0]->getCbAddr(absPartIdx);
> @@ -925,7 +925,7 @@
>
>          //Generate Recon
>          TComPicYuv* rec = pic->getPicYuvRec();
> -        int part = partitionFromSizes(cu->getCUSize(0), cu->getCUSize(0));
> +        int part = partitionFromSize(cu->getCUSize(0));
>          pixel* src = m_bestPredYuv[0]->getLumaAddr(absPartIdx);
>          pixel* dst = rec->getLumaAddr(cu->getAddr(), absPartIdx);
>          uint32_t srcstride = m_bestPredYuv[0]->getStride();
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/encoder/slicetype.cpp
> --- a/source/encoder/slicetype.cpp      Thu May 22 21:46:21 2014 -0500
> +++ b/source/encoder/slicetype.cpp      Fri May 23 13:34:51 2014 +0900
> @@ -1604,7 +1604,7 @@
>      }
>      if (!fenc->bIntraCalculated)
>      {
> -        int nLog2SizeMinus2 = g_convertToBit[cuSize]; // partition size
> +        int sizeIdx = g_convertToBit[cuSize]; // partition size
>
>          pixel _above0[X265_LOWRES_CU_SIZE * 4 + 1], *const above0 = _above0 + 2 * X265_LOWRES_CU_SIZE;
>          pixel _above1[X265_LOWRES_CU_SIZE * 4 + 1], *const above1 = _above1 + 2 * X265_LOWRES_CU_SIZE;
> @@ -1643,16 +1643,16 @@
>          int predsize = cuSize * cuSize;
>
>          // generate 35 intra predictions into tmp
> -        primitives.intra_pred[nLog2SizeMinus2][DC_IDX](predictions, cuSize, left0, above0, 0, (cuSize <= 16));
> +        primitives.intra_pred[sizeIdx][DC_IDX](predictions, cuSize, left0, above0, 0, (cuSize <= 16));
>          pixel *above = (cuSize >= 8) ? above1 : above0;
>          pixel *left  = (cuSize >= 8) ? left1 : left0;
> -        primitives.intra_pred[nLog2SizeMinus2][PLANAR_IDX](predictions + predsize, cuSize, left, above, 0, 0);
> -        primitives.intra_pred_allangs[nLog2SizeMinus2](predictions + 2 * predsize, above0, left0, above1, left1, (cuSize <= 16));
> +        primitives.intra_pred[sizeIdx][PLANAR_IDX](predictions + predsize, cuSize, left, above, 0, 0);
> +        primitives.intra_pred_allangs[sizeIdx](predictions + 2 * predsize, above0, left0, above1, left1, (cuSize <= 16));
>
>          // calculate 35 satd costs, keep least cost
>          ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
> -        primitives.transpose[nLog2SizeMinus2](buf_trans, me.fenc, FENC_STRIDE);
> -        pixelcmp_t satd = primitives.satd[partitionFromSizes(cuSize, cuSize)];
> +        primitives.transpose[sizeIdx](buf_trans, me.fenc, FENC_STRIDE);
> +        pixelcmp_t satd = primitives.satd[partitionFromSize(cuSize)];
>          int icost = me.COST_MAX, cost;
>          for (uint32_t mode = 0; mode < 35; mode++)
>          {
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho