[x265] [PATCH] Modify TEncSearch structure to support multiple color space formats

Wed Jan 8 04:41:34 CET 2014

The range-ext code has a whole lot of problems (whitespace and otherwise),
but functionally we're getting there. I'd like t push in these patches to
the default tip anyway, and let Ashok work on correcting the code and
polishing it up.

With such a large series, I'd like to avoid painful merges and related bug
fixes as much as possible.

I was waiting for him to fix an output mismatch for 4:2:0, which he now
has.

On Wed, Jan 8, 2014 at 5:38 AM, Steve Borho <steve at borho.org> wrote:

>
>
>
> On Tue, Jan 7, 2014 at 5:16 AM, <ashok at multicorewareinc.com> wrote:
>
>> # HG changeset patch
>> # User ashok at multicorewareinc.com
>> # Date 1389093279 -19800
>> #      Tue Jan 07 16:44:39 2014 +0530
>> # Node ID f7d21da102acf8d88be3f6ea6b6db5dc12134cdb
>> # Parent  4811da38078cd02434f7da1dcc1b0af4dcf5adb8
>> Modify TEncSearch structure to support multiple color space formats
>>
>
> Some parts of this patch look redundant with some earlier ones.
>
> It's an impressive series, ignoring the white-space and style problems.
>
> Configuring the 4:4:4 chroma primitives needs to happen
> in x265_setup_primitives(), in the same place it configures other function
> pointer copies.  This way you get ASM optimized functions if they were
> configured.
>
>
>>
>> diff -r 4811da38078c -r f7d21da102ac source/Lib/TLibCommon/CommonDef.h
>> --- a/source/Lib/TLibCommon/CommonDef.h Mon Jan 06 23:15:58 2014 -0600
>> +++ b/source/Lib/TLibCommon/CommonDef.h Tue Jan 07 16:44:39 2014 +0530
>> @@ -88,6 +88,9 @@
>>  #define MLS_GRP_NUM                 64 ///< G644 : Max number of
>> coefficient groups, max(16, 64)
>>  #define MLS_CG_SIZE                 4 ///< G644 : Coefficient group size
>> of 4x4
>>
>> +#define MLS_CG_LOG2_WIDTH           2
>> +#define MLS_CG_LOG2_HEIGHT          2
>> +
>>  #define ARL_C_PRECISION             7 ///< G382: 7-bit arithmetic
>> precision
>>  #define LEVEL_RANGE                 30 ///< G382: max coefficient level
>> in statistics collection
>>
>> diff -r 4811da38078c -r f7d21da102ac source/Lib/TLibEncoder/TEncSearch.cpp
>> --- a/source/Lib/TLibEncoder/TEncSearch.cpp     Mon Jan 06 23:15:58 2014
>> -0600
>> +++ b/source/Lib/TLibEncoder/TEncSearch.cpp     Tue Jan 07 16:44:39 2014
>> +0530
>> @@ -229,7 +229,7 @@
>>
>>      if (bChroma)
>>      {
>> -        if (trSizeLog2 > 2)
>> +        if ((trSizeLog2 > 2) && !(cu->getChromaFormat() == CHROMA_444))
>>          {
>>              if (trDepth == 0 || cu->getCbf(absPartIdx, TEXT_CHROMA_U,
>> trDepth - 1))
>>                  m_entropyCoder->encodeQtCbf(cu, absPartIdx,
>> TEXT_CHROMA_U, trDepth);
>> @@ -275,7 +275,7 @@
>>          return;
>>      }
>>
>> -    if (ttype != TEXT_LUMA && trSizeLog2 == 2)
>> +    if ( (ttype != TEXT_LUMA) && (trSizeLog2 == 2) &&
>> !(cu->getChromaFormat() == CHROMA_444))
>>      {
>>          assert(trDepth > 0);
>>          trDepth--;
>> @@ -288,9 +288,11 @@
>>      }
>>
>>      //===== coefficients =====
>> -    uint32_t width = cu->getWidth(0) >> (trDepth + chroma);
>> -    uint32_t height = cu->getHeight(0) >> (trDepth + chroma);
>> -    uint32_t coeffOffset = (cu->getPic()->getMinCUWidth() *
>> cu->getPic()->getMinCUHeight() * absPartIdx) >> (chroma << 1);
>> +    int cspx = chroma ? m_hChromaShift : 0;
>> +    int cspy = chroma ? m_vChromaShift : 0;
>> +    uint32_t width = cu->getWidth(0) >> (trDepth + cspx);
>> +    uint32_t height = cu->getHeight(0) >> (trDepth + cspy);
>> +    uint32_t coeffOffset = (cu->getPic()->getMinCUWidth() >> cspx) *
>> (cu->getPic()->getMinCUHeight() >> cspy) * absPartIdx;
>>      uint32_t qtLayer =
>> cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
>>      TCoeff* coeff = 0;
>>      switch (ttype)
>> @@ -363,12 +365,23 @@
>>              }
>>          }
>>      }
>> +
>>      if (bChroma)
>>      {
>>          // chroma prediction mode
>> -        if (absPartIdx == 0)
>> +        if ((cu->getPartitionSize(0) == SIZE_2Nx2N) ||
>> !(cu->getChromaFormat() == CHROMA_444))
>>          {
>> -            m_entropyCoder->encodeIntraDirModeChroma(cu, 0, true);
>> +            if (absPartIdx == 0)
>> +            {
>> +                m_entropyCoder->encodeIntraDirModeChroma(cu, absPartIdx,
>> true);
>> +            }
>> +        }
>> +        else
>> +        {
>> +            uint32_t qtNumParts = cu->getTotalNumPart() >> 2;
>> +            assert(trDepth > 0);
>> +            if ((absPartIdx%qtNumParts) == 0)
>> +                m_entropyCoder->encodeIntraDirModeChroma(cu, absPartIdx,
>> true);
>>          }
>>      }
>>  }
>> @@ -475,7 +488,7 @@
>>      int lastPos = -1;
>>      cu->setTrIdxSubParts(trDepth, absPartIdx, fullDepth);
>>
>> -    m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetY(), 0);
>> +    m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetY(), 0, cu->getChromaFormat());
>>      m_trQuant->selectLambda(TEXT_LUMA);
>>
>>      absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, width,
>> height, TEXT_LUMA, absPartIdx, &lastPos, useTransformSkip);
>> @@ -520,7 +533,7 @@
>>      uint32_t fullDepth   = cu->getDepth(0) + trDepth;
>>      uint32_t trSizeLog2  =
>> g_convertToBit[cu->getSlice()->getSPS()->getMaxCUWidth() >> fullDepth] + 2;
>>
>> -    if (trSizeLog2 == 2)
>> +    if ((trSizeLog2 == 2) && !(cu->getChromaFormat() == CHROMA_444))
>>      {
>>          assert(trDepth > 0);
>>          trDepth--;
>> @@ -534,7 +547,7 @@
>>
>>      TextType ttype          = (chromaId > 0 ? TEXT_CHROMA_V :
>> TEXT_CHROMA_U);
>>      uint32_t chromaPredMode = cu->getChromaIntraDir(absPartIdx);
>> -    uint32_t width          = cu->getWidth(0) >> (trDepth +
>> m_hChromaShift);
>> +    uint32_t width          = cu->getWidth(0)  >> (trDepth +
>> m_hChromaShift);
>>      uint32_t height         = cu->getHeight(0) >> (trDepth +
>> m_vChromaShift);
>>      uint32_t stride         = fencYuv->getCStride();
>>      Pel*     fenc           = (chromaId > 0 ?
>> fencYuv->getCrAddr(absPartIdx) : fencYuv->getCbAddr(absPartIdx));
>> @@ -543,10 +556,10 @@
>>      Pel*     recon          = (chromaId > 0 ?
>> predYuv->getCrAddr(absPartIdx) : predYuv->getCbAddr(absPartIdx));
>>
>>      uint32_t qtlayer        =
>> cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
>> -    uint32_t numCoeffPerInc = (cu->getSlice()->getSPS()->getMaxCUWidth()
>> * cu->getSlice()->getSPS()->getMaxCUHeight() >>
>> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1)) >> 2;
>> +    uint32_t numCoeffPerInc = (cu->getSlice()->getSPS()->getMaxCUWidth()
>> * cu->getSlice()->getSPS()->getMaxCUHeight() >>
>> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1)) >> (m_hChromaShift +
>> m_vChromaShift);
>>      TCoeff*  coeff          = (chromaId > 0 ? m_qtTempCoeffCr[qtlayer] :
>> m_qtTempCoeffCb[qtlayer]) + numCoeffPerInc * absPartIdx;
>>      int16_t* reconQt        = (chromaId > 0 ?
>> m_qtTempTComYuv[qtlayer].getCrAddr(absPartIdx) :
>> m_qtTempTComYuv[qtlayer].getCbAddr(absPartIdx));
>> -    assert(m_qtTempTComYuv[qtlayer].m_cwidth == MAX_CU_SIZE / 2);
>> +    uint32_t reconQtStride  = m_qtTempTComYuv[qtlayer].m_cwidth;
>>
>>      uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
>>      Pel*     reconIPred       = (chromaId > 0 ?
>> cu->getPic()->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder) :
>> cu->getPic()->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder));
>> @@ -557,7 +570,7 @@
>>      //===== update chroma mode =====
>>      if (chromaPredMode == DM_CHROMA_IDX)
>>      {
>> -        chromaPredMode = cu->getLumaIntraDir(0);
>> +        chromaPredMode = cu->getLumaIntraDir(absPartIdx);
>>      }
>>
>>      //===== init availability pattern =====
>> @@ -565,11 +578,11 @@
>>      {
>>          cu->getPattern()->initPattern(cu, trDepth, absPartIdx);
>>
>> -        cu->getPattern()->initAdiPatternChroma(cu, absPartIdx, trDepth,
>> m_predBuf, m_predBufStride, m_predBufHeight);
>> +        cu->getPattern()->initAdiPatternChroma(cu, absPartIdx, trDepth,
>> m_predBuf, m_predBufStride, m_predBufHeight, chromaId);
>>          Pel* chromaPred = (chromaId > 0 ?
>> cu->getPattern()->getAdiCrBuf(width, height, m_predBuf) :
>> cu->getPattern()->getAdiCbBuf(width, height, m_predBuf));
>>
>>          //===== get prediction signal =====
>> -        predIntraChromaAng(chromaPred, chromaPredMode, pred, stride,
>> width);
>> +        predIntraChromaAng(chromaPred, chromaPredMode, pred, stride,
>> width, height, cu->getChromaFormat());
>>
>>          // save prediction
>>          if (default0Save1Load2 == 1)
>> @@ -612,7 +625,7 @@
>>          {
>>              curChromaQpOffset =
>> cu->getSlice()->getPPS()->getChromaCrQpOffset() +
>> cu->getSlice()->getSliceQpDeltaCr();
>>          }
>> -        m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset);
>> +        m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset,
>> cu->getChromaFormat());
>>
>>          m_trQuant->selectLambda(TEXT_CHROMA);
>>
>> @@ -639,7 +652,7 @@
>>      //===== reconstruction =====
>>      assert(((uint32_t)(size_t)residual & (width - 1)) == 0);
>>      assert(width <= 32);
>> -    primitives.calcrecon[size](pred, residual, recon, reconQt,
>> reconIPred, stride, MAX_CU_SIZE / 2, reconIPredStride);
>> +    primitives.calcrecon[size](pred, residual, recon, reconQt,
>> reconIPred, stride, reconQtStride, reconIPredStride);
>>
>>      //===== update distortion =====
>>      uint32_t dist = primitives.sse_pp[part](fenc, stride, recon, stride);
>> @@ -702,11 +715,11 @@
>>      uint32_t singleCbfY  = 0;
>>      uint32_t singleCbfU  = 0;
>>      uint32_t singleCbfV  = 0;
>> -    bool   checkTransformSkip  =
>> cu->getSlice()->getPPS()->getUseTransformSkip();
>> +    bool     checkTransformSkip  =
>> cu->getSlice()->getPPS()->getUseTransformSkip();
>>      uint32_t widthTransformSkip  = cu->getWidth(0) >> trDepth;
>>      uint32_t heightTransformSkip = cu->getHeight(0) >> trDepth;
>> -    int    bestModeId    = 0;
>> -    int    bestModeIdUV[2] = { 0, 0 };
>> +    int      bestModeId          = 0;
>> +    int      bestModeIdUV[2]     = { 0, 0 };
>>
>>      checkTransformSkip &= (widthTransformSkip == 4 &&
>> heightTransformSkip == 4);
>>      checkTransformSkip &= (!cu->getCUTransquantBypass(0));
>> @@ -729,8 +742,8 @@
>>              uint32_t singleCbfUTmp      = 0;
>>              uint32_t singleCbfVTmp      = 0;
>>              uint64_t singleCostTmp      = 0;
>> -            int    default0Save1Load2 = 0;
>> -            int    firstCheckId       = 0;
>> +            int      default0Save1Load2 = 0;
>> +            int      firstCheckId       = 0;
>>
>>              uint32_t qpdiv = cu->getPic()->getNumPartInCU() >>
>> ((cu->getDepth(0) + (trDepth - 1)) << 1);
>>              bool   bFirstQ = ((absPartIdx % qpdiv) == 0);
>> @@ -964,17 +977,17 @@
>>
>>          if (!bLumaOnly)
>>          {
>> -            width >>= 1;
>> -            height >>= 1;
>> +            width  >>= m_hChromaShift;
>> +            height >>= m_vChromaShift;
>>              src       = m_qtTempTComYuv[qtLayer].getCbAddr(absPartIdx);
>> -            assert(m_qtTempTComYuv[qtLayer].m_cwidth == MAX_CU_SIZE / 2);
>> +            uint32_t srcstride = m_qtTempTComYuv[qtLayer].m_cwidth;
>>              dst       =
>> cu->getPic()->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder);
>>              dststride = cu->getPic()->getPicYuvRec()->getCStride();
>> -            primitives.blockcpy_ps(width, height, dst, dststride, src,
>> MAX_CU_SIZE / 2);
>> +            primitives.blockcpy_ps(width, height, dst, dststride, src,
>> srcstride);
>>
>>              src = m_qtTempTComYuv[qtLayer].getCrAddr(absPartIdx);
>>              dst = cu->getPic()->getPicYuvRec()->getCrAddr(cu->getAddr(),
>> zorder);
>> -            primitives.blockcpy_ps(width, height, dst, dststride, src,
>> MAX_CU_SIZE / 2);
>> +            primitives.blockcpy_ps(width, height, dst, dststride, src,
>> srcstride);
>>          }
>>      }
>>
>> @@ -1049,7 +1062,7 @@
>>          int lastPos = -1;
>>          cu->setTrIdxSubParts(trDepth, absPartIdx, fullDepth);
>>
>> -        m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetY(), 0);
>> +        m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetY(), 0, cu->getChromaFormat());
>>          m_trQuant->selectLambda(TEXT_LUMA);
>>          absSum = m_trQuant->transformNxN(cu, residual, stride, coeff,
>> width, height, TEXT_LUMA, absPartIdx, &lastPos, useTransformSkip);
>>
>> @@ -1081,7 +1094,6 @@
>>      if (bCheckSplit && !bCheckFull)
>>      {
>>          //----- code splitted block -----
>> -
>>          uint32_t qPartsDiv     = cu->getPic()->getNumPartInCU() >>
>> ((fullDepth + 1) << 1);
>>          uint32_t absPartIdxSub = absPartIdx;
>>          uint32_t splitCbfY = 0;
>> @@ -1267,12 +1279,12 @@
>>          reconIPred =
>> cu->getPic()->getPicYuvRec()->getCbAddr(cu->getAddr(), zOrder);
>>          reconIPredStride = cu->getPic()->getPicYuvRec()->getCStride();
>>          reconQt = m_qtTempTComYuv[qtlayer].getCbAddr(absPartIdx);
>> -        assert(m_qtTempTComYuv[qtlayer].m_cwidth == MAX_CU_SIZE / 2);
>> -        primitives.blockcpy_ps(width, height, reconIPred,
>> reconIPredStride, reconQt, MAX_CU_SIZE / 2);
>> +        uint32_t reconQtStride = m_qtTempTComYuv[qtlayer].m_cwidth;
>> +        primitives.blockcpy_ps(width, height, reconIPred,
>> reconIPredStride, reconQt, reconQtStride);
>>
>>          reconIPred =
>> cu->getPic()->getPicYuvRec()->getCrAddr(cu->getAddr(), zOrder);
>>          reconQt    = m_qtTempTComYuv[qtlayer].getCrAddr(absPartIdx);
>> -        primitives.blockcpy_ps(width, height, reconIPred,
>> reconIPredStride, reconQt, MAX_CU_SIZE / 2);
>> +        primitives.blockcpy_ps(width, height, reconIPred,
>> reconIPredStride, reconQt, reconQtStride);
>>      }
>>  }
>>
>> @@ -1376,20 +1388,20 @@
>>          uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
>>          uint32_t width            = cu->getWidth(0) >> (trDepth + 1);
>>          uint32_t height           = cu->getHeight(0) >> (trDepth + 1);
>> -        assert(m_qtTempTComYuv[qtlayer].m_cwidth == MAX_CU_SIZE / 2);
>> +        uint32_t reconQtStride    = m_qtTempTComYuv[qtlayer].m_cwidth;
>>          uint32_t reconIPredStride =
>> cu->getPic()->getPicYuvRec()->getCStride();
>>
>>          if (stateU0V1Both2 == 0 || stateU0V1Both2 == 2)
>>          {
>>              Pel* reconIPred =
>> cu->getPic()->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder);
>>              int16_t* reconQt  =
>> m_qtTempTComYuv[qtlayer].getCbAddr(absPartIdx);
>> -            primitives.blockcpy_ps(width, height, reconIPred,
>> reconIPredStride, reconQt, MAX_CU_SIZE / 2);
>> +            primitives.blockcpy_ps(width, height, reconIPred,
>> reconIPredStride, reconQt, reconQtStride);
>>          }
>>          if (stateU0V1Both2 == 1 || stateU0V1Both2 == 2)
>>          {
>>              Pel* reconIPred =
>> cu->getPic()->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder);
>>              int16_t* reconQt  =
>> m_qtTempTComYuv[qtlayer].getCrAddr(absPartIdx);
>> -            primitives.blockcpy_ps(width, height, reconIPred,
>> reconIPredStride, reconQt, MAX_CU_SIZE / 2);
>> +            primitives.blockcpy_ps(width, height, reconIPred,
>> reconIPredStride, reconQt, reconQtStride);
>>          }
>>      }
>>  }
>> @@ -1411,7 +1423,7 @@
>>          uint32_t trSizeLog2 =
>> g_convertToBit[cu->getSlice()->getSPS()->getMaxCUWidth() >> fullDepth] + 2;
>>
>>          uint32_t actualTrDepth = trDepth;
>> -        if (trSizeLog2 == 2)
>> +        if ((trSizeLog2 == 2) && !(cu->getChromaFormat() == CHROMA_444))
>>          {
>>              assert(trDepth > 0);
>>              actualTrDepth--;
>> @@ -1557,7 +1569,7 @@
>>          uint32_t qtlayer    =
>> cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
>>
>>          bool bChromaSame  = false;
>> -        if (trSizeLog2 == 2)
>> +        if ((trSizeLog2 == 2) && !(cu->getChromaFormat() == CHROMA_444))
>>          {
>>              assert(trDepth > 0);
>>              uint32_t qpdiv = cu->getPic()->getNumPartInCU() >>
>> ((cu->getDepth(0) + trDepth - 1) << 1);
>> @@ -1572,9 +1584,11 @@
>>          uint32_t numCoeffC = (cu->getSlice()->getSPS()->getMaxCUWidth()
>> * cu->getSlice()->getSPS()->getMaxCUHeight()) >> (fullDepth << 1);
>>          if (!bChromaSame)
>>          {
>> -            numCoeffC >>= 2;
>> +            numCoeffC = ((cu->getSlice()->getSPS()->getMaxCUWidth() >>
>> m_hChromaShift) * (cu->getSlice()->getSPS()->getMaxCUHeight() >>
>> m_vChromaShift)) >> (fullDepth << 1);
>>          }
>> -        uint32_t numCoeffIncC =
>> (cu->getSlice()->getSPS()->getMaxCUWidth() *
>> cu->getSlice()->getSPS()->getMaxCUHeight()) >>
>> ((cu->getSlice()->getSPS()->getMaxCUDepth() << 1) + 2);
>> +
>> +        uint32_t numCoeffIncC =
>> ((cu->getSlice()->getSPS()->getMaxCUWidth() >> m_hChromaShift) *
>> (cu->getSlice()->getSPS()->getMaxCUHeight() >> m_vChromaShift)) >>
>> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1);
>> +
>>          TCoeff* coeffSrcU = m_qtTempCoeffCb[qtlayer] + (numCoeffIncC *
>> absPartIdx);
>>          TCoeff* coeffSrcV = m_qtTempCoeffCr[qtlayer] + (numCoeffIncC *
>> absPartIdx);
>>          TCoeff* coeffDstU = cu->getCoeffCb()         + (numCoeffIncC *
>> absPartIdx);
>> @@ -1583,7 +1597,7 @@
>>          ::memcpy(coeffDstV, coeffSrcV, sizeof(TCoeff) * numCoeffC);
>>
>>          //===== copy reconstruction =====
>> -        uint32_t trSizeCLog2 = (bChromaSame ? trSizeLog2 : trSizeLog2 -
>> 1);
>> +        uint32_t trSizeCLog2 = (bChromaSame || (cu->getChromaFormat() ==
>> CHROMA_444))  ? trSizeLog2 : trSizeLog2 - 1;
>>          m_qtTempTComYuv[qtlayer].copyPartToPartChroma(reconYuv,
>> absPartIdx, 1 << trSizeCLog2, 1 << trSizeCLog2);
>>      }
>>      else
>> @@ -1650,11 +1664,11 @@
>>              }
>>              //===== init availability pattern =====
>>              cu->getPattern()->initPattern(cu, trDepth, absPartIdx);
>> -            cu->getPattern()->initAdiPatternChroma(cu, absPartIdx,
>> trDepth, m_predBuf, m_predBufStride, m_predBufHeight);
>> +            cu->getPattern()->initAdiPatternChroma(cu, absPartIdx,
>> trDepth, m_predBuf, m_predBufStride, m_predBufHeight, chromaId);
>>              Pel* chromaPred = (chromaId > 0 ?
>> cu->getPattern()->getAdiCrBuf(width, height, m_predBuf) :
>> cu->getPattern()->getAdiCbBuf(width, height, m_predBuf));
>>
>>              //===== get prediction signal =====
>> -            predIntraChromaAng(chromaPred, chromaPredMode, pred, stride,
>> width);
>> +            predIntraChromaAng(chromaPred, chromaPredMode, pred, stride,
>> width, height, cu->getChromaFormat());
>>
>>              //===== get residual signal =====
>>              assert(!((uint32_t)(size_t)fenc & (width - 1)));
>> @@ -1676,7 +1690,7 @@
>>              {
>>                  curChromaQpOffset =
>> cu->getSlice()->getPPS()->getChromaCrQpOffset() +
>> cu->getSlice()->getSliceQpDeltaCr();
>>              }
>> -            m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset);
>> +            m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset,
>> cu->getChromaFormat());
>>
>>              m_trQuant->selectLambda(TEXT_CHROMA);
>>
>> @@ -1741,7 +1755,8 @@
>>      //===== init pattern =====
>>      assert(width == height);
>>      cu->getPattern()->initPattern(cu, 0, 0);
>> -    cu->getPattern()->initAdiPatternChroma(cu, 0, 0, m_predBuf,
>> m_predBufStride, m_predBufHeight);
>> +    cu->getPattern()->initAdiPatternChroma(cu, 0, 0, m_predBuf,
>> m_predBufStride, m_predBufHeight, 0/*chromaId*/);
>> +    cu->getPattern()->initAdiPatternChroma(cu, 0, 0, m_predBuf,
>> m_predBufStride, m_predBufHeight, 1/*chromaId*/);
>>      Pel* patChromaU = cu->getPattern()->getAdiCbBuf(width, height,
>> m_predBuf);
>>      Pel* patChromaV = cu->getPattern()->getAdiCrBuf(width, height,
>> m_predBuf);
>>
>> @@ -1754,8 +1769,8 @@
>>      for (uint32_t mode = minMode; mode < maxMode; mode++)
>>      {
>>          //--- get prediction ---
>> -        predIntraChromaAng(patChromaU, mode, predU, stride, width);
>> -        predIntraChromaAng(patChromaV, mode, predV, stride, width);
>> +        predIntraChromaAng(patChromaU, mode, predU, stride, width,
>> height, cu->getChromaFormat());
>> +        predIntraChromaAng(patChromaV, mode, predV, stride, width,
>> height, cu->getChromaFormat());
>>
>>          //--- get SAD ---
>>          uint32_t sad = sa8d(fencU, stride, predU, stride) + sa8d(fencV,
>> stride, predV, stride);
>> @@ -2131,13 +2146,14 @@
>>
>>      if (width > 32)
>>      {
>> -        scaleWidth = 32;
>> -        scaleStride = 32;
>> +        scaleWidth     = 32;
>> +        scaleStride    = 32;
>>          costMultiplier = 4;
>>      }
>>
>>      cu->getPattern()->initPattern(cu, trDepth, absPartIdx);
>> -    cu->getPattern()->initAdiPatternChroma(cu, absPartIdx, trDepth,
>> m_predBuf, m_predBufStride, m_predBufHeight);
>> +    cu->getPattern()->initAdiPatternChroma(cu, absPartIdx, trDepth,
>> m_predBuf, m_predBufStride, m_predBufHeight, 0);
>> +    cu->getPattern()->initAdiPatternChroma(cu, absPartIdx, trDepth,
>> m_predBuf, m_predBufStride, m_predBufHeight, 1);
>>
>>      cu->getAllowedChromaDir(0, modeList);
>>      //----- check chroma modes -----
>> @@ -2156,7 +2172,7 @@
>>              Pel* chromaPred = (chromaId > 0 ?
>> cu->getPattern()->getAdiCrBuf(width, height, m_predBuf) :
>> cu->getPattern()->getAdiCbBuf(width, height, m_predBuf));
>>
>>              //===== get prediction signal =====
>> -            predIntraChromaAng(chromaPred, chromaPredMode, pred, stride,
>> width);
>> +            predIntraChromaAng(chromaPred, chromaPredMode, pred, stride,
>> width, height, cu->getChromaFormat());
>>              int log2SizeMinus2 = g_convertToBit[scaleWidth];
>>              pixelcmp_t sa8d = primitives.sa8d[log2SizeMinus2];
>>              sad = costMultiplier * sa8d(fenc, scaleStride, pred,
>> scaleStride);
>> @@ -2174,6 +2190,27 @@
>>      cu->setChromIntraDirSubParts(bestMode, 0, depth);
>>  }
>>
>> +bool TEncSearch::isNextSection()
>> +{
>> +    if (m_splitMode == DONT_SPLIT)
>> +    {
>> +        m_section++;
>> +        return false;
>> +    }
>> +    else
>> +    {
>> +        m_absPartIdxTURelCU += m_absPartIdxStep;
>> +
>> +        m_section++;
>> +        return m_section< (1 << m_splitMode);
>> +    }
>> +}
>> +
>> +bool TEncSearch::isLastSection()
>> +{
>> +    return (m_section+1) >= (1<<m_splitMode);
>> +}
>> +
>>  void TEncSearch::estIntraPredChromaQT(TComDataCU* cu,
>>                                        TComYuv*    fencYuv,
>>                                        TComYuv*    predYuv,
>> @@ -2181,60 +2218,109 @@
>>                                        TComYuv*    reconYuv,
>>                                        uint32_t    preCalcDistC)
>>  {
>> -    uint32_t depth     = cu->getDepth(0);
>> -    uint32_t bestMode  = 0;
>> -    uint32_t bestDist  = 0;
>> -    uint64_t bestCost  = MAX_INT64;
>> -
>> -    //----- init mode list -----
>> -    uint32_t minMode = 0;
>> -    uint32_t maxMode = NUM_CHROMA_MODE;
>> -    uint32_t modeList[NUM_CHROMA_MODE];
>> -
>> -    cu->getAllowedChromaDir(0, modeList);
>> -
>> -    //----- check chroma modes -----
>> -    for (uint32_t mode = minMode; mode < maxMode; mode++)
>> +    uint32_t depth              = cu->getDepth(0);
>> +    uint32_t initTrDepth        = (cu->getPartitionSize(0) !=
>> SIZE_2Nx2N) && (cu->getChromaFormat() == CHROMA_444 ? 1 : 0);
>> +    m_splitMode                 = (initTrDepth == 0) ? DONT_SPLIT :
>> QUAD_SPLIT;
>> +    m_absPartIdxStep            = (cu->getPic()->getNumPartInCU() >>
>> (depth << 1)) >> partIdxStepShift[m_splitMode];
>> +    m_partOffset                = 0;
>> +    m_section                   = 0;
>> +    m_absPartIdxTURelCU         = 0;
>> +
>> +    do
>>      {
>> -        //----- restore context models -----
>> -        m_rdGoOnSbacCoder->load(m_rdSbacCoders[depth][CI_CURR_BEST]);
>> -
>> -        //----- chroma coding -----
>> -        uint32_t dist = 0;
>> -        cu->setChromIntraDirSubParts(modeList[mode], 0, depth);
>> -        xRecurIntraChromaCodingQT(cu, 0, 0, fencYuv, predYuv, resiYuv,
>> dist);
>> -        if (cu->getSlice()->getPPS()->getUseTransformSkip())
>> +        uint32_t bestMode           = 0;
>> +        uint32_t bestDist           = 0;
>> +        uint64_t bestCost           = MAX_INT64;
>> +
>> +        //----- init mode list -----
>> +        uint32_t minMode = 0;
>> +        uint32_t maxMode = NUM_CHROMA_MODE;
>> +        uint32_t modeList[NUM_CHROMA_MODE];
>> +
>> +        m_partOffset = m_absPartIdxTURelCU;
>> +
>> +        cu->getAllowedChromaDir(m_partOffset, modeList);
>> +
>> +        //----- check chroma modes -----
>> +        for (uint32_t mode = minMode; mode < maxMode; mode++)
>>          {
>> +            //----- restore context models -----
>>              m_rdGoOnSbacCoder->load(m_rdSbacCoders[depth][CI_CURR_BEST]);
>> +
>> +            //----- chroma coding -----
>> +            uint32_t dist = 0;
>> +
>> +            cu->setChromIntraDirSubParts(modeList[mode], m_partOffset,
>> depth + initTrDepth);
>> +
>> +            xRecurIntraChromaCodingQT(cu, initTrDepth,
>> m_absPartIdxTURelCU, fencYuv, predYuv, resiYuv, dist);
>> +
>> +            if (cu->getSlice()->getPPS()->getUseTransformSkip())
>> +            {
>> +
>>  m_rdGoOnSbacCoder->load(m_rdSbacCoders[depth][CI_CURR_BEST]);
>> +            }
>> +
>> +            uint32_t bits = xGetIntraBitsQT(cu, initTrDepth,
>> m_absPartIdxTURelCU, false, true);
>> +            uint64_t cost = m_rdCost->calcRdCost(dist, bits);
>> +
>> +            //----- compare -----
>> +            if (cost < bestCost)
>> +            {
>> +                bestCost = cost;
>> +                bestDist = dist;
>> +                bestMode = modeList[mode];
>> +                xSetIntraResultChromaQT(cu, initTrDepth,
>> m_absPartIdxTURelCU, reconYuv);
>> +                ::memcpy(m_qtTempCbf[1], cu->getCbf(TEXT_CHROMA_U) +
>> m_partOffset, m_absPartIdxStep * sizeof(UChar));
>> +                ::memcpy(m_qtTempCbf[2], cu->getCbf(TEXT_CHROMA_V) +
>> m_partOffset, m_absPartIdxStep * sizeof(UChar));
>> +                ::memcpy(m_qtTempTransformSkipFlag[1],
>> cu->getTransformSkip(TEXT_CHROMA_U) + m_partOffset, m_absPartIdxStep *
>> sizeof(UChar));
>> +                ::memcpy(m_qtTempTransformSkipFlag[2],
>> cu->getTransformSkip(TEXT_CHROMA_V) + m_partOffset, m_absPartIdxStep *
>> sizeof(UChar));
>> +            }
>>          }
>>
>> -        uint32_t bits = xGetIntraBitsQT(cu, 0, 0, false, true);
>> -        uint64_t cost = m_rdCost->calcRdCost(dist, bits);
>> -
>> -        //----- compare -----
>> -        if (cost < bestCost)
>> +        if (!isLastSection())
>>          {
>> -            bestCost = cost;
>> -            bestDist = dist;
>> -            bestMode = modeList[mode];
>> -            uint32_t qpn = cu->getPic()->getNumPartInCU() >> (depth <<
>> 1);
>> -            xSetIntraResultChromaQT(cu, 0, 0, reconYuv);
>> -            ::memcpy(m_qtTempCbf[1], cu->getCbf(TEXT_CHROMA_U), qpn *
>> sizeof(UChar));
>> -            ::memcpy(m_qtTempCbf[2], cu->getCbf(TEXT_CHROMA_V), qpn *
>> sizeof(UChar));
>> -            ::memcpy(m_qtTempTransformSkipFlag[1],
>> cu->getTransformSkip(TEXT_CHROMA_U), qpn * sizeof(UChar));
>> -            ::memcpy(m_qtTempTransformSkipFlag[2],
>> cu->getTransformSkip(TEXT_CHROMA_V), qpn * sizeof(UChar));
>> +            uint32_t compWidth   = (cu->getWidth(0) >> m_hChromaShift)
>>  >> initTrDepth;
>> +            uint32_t compHeight  = (cu->getHeight(0) >> m_vChromaShift)
>> >> initTrDepth;
>> +            uint32_t zorder      = cu->getZorderIdxInCU() + m_partOffset;
>> +            Pel*     dst         =
>> cu->getPic()->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder);
>> +            uint32_t dststride   =
>> cu->getPic()->getPicYuvRec()->getCStride();
>> +            Pel*     src         = reconYuv->getCbAddr(m_partOffset);
>> +            uint32_t srcstride   = reconYuv->getCStride();
>> +
>> +            primitives.blockcpy_pp(compWidth, compHeight, dst,
>> dststride, src, srcstride);
>> +
>> +            dst                 =
>> cu->getPic()->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder);
>> +            src                 = reconYuv->getCrAddr(m_partOffset);
>> +            primitives.blockcpy_pp(compWidth, compHeight, dst,
>> dststride, src, srcstride);
>> +        }
>> +
>> +        //----- set data -----
>> +        ::memcpy(cu->getCbf(TEXT_CHROMA_U) + m_partOffset,
>> m_qtTempCbf[1], m_absPartIdxStep * sizeof(UChar));
>> +        ::memcpy(cu->getCbf(TEXT_CHROMA_V) + m_partOffset,
>> m_qtTempCbf[2], m_absPartIdxStep * sizeof(UChar));
>> +        ::memcpy(cu->getTransformSkip(TEXT_CHROMA_U) + m_partOffset,
>> m_qtTempTransformSkipFlag[1], m_absPartIdxStep * sizeof(UChar));
>> +        ::memcpy(cu->getTransformSkip(TEXT_CHROMA_V) + m_partOffset,
>> m_qtTempTransformSkipFlag[2], m_absPartIdxStep * sizeof(UChar));
>> +        cu->setChromIntraDirSubParts(bestMode, m_partOffset, depth +
>> initTrDepth);
>> +        cu->m_totalDistortion += bestDist - preCalcDistC;
>> +
>> +    } while(isNextSection());
>> +
>> +    //----- restore context models -----
>> +    if (initTrDepth != 0)
>> +    {   // set Cbf for all blocks
>> +        uint32_t uiCombCbfU = 0;
>> +        uint32_t uiCombCbfV = 0;
>> +        uint32_t uiPartIdx  = 0;
>> +        for (uint32_t uiPart = 0; uiPart < 4; uiPart++, uiPartIdx +=
>> m_absPartIdxStep)
>> +        {
>> +            uiCombCbfU |= cu->getCbf(uiPartIdx, TEXT_CHROMA_U, 1);
>> +            uiCombCbfV |= cu->getCbf(uiPartIdx, TEXT_CHROMA_V, 1);
>> +        }
>> +        for (uint32_t uiOffs = 0; uiOffs < 4 * m_absPartIdxStep;
>> uiOffs++)
>> +        {
>> +            cu->getCbf( TEXT_CHROMA_U )[ uiOffs ] |= uiCombCbfU;
>> +            cu->getCbf( TEXT_CHROMA_V )[ uiOffs ] |= uiCombCbfV;
>>          }
>>      }
>>
>> -    //----- set data -----
>> -    uint32_t qpn = cu->getPic()->getNumPartInCU() >> (depth << 1);
>> -    ::memcpy(cu->getCbf(TEXT_CHROMA_U), m_qtTempCbf[1], qpn *
>> sizeof(UChar));
>> -    ::memcpy(cu->getCbf(TEXT_CHROMA_V), m_qtTempCbf[2], qpn *
>> sizeof(UChar));
>> -    ::memcpy(cu->getTransformSkip(TEXT_CHROMA_U),
>> m_qtTempTransformSkipFlag[1], qpn * sizeof(UChar));
>> -    ::memcpy(cu->getTransformSkip(TEXT_CHROMA_V),
>> m_qtTempTransformSkipFlag[2], qpn * sizeof(UChar));
>> -    cu->setChromIntraDirSubParts(bestMode, 0, depth);
>> -    cu->m_totalDistortion += bestDist - preCalcDistC;
>> -
>>      //----- restore context models -----
>>      m_rdGoOnSbacCoder->load(m_rdSbacCoders[depth][CI_CURR_BEST]);
>>  }
>> @@ -3085,10 +3171,11 @@
>>          outResiYuv->clear();
>>
>>          predYuv->copyToPartYuv(outReconYuv, 0);
>> -
>> +        //Luma
>>          int part = partitionFromSizes(width, height);
>>          distortion = primitives.sse_pp[part](fencYuv->getLumaAddr(),
>> fencYuv->getStride(), outReconYuv->getLumaAddr(), outReconYuv->getStride());
>> -        part = partitionFromSizes(width >> 1, height >> 1);
>> +        //Chroma
>> +        part = partitionFromSizes(width >> m_hChromaShift, height >>
>> m_vChromaShift);
>>          distortion +=
>> m_rdCost->scaleChromaDistCb(primitives.sse_pp[part](fencYuv->getCbAddr(),
>> fencYuv->getCStride(), outReconYuv->getCbAddr(),
>> outReconYuv->getCStride()));
>>          distortion +=
>> m_rdCost->scaleChromaDistCr(primitives.sse_pp[part](fencYuv->getCrAddr(),
>> fencYuv->getCStride(), outReconYuv->getCrAddr(),
>> outReconYuv->getCStride()));
>>
>> @@ -3208,7 +3295,7 @@
>>      // update with clipped distortion and cost (qp estimation loop uses
>> unclipped values)
>>      int part = partitionFromSizes(width, height);
>>      bdist = primitives.sse_pp[part](fencYuv->getLumaAddr(),
>> fencYuv->getStride(), outReconYuv->getLumaAddr(), outReconYuv->getStride());
>> -    part = partitionFromSizes(width >> 1, height >> 1);
>> +    part = partitionFromSizes(width >> cu->getHorzChromaShift(), height
>> >> cu->getVertChromaShift());
>>      bdist +=
>> m_rdCost->scaleChromaDistCb(primitives.sse_pp[part](fencYuv->getCbAddr(),
>> fencYuv->getCStride(), outReconYuv->getCbAddr(),
>> outReconYuv->getCStride()));
>>      bdist +=
>> m_rdCost->scaleChromaDistCr(primitives.sse_pp[part](fencYuv->getCrAddr(),
>> fencYuv->getCStride(), outReconYuv->getCrAddr(),
>> outReconYuv->getCStride()));
>>      bcost = m_rdCost->calcRdCost(bdist, bestBits);
>> @@ -3311,7 +3398,7 @@
>>              cu->setTransformSkipSubParts(0, TEXT_CHROMA_V, absPartIdx,
>> cu->getDepth(0) + trModeC);
>>          }
>>
>> -        m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetY(), 0);
>> +        m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetY(), 0, cu->getChromaFormat());
>>          m_trQuant->selectLambda(TEXT_LUMA);
>>
>>          absSumY = m_trQuant->transformNxN(cu,
>> resiYuv->getLumaAddr(absTUPartIdx), resiYuv->m_width, coeffCurY,
>> @@ -3322,7 +3409,7 @@
>>          if (bCodeChroma)
>>          {
>>              int curChromaQpOffset =
>> cu->getSlice()->getPPS()->getChromaCbQpOffset() +
>> cu->getSlice()->getSliceQpDeltaCb();
>> -            m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset);
>> +            m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset,
>> cu->getChromaFormat());
>>
>>              m_trQuant->selectLambda(TEXT_CHROMA);
>>
>> @@ -3330,7 +3417,7 @@
>>                                                trWidthC, trHeightC,
>> TEXT_CHROMA_U, absPartIdx, &lastPosU, false, curuseRDOQ);
>>
>>              curChromaQpOffset =
>> cu->getSlice()->getPPS()->getChromaCrQpOffset() +
>> cu->getSlice()->getSliceQpDeltaCr();
>> -            m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset);
>> +            m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset,
>> cu->getChromaFormat());
>>              absSumV = m_trQuant->transformNxN(cu,
>> resiYuv->getCrAddr(absTUPartIdxC), resiYuv->m_cwidth, coeffCurV,
>>                                                trWidthC, trHeightC,
>> TEXT_CHROMA_V, absPartIdx, &lastPosV, false, curuseRDOQ);
>>
>> @@ -3342,7 +3429,7 @@
>>          {
>>              int16_t *curResiY = resiYuv->getLumaAddr(absTUPartIdx);
>>
>> -            m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetY(), 0);
>> +            m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetY(), 0, cu->getChromaFormat());
>>
>>              int scalingListType = 3 + g_eTTable[(int)TEXT_LUMA];
>>              assert(scalingListType < 6);
>> @@ -3362,7 +3449,7 @@
>>                  int16_t *pcResiCurrU = resiYuv->getCbAddr(absTUPartIdxC);
>>
>>                  int curChromaQpOffset =
>> cu->getSlice()->getPPS()->getChromaCbQpOffset() +
>> cu->getSlice()->getSliceQpDeltaCb();
>> -                m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset);
>> +                m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset,
>> cu->getChromaFormat());
>>
>>                  int scalingListType = 3 + g_eTTable[(int)TEXT_CHROMA_U];
>>                  assert(scalingListType < 6);
>> @@ -3378,7 +3465,7 @@
>>              {
>>                  int16_t *curResiV = resiYuv->getCrAddr(absTUPartIdxC);
>>                  int curChromaQpOffset =
>> cu->getSlice()->getPPS()->getChromaCrQpOffset() +
>> cu->getSlice()->getSliceQpDeltaCr();
>> -                m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset);
>> +                m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset,
>> cu->getChromaFormat());
>>
>>                  int scalingListType = 3 + g_eTTable[(int)TEXT_CHROMA_V];
>>                  assert(scalingListType < 6);
>> @@ -3453,6 +3540,7 @@
>>      assert(cu->getDepth(0) == cu->getDepth(absPartIdx));
>>      const uint32_t trMode = depth - cu->getDepth(0);
>>      const uint32_t trSizeLog2 =
>> g_convertToBit[cu->getSlice()->getSPS()->getMaxCUWidth() >> depth] + 2;
>> +    uint32_t  trSizeCLog2 =
>> g_convertToBit[(cu->getSlice()->getSPS()->getMaxCUWidth() >>
>> m_hChromaShift) >> depth] + 2;;
>>
>>      bool bSplitFlag =
>> ((cu->getSlice()->getSPS()->getQuadtreeTUMaxDepthInter() == 1) &&
>> cu->getPredictionMode(absPartIdx) == MODE_INTER &&
>> (cu->getPartitionSize(absPartIdx) != SIZE_2Nx2N));
>>      bool bCheckFull;
>> @@ -3465,12 +3553,11 @@
>>
>>      bool  bCodeChroma = true;
>>      uint32_t  trModeC     = trMode;
>> -    uint32_t  trSizeCLog2 = trSizeLog2 - 1;
>> -    if (trSizeLog2 == 2)
>> +    if ((trSizeLog2 == 2) && !(cu->getChromaFormat() == CHROMA_444))
>>      {
>>          trSizeCLog2++;
>>          trModeC--;
>> -        uint32_t qpdiv = cu->getPic()->getNumPartInCU() >>
>> ((cu->getDepth(0) + trModeC) << 1);
>> +        uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((depth - 1)
>> << 1);
>>          bCodeChroma = ((absPartIdx % qpdiv) == 0);
>>      }
>>
>> @@ -3490,8 +3577,8 @@
>>          const uint32_t numCoeffPerAbsPartIdxIncrement =
>> cu->getSlice()->getSPS()->getMaxCUWidth() *
>> cu->getSlice()->getSPS()->getMaxCUHeight() >>
>> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1);
>>          const uint32_t qtlayer =
>> cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
>>          TCoeff *coeffCurY = m_qtTempCoeffY[qtlayer] +
>> (numCoeffPerAbsPartIdxIncrement * absPartIdx);
>> -        TCoeff *coeffCurU = m_qtTempCoeffCb[qtlayer] +
>> (numCoeffPerAbsPartIdxIncrement * absPartIdx >> 2);
>> -        TCoeff *coeffCurV = m_qtTempCoeffCr[qtlayer] +
>> (numCoeffPerAbsPartIdxIncrement * absPartIdx >> 2);
>> +        TCoeff *coeffCurU = m_qtTempCoeffCb[qtlayer] +
>> (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift +
>> m_vChromaShift));
>> +        TCoeff *coeffCurV = m_qtTempCoeffCr[qtlayer] +
>> (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift +
>> m_vChromaShift));
>>
>>          int trWidth = 0, trHeight = 0, trWidthC = 0, trHeightC = 0;
>>          uint32_t absTUPartIdxC = absPartIdx;
>> @@ -3520,7 +3607,7 @@
>>              m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac,
>> trWidth, trHeight, TEXT_LUMA);
>>          }
>>
>> -        m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetY(), 0);
>> +        m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetY(), 0, cu->getChromaFormat());
>>          m_trQuant->selectLambda(TEXT_LUMA);
>>
>>          absSumY = m_trQuant->transformNxN(cu,
>> resiYuv->getLumaAddr(absTUPartIdx), resiYuv->m_width, coeffCurY,
>> @@ -3534,17 +3621,17 @@
>>              {
>>                  m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac,
>> trWidthC, trHeightC, TEXT_CHROMA);
>>              }
>> -
>> +            //Cb transform
>>              int curChromaQpOffset =
>> cu->getSlice()->getPPS()->getChromaCbQpOffset() +
>> cu->getSlice()->getSliceQpDeltaCb();
>> -            m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset);
>> +            m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset,
>> cu->getChromaFormat());
>>
>>              m_trQuant->selectLambda(TEXT_CHROMA);
>>
>>              absSumU = m_trQuant->transformNxN(cu,
>> resiYuv->getCbAddr(absTUPartIdxC), resiYuv->m_cwidth, coeffCurU,
>>                                                trWidthC, trHeightC,
>> TEXT_CHROMA_U, absPartIdx, &lastPosU, false, curuseRDOQ);
>> -
>> +            //Cr transform
>>              curChromaQpOffset =
>> cu->getSlice()->getPPS()->getChromaCrQpOffset() +
>> cu->getSlice()->getSliceQpDeltaCr();
>> -            m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset);
>> +            m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset,
>> cu->getChromaFormat());
>>              absSumV = m_trQuant->transformNxN(cu,
>> resiYuv->getCrAddr(absTUPartIdxC), resiYuv->m_cwidth, coeffCurV,
>>                                                trWidthC, trHeightC,
>> TEXT_CHROMA_V, absPartIdx, &lastPosV, false, curuseRDOQ);
>>
>> @@ -3586,7 +3673,7 @@
>>          {
>>              int16_t *curResiY =
>> m_qtTempTComYuv[qtlayer].getLumaAddr(absTUPartIdx);
>>
>> -            m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetY(), 0);
>> +            m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetY(), 0, cu->getChromaFormat());
>>
>>              int scalingListType = 3 + g_eTTable[(int)TEXT_LUMA];
>>              assert(scalingListType < 6);
>> @@ -3658,16 +3745,15 @@
>>                  int16_t *pcResiCurrU =
>> m_qtTempTComYuv[qtlayer].getCbAddr(absTUPartIdxC);
>>
>>                  int curChromaQpOffset =
>> cu->getSlice()->getPPS()->getChromaCbQpOffset() +
>> cu->getSlice()->getSliceQpDeltaCb();
>> -                m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset);
>> +                m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset,
>> cu->getChromaFormat());
>>
>>                  int scalingListType = 3 + g_eTTable[(int)TEXT_CHROMA_U];
>>                  assert(scalingListType < 6);
>> -                assert(m_qtTempTComYuv[qtlayer].m_cwidth == MAX_CU_SIZE
>> / 2);
>> -
>>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT,
>> pcResiCurrU, MAX_CU_SIZE / 2, coeffCurU, trWidthC, trHeightC,
>> scalingListType, false, lastPosU);
>> +
>>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT,
>> pcResiCurrU, m_qtTempTComYuv[qtlayer].m_cwidth, coeffCurU, trWidthC,
>> trHeightC, scalingListType, false, lastPosU);
>>
>>                  uint32_t dist =
>> primitives.sse_ss[partSizeC](resiYuv->getCbAddr(absTUPartIdxC),
>> resiYuv->m_cwidth,
>>
>> m_qtTempTComYuv[qtlayer].getCbAddr(absTUPartIdxC),
>> -                                                             MAX_CU_SIZE
>> / 2);
>> +
>> m_qtTempTComYuv[qtlayer].m_cwidth);
>>                  const uint32_t nonZeroDistU =
>> m_rdCost->scaleChromaDistCb(dist);
>>
>>                  if (cu->isLosslessCoded(0))
>> @@ -3710,10 +3796,10 @@
>>              if (!absSumU)
>>              {
>>                  int16_t *ptr =
>> m_qtTempTComYuv[qtlayer].getCbAddr(absTUPartIdxC);
>> -                assert(m_qtTempTComYuv[qtlayer].m_cwidth == MAX_CU_SIZE
>> / 2);
>> +                const uint32_t stride =
>> m_qtTempTComYuv[qtlayer].m_cwidth;
>>
>>                  assert(trWidthC == trHeightC);
>> -
>>  primitives.blockfill_s[(int)g_convertToBit[trWidthC]](ptr, MAX_CU_SIZE /
>> 2, 0);
>> +
>>  primitives.blockfill_s[(int)g_convertToBit[trWidthC]](ptr, stride, 0);
>>              }
>>
>>              distV =
>> m_rdCost->scaleChromaDistCr(primitives.sse_sp[partSizeC](resiYuv->getCrAddr(absTUPartIdxC),
>> resiYuv->m_cwidth, m_tempPel, trWidthC));
>> @@ -3725,16 +3811,15 @@
>>              {
>>                  int16_t *curResiV =
>> m_qtTempTComYuv[qtlayer].getCrAddr(absTUPartIdxC);
>>                  int curChromaQpOffset =
>> cu->getSlice()->getPPS()->getChromaCrQpOffset() +
>> cu->getSlice()->getSliceQpDeltaCr();
>> -                m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset);
>> +                m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset,
>> cu->getChromaFormat());
>>
>>                  int scalingListType = 3 + g_eTTable[(int)TEXT_CHROMA_V];
>>                  assert(scalingListType < 6);
>> -                assert(m_qtTempTComYuv[qtlayer].m_cwidth == MAX_CU_SIZE
>> / 2);
>> -
>>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT,
>> curResiV, MAX_CU_SIZE / 2, coeffCurV, trWidthC, trHeightC, scalingListType,
>> false, lastPosV);
>> +
>>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT,
>> curResiV, m_qtTempTComYuv[qtlayer].m_cwidth, coeffCurV, trWidthC,
>> trHeightC, scalingListType, false, lastPosV);
>>
>>                  uint32_t dist =
>> primitives.sse_ss[partSizeC](resiYuv->getCrAddr(absTUPartIdxC),
>> resiYuv->m_cwidth,
>>
>> m_qtTempTComYuv[qtlayer].getCrAddr(absTUPartIdxC),
>> -                                                             MAX_CU_SIZE
>> / 2);
>> +
>> m_qtTempTComYuv[qtlayer].m_cwidth);
>>                  const uint32_t nonZeroDistV =
>> m_rdCost->scaleChromaDistCr(dist);
>>
>>                  if (cu->isLosslessCoded(0))
>> @@ -3777,10 +3862,10 @@
>>              if (!absSumV)
>>              {
>>                  int16_t *ptr =
>>  m_qtTempTComYuv[qtlayer].getCrAddr(absTUPartIdxC);
>> -                assert(m_qtTempTComYuv[qtlayer].m_cwidth == MAX_CU_SIZE
>> / 2);
>> +                const uint32_t stride =
>> m_qtTempTComYuv[qtlayer].m_cwidth;
>>
>>                  assert(trWidthC == trHeightC);
>> -
>>  primitives.blockfill_s[(int)g_convertToBit[trWidthC]](ptr, MAX_CU_SIZE /
>> 2, 0);
>> +
>>  primitives.blockfill_s[(int)g_convertToBit[trWidthC]](ptr, stride, 0);
>>              }
>>          }
>>          cu->setCbfSubParts(absSumY ? setCbf : 0, TEXT_LUMA, absPartIdx,
>> depth);
>> @@ -3817,7 +3902,7 @@
>>                  m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac,
>> trWidth, trHeight, TEXT_LUMA);
>>              }
>>
>> -            m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetY(), 0);
>> +            m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetY(), 0, cu->getChromaFormat());
>>
>>              m_trQuant->selectLambda(TEXT_LUMA);
>>              absSumTransformSkipY = m_trQuant->transformNxN(cu,
>> resiYuv->getLumaAddr(absTUPartIdx), resiYuv->m_width, coeffCurY,
>> @@ -3831,7 +3916,7 @@
>>                  m_entropyCoder->encodeCoeffNxN(cu, coeffCurY,
>> absPartIdx, trWidth, trHeight, depth, TEXT_LUMA);
>>                  const uint32_t skipSingleBitsY =
>> m_entropyCoder->getNumberOfWrittenBits();
>>
>> -                m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetY(), 0);
>> +                m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetY(), 0, cu->getChromaFormat());
>>
>>                  int scalingListType = 3 + g_eTTable[(int)TEXT_LUMA];
>>                  assert(scalingListType < 6);
>> @@ -3874,7 +3959,7 @@
>>
>>              int16_t *curResiU =
>> m_qtTempTComYuv[qtlayer].getCbAddr(absTUPartIdxC);
>>              int16_t *curResiV =
>> m_qtTempTComYuv[qtlayer].getCrAddr(absTUPartIdxC);
>> -            assert(m_qtTempTComYuv[qtlayer].m_cwidth == MAX_CU_SIZE / 2);
>> +            uint32_t stride = m_qtTempTComYuv[qtlayer].m_cwidth;
>>
>>              TCoeff bestCoeffU[32 * 32], bestCoeffV[32 * 32];
>>              memcpy(bestCoeffU, coeffCurU, sizeof(TCoeff) *
>> numSamplesChroma);
>> @@ -3883,8 +3968,8 @@
>>              int16_t bestResiU[32 * 32], bestResiV[32 * 32];
>>              for (int i = 0; i < trHeightC; ++i)
>>              {
>> -                memcpy(&bestResiU[i * trWidthC], curResiU + i *
>> (MAX_CU_SIZE / 2), sizeof(int16_t) * trWidthC);
>> -                memcpy(&bestResiV[i * trWidthC], curResiV + i *
>> (MAX_CU_SIZE / 2), sizeof(int16_t) * trWidthC);
>> +                memcpy(&bestResiU[i * trWidthC], curResiU + i * stride,
>> sizeof(int16_t) * trWidthC);
>> +                memcpy(&bestResiV[i * trWidthC], curResiV + i * stride,
>> sizeof(int16_t) * trWidthC);
>>              }
>>
>>
>>  m_rdGoOnSbacCoder->load(m_rdSbacCoders[depth][CI_QT_TRAFO_ROOT]);
>> @@ -3898,13 +3983,13 @@
>>              }
>>
>>              int curChromaQpOffset =
>> cu->getSlice()->getPPS()->getChromaCbQpOffset() +
>> cu->getSlice()->getSliceQpDeltaCb();
>> -            m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset);
>> +            m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset,
>> cu->getChromaFormat());
>>              m_trQuant->selectLambda(TEXT_CHROMA);
>>
>>              absSumTransformSkipU = m_trQuant->transformNxN(cu,
>> resiYuv->getCbAddr(absTUPartIdxC), resiYuv->m_cwidth, coeffCurU,
>>                                                             trWidthC,
>> trHeightC, TEXT_CHROMA_U, absPartIdx, &lastPosTransformSkipU, true,
>> curuseRDOQ);
>>              curChromaQpOffset =
>> cu->getSlice()->getPPS()->getChromaCrQpOffset() +
>> cu->getSlice()->getSliceQpDeltaCr();
>> -            m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset);
>> +            m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset,
>> cu->getChromaFormat());
>>              absSumTransformSkipV = m_trQuant->transformNxN(cu,
>> resiYuv->getCrAddr(absTUPartIdxC), resiYuv->m_cwidth, coeffCurV,
>>                                                             trWidthC,
>> trHeightC, TEXT_CHROMA_V, absPartIdx, &lastPosTransformSkipV, true,
>> curuseRDOQ);
>>
>> @@ -3922,17 +4007,15 @@
>>                  singleBitsU = m_entropyCoder->getNumberOfWrittenBits();
>>
>>                  curChromaQpOffset =
>> cu->getSlice()->getPPS()->getChromaCbQpOffset() +
>> cu->getSlice()->getSliceQpDeltaCb();
>> -                m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset);
>> +                m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset,
>> cu->getChromaFormat());
>>
>>                  int scalingListType = 3 + g_eTTable[(int)TEXT_CHROMA_U];
>>                  assert(scalingListType < 6);
>> -                assert(m_qtTempTComYuv[qtlayer].m_cwidth == MAX_CU_SIZE
>> / 2);
>> -
>> -
>>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT,
>> curResiU, MAX_CU_SIZE / 2, coeffCurU, trWidthC, trHeightC, scalingListType,
>> true, lastPosTransformSkipU);
>> +
>>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT,
>> curResiU, m_qtTempTComYuv[qtlayer].m_cwidth, coeffCurU, trWidthC,
>> trHeightC, scalingListType, true, lastPosTransformSkipU);
>>
>>                  uint32_t dist =
>> primitives.sse_ss[partSizeC](resiYuv->getCbAddr(absTUPartIdxC),
>> resiYuv->m_cwidth,
>>
>> m_qtTempTComYuv[qtlayer].getCbAddr(absTUPartIdxC),
>> -                                                             MAX_CU_SIZE
>> / 2);
>> +
>> m_qtTempTComYuv[qtlayer].m_cwidth);
>>                  nonZeroDistU = m_rdCost->scaleChromaDistCb(dist);
>>                  singleCostU = m_rdCost->calcRdCost(nonZeroDistU,
>> singleBitsU);
>>              }
>> @@ -3944,7 +4027,7 @@
>>                  memcpy(coeffCurU, bestCoeffU, sizeof(TCoeff) *
>> numSamplesChroma);
>>                  for (int i = 0; i < trHeightC; ++i)
>>                  {
>> -                    memcpy(curResiU + i * (MAX_CU_SIZE / 2),
>> &bestResiU[i * trWidthC], sizeof(int16_t) * trWidthC);
>> +                    memcpy(curResiU + i * stride, &bestResiU[i *
>> trWidthC], sizeof(int16_t) * trWidthC);
>>                  }
>>              }
>>              else
>> @@ -3961,17 +4044,15 @@
>>                  singleBitsV = m_entropyCoder->getNumberOfWrittenBits() -
>> singleBitsU;
>>
>>                  curChromaQpOffset =
>> cu->getSlice()->getPPS()->getChromaCrQpOffset() +
>> cu->getSlice()->getSliceQpDeltaCr();
>> -                m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset);
>> +                m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
>> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset,
>> cu->getChromaFormat());
>>
>>                  int scalingListType = 3 + g_eTTable[(int)TEXT_CHROMA_V];
>>                  assert(scalingListType < 6);
>> -                assert(m_qtTempTComYuv[qtlayer].m_cwidth == MAX_CU_SIZE
>> / 2);
>> -
>> -
>>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT,
>> curResiV, MAX_CU_SIZE / 2, coeffCurV, trWidthC, trHeightC, scalingListType,
>> true, lastPosTransformSkipV);
>> +
>>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT,
>> curResiV, m_qtTempTComYuv[qtlayer].m_cwidth, coeffCurV, trWidthC,
>> trHeightC, scalingListType, true, lastPosTransformSkipV);
>>
>>                  uint32_t dist =
>> primitives.sse_ss[partSizeC](resiYuv->getCrAddr(absTUPartIdxC),
>> resiYuv->m_cwidth,
>>
>> m_qtTempTComYuv[qtlayer].getCrAddr(absTUPartIdxC),
>> -                                                             MAX_CU_SIZE
>> / 2);
>> +
>> m_qtTempTComYuv[qtlayer].m_cwidth);
>>                  nonZeroDistV = m_rdCost->scaleChromaDistCr(dist);
>>                  singleCostV = m_rdCost->calcRdCost(nonZeroDistV,
>> singleBitsV);
>>              }
>> @@ -3983,7 +4064,7 @@
>>                  memcpy(coeffCurV, bestCoeffV, sizeof(TCoeff) *
>> numSamplesChroma);
>>                  for (int i = 0; i < trHeightC; ++i)
>>                  {
>> -                    memcpy(curResiV + i * (MAX_CU_SIZE / 2),
>> &bestResiV[i * trWidthC], sizeof(int16_t) * trWidthC);
>> +                    memcpy(curResiV + i * stride, &bestResiV[i *
>> trWidthC], sizeof(int16_t) * trWidthC);
>>                  }
>>              }
>>              else
>> @@ -4115,6 +4196,7 @@
>>      const uint32_t trMode = cu->getTransformIdx(absPartIdx);
>>      const bool bSubdiv = curTrMode != trMode;
>>      const uint32_t trSizeLog2 =
>> g_convertToBit[cu->getSlice()->getSPS()->getMaxCUWidth() >> depth] + 2;
>> +    uint32_t  trSizeCLog2 =
>> g_convertToBit[(cu->getSlice()->getSPS()->getMaxCUWidth() >>
>> m_hChromaShift) >> depth] + 2;
>>
>>      if (bSubdivAndCbf && trSizeLog2 <=
>> cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() && trSizeLog2 >
>> cu->getQuadtreeTULog2MinSizeInCU(absPartIdx))
>>      {
>> @@ -4145,21 +4227,20 @@
>>
>>      if (!bSubdiv)
>>      {
>> +        //Luma
>>          const uint32_t numCoeffPerAbsPartIdxIncrement =
>> cu->getSlice()->getSPS()->getMaxCUWidth() *
>> cu->getSlice()->getSPS()->getMaxCUHeight() >>
>> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1);
>> -        //assert( 16 == uiNumCoeffPerAbsPartIdxIncrement ); // check
>>          const uint32_t qtlayer =
>> cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
>>          TCoeff *coeffCurY = m_qtTempCoeffY[qtlayer] +
>>  numCoeffPerAbsPartIdxIncrement * absPartIdx;
>> -        TCoeff *coeffCurU = m_qtTempCoeffCb[qtlayer] +
>> (numCoeffPerAbsPartIdxIncrement * absPartIdx >> 2);
>> -        TCoeff *coeffCurV = m_qtTempCoeffCr[qtlayer] +
>> (numCoeffPerAbsPartIdxIncrement * absPartIdx >> 2);
>> +
>> +        //Chroma
>> +        TCoeff *coeffCurU = m_qtTempCoeffCb[qtlayer] +
>> (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift +
>> m_vChromaShift));
>> +        TCoeff *coeffCurV = m_qtTempCoeffCr[qtlayer] +
>> (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift +
>> m_vChromaShift));
>>
>>          bool  bCodeChroma = true;
>> -        uint32_t  trModeC     = trMode;
>> -        uint32_t  trSizeCLog2 = trSizeLog2 - 1;
>> -        if (trSizeLog2 == 2)
>> +        if ((trSizeLog2 == 2) && !(cu->getChromaFormat() == CHROMA_444))
>>          {
>>              trSizeCLog2++;
>> -            trModeC--;
>> -            uint32_t qpdiv = cu->getPic()->getNumPartInCU() >>
>> ((cu->getDepth(0) + trModeC) << 1);
>> +            uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((depth -
>> 1) << 1);
>>              bCodeChroma = ((absPartIdx % qpdiv) == 0);
>>          }
>>
>> @@ -4171,21 +4252,18 @@
>>          {
>>              if (ttype == TEXT_LUMA && cu->getCbf(absPartIdx, TEXT_LUMA,
>> trMode))
>>              {
>> -                int trWidth  = 1 << trSizeLog2;
>> -                int trHeight = 1 << trSizeLog2;
>> -                m_entropyCoder->encodeCoeffNxN(cu, coeffCurY,
>> absPartIdx, trWidth, trHeight, depth, TEXT_LUMA);
>> +                m_entropyCoder->encodeCoeffNxN(cu, coeffCurY,
>> absPartIdx, 1 << trSizeLog2, 1 << trSizeLog2, depth, TEXT_LUMA);
>>              }
>> +
>>              if (bCodeChroma)
>>              {
>> -                int trWidth  = 1 << trSizeCLog2;
>> -                int trHeight = 1 << trSizeCLog2;
>>                  if (ttype == TEXT_CHROMA_U && cu->getCbf(absPartIdx,
>> TEXT_CHROMA_U, trMode))
>>                  {
>> -                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurU,
>> absPartIdx, trWidth, trHeight, depth, TEXT_CHROMA_U);
>> +                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurU,
>> absPartIdx, 1 << trSizeCLog2, 1 << trSizeCLog2, depth, TEXT_CHROMA_U);
>>                  }
>>                  if (ttype == TEXT_CHROMA_V && cu->getCbf(absPartIdx,
>> TEXT_CHROMA_V, trMode))
>>                  {
>> -                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurV,
>> absPartIdx, trWidth, trHeight, depth, TEXT_CHROMA_V);
>> +                    m_entropyCoder->encodeCoeffNxN(cu, coeffCurV,
>> absPartIdx, 1 << trSizeCLog2, 1 << trSizeCLog2, depth, TEXT_CHROMA_V);
>>                  }
>>              }
>>          }
>> @@ -4211,13 +4289,13 @@
>>
>>      if (curTrMode == trMode)
>>      {
>> -        const uint32_t trSizeLog2 =
>> g_convertToBit[cu->getSlice()->getSPS()->getMaxCUWidth() >> depth] + 2;
>> +        const uint32_t trSizeLog2   =
>> g_convertToBit[cu->getSlice()->getSPS()->getMaxCUWidth() >> depth] + 2;
>> +        uint32_t  trSizeCLog2 =
>> g_convertToBit[(cu->getSlice()->getSPS()->getMaxCUWidth() >>
>> cu->getHorzChromaShift()) >> depth] + 2;;
>>          const uint32_t qtlayer =
>> cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
>>
>>          bool  bCodeChroma   = true;
>>          uint32_t  trModeC     = trMode;
>> -        uint32_t  trSizeCLog2 = trSizeLog2 - 1;
>> -        if (trSizeLog2 == 2)
>> +        if((trSizeLog2 == 2) && !(cu->getChromaFormat() == CHROMA_444))
>>          {
>>              trSizeCLog2++;
>>              trModeC--;
>> @@ -4246,10 +4324,10 @@
>>              if (bCodeChroma)
>>              {
>>                  uint32_t    uiNumCoeffC = (1 << (trSizeCLog2 << 1));
>> -                TCoeff* pcCoeffSrcU = m_qtTempCoeffCb[qtlayer] +
>> (uiNumCoeffPerAbsPartIdxIncrement * absPartIdx >> 2);
>> -                TCoeff* pcCoeffSrcV = m_qtTempCoeffCr[qtlayer] +
>> (uiNumCoeffPerAbsPartIdxIncrement * absPartIdx >> 2);
>> -                TCoeff* pcCoeffDstU = cu->getCoeffCb() +
>> (uiNumCoeffPerAbsPartIdxIncrement * absPartIdx >> 2);
>> -                TCoeff* pcCoeffDstV = cu->getCoeffCr() +
>> (uiNumCoeffPerAbsPartIdxIncrement * absPartIdx >> 2);
>> +                TCoeff* pcCoeffSrcU = m_qtTempCoeffCb[qtlayer] +
>> (uiNumCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift +
>> m_vChromaShift));
>> +                TCoeff* pcCoeffSrcV = m_qtTempCoeffCr[qtlayer] +
>> (uiNumCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift +
>> m_vChromaShift));
>> +                TCoeff* pcCoeffDstU = cu->getCoeffCb() +
>> (uiNumCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift +
>> m_vChromaShift));
>> +                TCoeff* pcCoeffDstV = cu->getCoeffCr() +
>> (uiNumCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift +
>> m_vChromaShift));
>>                  ::memcpy(pcCoeffDstU, pcCoeffSrcU, sizeof(TCoeff) *
>> uiNumCoeffC);
>>                  ::memcpy(pcCoeffDstV, pcCoeffSrcV, sizeof(TCoeff) *
>> uiNumCoeffC);
>>              }
>> diff -r 4811da38078c -r f7d21da102ac source/common/TShortYUV.h
>> --- a/source/common/TShortYUV.h Mon Jan 06 23:15:58 2014 -0600
>> +++ b/source/common/TShortYUV.h Tue Jan 07 16:44:39 2014 +0530
>> @@ -87,9 +87,9 @@
>>      //  Access starting position of YUV partition unit buffer
>>      int16_t* getLumaAddr(unsigned int partUnitIdx) { return m_bufY +
>> getAddrOffset(partUnitIdx, m_width); }
>>
>> -    int16_t* getCbAddr(unsigned int partUnitIdx) { return m_bufCb +
>> (getAddrOffset(partUnitIdx, m_cwidth) >> 1); }
>> +    int16_t* getCbAddr(unsigned int partUnitIdx) { return m_bufCb +
>> (getAddrOffset(partUnitIdx, m_cwidth) >> m_hChromaShift); }
>>
>> -    int16_t* getCrAddr(unsigned int partUnitIdx) { return m_bufCr +
>> (getAddrOffset(partUnitIdx, m_cwidth) >> 1); }
>> +    int16_t* getCrAddr(unsigned int partUnitIdx) { return m_bufCr +
>> (getAddrOffset(partUnitIdx, m_cwidth) >> m_hChromaShift); }
>>
>>      //  Access starting position of YUV transform unit buffer
>>      int16_t* getLumaAddr(unsigned int partIdx, unsigned int size) {
>> return m_bufY + getAddrOffset(partIdx, size, m_width); }
>> diff -r 4811da38078c -r f7d21da102ac source/common/ipfilter.cpp
>> --- a/source/common/ipfilter.cpp        Mon Jan 06 23:15:58 2014 -0600
>> +++ b/source/common/ipfilter.cpp        Tue Jan 07 16:44:39 2014 +0530
>> @@ -449,74 +449,108 @@
>>  namespace x265 {
>>  // x265 private namespace
>>
>> -#define CHROMA(W, H) \
>> +#define CHROMA_420(W, H) \
>>      p.chroma[X265_CSP_I420].filter_hpp[CHROMA_ ## W ## x ## H] =
>> interp_horiz_pp_c<4, W, H>; \
>>      p.chroma[X265_CSP_I420].filter_hps[CHROMA_ ## W ## x ## H] =
>> interp_horiz_ps_c<4, W, H>; \
>> -    p.chroma[X265_CSP_I420].filter_vpp[CHROMA_ ## W ## x ## H] =
>> interp_vert_pp_c<4, W, H>; \
>> -    p.chroma[X265_CSP_I420].filter_vps[CHROMA_ ## W ## x ## H] =
>> interp_vert_ps_c<4, W, H>; \
>> -    p.chroma[X265_CSP_I420].filter_vsp[CHROMA_ ## W ## x ## H] =
>> interp_vert_sp_c<4, W, H>; \
>> +    p.chroma[X265_CSP_I420].filter_vpp[CHROMA_ ## W ## x ## H] =
>> interp_vert_pp_c<4, W, H>;  \
>> +    p.chroma[X265_CSP_I420].filter_vps[CHROMA_ ## W ## x ## H] =
>> interp_vert_ps_c<4, W, H>;  \
>> +    p.chroma[X265_CSP_I420].filter_vsp[CHROMA_ ## W ## x ## H] =
>> interp_vert_sp_c<4, W, H>;  \
>>      p.chroma[X265_CSP_I420].filter_vss[CHROMA_ ## W ## x ## H] =
>> interp_vert_ss_c<4, W, H>;
>>
>> +#define CHROMA_444(W, H) \
>> +    p.chroma[X265_CSP_I444].filter_hpp[LUMA_ ## W ## x ## H] =
>> interp_horiz_pp_c<4, W, H>; \
>> +    p.chroma[X265_CSP_I444].filter_hps[LUMA_ ## W ## x ## H] =
>> interp_horiz_ps_c<4, W, H>; \
>> +    p.chroma[X265_CSP_I444].filter_vpp[LUMA_ ## W ## x ## H] =
>> interp_vert_pp_c<4, W, H>;  \
>> +    p.chroma[X265_CSP_I444].filter_vps[LUMA_ ## W ## x ## H] =
>> interp_vert_ps_c<4, W, H>;  \
>> +    p.chroma[X265_CSP_I444].filter_vsp[LUMA_ ## W ## x ## H] =
>> interp_vert_sp_c<4, W, H>;  \
>> +    p.chroma[X265_CSP_I444].filter_vss[LUMA_ ## W ## x ## H] =
>> interp_vert_ss_c<4, W, H>;
>> +
>>  #define LUMA(W, H) \
>>      p.luma_hpp[LUMA_ ## W ## x ## H]     = interp_horiz_pp_c<8, W, H>; \
>>      p.luma_hps[LUMA_ ## W ## x ## H]     = interp_horiz_ps_c<8, W, H>; \
>> -    p.luma_vpp[LUMA_ ## W ## x ## H]     = interp_vert_pp_c<8, W, H>; \
>> -    p.luma_vps[LUMA_ ## W ## x ## H]     = interp_vert_ps_c<8, W, H>; \
>> -    p.luma_vsp[LUMA_ ## W ## x ## H]     = interp_vert_sp_c<8, W, H>; \
>> -    p.luma_vss[LUMA_ ## W ## x ## H]     = interp_vert_ss_c<8, W, H>; \
>> +    p.luma_vpp[LUMA_ ## W ## x ## H]     = interp_vert_pp_c<8, W, H>;  \
>> +    p.luma_vps[LUMA_ ## W ## x ## H]     = interp_vert_ps_c<8, W, H>;  \
>> +    p.luma_vsp[LUMA_ ## W ## x ## H]     = interp_vert_sp_c<8, W, H>;  \
>> +    p.luma_vss[LUMA_ ## W ## x ## H]     = interp_vert_ss_c<8, W, H>;  \
>>      p.luma_hvpp[LUMA_ ## W ## x ## H]    = interp_hv_pp_c<8, W, H>;
>>
>>  void Setup_C_IPFilterPrimitives(EncoderPrimitives& p)
>>  {
>>      LUMA(4, 4);
>>      LUMA(8, 8);
>> -    CHROMA(4, 4);
>> +    CHROMA_420(4,  4);
>>      LUMA(4, 8);
>> -    CHROMA(2, 4);
>> +    CHROMA_420(2,  4);
>>      LUMA(8, 4);
>> -    CHROMA(4, 2);
>> +    CHROMA_420(4,  2);
>>      LUMA(16, 16);
>> -    CHROMA(8, 8);
>> +    CHROMA_420(8,  8);
>>      LUMA(16,  8);
>> -    CHROMA(8, 4);
>> +    CHROMA_420(8,  4);
>>      LUMA(8, 16);
>> -    CHROMA(4, 8);
>> +    CHROMA_420(4,  8);
>>      LUMA(16, 12);
>> -    CHROMA(8, 6);
>> +    CHROMA_420(8,  6);
>>      LUMA(12, 16);
>> -    CHROMA(6, 8);
>> +    CHROMA_420(6,  8);
>>      LUMA(16,  4);
>> -    CHROMA(8, 2);
>> +    CHROMA_420(8,  2);
>>      LUMA(4, 16);
>> -    CHROMA(2, 8);
>> +    CHROMA_420(2,  8);
>>      LUMA(32, 32);
>> -    CHROMA(16, 16);
>> +    CHROMA_420(16, 16);
>>      LUMA(32, 16);
>> -    CHROMA(16, 8);
>> +    CHROMA_420(16, 8);
>>      LUMA(16, 32);
>> -    CHROMA(8, 16);
>> +    CHROMA_420(8,  16);
>>      LUMA(32, 24);
>> -    CHROMA(16, 12);
>> +    CHROMA_420(16, 12);
>>      LUMA(24, 32);
>> -    CHROMA(12, 16);
>> +    CHROMA_420(12, 16);
>>      LUMA(32,  8);
>> -    CHROMA(16, 4);
>> +    CHROMA_420(16, 4);
>>      LUMA(8, 32);
>> -    CHROMA(4, 16);
>> +    CHROMA_420(4,  16);
>>      LUMA(64, 64);
>> -    CHROMA(32, 32);
>> +    CHROMA_420(32, 32);
>>      LUMA(64, 32);
>> -    CHROMA(32, 16);
>> +    CHROMA_420(32, 16);
>>      LUMA(32, 64);
>> -    CHROMA(16, 32);
>> +    CHROMA_420(16, 32);
>>      LUMA(64, 48);
>> -    CHROMA(32, 24);
>> +    CHROMA_420(32, 24);
>>      LUMA(48, 64);
>> -    CHROMA(24, 32);
>> +    CHROMA_420(24, 32);
>>      LUMA(64, 16);
>> -    CHROMA(32, 8);
>> +    CHROMA_420(32, 8);
>>      LUMA(16, 64);
>> -    CHROMA(8, 32);
>> +    CHROMA_420(8,  32);
>> +
>> +    CHROMA_444(4,  4);
>> +    CHROMA_444(8,  8);
>> +    CHROMA_444(4,  8);
>> +    CHROMA_444(8,  4);
>> +    CHROMA_444(16, 16);
>> +    CHROMA_444(16, 8);
>> +    CHROMA_444(8,  16);
>> +    CHROMA_444(16, 12);
>> +    CHROMA_444(12, 16);
>> +    CHROMA_444(16, 4);
>> +    CHROMA_444(4,  16);
>> +    CHROMA_444(32, 32);
>> +    CHROMA_444(32, 16);
>> +    CHROMA_444(16, 32);
>> +    CHROMA_444(32, 24);
>> +    CHROMA_444(24, 32);
>> +    CHROMA_444(32, 8);
>> +    CHROMA_444(8,  32);
>> +    CHROMA_444(64, 64);
>> +    CHROMA_444(64, 32);
>> +    CHROMA_444(32, 64);
>> +    CHROMA_444(64, 48);
>> +    CHROMA_444(48, 64);
>> +    CHROMA_444(64, 16);
>> +    CHROMA_444(16, 64);
>>
>>      p.ipfilter_ps[FILTER_V_P_S_8] = filterVertical_ps_c<8>;
>>      p.ipfilter_ps[FILTER_V_P_S_4] = filterVertical_ps_c<4>;
>> @@ -525,7 +559,9 @@
>>
>>      p.chroma_vsp = filterVertical_sp_c<4>;
>>      p.luma_p2s = filterConvertPelToShort_c<MAX_CU_SIZE>;
>> -    p.chroma_p2s = filterConvertPelToShort_c<MAX_CU_SIZE / 2>;
>> +
>> +    p.chroma_p2s[X265_CSP_I444] = filterConvertPelToShort_c<MAX_CU_SIZE>;
>> +    p.chroma_p2s[X265_CSP_I420] = filterConvertPelToShort_c<MAX_CU_SIZE
>> / 2>;
>>
>>      p.extendRowBorder = extendCURowColBorder;
>>  }
>> diff -r 4811da38078c -r f7d21da102ac source/common/pixel.cpp
>> --- a/source/common/pixel.cpp   Mon Jan 06 23:15:58 2014 -0600
>> +++ b/source/common/pixel.cpp   Tue Jan 07 16:44:39 2014 +0530
>> @@ -805,6 +805,27 @@
>>  namespace x265 {
>>  // x265 private namespace
>>
>> +#define CHROMA_420(W, H) \
>> +    p.chroma[X265_CSP_I420].copy_pp[CHROMA_ ## W ## x ## H] =
>> blockcopy_pp_c<W, H>; \
>> +    p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] =
>> blockcopy_sp_c<W, H>; \
>> +    p.chroma[X265_CSP_I420].copy_ps[CHROMA_ ## W ## x ## H] =
>> blockcopy_ps_c<W, H>; \
>> +    p.chroma[X265_CSP_I420].sub_ps [CHROMA_ ## W ## x ## H] =
>> pixel_sub_ps_c<W, H>; \
>> +    p.chroma[X265_CSP_I420].add_ps [CHROMA_ ## W ## x ## H] =
>> pixel_add_ps_c<W, H>;
>> +
>> +#define CHROMA_444(W, H) \
>> +    p.chroma[X265_CSP_I444].copy_pp[LUMA_ ## W ## x ## H] =
>> blockcopy_pp_c<W, H>; \
>> +    p.chroma[X265_CSP_I444].copy_sp[LUMA_ ## W ## x ## H] =
>> blockcopy_sp_c<W, H>; \
>> +    p.chroma[X265_CSP_I444].copy_ps[LUMA_ ## W ## x ## H] =
>> blockcopy_ps_c<W, H>; \
>> +    p.chroma[X265_CSP_I444].sub_ps [LUMA_ ## W ## x ## H] =
>> pixel_sub_ps_c<W, H>; \
>> +    p.chroma[X265_CSP_I444].add_ps [LUMA_ ## W ## x ## H] =
>> pixel_add_ps_c<W, H>;
>> +
>> +#define LUMA(W, H) \
>> +    p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
>> +    p.luma_copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
>> +    p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
>> +    p.luma_sub_ps[LUMA_ ## W ## x ## H]  = pixel_sub_ps_c<W, H>; \
>> +    p.luma_add_ps[LUMA_ ## W ## x ## H]  = pixel_add_ps_c<W, H>;
>> +
>>  /* It should initialize entries for pixel functions defined in this
>> file. */
>>  void Setup_C_PixelPrimitives(EncoderPrimitives &p)
>>  {
>> @@ -840,69 +861,81 @@
>>      p.satd[LUMA_64x16] = satd8<64, 16>;
>>      p.satd[LUMA_16x64] = satd8<16, 64>;
>>
>> -#define CHROMA(W, H) \
>> -    p.chroma[X265_CSP_I420].copy_pp[CHROMA_ ## W ## x ## H] =
>> blockcopy_pp_c<W, H>; \
>> -    p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] =
>> blockcopy_sp_c<W, H>; \
>> -    p.chroma[X265_CSP_I420].copy_ps[CHROMA_ ## W ## x ## H] =
>> blockcopy_ps_c<W, H>; \
>> -    p.chroma[X265_CSP_I420].sub_ps[CHROMA_ ## W ## x ## H] =
>> pixel_sub_ps_c<W, H>; \
>> -    p.chroma[X265_CSP_I420].add_ps[CHROMA_ ## W ## x ## H] =
>> pixel_add_ps_c<W, H>;
>> -
>> -#define LUMA(W, H) \
>> -    p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
>> -    p.luma_copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
>> -    p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
>> -    p.luma_sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
>> -    p.luma_add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
>> -
>>      LUMA(4, 4);
>>      LUMA(8, 8);
>> -    CHROMA(4, 4);
>> +    CHROMA_420(4, 4);
>>      LUMA(4, 8);
>> -    CHROMA(2, 4);
>> +    CHROMA_420(2, 4);
>>      LUMA(8, 4);
>> -    CHROMA(4, 2);
>> +    CHROMA_420(4, 2);
>>      LUMA(16, 16);
>> -    CHROMA(8, 8);
>> +    CHROMA_420(8,  8);
>>      LUMA(16,  8);
>> -    CHROMA(8, 4);
>> +    CHROMA_420(8,  4);
>>      LUMA(8, 16);
>> -    CHROMA(4, 8);
>> +    CHROMA_420(4,  8);
>>      LUMA(16, 12);
>> -    CHROMA(8, 6);
>> +    CHROMA_420(8,  6);
>>      LUMA(12, 16);
>> -    CHROMA(6, 8);
>> +    CHROMA_420(6,  8);
>>      LUMA(16,  4);
>> -    CHROMA(8, 2);
>> +    CHROMA_420(8,  2);
>>      LUMA(4, 16);
>> -    CHROMA(2, 8);
>> +    CHROMA_420(2,  8);
>>      LUMA(32, 32);
>> -    CHROMA(16, 16);
>> +    CHROMA_420(16, 16);
>>      LUMA(32, 16);
>> -    CHROMA(16, 8);
>> +    CHROMA_420(16, 8);
>>      LUMA(16, 32);
>> -    CHROMA(8, 16);
>> +    CHROMA_420(8,  16);
>>      LUMA(32, 24);
>> -    CHROMA(16, 12);
>> +    CHROMA_420(16, 12);
>>      LUMA(24, 32);
>> -    CHROMA(12, 16);
>> +    CHROMA_420(12, 16);
>>      LUMA(32,  8);
>> -    CHROMA(16, 4);
>> +    CHROMA_420(16, 4);
>>      LUMA(8, 32);
>> -    CHROMA(4, 16);
>> +    CHROMA_420(4,  16);
>>      LUMA(64, 64);
>> -    CHROMA(32, 32);
>> +    CHROMA_420(32, 32);
>>      LUMA(64, 32);
>> -    CHROMA(32, 16);
>> +    CHROMA_420(32, 16);
>>      LUMA(32, 64);
>> -    CHROMA(16, 32);
>> +    CHROMA_420(16, 32);
>>      LUMA(64, 48);
>> -    CHROMA(32, 24);
>> +    CHROMA_420(32, 24);
>>      LUMA(48, 64);
>> -    CHROMA(24, 32);
>> +    CHROMA_420(24, 32);
>>      LUMA(64, 16);
>> -    CHROMA(32, 8);
>> +    CHROMA_420(32, 8);
>>      LUMA(16, 64);
>> -    CHROMA(8, 32);
>> +    CHROMA_420(8,  32);
>> +
>> +    CHROMA_444(4,  4);
>> +    CHROMA_444(8,  8);
>> +    CHROMA_444(4,  8);
>> +    CHROMA_444(8,  4);
>> +    CHROMA_444(16, 16);
>> +    CHROMA_444(16, 8);
>> +    CHROMA_444(8,  16);
>> +    CHROMA_444(16, 12);
>> +    CHROMA_444(12, 16);
>> +    CHROMA_444(16, 4);
>> +    CHROMA_444(4,  16);
>> +    CHROMA_444(32, 32);
>> +    CHROMA_444(32, 16);
>> +    CHROMA_444(16, 32);
>> +    CHROMA_444(32, 24);
>> +    CHROMA_444(24, 32);
>> +    CHROMA_444(32, 8);
>> +    CHROMA_444(8,  32);
>> +    CHROMA_444(64, 64);
>> +    CHROMA_444(64, 32);
>> +    CHROMA_444(32, 64);
>> +    CHROMA_444(64, 48);
>> +    CHROMA_444(48, 64);
>> +    CHROMA_444(64, 16);
>> +    CHROMA_444(16, 64);
>>
>>      SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixelcmp_t, pixel, pixel)
>>      SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, pixelcmp_sp_t, int16_t,
>> pixel)
>> diff -r 4811da38078c -r f7d21da102ac source/common/primitives.h
>> --- a/source/common/primitives.h        Mon Jan 06 23:15:58 2014 -0600
>> +++ b/source/common/primitives.h        Tue Jan 07 16:44:39 2014 +0530
>> @@ -75,7 +75,7 @@
>>  // 4:2:0 chroma partition sizes. These enums are just a convenience for
>> indexing into the
>>  // chroma primitive arrays when instantiating templates. The function
>> tables should always
>>  // be indexed by the luma partition enum
>> -enum Chroma420Partions
>> +enum Chroma420Partitions
>>  {
>>      CHROMA_2x2, // never used by HEVC
>>      CHROMA_4x4,   CHROMA_4x2,   CHROMA_2x4,
>> @@ -240,7 +240,7 @@
>>      ipfilter_ps_t   ipfilter_ps[NUM_IPFILTER_P_S];
>>      ipfilter_ss_t   ipfilter_ss[NUM_IPFILTER_S_S];
>>      filter_p2s_t    luma_p2s;
>> -    filter_p2s_t    chroma_p2s;
>> +    filter_p2s_t    chroma_p2s[NUM_CHROMA_PARTITIONS];
>>      ipfilter_sp_t   chroma_vsp;
>>
>>      weightp_sp_t    weight_sp;
>> diff -r 4811da38078c -r f7d21da102ac source/encoder/encoder.cpp
>> --- a/source/encoder/encoder.cpp        Mon Jan 06 23:15:58 2014 -0600
>> +++ b/source/encoder/encoder.cpp        Tue Jan 07 16:44:39 2014 +0530
>> @@ -1288,6 +1288,8 @@
>>          bEnableRDOQTS = 0;
>>      }
>>
>> +    m_csp = _param->internalCsp;
>> +
>>      //====== Coding Tools ========
>>
>>      uint32_t tuQTMaxLog2Size = g_convertToBit[_param->maxCUSize] + 2 - 1;
>> diff -r 4811da38078c -r f7d21da102ac source/encoder/frameencoder.cpp
>> --- a/source/encoder/frameencoder.cpp   Mon Jan 06 23:15:58 2014 -0600
>> +++ b/source/encoder/frameencoder.cpp   Tue Jan 07 16:44:39 2014 +0530
>> @@ -330,11 +330,11 @@
>>      // instead we weight the distortion of chroma.
>>      int chromaQPOffset = slice->getPPS()->getChromaCbQpOffset() +
>> slice->getSliceQpDeltaCb();
>>      int qpc = Clip3(0, 70, qp + chromaQPOffset);
>> -    double cbWeight = pow(2.0, (qp - g_chromaScale[qpc])); // takes into
>> account of the chroma qp mapping and chroma qp Offset
>> +    double cbWeight = pow(2.0, (qp -
>> g_chromaScale[slice->getSPS()->getChromaFormatIdc()][qpc])); // takes into
>> account of the chroma qp mapping and chroma qp Offset
>>
>>      chromaQPOffset = slice->getPPS()->getChromaCrQpOffset() +
>> slice->getSliceQpDeltaCr();
>>      qpc = Clip3(0, 70, qp + chromaQPOffset);
>> -    double crWeight = pow(2.0, (qp - g_chromaScale[qpc])); // takes into
>> account of the chroma qp mapping and chroma qp Offset
>> +    double crWeight = pow(2.0, (qp -
>> g_chromaScale[slice->getSPS()->getChromaFormatIdc()][qpc])); // takes into
>> account of the chroma qp mapping and chroma qp Offset
>>      double chromaLambda = lambda / crWeight;
>>
>>      m_rows[row].m_search.setQPLambda(qp, lambda, chromaLambda);
>> @@ -369,10 +369,10 @@
>>      int qpc;
>>      int chromaQPOffset = slice->getPPS()->getChromaCbQpOffset() +
>> slice->getSliceQpDeltaCb();
>>      qpc = Clip3(0, 70, qp + chromaQPOffset);
>> -    double cbWeight = pow(2.0, (qp - g_chromaScale[qpc])); // takes into
>> account of the chroma qp mapping and chroma qp Offset
>> +    double cbWeight = pow(2.0, (qp -
>> g_chromaScale[slice->getSPS()->getChromaFormatIdc()][qpc])); // takes into
>> account of the chroma qp mapping and chroma qp Offset
>>      chromaQPOffset = slice->getPPS()->getChromaCrQpOffset() +
>> slice->getSliceQpDeltaCr();
>>      qpc = Clip3(0, 70, qp + chromaQPOffset);
>> -    double crWeight = pow(2.0, (qp - g_chromaScale[qpc])); // takes into
>> account of the chroma qp mapping and chroma qp Offset
>> +    double crWeight = pow(2.0, (qp -
>> g_chromaScale[slice->getSPS()->getChromaFormatIdc()][qpc])); // takes into
>> account of the chroma qp mapping and chroma qp Offset
>>      double chromaLambda = lambda / crWeight;
>>
>>      // NOTE: set SAO lambda every Frame
>> diff -r 4811da38078c -r f7d21da102ac source/encoder/framefilter.cpp
>> --- a/source/encoder/framefilter.cpp    Mon Jan 06 23:15:58 2014 -0600
>> +++ b/source/encoder/framefilter.cpp    Tue Jan 07 16:44:39 2014 +0530
>> @@ -64,6 +64,9 @@
>>      m_cfg = top;
>>      m_numRows = numRows;
>>
>> +    m_hChromaShift = CHROMA_H_SHIFT(m_cfg->getColorFormat());
>> +    m_vChromaShift = CHROMA_V_SHIFT(m_cfg->getColorFormat());
>> +
>>      // NOTE: for sao only, I write this code because I want to exact
>> match with HM's bug bitstream
>>      m_rdGoOnSbacCoderRow0 = rdGoOnSbacCoder;
>>
>> @@ -77,7 +80,7 @@
>>          m_sao.setSaoLcuBoundary(top->param.saoLcuBoundary);
>>
>>  m_sao.setSaoLcuBasedOptimization(top->param.saoLcuBasedOptimization);
>>          m_sao.setMaxNumOffsetsPerPic(top->getMaxNumOffsetsPerPic());
>> -        m_sao.create(top->param.sourceWidth, top->param.sourceHeight,
>> g_maxCUWidth, g_maxCUHeight);
>> +        m_sao.create(top->param.sourceWidth, top->param.sourceHeight,
>> g_maxCUWidth, g_maxCUHeight, m_cfg->getColorFormat());
>>          m_sao.createEncBuffer();
>>      }
>>
>> @@ -222,8 +225,8 @@
>>
>>      // Border extend Left and Right
>>      primitives.extendRowBorder(recon->getLumaAddr(lineStartCUAddr),
>> recon->getStride(), recon->getWidth(), realH, recon->getLumaMarginX());
>> -    primitives.extendRowBorder(recon->getCbAddr(lineStartCUAddr),
>> recon->getCStride(), recon->getWidth() >> 1, realH >> 1,
>> recon->getChromaMarginX());
>> -    primitives.extendRowBorder(recon->getCrAddr(lineStartCUAddr),
>> recon->getCStride(), recon->getWidth() >> 1, realH >> 1,
>> recon->getChromaMarginX());
>> +    primitives.extendRowBorder(recon->getCbAddr(lineStartCUAddr),
>> recon->getCStride(), recon->getWidth() >> m_hChromaShift, realH >>
>> m_vChromaShift, recon->getChromaMarginX());
>> +    primitives.extendRowBorder(recon->getCrAddr(lineStartCUAddr),
>> recon->getCStride(), recon->getWidth() >> m_hChromaShift, realH >>
>> m_vChromaShift, recon->getChromaMarginX());
>>
>>      // Border extend Top
>>      if (row == 0)
>> @@ -252,8 +255,8 @@
>>          const intptr_t stride = recon->getStride();
>>          const intptr_t strideC = recon->getCStride();
>>          pixel *pixY = recon->getLumaAddr(lineStartCUAddr) -
>> recon->getLumaMarginX() + (realH - 1) * stride;
>> -        pixel *pixU = recon->getCbAddr(lineStartCUAddr) -
>> recon->getChromaMarginX() + ((realH >> 1) - 1) * strideC;
>> -        pixel *pixV = recon->getCrAddr(lineStartCUAddr) -
>> recon->getChromaMarginX() + ((realH >> 1) - 1) * strideC;
>> +        pixel *pixU = recon->getCbAddr(lineStartCUAddr) -
>> recon->getChromaMarginX() + ((realH >> m_vChromaShift) - 1) * strideC;
>> +        pixel *pixV = recon->getCrAddr(lineStartCUAddr) -
>> recon->getChromaMarginX() + ((realH >> m_vChromaShift) - 1) * strideC;
>>
>>          for (int y = 0; y < recon->getLumaMarginY(); y++)
>>          {
>> @@ -290,8 +293,8 @@
>>
>>          uint64_t ssdY = computeSSD(orig->getLumaAddr(cuAddr),
>> recon->getLumaAddr(cuAddr), stride, width, height);
>>
>> -        height >>= 1;
>> -        width  >>= 1;
>> +        height >>= m_vChromaShift;
>> +        width  >>= m_hChromaShift;
>>          stride = recon->getCStride();
>>
>>          uint64_t ssdU = computeSSD(orig->getCbAddr(cuAddr),
>> recon->getCbAddr(cuAddr), stride, width, height);
>> @@ -337,8 +340,8 @@
>>
>>          updateMD5Plane(m_pic->m_state[0], recon->getLumaAddr(cuAddr),
>> width, height, stride);
>>
>> -        width >>= 1;
>> -        height >>= 1;
>> +        width  >>= m_hChromaShift;
>> +        height >>= m_vChromaShift;
>>          stride = recon->getCStride();
>>
>>          updateMD5Plane(m_pic->m_state[1], recon->getCbAddr(cuAddr),
>> width, height, stride);
>> @@ -356,8 +359,8 @@
>>          }
>>          updateCRC(recon->getLumaAddr(cuAddr), m_pic->m_crc[0], height,
>> width, stride);
>>
>> -        width >>= 1;
>> -        height >>= 1;
>> +        width  >>= m_hChromaShift;
>> +        height >>= m_vChromaShift;
>>          stride = recon->getCStride();
>>
>>          updateCRC(recon->getCbAddr(cuAddr), m_pic->m_crc[1], height,
>> width, stride);
>> @@ -374,10 +377,10 @@
>>              m_pic->m_checksum[0] = m_pic->m_checksum[1] =
>> m_pic->m_checksum[2] = 0;
>>          }
>>          updateChecksum(recon->getLumaAddr(), m_pic->m_checksum[0],
>> height, width, stride, row, cuHeight);
>> -        width >>= 1;
>> -        height >>= 1;
>> +        width  >>= m_hChromaShift;
>> +        height >>= m_vChromaShift;
>>          stride = recon->getCStride();
>> -        cuHeight >>= 1;
>> +        cuHeight >>= m_vChromaShift;
>>          updateChecksum(recon->getCbAddr(), m_pic->m_checksum[1], height,
>> width, stride, row, cuHeight);
>>          updateChecksum(recon->getCrAddr(), m_pic->m_checksum[2], height,
>> width, stride, row, cuHeight);
>>      }
>> diff -r 4811da38078c -r f7d21da102ac source/encoder/framefilter.h
>> --- a/source/encoder/framefilter.h      Mon Jan 06 23:15:58 2014 -0600
>> +++ b/source/encoder/framefilter.h      Tue Jan 07 16:44:39 2014 +0530
>> @@ -59,6 +59,9 @@
>>      TEncCfg*                    m_cfg;
>>      TComPic*                    m_pic;
>>
>> +    int                         m_hChromaShift;
>> +    int                         m_vChromaShift;
>> +
>>  public:
>>
>>      TComLoopFilter              m_loopFilter;
>> _______________________________________________
>> x265-devel mailing list
>> x265-devel at videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
>>
>
>
>
> --
> Steve Borho
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140108/4668d706/attachment-0001.html>