[x265] refine cbf==0 path: remove clearing coeff and resi

Deepthi Nandakumar deepthi at multicorewareinc.com
Mon Jun 2 12:50:16 CEST 2014


Thanks Satoshi. Pushed for testing.


On Mon, Jun 2, 2014 at 8:17 AM, Satoshi Nakagawa <nakagawa424 at oki.com>
wrote:

> # HG changeset patch
> # User Satoshi Nakagawa <nakagawa424 at oki.com>
> # Date 1401677099 -32400
> #      Mon Jun 02 11:44:59 2014 +0900
> # Node ID 73f86312c2e0aa5a105e84b0045478e02c8a03e7
> # Parent  a5998df9b12ef81e48e7c5b89219a74276a75f27
> refine cbf==0 path: remove clearing coeff and resi
>
> diff -r a5998df9b12e -r 73f86312c2e0 source/Lib/TLibEncoder/TEncEntropy.cpp
> --- a/source/Lib/TLibEncoder/TEncEntropy.cpp    Mon Jun 02 07:36:20 2014
> +0530
> +++ b/source/Lib/TLibEncoder/TEncEntropy.cpp    Mon Jun 02 11:44:59 2014
> +0900
> @@ -202,7 +202,6 @@
>
>  void TEncEntropy::initTUEntropySection(TComTURecurse *tuIterator,
> uint32_t splitMode, uint32_t absPartIdxStep, uint32_t m_absPartIdxTU)
>  {
> -    tuIterator->m_partOffset        = 0;
>      tuIterator->m_section           = 0;
>      tuIterator->m_absPartIdxTURelCU = m_absPartIdxTU;
>      tuIterator->m_splitMode         = splitMode;
> diff -r a5998df9b12e -r 73f86312c2e0 source/Lib/TLibEncoder/TEncEntropy.h
> --- a/source/Lib/TLibEncoder/TEncEntropy.h      Mon Jun 02 07:36:20 2014
> +0530
> +++ b/source/Lib/TLibEncoder/TEncEntropy.h      Mon Jun 02 11:44:59 2014
> +0900
> @@ -66,7 +66,6 @@
>      uint32_t          m_splitMode;
>      uint32_t          m_absPartIdxTURelCU;
>      uint32_t          m_absPartIdxStep;
> -    uint32_t          m_partOffset;
>  };
>
>  //
> ====================================================================================================================
> diff -r a5998df9b12e -r 73f86312c2e0 source/Lib/TLibEncoder/TEncSbac.cpp
> --- a/source/Lib/TLibEncoder/TEncSbac.cpp       Mon Jun 02 07:36:20 2014
> +0530
> +++ b/source/Lib/TLibEncoder/TEncSbac.cpp       Mon Jun 02 11:44:59 2014
> +0900
> @@ -2120,8 +2120,9 @@
>      // compute number of significant coefficients
>      uint32_t numSig = primitives.count_nonzero(coeff, trSize * trSize);
>
> -    if (numSig == 0)
> -        return;
> +#if CHECKED_BUILD || _DEBUG
> +    X265_CHECK(numSig > 0, "cbf check fail");
> +#endif
>
>      bool beValid;
>      if (cu->getCUTransquantBypass(absPartIdx))
> diff -r a5998df9b12e -r 73f86312c2e0 source/Lib/TLibEncoder/TEncSearch.cpp
> --- a/source/Lib/TLibEncoder/TEncSearch.cpp     Mon Jun 02 07:36:20 2014
> +0530
> +++ b/source/Lib/TLibEncoder/TEncSearch.cpp     Mon Jun 02 11:44:59 2014
> +0900
> @@ -408,8 +408,8 @@
>      coeff_t* coeff          = m_qtTempCoeff[0][qtLayer] + coeffOffsetY;
>
>      int16_t* reconQt        =
> m_qtTempShortYuv[qtLayer].getLumaAddr(absPartIdx);
> -
>      X265_CHECK(m_qtTempShortYuv[qtLayer].m_width == MAX_CU_SIZE, "width
> is not max CU size\n");
> +    const uint32_t reconQtStride = MAX_CU_SIZE;
>
>      uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
>      pixel*   reconIPred       =
> cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
> @@ -443,25 +443,29 @@
>      //--- set coded block flag ---
>      cu->setCbfSubParts((absSum ? 1 : 0) << trDepth, TEXT_LUMA,
> absPartIdx, fullDepth);
>
> -    //--- inverse transform ---
>      if (absSum)
>      {
> +        //--- inverse transform ---
>          int scalingListType = 0 + TEXT_LUMA;
> -        X265_CHECK(scalingListType < 6, "scalingListType is too large
> %d\n", scalingListType);
> +        X265_CHECK(scalingListType < 6, "scalingListType invalid %d\n",
> scalingListType);
>          m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx),
> cu->getLumaIntraDir(absPartIdx), residual, stride, coeff, tuSize,
> scalingListType, useTransformSkip, lastPos);
> +        X265_CHECK(tuSize <= 32, "tuSize is too large %d\n", tuSize);
> +        //===== reconstruction =====
> +        primitives.calcrecon[sizeIdx](pred, residual, reconQt,
> reconIPred, stride, reconQtStride, reconIPredStride);
> +        //===== update distortion =====
> +        outDist += primitives.sse_sp[part](reconQt, reconQtStride, fenc,
> stride);
>      }
>      else
>      {
> -        int16_t* resiTmp = residual;
> +#if CHECKED_BUILD || _DEBUG
>          memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);
> -        primitives.blockfill_s[sizeIdx](resiTmp, stride, 0);
> +#endif
> +        //===== reconstruction =====
> +        primitives.luma_copy_ps[part](reconQt,    reconQtStride,    pred,
> stride);
> +        primitives.luma_copy_pp[part](reconIPred, reconIPredStride, pred,
> stride);
> +        //===== update distortion =====
> +        outDist += primitives.sse_pp[part](pred, stride, fenc, stride);
>      }
> -
> -    X265_CHECK(tuSize <= 32, "tuSize is too large %d\n", tuSize);
> -    //===== reconstruction =====
> -    primitives.calcrecon[sizeIdx](pred, residual, reconQt, reconIPred,
> stride, MAX_CU_SIZE, reconIPredStride);
> -    //===== update distortion =====
> -    outDist += primitives.sse_sp[part](reconQt, MAX_CU_SIZE, fenc,
> stride);
>  }
>
>  void TEncSearch::xIntraCodingChromaBlk(TComDataCU* cu,
> @@ -519,67 +523,67 @@
>      primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
>
>      //===== transform and quantization =====
> +    //--- init rate estimation arrays for RDOQ ---
> +    if (useTransformSkipChroma ? m_cfg->bEnableRDOQTS :
> m_cfg->bEnableRDOQ)
>      {
> -        //--- init rate estimation arrays for RDOQ ---
> -        if (useTransformSkipChroma ? m_cfg->bEnableRDOQTS :
> m_cfg->bEnableRDOQ)
> -        {
> -            m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac, tuSize,
> ttype);
> -        }
> -        //--- transform and quantization ---
> -        uint32_t absSum = 0;
> -        int lastPos = -1;
> -
> -        int curChromaQpOffset;
> -        if (ttype == TEXT_CHROMA_U)
> -        {
> -            curChromaQpOffset =
> cu->getSlice()->getPPS()->getChromaCbQpOffset() +
> cu->getSlice()->getSliceQpDeltaCb();
> -        }
> -        else
> -        {
> -            curChromaQpOffset =
> cu->getSlice()->getPPS()->getChromaCrQpOffset() +
> cu->getSlice()->getSliceQpDeltaCr();
> -        }
> -        m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> -
> -        m_trQuant->selectLambda(TEXT_CHROMA);
> -
> -        absSum = m_trQuant->transformNxN(cu, residual, stride, coeff,
> tuSize, ttype, absPartIdx, &lastPos, useTransformSkipChroma);
> -
> -        //--- set coded block flag ---
> -        cu->setCbfPartRange((((absSum > 0) ? 1 : 0) << origTrDepth),
> ttype, absPartIdx, absPartIdxStep);
> -
> +        m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac, tuSize,
> ttype);
> +    }
> +
> +    //--- transform and quantization ---
> +    uint32_t absSum = 0;
> +    int lastPos = -1;
> +
> +    int curChromaQpOffset;
> +    if (ttype == TEXT_CHROMA_U)
> +    {
> +        curChromaQpOffset =
> cu->getSlice()->getPPS()->getChromaCbQpOffset() +
> cu->getSlice()->getSliceQpDeltaCb();
> +    }
> +    else
> +    {
> +        curChromaQpOffset =
> cu->getSlice()->getPPS()->getChromaCrQpOffset() +
> cu->getSlice()->getSliceQpDeltaCr();
> +    }
> +    m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> +    m_trQuant->selectLambda(TEXT_CHROMA);
> +
> +    absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, tuSize,
> ttype, absPartIdx, &lastPos, useTransformSkipChroma);
> +
> +    //--- set coded block flag ---
> +    cu->setCbfPartRange((((absSum > 0) ? 1 : 0) << origTrDepth), ttype,
> absPartIdx, absPartIdxStep);
> +
> +    uint32_t dist;
> +    if (absSum)
> +    {
>          //--- inverse transform ---
> -        if (absSum)
> -        {
> -            int scalingListType = 0 + ttype;
> -            X265_CHECK(scalingListType < 6, "scalingListType invalid
> %d\n", scalingListType);
> -
>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT,
> residual, stride, coeff, tuSize, scalingListType, useTransformSkipChroma,
> lastPos);
> -        }
> -        else
> -        {
> -            int16_t* resiTmp = residual;
> -            memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);
> -            primitives.blockfill_s[sizeIdx](resiTmp, stride, 0);
> -        }
> +        int scalingListType = 0 + ttype;
> +        X265_CHECK(scalingListType < 6, "scalingListType invalid %d\n",
> scalingListType);
> +        m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx),
> REG_DCT, residual, stride, coeff, tuSize, scalingListType,
> useTransformSkipChroma, lastPos);
> +        X265_CHECK(tuSize <= 32, "tuSize is too large %d\n", tuSize);
> +        //===== reconstruction =====
> +        primitives.calcrecon[sizeIdx](pred, residual, reconQt,
> reconIPred, stride, reconQtStride, reconIPredStride);
> +        //===== update distortion =====
> +        dist = primitives.sse_sp[part](reconQt, reconQtStride, fenc,
> stride);
>      }
> -
> -    X265_CHECK(((intptr_t)residual & (tuSize - 1)) == 0, "residual
> alignment check failure\n");
> -    X265_CHECK(tuSize <= 32, "tuSize invalud\n");
> -    //===== reconstruction =====
> -    primitives.calcrecon[sizeIdx](pred, residual, reconQt, reconIPred,
> stride, reconQtStride, reconIPredStride);
> -    //===== update distortion =====
> -    uint32_t dist = primitives.sse_sp[part](reconQt, reconQtStride, fenc,
> stride);
> +    else
> +    {
> +#if CHECKED_BUILD || _DEBUG
> +        memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);
> +#endif
> +        //===== reconstruction =====
> +        primitives.square_copy_ps[sizeIdx](reconQt,    reconQtStride,
>  pred, stride);
> +        primitives.square_copy_pp[sizeIdx](reconIPred, reconIPredStride,
> pred, stride);
> +        //===== update distortion =====
> +        dist = primitives.sse_pp[part](pred, stride, fenc, stride);
> +    }
> +
> +    X265_CHECK(ttype == TEXT_CHROMA_U || ttype == TEXT_CHROMA_V, "invalid
> ttype\n");
>      if (ttype == TEXT_CHROMA_U)
>      {
>          outDist += m_rdCost->scaleChromaDistCb(dist);
>      }
> -    else if (ttype == TEXT_CHROMA_V)
> +    else
>      {
>          outDist += m_rdCost->scaleChromaDistCr(dist);
>      }
> -    else
> -    {
> -        outDist += dist;
> -    }
>  }
>
>  void TEncSearch::xRecurIntraCodingQT(TComDataCU* cu,
> @@ -784,15 +788,15 @@
>          cu->setTransformSkipSubParts(bestModeId, TEXT_LUMA, absPartIdx,
> fullDepth);
>
>          //--- set reconstruction for next intra prediction blocks ---
> -        uint32_t width     = cu->getCUSize(0) >> trDepth;
> -        uint32_t height    = cu->getCUSize(0) >> trDepth;
>          uint32_t qtLayer   =
> cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
>          uint32_t zorder    = cu->getZorderIdxInCU() + absPartIdx;
>          int16_t* src       =
> m_qtTempShortYuv[qtLayer].getLumaAddr(absPartIdx);
>          X265_CHECK(m_qtTempShortYuv[qtLayer].m_width == MAX_CU_SIZE,
> "width is not max CU size\n");
> +        const uint32_t srcstride = MAX_CU_SIZE;
>          pixel*   dst       =
> cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
>          uint32_t dststride = cu->getPic()->getPicYuvRec()->getStride();
> -        primitives.blockcpy_ps(width, height, dst, dststride, src,
> MAX_CU_SIZE);
> +        int sizeIdx = trSizeLog2 - 2;
> +        primitives.square_copy_sp[sizeIdx](dst, dststride, src,
> srcstride);
>      }
>
>      outDistY += singleDistY;
> @@ -866,25 +870,29 @@
>          //--- set coded block flag ---
>          cu->setCbfSubParts((absSum ? 1 : 0) << trDepth, TEXT_LUMA,
> absPartIdx, fullDepth);
>
> -        //--- inverse transform ---
> +        int part = partitionFromSize(tuSize);
> +
>          if (absSum)
>          {
> +            //--- inverse transform ---
>              int scalingListType = 0 + TEXT_LUMA;
>              X265_CHECK(scalingListType < 6, "scalingListType %d\n",
> scalingListType);
>
>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx),
> cu->getLumaIntraDir(absPartIdx), residual, stride, coeff, tuSize,
> scalingListType, useTransformSkip, lastPos);
> +
> +            // Generate Recon
> +            primitives.luma_add_ps[part](recon, stride, pred, residual,
> stride, stride);
> +            primitives.luma_copy_pp[part](reconIPred, reconIPredStride,
> recon, stride);
>          }
>          else
>          {
> -            int16_t* resiTmp = residual;
> +#if CHECKED_BUILD || _DEBUG
>              memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);
> -            primitives.blockfill_s[sizeIdx](resiTmp, stride, 0);
> +#endif
> +
> +            // Generate Recon
> +            primitives.luma_copy_pp[part](recon,      stride,
> pred, stride);
> +            primitives.luma_copy_pp[part](reconIPred, reconIPredStride,
> pred, stride);
>          }
> -
> -        //Generate Recon
> -        X265_CHECK(tuSize <= 32, "tuSize is too large\n");
> -        int part = partitionFromSize(tuSize);
> -        primitives.luma_add_ps[part](recon, stride, pred, residual,
> stride, stride);
> -        primitives.blockcpy_pp(tuSize, tuSize, reconIPred,
> reconIPredStride, recon, stride);
>      }
>
>      if (bCheckSplit && !bCheckFull)
> @@ -980,8 +988,10 @@
>      pixel*     reconIPred       =
> cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zOrder);
>      uint32_t   reconIPredStride =
> cu->getPic()->getPicYuvRec()->getStride();
>      int16_t*   reconQt          =
> m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx);
> -    primitives.blockcpy_ps(trSize, trSize, reconIPred, reconIPredStride,
> reconQt, MAX_CU_SIZE);
>      X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width
> is not max CU size\n");
> +    const uint32_t reconQtStride = MAX_CU_SIZE;
> +    int sizeIdx = trSizeLog2 - 2;
> +    primitives.square_copy_sp[sizeIdx](reconIPred, reconIPredStride,
> reconQt, reconQtStride);
>  }
>
>  void TEncSearch::xStoreIntraResultChromaQT(TComDataCU* cu, uint32_t
> trDepth, uint32_t absPartIdx, uint32_t chromaId, const bool splitIntoSubTUs)
> @@ -1059,8 +1069,7 @@
>          }
>
>          //===== copy transform coefficients =====
> -        uint32_t trSizeC  = 1 << trSizeCLog2;
> -        uint32_t numCoeffC = 1 << trSizeCLog2 * 2;
> +        uint32_t numCoeffC = 1 << (trSizeCLog2 * 2);
>          uint32_t coeffOffsetC = absPartIdx <<
> (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
>
>          coeff_t* coeffDst = m_qtTempCoeff[chromaId][qtlayer] +
> coeffOffsetC;
> @@ -1072,12 +1081,13 @@
>
>  m_qtTempTransformSkipYuv.copyPartToPartChroma(&m_qtTempShortYuv[qtlayer],
> absPartIdx, lumaSize, chromaId, splitIntoSubTUs);
>
>          uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
> -        uint32_t reconQtStride    = m_qtTempShortYuv[qtlayer].m_cwidth;
>          uint32_t reconIPredStride =
> cu->getPic()->getPicYuvRec()->getCStride();
>
>          pixel* reconIPred =
> cu->getPic()->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(),
> zorder);
>          int16_t* reconQt  =
> m_qtTempShortYuv[qtlayer].getChromaAddr(chromaId, absPartIdx);
> -        primitives.blockcpy_ps(trSizeC, trSizeC, reconIPred,
> reconIPredStride, reconQt, reconQtStride);
> +        uint32_t reconQtStride    = m_qtTempShortYuv[qtlayer].m_cwidth;
> +        int sizeIdxC = trSizeCLog2 - 2;
> +        primitives.square_copy_sp[sizeIdxC](reconIPred, reconIPredStride,
> reconQt, reconQtStride);
>      }
>  }
>
> @@ -1387,6 +1397,7 @@
>          uint32_t stride = fencYuv->getCStride();
>          const bool splitIntoSubTUs = (chFmt == CHROMA_422);
>          int sizeIdx = g_convertToBit[tuSize];
> +        int part = partitionFromSize(tuSize);
>
>          for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <=
> TEXT_CHROMA_V; chromaId++)
>          {
> @@ -1456,28 +1467,28 @@
>                  //--- set coded block flag ---
>                  cu->setCbfPartRange((((absSum > 0) ? 1 : 0) <<
> origTrDepth), ttype, absPartIdxC, tuIterator.m_absPartIdxStep);
>
> -                //--- inverse transform ---
>                  if (absSum)
>                  {
> +                    //--- inverse transform ---
>                      int scalingListType = 0 + ttype;
>                      X265_CHECK(scalingListType < 6, "scalingListType too
> large %d\n", scalingListType);
>
>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC),
> REG_DCT, residual, stride, coeff, tuSize, scalingListType,
> useTransformSkipChroma, lastPos);
> +
> +                    //===== reconstruction =====
> +                    // use square primitives
> +                    primitives.chroma[CHROMA_444].add_ps[part](recon,
> stride, pred, residual, stride, stride);
> +                    primitives.square_copy_pp[sizeIdx](reconIPred,
> reconIPredStride, recon, stride);
>                  }
>                  else
>                  {
> -                    int16_t* resiTmp = residual;
> +#if CHECKED_BUILD || _DEBUG
>                      memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);
> -                    primitives.blockfill_s[sizeIdx](resiTmp, stride, 0);
> +#endif
> +
> +                    //===== reconstruction =====
> +                    primitives.square_copy_pp[sizeIdx](recon,
>  stride,           pred, stride);
> +                    primitives.square_copy_pp[sizeIdx](reconIPred,
> reconIPredStride, pred, stride);
>                  }
> -
> -                //===== reconstruction =====
> -                X265_CHECK(((intptr_t)residual & (tuSize - 1)) == 0,
> "residual alignment check failed\n");
> -                X265_CHECK(tuSize <= 32, "tuSize out of range\n");
> -
> -                // use square primitive
> -                int part = partitionFromSize(tuSize);
> -                primitives.chroma[CHROMA_444].add_ps[part](recon, stride,
> pred, residual, stride, stride);
> -                primitives.chroma[CHROMA_444].copy_pp[part](reconIPred,
> reconIPredStride, recon, stride);
>              }
>              while (isNextSection(&tuIterator));
>
> @@ -1859,7 +1870,6 @@
>
>  void TEncSearch::initSection(TComTURecurse *tuIterator, uint32_t
> splitMode, uint32_t absPartIdxStep, uint32_t m_absPartIdxTU)
>  {
> -    tuIterator->m_partOffset        = 0;
>      tuIterator->m_section           = 0;
>      tuIterator->m_absPartIdxTURelCU = m_absPartIdxTU;
>      tuIterator->m_splitMode         = splitMode;
> @@ -1874,16 +1884,21 @@
>  {
>      uint32_t depth              = cu->getDepth(0);
>      uint32_t initTrDepth        = (cu->getPartitionSize(0) != SIZE_2Nx2N)
> && (cu->getChromaFormat() == CHROMA_444 ? 1 : 0);
> -
> +    uint32_t tuSize             = cu->getCUSize(0) >> initTrDepth;
>      uint32_t splitMode          = (initTrDepth == 0) ? DONT_SPLIT :
> QUAD_SPLIT;
>      uint32_t absPartIdx         = (cu->getPic()->getNumPartInCU() >>
> (depth << 1));
>
> +    int chFmt = cu->getChromaFormat();
> +    int part = partitionFromSize(tuSize);
> +
>      TComTURecurse tuIterator;
>
>      initSection(&tuIterator, splitMode, absPartIdx);
>
>      do
>      {
> +        uint32_t absPartIdxC = tuIterator.m_absPartIdxTURelCU;
> +
>          uint32_t bestMode           = 0;
>          uint32_t bestDist           = 0;
>          uint64_t bestCost           = MAX_INT64;
> @@ -1893,9 +1908,7 @@
>          uint32_t maxMode = NUM_CHROMA_MODE;
>          uint32_t modeList[NUM_CHROMA_MODE];
>
> -        tuIterator.m_partOffset = tuIterator.m_absPartIdxTURelCU;
> -
> -        cu->getAllowedChromaDir(tuIterator.m_partOffset, modeList);
> +        cu->getAllowedChromaDir(absPartIdxC, modeList);
>
>          //----- check chroma modes -----
>          for (uint32_t mode = minMode; mode < maxMode; mode++)
> @@ -1906,16 +1919,16 @@
>              //----- chroma coding -----
>              uint32_t dist = 0;
>
> -            cu->setChromIntraDirSubParts(modeList[mode],
> tuIterator.m_partOffset, depth + initTrDepth);
> -
> -            xRecurIntraChromaCodingQT(cu, initTrDepth,
> tuIterator.m_absPartIdxTURelCU, fencYuv, predYuv, resiYuv, dist);
> +            cu->setChromIntraDirSubParts(modeList[mode], absPartIdxC,
> depth + initTrDepth);
> +
> +            xRecurIntraChromaCodingQT(cu, initTrDepth, absPartIdxC,
> fencYuv, predYuv, resiYuv, dist);
>
>              if (cu->getSlice()->getPPS()->getUseTransformSkip())
>              {
>
>  m_rdGoOnSbacCoder->load(m_rdSbacCoders[depth][CI_CURR_BEST]);
>              }
>
> -            uint32_t bits = xGetIntraBitsQT(cu, initTrDepth,
> tuIterator.m_absPartIdxTURelCU, tuIterator.m_absPartIdxStep, false, true);
> +            uint32_t bits = xGetIntraBitsQT(cu, initTrDepth, absPartIdxC,
> tuIterator.m_absPartIdxStep, false, true);
>              uint64_t cost = m_rdCost->calcRdCost(dist, bits);
>
>              //----- compare -----
> @@ -1924,37 +1937,36 @@
>                  bestCost = cost;
>                  bestDist = dist;
>                  bestMode = modeList[mode];
> -                xSetIntraResultChromaQT(cu, initTrDepth,
> tuIterator.m_absPartIdxTURelCU, reconYuv);
> -                ::memcpy(m_qtTempCbf[1], cu->getCbf(TEXT_CHROMA_U) +
> tuIterator.m_partOffset, tuIterator.m_absPartIdxStep * sizeof(uint8_t));
> -                ::memcpy(m_qtTempCbf[2], cu->getCbf(TEXT_CHROMA_V) +
> tuIterator.m_partOffset, tuIterator.m_absPartIdxStep * sizeof(uint8_t));
> -                ::memcpy(m_qtTempTransformSkipFlag[1],
> cu->getTransformSkip(TEXT_CHROMA_U) + tuIterator.m_partOffset,
> tuIterator.m_absPartIdxStep * sizeof(uint8_t));
> -                ::memcpy(m_qtTempTransformSkipFlag[2],
> cu->getTransformSkip(TEXT_CHROMA_V) + tuIterator.m_partOffset,
> tuIterator.m_absPartIdxStep * sizeof(uint8_t));
> +                xSetIntraResultChromaQT(cu, initTrDepth, absPartIdxC,
> reconYuv);
> +                ::memcpy(m_qtTempCbf[1], cu->getCbf(TEXT_CHROMA_U) +
> absPartIdxC, tuIterator.m_absPartIdxStep * sizeof(uint8_t));
> +                ::memcpy(m_qtTempCbf[2], cu->getCbf(TEXT_CHROMA_V) +
> absPartIdxC, tuIterator.m_absPartIdxStep * sizeof(uint8_t));
> +                ::memcpy(m_qtTempTransformSkipFlag[1],
> cu->getTransformSkip(TEXT_CHROMA_U) + absPartIdxC,
> tuIterator.m_absPartIdxStep * sizeof(uint8_t));
> +                ::memcpy(m_qtTempTransformSkipFlag[2],
> cu->getTransformSkip(TEXT_CHROMA_V) + absPartIdxC,
> tuIterator.m_absPartIdxStep * sizeof(uint8_t));
>              }
>          }
>
>          if (!isLastSection(&tuIterator))
>          {
> -            uint32_t compWidth   = (cu->getCUSize(0) >> m_hChromaShift)
> >> initTrDepth;
> -            uint32_t compHeight  = (cu->getCUSize(0) >> m_vChromaShift)
> >> initTrDepth;
> -            uint32_t zorder      = cu->getZorderIdxInCU() +
> tuIterator.m_partOffset;
> -            pixel*     dst         =
> cu->getPic()->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder);
> +            uint32_t zorder      = cu->getZorderIdxInCU() + absPartIdxC;
>              uint32_t dststride   =
> cu->getPic()->getPicYuvRec()->getCStride();
> -            pixel*     src         =
> reconYuv->getCbAddr(tuIterator.m_partOffset);
>              uint32_t srcstride   = reconYuv->getCStride();
> -
> -            primitives.blockcpy_pp(compWidth, compHeight, dst, dststride,
> src, srcstride);
> -
> -            dst                 =
> cu->getPic()->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder);
> -            src                 =
> reconYuv->getCrAddr(tuIterator.m_partOffset);
> -            primitives.blockcpy_pp(compWidth, compHeight, dst, dststride,
> src, srcstride);
> +            pixel *src, *dst;
> +
> +            dst = cu->getPic()->getPicYuvRec()->getCbAddr(cu->getAddr(),
> zorder);
> +            src = reconYuv->getCbAddr(absPartIdxC);
> +            primitives.chroma[chFmt].copy_pp[part](dst, dststride, src,
> srcstride);
> +
> +            dst = cu->getPic()->getPicYuvRec()->getCrAddr(cu->getAddr(),
> zorder);
> +            src = reconYuv->getCrAddr(absPartIdxC);
> +            primitives.chroma[chFmt].copy_pp[part](dst, dststride, src,
> srcstride);
>          }
>
>          //----- set data -----
> -        ::memcpy(cu->getCbf(TEXT_CHROMA_U) + tuIterator.m_partOffset,
> m_qtTempCbf[1], tuIterator.m_absPartIdxStep * sizeof(uint8_t));
> -        ::memcpy(cu->getCbf(TEXT_CHROMA_V) + tuIterator.m_partOffset,
> m_qtTempCbf[2], tuIterator.m_absPartIdxStep * sizeof(uint8_t));
> -        ::memcpy(cu->getTransformSkip(TEXT_CHROMA_U) +
> tuIterator.m_partOffset, m_qtTempTransformSkipFlag[1],
> tuIterator.m_absPartIdxStep * sizeof(uint8_t));
> -        ::memcpy(cu->getTransformSkip(TEXT_CHROMA_V) +
> tuIterator.m_partOffset, m_qtTempTransformSkipFlag[2],
> tuIterator.m_absPartIdxStep * sizeof(uint8_t));
> -        cu->setChromIntraDirSubParts(bestMode, tuIterator.m_partOffset,
> depth + initTrDepth);
> +        ::memcpy(cu->getCbf(TEXT_CHROMA_U) + absPartIdxC, m_qtTempCbf[1],
> tuIterator.m_absPartIdxStep * sizeof(uint8_t));
> +        ::memcpy(cu->getCbf(TEXT_CHROMA_V) + absPartIdxC, m_qtTempCbf[2],
> tuIterator.m_absPartIdxStep * sizeof(uint8_t));
> +        ::memcpy(cu->getTransformSkip(TEXT_CHROMA_U) + absPartIdxC,
> m_qtTempTransformSkipFlag[1], tuIterator.m_absPartIdxStep *
> sizeof(uint8_t));
> +        ::memcpy(cu->getTransformSkip(TEXT_CHROMA_V) + absPartIdxC,
> m_qtTempTransformSkipFlag[2], tuIterator.m_absPartIdxStep *
> sizeof(uint8_t));
> +        cu->setChromIntraDirSubParts(bestMode, absPartIdxC, depth +
> initTrDepth);
>          cu->m_totalDistortion += bestDist;
>      }
>      while (isNextSection(&tuIterator));
> @@ -2685,9 +2697,11 @@
>          ::memset(cu->getCbf(TEXT_LUMA), 0, qpartnum * sizeof(uint8_t));
>          ::memset(cu->getCbf(TEXT_CHROMA_U), 0, qpartnum *
> sizeof(uint8_t));
>          ::memset(cu->getCbf(TEXT_CHROMA_V), 0, qpartnum *
> sizeof(uint8_t));
> +#if CHECKED_BUILD || _DEBUG
>          ::memset(cu->getCoeffY(), 0, cuSize * cuSize * sizeof(coeff_t));
>          ::memset(cu->getCoeffCb(), 0, cuSize * cuSize * sizeof(coeff_t)
> >> (m_hChromaShift + m_vChromaShift));
>          ::memset(cu->getCoeffCr(), 0, cuSize * cuSize * sizeof(coeff_t)
> >> (m_hChromaShift + m_vChromaShift));
> +#endif
>          cu->setTransformSkipSubParts(0, 0, 0, 0, cu->getDepth(0));
>      }
>      else
> @@ -2841,25 +2855,26 @@
>          m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET,
> 0, chFmt);
>          m_trQuant->selectLambda(TEXT_LUMA);
>
> -        absSumY = m_trQuant->transformNxN(cu,
> resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, coeffCurY,
> +        int16_t *curResiY = resiYuv->getLumaAddr(absPartIdx);
> +        const uint32_t strideResiY = resiYuv->m_width;
> +        const uint32_t strideResiC = resiYuv->m_cwidth;
> +
> +        absSumY = m_trQuant->transformNxN(cu, curResiY, strideResiY,
> coeffCurY,
>                                            trSize, TEXT_LUMA, absPartIdx,
> &lastPosY, false, curuseRDOQ);
>
>          cu->setCbfSubParts(absSumY ? setCbf : 0, TEXT_LUMA, absPartIdx,
> depth);
>
>          if (absSumY)
>          {
> -            int16_t *curResiY = resiYuv->getLumaAddr(absPartIdx);
> -
>              m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA,
> QP_BD_OFFSET, 0, chFmt);
>
>              int scalingListType = 3 + TEXT_LUMA;
>              X265_CHECK(scalingListType < 6, "scalingListType too large
> %d\n", scalingListType);
> -
>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT,
> curResiY, resiYuv->m_width,  coeffCurY, trSize, scalingListType, false,
> lastPosY); //this is for inter mode only
> +
>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT,
> curResiY, strideResiY,  coeffCurY, trSize, scalingListType, false,
> lastPosY); //this is for inter mode only
>          }
>          else
>          {
> -            int16_t *ptr = resiYuv->getLumaAddr(absPartIdx);
> -            primitives.blockfill_s[sizeIdx](ptr, resiYuv->m_width, 0);
> +            primitives.blockfill_s[sizeIdx](curResiY, strideResiY, 0);
>          }
>          cu->setCbfSubParts(absSumY ? setCbf : 0, TEXT_LUMA, absPartIdx,
> depth);
>
> @@ -2873,6 +2888,9 @@
>                  uint32_t absPartIdxC = tuIterator.m_absPartIdxTURelCU;
>                  uint32_t subTUBufferOffset = trSizeC * trSizeC *
> tuIterator.m_section;
>
> +                int16_t *curResiU = resiYuv->getCbAddr(absPartIdxC);
> +                int16_t *curResiV = resiYuv->getCrAddr(absPartIdxC);
> +
>                  cu->setTransformSkipPartRange(0, TEXT_CHROMA_U,
> absPartIdxC, tuIterator.m_absPartIdxStep);
>                  cu->setTransformSkipPartRange(0, TEXT_CHROMA_V,
> absPartIdxC, tuIterator.m_absPartIdxStep);
>
> @@ -2881,12 +2899,12 @@
>
>                  m_trQuant->selectLambda(TEXT_CHROMA);
>
> -                absSumU = m_trQuant->transformNxN(cu,
> resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurU +
> subTUBufferOffset,
> +                absSumU = m_trQuant->transformNxN(cu, curResiU,
> strideResiC, coeffCurU + subTUBufferOffset,
>                                                    trSizeC, TEXT_CHROMA_U,
> absPartIdxC, &lastPosU, false, curuseRDOQ);
>
>                  curChromaQpOffset =
> cu->getSlice()->getPPS()->getChromaCrQpOffset() +
> cu->getSlice()->getSliceQpDeltaCr();
>                  m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> -                absSumV = m_trQuant->transformNxN(cu,
> resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurV +
> subTUBufferOffset,
> +                absSumV = m_trQuant->transformNxN(cu, curResiV,
> strideResiC, coeffCurV + subTUBufferOffset,
>                                                    trSizeC, TEXT_CHROMA_V,
> absPartIdxC, &lastPosV, false, curuseRDOQ);
>
>                  cu->setCbfPartRange(absSumU ? setCbf : 0, TEXT_CHROMA_U,
> absPartIdxC, tuIterator.m_absPartIdxStep);
> @@ -2894,34 +2912,29 @@
>
>                  if (absSumU)
>                  {
> -                    int16_t *pcResiCurrU =
> resiYuv->getCbAddr(absPartIdxC);
> -
>                      curChromaQpOffset =
> cu->getSlice()->getPPS()->getChromaCbQpOffset() +
> cu->getSlice()->getSliceQpDeltaCb();
>                      m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
>
>                      int scalingListType = 3 + TEXT_CHROMA_U;
>                      X265_CHECK(scalingListType < 6, "scalingListType too
> large %d\n", scalingListType);
> -
>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC),
> REG_DCT, pcResiCurrU, resiYuv->m_cwidth, coeffCurU + subTUBufferOffset,
> trSizeC, scalingListType, false, lastPosU);
> +
>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC),
> REG_DCT, curResiU, strideResiC, coeffCurU + subTUBufferOffset, trSizeC,
> scalingListType, false, lastPosU);
>                  }
>                  else
>                  {
> -                    int16_t *ptr = resiYuv->getCbAddr(absPartIdxC);
> -                    primitives.blockfill_s[sizeIdxC](ptr,
> resiYuv->m_cwidth, 0);
> +                    primitives.blockfill_s[sizeIdxC](curResiU,
> strideResiC, 0);
>                  }
>                  if (absSumV)
>                  {
> -                    int16_t *curResiV = resiYuv->getCrAddr(absPartIdxC);
>                      curChromaQpOffset =
> cu->getSlice()->getPPS()->getChromaCrQpOffset() +
> cu->getSlice()->getSliceQpDeltaCr();
>                      m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
>
>                      int scalingListType = 3 + TEXT_CHROMA_V;
>                      X265_CHECK(scalingListType < 6, "scalingListType too
> large %d\n", scalingListType);
> -
>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC),
> REG_DCT, curResiV, resiYuv->m_cwidth, coeffCurV + subTUBufferOffset,
> trSizeC, scalingListType, false, lastPosV);
> +
>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC),
> REG_DCT, curResiV, strideResiC, coeffCurV + subTUBufferOffset, trSizeC,
> scalingListType, false, lastPosV);
>                  }
>                  else
>                  {
> -                    int16_t *ptr = resiYuv->getCrAddr(absPartIdxC);
> -                    primitives.blockfill_s[sizeIdxC](ptr,
> resiYuv->m_cwidth, 0);
> +                    primitives.blockfill_s[sizeIdxC](curResiV,
> strideResiC, 0);
>                  }
>                  cu->setCbfPartRange(absSumU ? setCbf : 0, TEXT_CHROMA_U,
> absPartIdxC, tuIterator.m_absPartIdxStep);
>                  cu->setCbfPartRange(absSumV ? setCbf : 0, TEXT_CHROMA_V,
> absPartIdxC, tuIterator.m_absPartIdxStep);
> @@ -3027,6 +3040,8 @@
>      if (bCheckFull)
>      {
>          uint32_t trSizeC = 1 << trSizeCLog2;
> +        int sizeIdx  = trSizeLog2 - 2;
> +        int sizeIdxC = trSizeCLog2 - 2;
>          const uint32_t qtlayer =
> cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
>          uint32_t coeffOffsetY = absPartIdx <<
> cu->getPic()->getLog2UnitSize() * 2;
>          uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift +
> m_vChromaShift);
> @@ -3070,7 +3085,7 @@
>              do
>              {
>                  uint32_t absPartIdxC = tuIterator.m_absPartIdxTURelCU;
> -                uint32_t subTUBufferOffset    = trSizeC * trSizeC *
> tuIterator.m_section;
> +                uint32_t subTUBufferOffset = trSizeC * trSizeC *
> tuIterator.m_section;
>
>                  cu->setTransformSkipPartRange(0, TEXT_CHROMA_U,
> absPartIdxC, tuIterator.m_absPartIdxStep);
>                  cu->setTransformSkipPartRange(0, TEXT_CHROMA_V,
> absPartIdxC, tuIterator.m_absPartIdxStep);
> @@ -3112,7 +3127,8 @@
>              while (isNextSection(&tuIterator));
>          }
>
> -        const uint32_t numSamplesLuma = 1 << (trSizeLog2 << 1);
> +        const uint32_t numCoeffY = 1 << (trSizeLog2 * 2);
> +        const uint32_t numCoeffC = 1 << (trSizeCLog2 * 2);
>
>          for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
>          {
> @@ -3123,6 +3139,10 @@
>
>          int partSize = partitionFromSize(trSize);
>          uint32_t distY =
> primitives.sse_sp[partSize](resiYuv->getLumaAddr(absPartIdx),
> resiYuv->m_width, (pixel*)RDCost::zeroPel, trSize);
> +        int16_t *curResiY =
> m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx);
> +        X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE,
> "width not full CU\n");
> +        const uint32_t strideResiY = MAX_CU_SIZE;
> +        const uint32_t strideResiC = m_qtTempShortYuv[qtlayer].m_cwidth;
>
>          if (outZeroDist)
>          {
> @@ -3130,16 +3150,13 @@
>          }
>          if (absSum[TEXT_LUMA][0])
>          {
> -            int16_t *curResiY =
> m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx);
> -
>              m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA,
> QP_BD_OFFSET, 0, chFmt);
>
>              int scalingListType = 3 + TEXT_LUMA;
>              X265_CHECK(scalingListType < 6, "scalingListType too large
> %d\n", scalingListType);
> -            X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE,
> "width not full CU\n");
> -
>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT,
> curResiY, MAX_CU_SIZE,  coeffCurY, trSize, scalingListType, false,
> lastPos[TEXT_LUMA][0]); //this is for inter mode only
> -
> -            const uint32_t nonZeroDistY =
> primitives.sse_ss[partSize](resiYuv->getLumaAddr(absPartIdx),
> resiYuv->m_width, m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx),
> MAX_CU_SIZE);
> +
>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT,
> curResiY, strideResiY,  coeffCurY, trSize, scalingListType, false,
> lastPos[TEXT_LUMA][0]); //this is for inter mode only
> +
> +            const uint32_t nonZeroDistY =
> primitives.sse_ss[partSize](resiYuv->getLumaAddr(absPartIdx),
> resiYuv->m_width, curResiY, strideResiY);
>              if (cu->isLosslessCoded(0))
>              {
>                  distY = nonZeroDistY;
> @@ -3154,7 +3171,9 @@
>                  if (nullCostY < singleCostY)
>                  {
>                      absSum[TEXT_LUMA][0] = 0;
> -                    ::memset(coeffCurY, 0, sizeof(coeff_t) *
> numSamplesLuma);
> +#if CHECKED_BUILD || _DEBUG
> +                    ::memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY);
> +#endif
>                      if (checkTransformSkipY)
>                      {
>                          minCost[TEXT_LUMA][0] = nullCostY;
> @@ -3182,10 +3201,7 @@
>
>          if (!absSum[TEXT_LUMA][0])
>          {
> -            int16_t *ptr =
>  m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx);
> -            X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE,
> "width not full CU\n");
> -            int sizeIdx = trSizeLog2 - 2;
> -            primitives.blockfill_s[sizeIdx](ptr, MAX_CU_SIZE, 0);
> +            primitives.blockfill_s[sizeIdx](curResiY, strideResiY, 0);
>          }
>          cu->setCbfSubParts(absSum[TEXT_LUMA][0] ? setCbf : 0, TEXT_LUMA,
> absPartIdx, depth);
>
> @@ -3197,13 +3213,15 @@
>              initSection(&tuIterator, splitIntoSubTUs ? VERTICAL_SPLIT :
> DONT_SPLIT, absPartIdxStep, absPartIdx);
>
>              int partSizeC = partitionFromSize(trSizeC);
> -            const uint32_t numSamplesChroma = trSizeC * trSizeC;
>
>              do
>              {
>                  uint32_t absPartIdxC = tuIterator.m_absPartIdxTURelCU;
>                  uint32_t subTUBufferOffset = trSizeC * trSizeC *
> tuIterator.m_section;
>
> +                int16_t *curResiU =
> m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC);
> +                int16_t *curResiV =
> m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC);
> +
>                  distU =
> m_rdCost->scaleChromaDistCb(primitives.sse_sp[partSizeC](resiYuv->getCbAddr(absPartIdxC),
> resiYuv->m_cwidth, (pixel*)RDCost::zeroPel, trSizeC));
>
>                  if (outZeroDist)
> @@ -3212,18 +3230,15 @@
>                  }
>                  if (absSum[TEXT_CHROMA_U][tuIterator.m_section])
>                  {
> -                    int16_t *pcResiCurrU =
> m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC);
> -
>                      int curChromaQpOffset =
> cu->getSlice()->getPPS()->getChromaCbQpOffset() +
> cu->getSlice()->getSliceQpDeltaCb();
>                      m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
>
>                      int scalingListType = 3 + TEXT_CHROMA_U;
>                      X265_CHECK(scalingListType < 6, "scalingListType too
> large %d\n", scalingListType);
> -
>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC),
> REG_DCT, pcResiCurrU, m_qtTempShortYuv[qtlayer].m_cwidth, coeffCurU +
> subTUBufferOffset,
> +
>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC),
> REG_DCT, curResiU, strideResiC, coeffCurU + subTUBufferOffset,
>                                                 trSizeC, scalingListType,
> false, lastPos[TEXT_CHROMA_U][tuIterator.m_section]);
>                      uint32_t dist =
> primitives.sse_ss[partSizeC](resiYuv->getCbAddr(absPartIdxC),
> resiYuv->m_cwidth,
> -
> m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC),
> -
> m_qtTempShortYuv[qtlayer].m_cwidth);
> +
> curResiU, strideResiC);
>                      const uint32_t nonZeroDistU =
> m_rdCost->scaleChromaDistCb(dist);
>
>                      if (cu->isLosslessCoded(0))
> @@ -3240,7 +3255,9 @@
>                          if (nullCostU < singleCostU)
>                          {
>                              absSum[TEXT_CHROMA_U][tuIterator.m_section] =
> 0;
> -                            ::memset(coeffCurU + subTUBufferOffset, 0,
> sizeof(coeff_t) * numSamplesChroma);
> +#if CHECKED_BUILD || _DEBUG
> +                            ::memset(coeffCurU + subTUBufferOffset, 0,
> sizeof(coeff_t) * numCoeffC);
> +#endif
>                              if (checkTransformSkipUV)
>                              {
>
>  minCost[TEXT_CHROMA_U][tuIterator.m_section] = nullCostU;
> @@ -3268,10 +3285,7 @@
>
>                  if (!absSum[TEXT_CHROMA_U][tuIterator.m_section])
>                  {
> -                    int16_t *ptr =
> m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC);
> -                    const uint32_t stride =
> m_qtTempShortYuv[qtlayer].m_cwidth;
> -                    int sizeIdxC = trSizeCLog2 - 2;
> -                    primitives.blockfill_s[sizeIdxC](ptr, stride, 0);
> +                    primitives.blockfill_s[sizeIdxC](curResiU,
> strideResiC, 0);
>                  }
>
>                  distV =
> m_rdCost->scaleChromaDistCr(primitives.sse_sp[partSizeC](resiYuv->getCrAddr(absPartIdxC),
> resiYuv->m_cwidth, (pixel*)RDCost::zeroPel, trSizeC));
> @@ -3281,17 +3295,15 @@
>                  }
>                  if (absSum[TEXT_CHROMA_V][tuIterator.m_section])
>                  {
> -                    int16_t *curResiV =
> m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC);
>                      int curChromaQpOffset =
> cu->getSlice()->getPPS()->getChromaCrQpOffset() +
> cu->getSlice()->getSliceQpDeltaCr();
>                      m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA,
> cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
>
>                      int scalingListType = 3 + TEXT_CHROMA_V;
>                      X265_CHECK(scalingListType < 6, "scalingListType too
> large %d\n", scalingListType);
> -
>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC),
> REG_DCT, curResiV, m_qtTempShortYuv[qtlayer].m_cwidth, coeffCurV +
> subTUBufferOffset,
> +
>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC),
> REG_DCT, curResiV, strideResiC, coeffCurV + subTUBufferOffset,
>                                                 trSizeC, scalingListType,
> false, lastPos[TEXT_CHROMA_V][tuIterator.m_section]);
>                      uint32_t dist =
> primitives.sse_ss[partSizeC](resiYuv->getCrAddr(absPartIdxC),
> resiYuv->m_cwidth,
> -
> m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC),
> -
> m_qtTempShortYuv[qtlayer].m_cwidth);
> +
> curResiV, strideResiC);
>                      const uint32_t nonZeroDistV =
> m_rdCost->scaleChromaDistCr(dist);
>
>                      if (cu->isLosslessCoded(0))
> @@ -3308,7 +3320,9 @@
>                          if (nullCostV < singleCostV)
>                          {
>                              absSum[TEXT_CHROMA_V][tuIterator.m_section] =
> 0;
> -                            ::memset(coeffCurV + subTUBufferOffset, 0,
> sizeof(coeff_t) * numSamplesChroma);
> +#if CHECKED_BUILD || _DEBUG
> +                            ::memset(coeffCurV + subTUBufferOffset, 0,
> sizeof(coeff_t) * numCoeffC);
> +#endif
>                              if (checkTransformSkipUV)
>                              {
>
>  minCost[TEXT_CHROMA_V][tuIterator.m_section] = nullCostV;
> @@ -3336,10 +3350,7 @@
>
>                  if (!absSum[TEXT_CHROMA_V][tuIterator.m_section])
>                  {
> -                    int16_t *ptr =
>  m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC);
> -                    const uint32_t stride =
> m_qtTempShortYuv[qtlayer].m_cwidth;
> -                    int sizeIdxC = trSizeCLog2 - 2;
> -                    primitives.blockfill_s[sizeIdxC](ptr, stride, 0);
> +                    primitives.blockfill_s[sizeIdxC](curResiV,
> strideResiC, 0);
>                  }
>
>
>  cu->setCbfPartRange(absSum[TEXT_CHROMA_U][tuIterator.m_section] ? setCbf :
> 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> @@ -3354,17 +3365,11 @@
>              uint32_t nonZeroDistY = 0, absSumTransformSkipY;
>              uint64_t singleCostY = MAX_INT64;
>
> -            int16_t *curResiY =
> m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx);
> -            X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE,
> "width not full CU\n");
> -
> -            coeff_t bestCoeffY[32 * 32];
> -            memcpy(bestCoeffY, coeffCurY, sizeof(coeff_t) *
> numSamplesLuma);
> -
> -            int16_t bestResiY[32 * 32];
> -            for (int i = 0; i < trSize; ++i)
> -            {
> -                memcpy(bestResiY + i * trSize, curResiY + i *
> MAX_CU_SIZE, sizeof(int16_t) * trSize);
> -            }
> +            coeff_t bestCoeffY[MAX_TS_SIZE * MAX_TS_SIZE];
> +            memcpy(bestCoeffY, coeffCurY, sizeof(coeff_t) * numCoeffY);
> +
> +            int16_t bestResiY[MAX_TS_SIZE * MAX_TS_SIZE];
> +            primitives.square_copy_ss[sizeIdx](bestResiY, trSize,
> curResiY, strideResiY);
>
>
>  m_rdGoOnSbacCoder->load(m_rdSbacCoders[depth][CI_QT_TRAFO_ROOT]);
>
> @@ -3393,13 +3398,11 @@
>
>                  int scalingListType = 3 + TEXT_LUMA;
>                  X265_CHECK(scalingListType < 6, "scalingListType too
> large %d\n", scalingListType);
> -                X265_CHECK(m_qtTempShortYuv[qtlayer].m_width ==
> MAX_CU_SIZE, "width not full CU\n");
> -
> -
>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT,
> curResiY, MAX_CU_SIZE,  coeffCurY, trSize, scalingListType, true,
> lastPosTransformSkip[TEXT_LUMA][0]);
> +
> +
>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT,
> curResiY, strideResiY,  coeffCurY, trSize, scalingListType, true,
> lastPosTransformSkip[TEXT_LUMA][0]);
>
>                  nonZeroDistY =
> primitives.sse_ss[partSize](resiYuv->getLumaAddr(absPartIdx),
> resiYuv->m_width,
> -
> m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx),
> -                                                           MAX_CU_SIZE);
> +                                                           curResiY,
> strideResiY);
>
>                  singleCostY = m_rdCost->calcRdCost(nonZeroDistY,
> skipSingleBitsY);
>              }
> @@ -3407,11 +3410,8 @@
>              if (!absSumTransformSkipY || minCost[TEXT_LUMA][0] <
> singleCostY)
>              {
>                  cu->setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx,
> depth);
> -                memcpy(coeffCurY, bestCoeffY, sizeof(coeff_t) *
> numSamplesLuma);
> -                for (int i = 0; i < trSize; ++i)
> -                {
> -                    memcpy(curResiY + i * MAX_CU_SIZE, &bestResiY[i *
> trSize], sizeof(int16_t) * trSize);
> -                }
> +                memcpy(coeffCurY, bestCoeffY, sizeof(coeff_t) *
> numCoeffY);
> +                primitives.square_copy_ss[sizeIdx](curResiY, strideResiY,
> bestResiY, trSize);
>              }
>              else
>              {
> @@ -3435,7 +3435,6 @@
>              initSection(&tuIterator, splitIntoSubTUs ? VERTICAL_SPLIT :
> DONT_SPLIT, absPartIdxStep, absPartIdx);
>
>              int partSizeC = partitionFromSize(trSizeC);
> -            const uint32_t numSamplesChroma = trSizeC * trSizeC;
>
>              do
>              {
> @@ -3444,18 +3443,14 @@
>
>                  int16_t *curResiU =
> m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC);
>                  int16_t *curResiV =
> m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC);
> -                uint32_t stride = m_qtTempShortYuv[qtlayer].m_cwidth;
> -
> -                coeff_t bestCoeffU[32 * 32], bestCoeffV[32 * 32];
> -                memcpy(bestCoeffU, coeffCurU + subTUBufferOffset,
> sizeof(coeff_t) * numSamplesChroma);
> -                memcpy(bestCoeffV, coeffCurV + subTUBufferOffset,
> sizeof(coeff_t) * numSamplesChroma);
> -
> -                int16_t bestResiU[32 * 32], bestResiV[32 * 32];
> -                for (int i = 0; i < trSizeC; ++i)
> -                {
> -                    memcpy(&bestResiU[i * trSizeC], curResiU + i *
> stride, sizeof(int16_t) * trSizeC);
> -                    memcpy(&bestResiV[i * trSizeC], curResiV + i *
> stride, sizeof(int16_t) * trSizeC);
> -                }
> +
> +                coeff_t bestCoeffU[MAX_TS_SIZE * MAX_TS_SIZE],
> bestCoeffV[MAX_TS_SIZE * MAX_TS_SIZE];
> +                memcpy(bestCoeffU, coeffCurU + subTUBufferOffset,
> sizeof(coeff_t) * numCoeffC);
> +                memcpy(bestCoeffV, coeffCurV + subTUBufferOffset,
> sizeof(coeff_t) * numCoeffC);
> +
> +                int16_t bestResiU[MAX_TS_SIZE * MAX_TS_SIZE],
> bestResiV[MAX_TS_SIZE * MAX_TS_SIZE];
> +                primitives.square_copy_ss[sizeIdxC](bestResiU, trSizeC,
> curResiU, strideResiC);
> +                primitives.square_copy_ss[sizeIdxC](bestResiV, trSizeC,
> curResiV, strideResiC);
>
>                  cu->setTransformSkipPartRange(1, TEXT_CHROMA_U,
> absPartIdxC, tuIterator.m_absPartIdxStep);
>                  cu->setTransformSkipPartRange(1, TEXT_CHROMA_V,
> absPartIdxC, tuIterator.m_absPartIdxStep);
> @@ -3493,11 +3488,10 @@
>
>                      int scalingListType = 3 + TEXT_CHROMA_U;
>                      X265_CHECK(scalingListType < 6, "scalingListType too
> large %d\n", scalingListType);
> -
>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC),
> REG_DCT, curResiU, m_qtTempShortYuv[qtlayer].m_cwidth, coeffCurU +
> subTUBufferOffset,
> +
>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC),
> REG_DCT, curResiU, strideResiC, coeffCurU + subTUBufferOffset,
>                                                 trSizeC, scalingListType,
> true, lastPosTransformSkip[TEXT_CHROMA_U][tuIterator.m_section]);
>                      uint32_t dist =
> primitives.sse_ss[partSizeC](resiYuv->getCbAddr(absPartIdxC),
> resiYuv->m_cwidth,
> -
> m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC),
> -
> m_qtTempShortYuv[qtlayer].m_cwidth);
> +
> curResiU, strideResiC);
>                      nonZeroDistU = m_rdCost->scaleChromaDistCb(dist);
>                      singleCostU = m_rdCost->calcRdCost(nonZeroDistU,
> singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section]);
>                  }
> @@ -3506,11 +3500,8 @@
>                  {
>                      cu->setTransformSkipPartRange(0, TEXT_CHROMA_U,
> absPartIdxC, tuIterator.m_absPartIdxStep);
>
> -                    memcpy(coeffCurU + subTUBufferOffset, bestCoeffU,
> sizeof(coeff_t) * numSamplesChroma);
> -                    for (int i = 0; i < trSizeC; ++i)
> -                    {
> -                        memcpy(curResiU + i * stride, &bestResiU[i *
> trSizeC], sizeof(int16_t) * trSizeC);
> -                    }
> +                    memcpy(coeffCurU + subTUBufferOffset, bestCoeffU,
> sizeof(coeff_t) * numCoeffC);
> +                    primitives.square_copy_ss[sizeIdxC](curResiU,
> strideResiC, bestResiU, trSizeC);
>                  }
>                  else
>                  {
> @@ -3530,11 +3521,10 @@
>
>                      int scalingListType = 3 + TEXT_CHROMA_V;
>                      X265_CHECK(scalingListType < 6, "scalingListType too
> large %d\n", scalingListType);
> -
>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC),
> REG_DCT, curResiV, m_qtTempShortYuv[qtlayer].m_cwidth, coeffCurV +
> subTUBufferOffset,
> +
>  m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC),
> REG_DCT, curResiV, strideResiC, coeffCurV + subTUBufferOffset,
>                                                 trSizeC, scalingListType,
> true, lastPosTransformSkip[TEXT_CHROMA_V][tuIterator.m_section]);
>                      uint32_t dist =
> primitives.sse_ss[partSizeC](resiYuv->getCrAddr(absPartIdxC),
> resiYuv->m_cwidth,
> -
> m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC),
> -
> m_qtTempShortYuv[qtlayer].m_cwidth);
> +
> curResiV, strideResiC);
>                      nonZeroDistV = m_rdCost->scaleChromaDistCr(dist);
>                      singleCostV = m_rdCost->calcRdCost(nonZeroDistV,
> singleBitsComp[TEXT_CHROMA_V][tuIterator.m_section]);
>                  }
> @@ -3543,11 +3533,8 @@
>                  {
>                      cu->setTransformSkipPartRange(0, TEXT_CHROMA_V,
> absPartIdxC, tuIterator.m_absPartIdxStep);
>
> -                    memcpy(coeffCurV + subTUBufferOffset, bestCoeffV,
> sizeof(coeff_t) * numSamplesChroma);
> -                    for (int i = 0; i < trSizeC; ++i)
> -                    {
> -                        memcpy(curResiV + i * stride, &bestResiV[i *
> trSizeC], sizeof(int16_t) * trSizeC);
> -                    }
> +                    memcpy(coeffCurV + subTUBufferOffset, bestCoeffV,
> sizeof(coeff_t) * numCoeffC);
> +                    primitives.square_copy_ss[sizeIdxC](curResiV,
> strideResiC, bestResiV, trSizeC);
>                  }
>                  else
>                  {
> @@ -3560,6 +3547,7 @@
>
>  cu->setCbfPartRange(absSum[TEXT_CHROMA_V][tuIterator.m_section] ? setCbf :
> 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
>              }
>              while (isNextSection(&tuIterator));
> +
>          }
>
>          m_rdGoOnSbacCoder->load(m_rdSbacCoders[depth][CI_QT_TRAFO_ROOT]);
> @@ -3929,7 +3917,7 @@
>
>              if (bCodeChroma)
>              {
> -                m_qtTempShortYuv[qtlayer].copyPartToPartChroma(resiYuv,
> absPartIdx, 1 << trSizeLog2, (bChromaSame && (chFmt != CHROMA_422)));
> +                m_qtTempShortYuv[qtlayer].copyPartToPartChroma(resiYuv,
> absPartIdx, trSize, (bChromaSame && (chFmt != CHROMA_422)));
>              }
>          }
>          else
> diff -r a5998df9b12e -r 73f86312c2e0 source/common/primitives.cpp
> --- a/source/common/primitives.cpp      Mon Jun 02 07:36:20 2014 +0530
> +++ b/source/common/primitives.cpp      Mon Jun 02 11:44:59 2014 +0900
> @@ -55,6 +55,11 @@
>      LUMA_4x4,  LUMA_8x8,  255,        LUMA_16x16, 255, 255,        255,
> LUMA_32x32, 255, 255, 255, 255,        255, 255, 255, LUMA_64x64
>  };
>
> +extern const uint8_t lumaPartitionsFromSquareBlocksTable[] =
> +{
> +    LUMA_4x4, LUMA_8x8, LUMA_16x16, LUMA_32x32, LUMA_64x64
> +};
> +
>  /* the "authoritative" set of encoder primitives */
>  EncoderPrimitives primitives;
>
> @@ -72,6 +77,31 @@
>      Setup_C_IPredPrimitives(p);      // intrapred.cpp
>      Setup_C_LoopFilterPrimitives(p); // loopfilter.cpp
>  }
> +
> +static void Setup_Alias_Primitives(EncoderPrimitives &p)
> +{
> +    /* copy reusable luma primitives to chroma 4:4:4 */
> +    for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
> +    {
> +        p.chroma[X265_CSP_I444].copy_pp[i] = p.luma_copy_pp[i];
> +        p.chroma[X265_CSP_I444].copy_ps[i] = p.luma_copy_ps[i];
> +        p.chroma[X265_CSP_I444].copy_sp[i] = p.luma_copy_sp[i];
> +        p.chroma[X265_CSP_I444].copy_ss[i] = p.luma_copy_ss[i];
> +        p.chroma[X265_CSP_I444].add_ps[i]  = p.luma_add_ps[i];
> +        p.chroma[X265_CSP_I444].sub_ps[i]  = p.luma_sub_ps[i];
> +        p.chroma[X265_CSP_I444].addAvg[i]  = p.luma_addAvg[i];
> +    }
> +
> +    for (int i = 0; i < NUM_SQUARE_BLOCKS; i++)
> +    {
> +        int partL = lumaPartitionsFromSquareBlocksTable[i];
> +        p.sad_square[i]     = p.sad[partL];
> +        p.square_copy_pp[i] = p.luma_copy_pp[partL];
> +        p.square_copy_ps[i] = p.luma_copy_ps[partL];
> +        p.square_copy_sp[i] = p.luma_copy_sp[partL];
> +        p.square_copy_ss[i] = p.luma_copy_ss[partL];
> +    }
> +}
>  }
>  using namespace x265;
>
> @@ -95,6 +125,8 @@
>          x265_log(param, X265_LOG_WARNING, "Assembly not supported in this
> binary\n");
>  #endif
>
> +        Setup_Alias_Primitives(primitives);
> +
>          initROM();
>      }
>
> diff -r a5998df9b12e -r 73f86312c2e0 source/common/primitives.h
> --- a/source/common/primitives.h        Mon Jun 02 07:36:20 2014 +0530
> +++ b/source/common/primitives.h        Mon Jun 02 11:44:59 2014 +0900
> @@ -213,6 +213,10 @@
>      copy_ss_t       luma_copy_ss[NUM_LUMA_PARTITIONS];
>      pixel_sub_ps_t  luma_sub_ps[NUM_LUMA_PARTITIONS];
>      pixel_add_ps_t  luma_add_ps[NUM_LUMA_PARTITIONS];
> +    copy_pp_t       square_copy_pp[NUM_SQUARE_BLOCKS];
> +    copy_sp_t       square_copy_sp[NUM_SQUARE_BLOCKS];
> +    copy_ps_t       square_copy_ps[NUM_SQUARE_BLOCKS];
> +    copy_ss_t       square_copy_ss[NUM_SQUARE_BLOCKS];
>
>      filter_pp_t     luma_hpp[NUM_LUMA_PARTITIONS];
>      filter_hps_t    luma_hps[NUM_LUMA_PARTITIONS];
> diff -r a5998df9b12e -r 73f86312c2e0 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp      Mon Jun 02 07:36:20 2014
> +0530
> +++ b/source/common/x86/asm-primitives.cpp      Mon Jun 02 11:44:59 2014
> +0900
> @@ -1316,30 +1316,12 @@
>      }
>  #endif // if HIGH_BIT_DEPTH
>
> -    /* copy reusable luma primitives to chroma 4:4:4 */
> -    for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
> -    {
> -        p.chroma[X265_CSP_I444].copy_pp[i] = p.luma_copy_pp[i];
> -        p.chroma[X265_CSP_I444].copy_ps[i] = p.luma_copy_ps[i];
> -        p.chroma[X265_CSP_I444].copy_sp[i] = p.luma_copy_sp[i];
> -        p.chroma[X265_CSP_I444].copy_ss[i] = p.luma_copy_ss[i];
> -        p.chroma[X265_CSP_I444].add_ps[i]  = p.luma_add_ps[i];
> -        p.chroma[X265_CSP_I444].sub_ps[i]  = p.luma_sub_ps[i];
> -        p.chroma[X265_CSP_I444].addAvg[i]  = p.luma_addAvg[i];
> -    }
> -
>      primitives.sa8d[BLOCK_4x4]   = primitives.sa8d_inter[LUMA_4x4];
>      primitives.sa8d[BLOCK_8x8]   = primitives.sa8d_inter[LUMA_8x8];
>      primitives.sa8d[BLOCK_16x16] = primitives.sa8d_inter[LUMA_16x16];
>      primitives.sa8d[BLOCK_32x32] = primitives.sa8d_inter[LUMA_32x32];
>      primitives.sa8d[BLOCK_64x64] = primitives.sa8d_inter[LUMA_64x64];
>
> -    primitives.sad_square[BLOCK_4x4]   = primitives.sad[LUMA_4x4];
> -    primitives.sad_square[BLOCK_8x8]   = primitives.sad[LUMA_8x8];
> -    primitives.sad_square[BLOCK_16x16] = primitives.sad[LUMA_16x16];
> -    primitives.sad_square[BLOCK_32x32] = primitives.sad[LUMA_32x32];
> -    primitives.sad_square[BLOCK_64x64] = primitives.sad[LUMA_64x64];
> -
>      // SA8D devolves to SATD for blocks not even multiples of 8x8
>      primitives.sa8d_inter[LUMA_4x4]   = primitives.satd[LUMA_4x4];
>      primitives.sa8d_inter[LUMA_4x8]   = primitives.satd[LUMA_4x8];
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140602/53cdddcc/attachment-0001.html>


More information about the x265-devel mailing list