[x265] refine block size related
Steve Borho
steve at borho.org
Fri May 23 16:12:52 CEST 2014
On Thu, May 22, 2014 at 11:37 PM, Satoshi Nakagawa <nakagawa424 at oki.com> wrote:
> # HG changeset patch
> # User Satoshi Nakagawa <nakagawa424 at oki.com>
> # Date 1400819691 -32400
> # Fri May 23 13:34:51 2014 +0900
> # Node ID 085be1ffd4a9752f64f8422e404985527e890921
> # Parent 5134e76aa729b6fece18701fdc00390c2f2ffb32
> refine block size related
nice!
staged for regression testing
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/Lib/TLibCommon/TComBitStream.cpp
> --- a/source/Lib/TLibCommon/TComBitStream.cpp Thu May 22 21:46:21 2014 -0500
> +++ b/source/Lib/TLibCommon/TComBitStream.cpp Fri May 23 13:34:51 2014 +0900
> @@ -88,7 +88,7 @@
> /* any modulo 8 remainder of num_total_bits cannot be written this time,
> * and will be held until next time. */
> uint32_t num_total_bits = numBits + m_num_held_bits;
> - uint32_t next_num_held_bits = num_total_bits % 8;
> + uint32_t next_num_held_bits = num_total_bits & 7;
>
> /* form a byte aligned word (write_bits), by concatenating any held bits
> * with the new bits, discarding the bits that will form the next_held_bits.
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/Lib/TLibCommon/TComDataCU.cpp
> --- a/source/Lib/TLibCommon/TComDataCU.cpp Thu May 22 21:46:21 2014 -0500
> +++ b/source/Lib/TLibCommon/TComDataCU.cpp Fri May 23 13:34:51 2014 +0900
> @@ -1337,7 +1337,7 @@
> {
> uint32_t curPartNum = m_pic->getNumPartInCU() >> (depth << 1);
>
> - return ((m_absIdxInLCU + absPartIdx) % curPartNum) == 0;
> + return ((m_absIdxInLCU + absPartIdx) & (curPartNum - 1)) == 0;
> }
>
> void TComDataCU::setPartSizeSubParts(PartSize mode, uint32_t absPartIdx, uint32_t depth)
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/Lib/TLibCommon/TComPrediction.cpp
> --- a/source/Lib/TLibCommon/TComPrediction.cpp Thu May 22 21:46:21 2014 -0500
> +++ b/source/Lib/TLibCommon/TComPrediction.cpp Fri May 23 13:34:51 2014 +0900
> @@ -117,15 +117,15 @@
> {
> bool bFilter;
>
> - if (dirMode == DC_IDX)
> + if (dirMode == DC_IDX || tuSize <= 4)
> {
> - bFilter = false; // no smoothing for DC or LM chroma
> + bFilter = false; // no smoothing for DC
> }
> else
> {
> int diff = std::min<int>(abs((int)dirMode - HOR_IDX), abs((int)dirMode - VER_IDX));
> - uint32_t sizeIndex = g_convertToBit[tuSize];
> - bFilter = diff > intraFilterThreshold[sizeIndex];
> + uint32_t sizeIdx = g_convertToBit[tuSize];
> + bFilter = diff > intraFilterThreshold[sizeIdx];
> }
>
> return bFilter;
> @@ -134,7 +134,7 @@
> void TComPrediction::predIntraLumaAng(uint32_t dirMode, pixel* dst, intptr_t stride, int tuSize)
> {
> X265_CHECK(tuSize >= 4 && tuSize <= 64, "intra block size is out of range\n");
> - int log2BlkSize = g_convertToBit[tuSize];
> + int sizeIdx = g_convertToBit[tuSize];
> bool bUseFilteredPredictions = TComPrediction::filteringIntraReferenceSamples(dirMode, tuSize);
>
> pixel *refLft, *refAbv;
> @@ -148,13 +148,13 @@
> }
>
> bool bFilter = tuSize <= 16 && dirMode != PLANAR_IDX;
> - primitives.intra_pred[log2BlkSize][dirMode](dst, stride, refLft, refAbv, dirMode, bFilter);
> + primitives.intra_pred[sizeIdx][dirMode](dst, stride, refLft, refAbv, dirMode, bFilter);
> }
>
> // Angular chroma
> void TComPrediction::predIntraChromaAng(pixel* src, uint32_t dirMode, pixel* dst, intptr_t stride, int tuSize, int chFmt)
> {
> - int log2BlkSize = g_convertToBit[tuSize];
> + int sizeIdx = g_convertToBit[tuSize];
> uint32_t tuSize2 = tuSize << 1;
>
> // Create the prediction
> @@ -222,7 +222,7 @@
> }
> }
>
> - primitives.intra_pred[log2BlkSize][dirMode](dst, stride, refLft + tuSize - 1, refAbv + tuSize - 1, dirMode, 0);
> + primitives.intra_pred[sizeIdx][dirMode](dst, stride, refLft + tuSize - 1, refAbv + tuSize - 1, dirMode, 0);
> }
>
> /** Function for checking identical motion.
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/Lib/TLibCommon/TComSlice.h
> --- a/source/Lib/TLibCommon/TComSlice.h Thu May 22 21:46:21 2014 -0500
> +++ b/source/Lib/TLibCommon/TComSlice.h Fri May 23 13:34:51 2014 +0900
> @@ -906,6 +906,8 @@
>
> void setLog2DiffMaxMinCodingBlockSize(int val) { m_log2DiffMaxMinCodingBlockSize = val; }
>
> + int getLog2MaxCodingBlockSize() const { return m_log2MinCodingBlockSize + m_log2DiffMaxMinCodingBlockSize; }
> +
> void setMaxCUSize(uint32_t u) { m_maxCUSize = u; }
>
> uint32_t getMaxCUSize() const { return m_maxCUSize; }
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/Lib/TLibCommon/TComYuv.cpp
> --- a/source/Lib/TLibCommon/TComYuv.cpp Thu May 22 21:46:21 2014 -0500
> +++ b/source/Lib/TLibCommon/TComYuv.cpp Fri May 23 13:34:51 2014 +0900
> @@ -186,7 +186,7 @@
>
> void TComYuv::copyPartToPartLuma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize)
> {
> - int part = partitionFromSizes(lumaSize, lumaSize);
> + int part = partitionFromSize(lumaSize);
>
> int16_t* dst = dstPicYuv->getLumaAddr(partIdx);
> uint32_t dststride = dstPicYuv->m_width;
> @@ -196,7 +196,7 @@
>
> void TComYuv::copyPartToPartChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, uint32_t chromaId, const bool splitIntoSubTUs)
> {
> - int part = splitIntoSubTUs ? NUM_CHROMA_PARTITIONS422 : partitionFromSizes(lumaSize, lumaSize);
> + int part = splitIntoSubTUs ? NUM_CHROMA_PARTITIONS422 : partitionFromSize(lumaSize);
>
> if (chromaId == 1)
> {
> @@ -235,7 +235,7 @@
>
> void TComYuv::addClip(TComYuv* srcYuv0, ShortYuv* srcYuv1, uint32_t partSize)
> {
> - int part = partitionFromSizes(partSize, partSize);
> + int part = partitionFromSize(partSize);
>
> addClipLuma(srcYuv0, srcYuv1, part);
> addClipChroma(srcYuv0, srcYuv1, part);
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/Lib/TLibEncoder/TEncCu.cpp
> --- a/source/Lib/TLibEncoder/TEncCu.cpp Thu May 22 21:46:21 2014 -0500
> +++ b/source/Lib/TLibEncoder/TEncCu.cpp Fri May 23 13:34:51 2014 +0900
> @@ -571,13 +571,14 @@
> m_origYuv[0]->copyPartToYuv(m_origYuv[depth], outBestCU->getZorderIdxInCU());
> }
>
> + uint32_t cuSize = outTempCU->getCUSize(0);
> TComSlice* slice = outTempCU->getSlice();
> if (!bInsidePicture)
> {
> uint32_t lpelx = outBestCU->getCUPelX();
> uint32_t tpely = outBestCU->getCUPelY();
> - uint32_t rpelx = lpelx + outBestCU->getCUSize(0);
> - uint32_t bpely = tpely + outBestCU->getCUSize(0);
> + uint32_t rpelx = lpelx + cuSize;
> + uint32_t bpely = tpely + cuSize;
> bInsidePicture = (rpelx <= slice->getSPS()->getPicWidthInLumaSamples() &&
> bpely <= slice->getSPS()->getPicHeightInLumaSamples());
> }
> @@ -592,7 +593,7 @@
>
> if (depth == g_maxCUDepth - g_addCUDepth)
> {
> - if (outTempCU->getCUSize(0) > (1 << slice->getSPS()->getQuadtreeTULog2MinSize()))
> + if (cuSize > (1 << slice->getSPS()->getQuadtreeTULog2MinSize()))
> {
> xCheckRDCostIntra(outBestCU, outTempCU, SIZE_NxN);
> }
> @@ -715,13 +716,14 @@
> bool doNotBlockPu = true;
> bool earlyDetectionSkipMode = false;
>
> + uint32_t cuSize = outTempCU->getCUSize(0);
> TComSlice* slice = outTempCU->getSlice();
> if (!bInsidePicture)
> {
> uint32_t lpelx = outBestCU->getCUPelX();
> uint32_t tpely = outBestCU->getCUPelY();
> - uint32_t rpelx = lpelx + outBestCU->getCUSize(0);
> - uint32_t bpely = tpely + outBestCU->getCUSize(0);
> + uint32_t rpelx = lpelx + cuSize;
> + uint32_t bpely = tpely + cuSize;
> bInsidePicture = (rpelx <= slice->getSPS()->getPicWidthInLumaSamples() &&
> bpely <= slice->getSPS()->getPicHeightInLumaSamples());
> }
> @@ -765,7 +767,7 @@
> if (slice->getSliceType() != I_SLICE)
> {
> // 2Nx2N, NxN
> - if (!(outBestCU->getCUSize(0) == 8))
> + if (!(cuSize == 8))
> {
> if (depth == g_maxCUDepth - g_addCUDepth && doNotBlockPu)
> {
> @@ -899,7 +901,7 @@
>
> if (depth == g_maxCUDepth - g_addCUDepth)
> {
> - if (outTempCU->getCUSize(0) > (1 << slice->getSPS()->getQuadtreeTULog2MinSize()))
> + if (cuSize > (1 << slice->getSPS()->getQuadtreeTULog2MinSize()))
> {
> xCheckRDCostIntraInInter(outBestCU, outTempCU, SIZE_NxN);
> outTempCU->initEstData(depth);
> @@ -908,10 +910,10 @@
> }
> // test PCM
> if (slice->getSPS()->getUsePCM()
> - && outTempCU->getCUSize(0) <= (1 << slice->getSPS()->getPCMLog2MaxSize())
> - && outTempCU->getCUSize(0) >= (1 << slice->getSPS()->getPCMLog2MinSize()))
> + && cuSize <= (1 << slice->getSPS()->getPCMLog2MaxSize())
> + && cuSize >= (1 << slice->getSPS()->getPCMLog2MinSize()))
> {
> - uint32_t rawbits = (2 * X265_DEPTH + X265_DEPTH) * outBestCU->getCUSize(0) * outBestCU->getCUSize(0) / 2;
> + uint32_t rawbits = (2 * X265_DEPTH + X265_DEPTH) * cuSize * cuSize / 2;
> uint32_t bestbits = outBestCU->m_totalBits;
> if ((bestbits > rawbits) || (outBestCU->m_totalCost > m_rdCost->calcRdCost(0, rawbits)))
> {
> @@ -1045,6 +1047,7 @@
> uint32_t posy = (externalAddress / pic->getFrameWidthInCU()) * g_maxCUSize + g_rasterToPelY[g_zscanToRaster[internalAddress]];
> uint32_t width = slice->getSPS()->getPicWidthInLumaSamples();
> uint32_t height = slice->getSPS()->getPicHeightInLumaSamples();
> + uint32_t cuSize = cu->getCUSize(absPartIdx);
>
> while (posx >= width || posy >= height)
> {
> @@ -1070,8 +1073,8 @@
> uint32_t uiGranularityWidth = g_maxCUSize;
> posx = cu->getCUPelX() + g_rasterToPelX[g_zscanToRaster[absPartIdx]];
> posy = cu->getCUPelY() + g_rasterToPelY[g_zscanToRaster[absPartIdx]];
> - bool granularityBoundary = ((posx + cu->getCUSize(absPartIdx)) % uiGranularityWidth == 0 || (posx + cu->getCUSize(absPartIdx) == width))
> - && ((posy + cu->getCUSize(absPartIdx)) % uiGranularityWidth == 0 || (posy + cu->getCUSize(absPartIdx) == height));
> + bool granularityBoundary = ((posx + cuSize) % uiGranularityWidth == 0 || (posx + cuSize == width))
> + && ((posy + cuSize) % uiGranularityWidth == 0 || (posy + cuSize == height));
>
> if (granularityBoundary)
> {
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/Lib/TLibEncoder/TEncEntropy.cpp
> --- a/source/Lib/TLibEncoder/TEncEntropy.cpp Thu May 22 21:46:21 2014 -0500
> +++ b/source/Lib/TLibEncoder/TEncEntropy.cpp Fri May 23 13:34:51 2014 +0900
> @@ -212,7 +212,7 @@
> void TEncEntropy::xEncodeTransform(TComDataCU* cu, uint32_t offsetLuma, uint32_t offsetChroma, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t depth, uint32_t tuSize, uint32_t trIdx, bool& bCodeDQP)
> {
> const uint32_t subdiv = cu->getTransformIdx(absPartIdx) + cu->getDepth(absPartIdx) > depth;
> - const uint32_t log2TrafoSize = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize()] + 2 - depth;
> + const uint32_t log2TrafoSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
> uint32_t hChromaShift = cu->getHorzChromaShift();
> uint32_t vChromaShift = cu->getVertChromaShift();
> uint32_t cbfY = cu->getCbf(absPartIdx, TEXT_LUMA, trIdx);
> @@ -227,12 +227,12 @@
> if ((log2TrafoSize == 2) && !(cu->getChromaFormat() == CHROMA_444))
> {
> uint32_t partNum = cu->getPic()->getNumPartInCU() >> ((depth - 1) << 1);
> - if ((absPartIdx % partNum) == 0)
> + if ((absPartIdx & (partNum - 1)) == 0)
> {
> m_bakAbsPartIdx = absPartIdx;
> m_bakChromaOffset = offsetChroma;
> }
> - else if ((absPartIdx % partNum) == (partNum - 1))
> + else if ((absPartIdx & (partNum - 1)) == (partNum - 1))
> {
> cbfU = cu->getCbf(m_bakAbsPartIdx, TEXT_CHROMA_U, trIdx);
> cbfV = cu->getCbf(m_bakAbsPartIdx, TEXT_CHROMA_V, trIdx);
> @@ -369,9 +369,9 @@
> if ((log2TrafoSize == 2) && !(chFmt == CHROMA_444))
> {
> uint32_t partNum = cu->getPic()->getNumPartInCU() >> ((depth - 1) << 1);
> - if ((absPartIdx % partNum) == (partNum - 1))
> + if ((absPartIdx & (partNum - 1)) == (partNum - 1))
> {
> - uint32_t trWidthC = log2TrafoSize << 1;
> + uint32_t trSizeC = 1 << log2TrafoSize;
> const bool splitIntoSubTUs = (chFmt == CHROMA_422);
>
> uint32_t curPartNum = cu->getPic()->getNumPartInCU() >> ((depth - 1) << 1);
> @@ -384,10 +384,10 @@
> do
> {
> uint32_t cbf = cu->getCbf(tuIterator.m_absPartIdxTURelCU, (TextType)chromaId, trIdx + splitIntoSubTUs);
> - uint32_t subTUIndex = tuIterator.m_section * trWidthC * trWidthC;
> + uint32_t subTUIndex = tuIterator.m_section * trSizeC * trSizeC;
> if (cbf)
> {
> - m_entropyCoderIf->codeCoeffNxN(cu, (coeffChroma + m_bakChromaOffset + subTUIndex), tuIterator.m_absPartIdxTURelCU, trWidthC, (TextType)chromaId);
> + m_entropyCoderIf->codeCoeffNxN(cu, (coeffChroma + m_bakChromaOffset + subTUIndex), tuIterator.m_absPartIdxTURelCU, trSizeC, (TextType)chromaId);
> }
> }
> while (isNextTUSection(&tuIterator));
> @@ -396,10 +396,8 @@
> }
> else
> {
> - uint32_t trWidthC = tuSize >> hChromaShift;
> - uint32_t trHeightC = tuSize >> vChromaShift;
> + uint32_t trSizeC = tuSize >> hChromaShift;
> const bool splitIntoSubTUs = (chFmt == CHROMA_422);
> - trHeightC = splitIntoSubTUs ? trHeightC >> 1 : trHeightC;
> uint32_t curPartNum = cu->getPic()->getNumPartInCU() >> (depth << 1);
> for (uint32_t chromaId = TEXT_CHROMA; chromaId < MAX_NUM_COMPONENT; chromaId++)
> {
> @@ -409,10 +407,10 @@
> do
> {
> uint32_t cbf = cu->getCbf(tuIterator.m_absPartIdxTURelCU, (TextType)chromaId, trIdx + splitIntoSubTUs);
> - uint32_t subTUIndex = tuIterator.m_section * trWidthC * trHeightC;
> + uint32_t subTUIndex = tuIterator.m_section * trSizeC * trSizeC;
> if (cbf)
> {
> - m_entropyCoderIf->codeCoeffNxN(cu, (coeffChroma + offsetChroma + subTUIndex), tuIterator.m_absPartIdxTURelCU, trWidthC, (TextType)chromaId);
> + m_entropyCoderIf->codeCoeffNxN(cu, (coeffChroma + offsetChroma + subTUIndex), tuIterator.m_absPartIdxTURelCU, trSizeC, (TextType)chromaId);
> }
> }
> while (isNextTUSection(&tuIterator));
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/Lib/TLibEncoder/TEncSbac.cpp
> --- a/source/Lib/TLibEncoder/TEncSbac.cpp Thu May 22 21:46:21 2014 -0500
> +++ b/source/Lib/TLibEncoder/TEncSbac.cpp Fri May 23 13:34:51 2014 +0900
> @@ -1930,8 +1930,7 @@
> m_binIf->encodePCMAlignBits();
>
> uint32_t lumaOffset = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> - uint32_t chromaOffset = lumaOffset >> 2;
> - //uint32_t chromaOffset = lumaOffset >> (m_hChromaShift + m_vChromaShift);
> + uint32_t chromaOffset = lumaOffset >> (cu->getHorzChromaShift() + cu->getVertChromaShift());
> uint32_t width;
> uint32_t height;
> uint32_t sampleBits;
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/Lib/TLibEncoder/TEncSearch.cpp
> --- a/source/Lib/TLibEncoder/TEncSearch.cpp Thu May 22 21:46:21 2014 -0500
> +++ b/source/Lib/TLibEncoder/TEncSearch.cpp Fri May 23 13:34:51 2014 +0900
> @@ -160,7 +160,7 @@
> uint32_t fullDepth = cu->getDepth(0) + trDepth;
> uint32_t trMode = cu->getTransformIdx(absPartIdx);
> uint32_t subdiv = (trMode > trDepth ? 1 : 0);
> - uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize()] + 2 - fullDepth;
> + uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
>
> if (cu->getPredictionMode(0) == MODE_INTRA && cu->getPartitionSize(0) == SIZE_NxN && trDepth == 0)
> {
> @@ -245,14 +245,14 @@
>
> uint32_t origTrDepth = trDepth;
>
> - uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize()] + 2 - fullDepth;
> + uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> int chFmt = cu->getChromaFormat();
> if ((ttype != TEXT_LUMA) && (trSizeLog2 == 2) && !(chFmt == CHROMA_444))
> {
> X265_CHECK(trDepth > 0, "transform size too small\n");
> trDepth--;
> uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trDepth) << 1);
> - bool bFirstQ = ((absPartIdx % qpdiv) == 0);
> + bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
> if (!bFirstQ)
> {
> return;
> @@ -290,7 +290,7 @@
> {
> uint32_t subTUSize = width * width;
> uint32_t partIdxesPerSubTU = cu->getPic()->getNumPartInCU() >> (((cu->getDepth(absPartIdx) + trDepth) << 1) + 1);
> -
> +
> if (cu->getCbf(absPartIdx, ttype, origTrDepth + 1))
> m_entropyCoder->encodeCoeffNxN(cu, coeff, absPartIdx, width, ttype);
> if (cu->getCbf(absPartIdx + partIdxesPerSubTU, ttype, origTrDepth + 1))
> @@ -346,7 +346,7 @@
> m_entropyCoder->encodeIntraDirModeLuma(cu, part * qtNumParts);
> }
> }
> - else if ((absPartIdx % qtNumParts) == 0)
> + else if ((absPartIdx & (qtNumParts - 1)) == 0)
> {
> m_entropyCoder->encodeIntraDirModeLuma(cu, absPartIdx);
> }
> @@ -366,7 +366,7 @@
> {
> uint32_t qtNumParts = cu->getTotalNumPart() >> 2;
> X265_CHECK(trDepth > 0, "unexpected trDepth %d\n", trDepth);
> - if ((absPartIdx % qtNumParts) == 0)
> + if ((absPartIdx & (qtNumParts - 1)) == 0)
> m_entropyCoder->encodeIntraDirModeChroma(cu, absPartIdx);
> }
> }
> @@ -418,12 +418,13 @@
> pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
> pixel* pred = predYuv->getLumaAddr(absPartIdx);
> int16_t* residual = resiYuv->getLumaAddr(absPartIdx);
> - int part = partitionFromSizes(tuSize, tuSize);
> -
> - uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> + int part = partitionFromSize(tuSize);
> + int sizeIdx = g_convertToBit[tuSize];
> +
> + uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> uint32_t qtLayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> - uint32_t numCoeffPerInc = cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize() >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1);
> - coeff_t* coeff = m_qtTempCoeffY[qtLayer] + numCoeffPerInc * absPartIdx;
> + uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> + coeff_t* coeff = m_qtTempCoeffY[qtLayer] + coeffOffsetY;
>
> int16_t* reconQt = m_qtTempShortYuv[qtLayer].getLumaAddr(absPartIdx);
>
> @@ -435,10 +436,10 @@
> bool useTransformSkip = !!cu->getTransformSkip(absPartIdx, TEXT_LUMA);
>
> //===== get residual signal =====
> - X265_CHECK(!((uint32_t)(size_t)fenc & (tuSize - 1)), "fenc alignment check fail\n");
> - X265_CHECK(!((uint32_t)(size_t)pred & (tuSize - 1)), "pred alignment check fail\n");
> - X265_CHECK(!((uint32_t)(size_t)residual & (tuSize - 1)), "residual alignment check fail\n");
> - primitives.calcresidual[(int)g_convertToBit[tuSize]](fenc, pred, residual, stride);
> + X265_CHECK(!((intptr_t)fenc & (tuSize - 1)), "fenc alignment check fail\n");
> + X265_CHECK(!((intptr_t)pred & (tuSize - 1)), "pred alignment check fail\n");
> + X265_CHECK(!((intptr_t)residual & (tuSize - 1)), "residual alignment check fail\n");
> + primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
>
> //===== transform and quantization =====
> //--- init rate estimation arrays for RDOQ ---
> @@ -462,7 +463,6 @@
> cu->setCbfSubParts((absSum ? 1 : 0) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
>
> //--- inverse transform ---
> - int size = g_convertToBit[tuSize];
> if (absSum)
> {
> int scalingListType = 0 + TEXT_LUMA;
> @@ -473,12 +473,12 @@
> {
> int16_t* resiTmp = residual;
> memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);
> - primitives.blockfill_s[size](resiTmp, stride, 0);
> + primitives.blockfill_s[sizeIdx](resiTmp, stride, 0);
> }
>
> X265_CHECK(tuSize <= 32, "tuSize is too large %d\n", tuSize);
> //===== reconstruction =====
> - primitives.calcrecon[size](pred, residual, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
> + primitives.calcrecon[sizeIdx](pred, residual, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
> //===== update distortion =====
> outDist += primitives.sse_sp[part](reconQt, MAX_CU_SIZE, fenc, stride);
> }
> @@ -494,7 +494,7 @@
> uint32_t chromaId)
> {
> uint32_t fullDepth = cu->getDepth(0) + trDepth;
> - uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> + uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> int chFmt = cu->getChromaFormat();
>
> uint32_t origTrDepth = trDepth;
> @@ -504,8 +504,8 @@
> X265_CHECK(trDepth > 0, "trDepth should be non-zero\n");
> trDepth--;
> uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trDepth) << 1);
> - bool bFirstQ = ((absPartIdx % qpdiv) == 0);
> - bool bSecondQ = (chFmt == CHROMA_422) ? ((absPartIdx % qpdiv) == 2) : false;
> + bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
> + bool bSecondQ = (chFmt == CHROMA_422) ? ((absPartIdx & (qpdiv - 1)) == 2) : false;
> if ((!bFirstQ) && (!bSecondQ))
> {
> return;
> @@ -520,22 +520,22 @@
> int16_t* residual = (chromaId == 1) ? resiYuv->getCbAddr(absPartIdx) : resiYuv->getCrAddr(absPartIdx);
>
> uint32_t qtlayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> - uint32_t numCoeffPerInc = (cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize() >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1)) >> (m_hChromaShift + m_vChromaShift);
> - coeff_t* coeff = (chromaId == 1 ? m_qtTempCoeffCb[qtlayer] : m_qtTempCoeffCr[qtlayer]) + numCoeffPerInc * absPartIdx;
> + uint32_t coeffOffsetC = absPartIdx << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
> + coeff_t* coeff = (chromaId == 1 ? m_qtTempCoeffCb[qtlayer] : m_qtTempCoeffCr[qtlayer]) + coeffOffsetC;
> int16_t* reconQt = (chromaId == 1) ? m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdx) : m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdx);
> uint32_t reconQtStride = m_qtTempShortYuv[qtlayer].m_cwidth;
> uint32_t zorder = cu->getZorderIdxInCU() + absPartIdx;
> pixel* reconIPred = (chromaId == 1) ? cu->getPic()->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder) : cu->getPic()->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder);
> uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getCStride();
> bool useTransformSkipChroma = !!cu->getTransformSkip(absPartIdx, ttype);
> - int part = partitionFromSizes(tuSize, tuSize);
> + int part = partitionFromSize(tuSize);
> + int sizeIdx = g_convertToBit[tuSize];
>
> //===== get residual signal =====
> - X265_CHECK(!((uint32_t)(size_t)fenc & (tuSize - 1)), "fenc alignment check fail\n");
> - X265_CHECK(!((uint32_t)(size_t)pred & (tuSize - 1)), "pred alignment check fail\n");
> - X265_CHECK(!((uint32_t)(size_t)residual & (tuSize - 1)), "residual alignment check fail\n");
> - int size = g_convertToBit[tuSize];
> - primitives.calcresidual[size](fenc, pred, residual, stride);
> + X265_CHECK(!((intptr_t)fenc & (tuSize - 1)), "fenc alignment check fail\n");
> + X265_CHECK(!((intptr_t)pred & (tuSize - 1)), "pred alignment check fail\n");
> + X265_CHECK(!((intptr_t)residual & (tuSize - 1)), "residual alignment check fail\n");
> + primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
>
> //===== transform and quantization =====
> {
> @@ -577,14 +577,14 @@
> {
> int16_t* resiTmp = residual;
> memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);
> - primitives.blockfill_s[size](resiTmp, stride, 0);
> + primitives.blockfill_s[sizeIdx](resiTmp, stride, 0);
> }
> }
>
> X265_CHECK(((intptr_t)residual & (tuSize - 1)) == 0, "residual alignment check failure\n");
> X265_CHECK(tuSize <= 32, "tuSize invalud\n");
> //===== reconstruction =====
> - primitives.calcrecon[size](pred, residual, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
> + primitives.calcrecon[sizeIdx](pred, residual, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
> //===== update distortion =====
> uint32_t dist = primitives.sse_sp[part](reconQt, reconQtStride, fenc, stride);
> if (ttype == TEXT_CHROMA_U)
> @@ -612,7 +612,7 @@
> uint64_t& rdCost)
> {
> uint32_t fullDepth = cu->getDepth(0) + trDepth;
> - uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> + uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> bool bCheckFull = (trSizeLog2 <= cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize());
> bool bCheckSplit = (trSizeLog2 > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx));
>
> @@ -829,7 +829,7 @@
> TComYuv* reconYuv)
> {
> uint32_t fullDepth = cu->getDepth(0) + trDepth;
> - uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> + uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> bool bCheckFull = (trSizeLog2 <= cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize());
> bool bCheckSplit = (trSizeLog2 > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx));
>
> @@ -854,9 +854,8 @@
> pixel* pred = predYuv->getLumaAddr(absPartIdx);
> int16_t* residual = resiYuv->getLumaAddr(absPartIdx);
> pixel* recon = reconYuv->getLumaAddr(absPartIdx);
> -
> - uint32_t numCoeffPerInc = cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize() >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1);
> - coeff_t* coeff = cu->getCoeffY() + numCoeffPerInc * absPartIdx;
> + uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> + coeff_t* coeff = cu->getCoeffY() + coeffOffsetY;
>
> uint32_t zorder = cu->getZorderIdxInCU() + absPartIdx;
> pixel* reconIPred = cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
> @@ -870,10 +869,11 @@
> predIntraLumaAng(lumaPredMode, pred, stride, tuSize);
>
> //===== get residual signal =====
> - X265_CHECK(!((uint32_t)(size_t)fenc & (tuSize - 1)), "fenc alignment failure\n");
> - X265_CHECK(!((uint32_t)(size_t)pred & (tuSize - 1)), "pred alignment failure\n");
> - X265_CHECK(!((uint32_t)(size_t)residual & (tuSize - 1)), "residual alignment failure\n");
> - primitives.calcresidual[(int)g_convertToBit[tuSize]](fenc, pred, residual, stride);
> + X265_CHECK(!((intptr_t)fenc & (tuSize - 1)), "fenc alignment failure\n");
> + X265_CHECK(!((intptr_t)pred & (tuSize - 1)), "pred alignment failure\n");
> + X265_CHECK(!((intptr_t)residual & (tuSize - 1)), "residual alignment failure\n");
> + int sizeIdx = g_convertToBit[tuSize];
> + primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
>
> //===== transform and quantization =====
> uint32_t absSum = 0;
> @@ -888,7 +888,6 @@
> cu->setCbfSubParts((absSum ? 1 : 0) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
>
> //--- inverse transform ---
> - int size = g_convertToBit[tuSize];
> if (absSum)
> {
> int scalingListType = 0 + TEXT_LUMA;
> @@ -899,12 +898,12 @@
> {
> int16_t* resiTmp = residual;
> memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);
> - primitives.blockfill_s[size](resiTmp, stride, 0);
> + primitives.blockfill_s[sizeIdx](resiTmp, stride, 0);
> }
>
> //Generate Recon
> X265_CHECK(tuSize <= 32, "tuSize is too large\n");
> - int part = partitionFromSizes(tuSize, tuSize);
> + int part = partitionFromSize(tuSize);
> primitives.luma_add_ps[part](recon, stride, pred, residual, stride, stride);
> primitives.blockcpy_pp(tuSize, tuSize, reconIPred, reconIPredStride, recon, stride);
> }
> @@ -939,14 +938,14 @@
>
> if (trMode == trDepth)
> {
> - uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> + uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> uint32_t qtlayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
>
> //===== copy transform coefficients =====
> - uint32_t numCoeffY = (cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize()) >> (fullDepth << 1);
> - uint32_t numCoeffIncY = (cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize()) >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1);
> - coeff_t* coeffSrcY = m_qtTempCoeffY[qtlayer] + (numCoeffIncY * absPartIdx);
> - coeff_t* coeffDestY = cu->getCoeffY() + (numCoeffIncY * absPartIdx);
> + uint32_t numCoeffY = 1 << (trSizeLog2 * 2);
> + uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> + coeff_t* coeffSrcY = m_qtTempCoeffY[qtlayer] + coeffOffsetY;
> + coeff_t* coeffDestY = cu->getCoeffY() + coeffOffsetY;
> ::memcpy(coeffDestY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
>
> //===== copy reconstruction =====
> @@ -964,15 +963,14 @@
>
> void TEncSearch::xStoreIntraResultQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx)
> {
> - uint32_t fullMode = cu->getDepth(0) + trDepth;
> -
> - uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullMode] + 2;
> + uint32_t fullDepth = cu->getDepth(0) + trDepth;
> + uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> uint32_t qtlayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
>
> //===== copy transform coefficients =====
> - uint32_t numCoeffY = (cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize()) >> (fullMode << 1);
> - uint32_t numCoeffIncY = (cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize()) >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1);
> - coeff_t* coeffSrcY = m_qtTempCoeffY[qtlayer] + (numCoeffIncY * absPartIdx);
> + uint32_t numCoeffY = 1 << (trSizeLog2 * 2);
> + uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> + coeff_t* coeffSrcY = m_qtTempCoeffY[qtlayer] + coeffOffsetY;
> coeff_t* coeffDstY = m_qtTempTUCoeffY;
>
> ::memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
> @@ -984,14 +982,13 @@
> void TEncSearch::xLoadIntraResultQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx)
> {
> uint32_t fullDepth = cu->getDepth(0) + trDepth;
> -
> - uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> + uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> uint32_t qtlayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
>
> //===== copy transform coefficients =====
> - uint32_t numCoeffY = (cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize()) >> (fullDepth << 1);
> - uint32_t numCoeffIncY = (cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize()) >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1);
> - coeff_t* coeffDstY = m_qtTempCoeffY[qtlayer] + (numCoeffIncY * absPartIdx);
> + uint32_t numCoeffY = 1 << (trSizeLog2 * 2);
> + uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> + coeff_t* coeffDstY = m_qtTempCoeffY[qtlayer] + coeffOffsetY;
> coeff_t* coeffSrcY = m_qtTempTUCoeffY;
>
> ::memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
> @@ -1008,25 +1005,27 @@
> X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width is not max CU size\n");
> }
>
> -void TEncSearch::xStoreIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t stateU0V1Both2, const bool splitIntoSubTUs)
> +void TEncSearch::xStoreIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t chromaId, const bool splitIntoSubTUs)
> {
> + assert(chromaId == 1 || chromaId == 2);
> +
> uint32_t fullDepth = cu->getDepth(0) + trDepth;
> uint32_t trMode = cu->getTransformIdx(absPartIdx);
>
> if (trMode == trDepth)
> {
> - uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> + uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> uint32_t qtlayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> int chFmt = cu->getChromaFormat();
>
> bool bChromaSame = false;
> if (trSizeLog2 == 2 && !(chFmt == CHROMA_444))
> {
> - X265_CHECK(trDepth > 0, "trDepth is invalid\n");
> + X265_CHECK(trDepth > 0, "invalid trDepth\n");
> trDepth--;
> uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trDepth) << 1);
> - bool bFirstQ = ((absPartIdx % qpdiv) == 0);
> - bool bSecondQ = (chFmt == CHROMA_422) ? ((absPartIdx % qpdiv) == 2) : false;
> + bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
> + bool bSecondQ = (chFmt == CHROMA_422) ? ((absPartIdx & (qpdiv - 1)) == 2) : false;
> if ((!bFirstQ) && (!bSecondQ))
> {
> return;
> @@ -1037,35 +1036,37 @@
> uint32_t height = cu->getCUSize(absPartIdx) >> (trDepth + m_vChromaShift);
> height = splitIntoSubTUs ? height >> 1 : height;
> uint32_t numCoeffC = width * height;
> -
> - uint32_t numCoeffIncC = (cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize()) >> ((cu->getSlice()->getSPS()->getMaxCUDepth() << 1) + (m_hChromaShift + m_vChromaShift));
> - if (stateU0V1Both2 == 1 || stateU0V1Both2 == 3)
> + uint32_t coeffOffsetC = absPartIdx << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
> +
> + if (chromaId == 1)
> {
> - coeff_t* coeffSrcU = m_qtTempCoeffCb[qtlayer] + (numCoeffIncC * absPartIdx);
> + coeff_t* coeffSrcU = m_qtTempCoeffCb[qtlayer] + coeffOffsetC;
> coeff_t* coeffDstU = m_qtTempTUCoeffCb;
> ::memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
> }
> - if (stateU0V1Both2 == 2 || stateU0V1Both2 == 3)
> + if (chromaId == 2)
> {
> - coeff_t* coeffSrcV = m_qtTempCoeffCr[qtlayer] + (numCoeffIncC * absPartIdx);
> + coeff_t* coeffSrcV = m_qtTempCoeffCr[qtlayer] + coeffOffsetC;
> coeff_t* coeffDstV = m_qtTempTUCoeffCr;
> ::memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
> }
>
> //===== copy reconstruction =====
> uint32_t lumaSize = 1 << (bChromaSame ? trSizeLog2 + 1 : trSizeLog2);
> - m_qtTempShortYuv[qtlayer].copyPartToPartYuvChroma(&m_qtTempTransformSkipYuv, absPartIdx, lumaSize, stateU0V1Both2, splitIntoSubTUs);
> + m_qtTempShortYuv[qtlayer].copyPartToPartYuvChroma(&m_qtTempTransformSkipYuv, absPartIdx, lumaSize, chromaId, splitIntoSubTUs);
> }
> }
>
> -void TEncSearch::xLoadIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t stateU0V1Both2, const bool splitIntoSubTUs)
> +void TEncSearch::xLoadIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t chromaId, const bool splitIntoSubTUs)
> {
> + assert(chromaId == 1 || chromaId == 2);
> +
> uint32_t fullDepth = cu->getDepth(0) + trDepth;
> uint32_t trMode = cu->getTransformIdx(absPartIdx);
>
> if (trMode == trDepth)
> {
> - uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> + uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> uint32_t qtlayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> int chFmt = cu->getChromaFormat();
>
> @@ -1075,8 +1076,8 @@
> X265_CHECK(trDepth > 0, "invalid trDepth\n");
> trDepth--;
> uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trDepth) << 1);
> - bool bFirstQ = ((absPartIdx % qpdiv) == 0);
> - bool bSecondQ = (chFmt == CHROMA_422) ? ((absPartIdx % qpdiv) == 2) : false;
> + bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
> + bool bSecondQ = (chFmt == CHROMA_422) ? ((absPartIdx & (qpdiv - 1)) == 2) : false;
> if ((!bFirstQ) && (!bSecondQ))
> {
> return;
> @@ -1085,45 +1086,44 @@
> }
>
> //===== copy transform coefficients =====
> - uint32_t trWidth = cu->getCUSize(absPartIdx) >> (trDepth + m_hChromaShift);
> - uint32_t trHeight = cu->getCUSize(absPartIdx) >> (trDepth + m_vChromaShift);
> - trHeight = splitIntoSubTUs ? trHeight >> 1 : trHeight;
> - uint32_t numCoeffC = trWidth * trHeight;
> -
> - uint32_t numCoeffIncC = (cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize()) >> ((cu->getSlice()->getSPS()->getMaxCUDepth() << 1) + (m_hChromaShift + m_vChromaShift));
> -
> - if (stateU0V1Both2 == 1 || stateU0V1Both2 == 3)
> + uint32_t trWidthC = cu->getCUSize(absPartIdx) >> (trDepth + m_hChromaShift);
> + uint32_t trHeightC = cu->getCUSize(absPartIdx) >> (trDepth + m_vChromaShift);
> + trHeightC = splitIntoSubTUs ? trHeightC >> 1 : trHeightC;
> + uint32_t numCoeffC = trWidthC * trHeightC;
> + uint32_t coeffOffsetC = absPartIdx << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
> +
> + if (chromaId == 1)
> {
> - coeff_t* coeffDstU = m_qtTempCoeffCb[qtlayer] + (numCoeffIncC * absPartIdx);
> + coeff_t* coeffDstU = m_qtTempCoeffCb[qtlayer] + coeffOffsetC;
> coeff_t* coeffSrcU = m_qtTempTUCoeffCb;
> ::memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
> }
> - if (stateU0V1Both2 == 2 || stateU0V1Both2 == 3)
> + if (chromaId == 2)
> {
> - coeff_t* coeffDstV = m_qtTempCoeffCr[qtlayer] + (numCoeffIncC * absPartIdx);
> + coeff_t* coeffDstV = m_qtTempCoeffCr[qtlayer] + coeffOffsetC;
> coeff_t* coeffSrcV = m_qtTempTUCoeffCr;
> ::memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
> }
>
> //===== copy reconstruction =====
> uint32_t lumaSize = 1 << (bChromaSame ? trSizeLog2 + 1 : trSizeLog2);
> - m_qtTempTransformSkipYuv.copyPartToPartChroma(&m_qtTempShortYuv[qtlayer], absPartIdx, lumaSize, stateU0V1Both2, splitIntoSubTUs);
> + m_qtTempTransformSkipYuv.copyPartToPartChroma(&m_qtTempShortYuv[qtlayer], absPartIdx, lumaSize, chromaId, splitIntoSubTUs);
>
> uint32_t zorder = cu->getZorderIdxInCU() + absPartIdx;
> uint32_t reconQtStride = m_qtTempShortYuv[qtlayer].m_cwidth;
> uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getCStride();
>
> - if (stateU0V1Both2 == 1 || stateU0V1Both2 == 3)
> + if (chromaId == 1)
> {
> pixel* reconIPred = cu->getPic()->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder);
> int16_t* reconQt = m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdx);
> - primitives.blockcpy_ps(trWidth, trHeight, reconIPred, reconIPredStride, reconQt, reconQtStride);
> + primitives.blockcpy_ps(trWidthC, trHeightC, reconIPred, reconIPredStride, reconQt, reconQtStride);
> }
> - if (stateU0V1Both2 == 2 || stateU0V1Both2 == 3)
> + if (chromaId == 2)
> {
> pixel* reconIPred = cu->getPic()->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder);
> int16_t* reconQt = m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdx);
> - primitives.blockcpy_ps(trWidth, trHeight, reconIPred, reconIPredStride, reconQt, reconQtStride);
> + primitives.blockcpy_ps(trWidthC, trHeightC, reconIPred, reconIPredStride, reconQt, reconQtStride);
> }
> }
> }
> @@ -1132,7 +1132,7 @@
> {
> uint32_t depth = cu->getDepth(0);
> uint32_t fullDepth = depth + trDepth;
> - uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> + uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
>
> uint32_t actualTrDepth = trDepth;
>
> @@ -1179,7 +1179,7 @@
> if (trMode == trDepth)
> {
> int chFmt = cu->getChromaFormat();
> - uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> + uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> uint32_t trSizeCLog2 = trSizeLog2 - m_hChromaShift;
> uint32_t actualTrDepth = trDepth;
> if ((trSizeLog2 == 2) && !(chFmt == CHROMA_444))
> @@ -1188,7 +1188,7 @@
> actualTrDepth--;
> trSizeCLog2++;
> uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + actualTrDepth) << 1);
> - bool bFirstQ = ((absPartIdx % qpdiv) == 0);
> + bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
> if (!bFirstQ)
> {
> return;
> @@ -1288,7 +1288,7 @@
>
> if (bestModeId == firstCheckId)
> {
> - xStoreIntraResultChromaQT(cu, trDepth, absPartIdxC, (TextType)chromaId, splitIntoSubTUs);
> + xStoreIntraResultChromaQT(cu, trDepth, absPartIdxC, chromaId, splitIntoSubTUs);
> m_rdGoOnSbacCoder->store(m_rdSbacCoders[fullDepth][CI_TEMP_BEST]);
> }
> }
> @@ -1300,7 +1300,7 @@
>
> if (bestModeId == firstCheckId)
> {
> - xLoadIntraResultChromaQT(cu, trDepth, absPartIdxC, (TextType)chromaId, splitIntoSubTUs);
> + xLoadIntraResultChromaQT(cu, trDepth, absPartIdxC, chromaId, splitIntoSubTUs);
> cu->setCbfPartRange(singleCbfC << trDepth, (TextType)chromaId, absPartIdxC, tuIterator.m_absPartIdxStep);
>
> m_rdGoOnSbacCoder->load(m_rdSbacCoders[fullDepth][CI_TEMP_BEST]);
> @@ -1358,7 +1358,7 @@
> if (trMode == trDepth)
> {
> int chFmt = cu->getChromaFormat();
> - uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> + uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> uint32_t qtlayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
>
> bool bChromaSame = false;
> @@ -1367,7 +1367,7 @@
> X265_CHECK(trDepth > 0, "invalid trDepth\n");
> trDepth--;
> uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trDepth) << 1);
> - if ((absPartIdx % qpdiv) != 0)
> + if ((absPartIdx & (qpdiv - 1)) != 0)
> {
> return;
> }
> @@ -1379,12 +1379,12 @@
> uint32_t width = cu->getCUSize(absPartIdx) >> (trDepth + m_hChromaShift);
> uint32_t height = cu->getCUSize(absPartIdx) >> (trDepth + m_vChromaShift);
> uint32_t numCoeffC = width * height;
> - uint32_t numCoeffIncC = ((cu->getSlice()->getSPS()->getMaxCUSize() >> m_hChromaShift) * (cu->getSlice()->getSPS()->getMaxCUSize() >> m_vChromaShift)) >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1);
> -
> - coeff_t* coeffSrcU = m_qtTempCoeffCb[qtlayer] + (numCoeffIncC * absPartIdx);
> - coeff_t* coeffSrcV = m_qtTempCoeffCr[qtlayer] + (numCoeffIncC * absPartIdx);
> - coeff_t* coeffDstU = cu->getCoeffCb() + (numCoeffIncC * absPartIdx);
> - coeff_t* coeffDstV = cu->getCoeffCr() + (numCoeffIncC * absPartIdx);
> + uint32_t coeffOffsetC = absPartIdx << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
> +
> + coeff_t* coeffSrcU = m_qtTempCoeffCb[qtlayer] + coeffOffsetC;
> + coeff_t* coeffSrcV = m_qtTempCoeffCr[qtlayer] + coeffOffsetC;
> + coeff_t* coeffDstU = cu->getCoeffCb() + coeffOffsetC;
> + coeff_t* coeffDstV = cu->getCoeffCr() + coeffOffsetC;
> ::memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
> ::memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
>
> @@ -1415,7 +1415,7 @@
> if (trMode == trDepth)
> {
> int chFmt = cu->getChromaFormat();
> - uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> fullDepth] + 2;
> + uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
> uint32_t origTrDepth = trDepth;
> uint32_t actualTrDepth = trDepth;
> if ((trSizeLog2 == 2) && !(chFmt == CHROMA_444))
> @@ -1423,7 +1423,7 @@
> X265_CHECK(trDepth > 0, "invalid trDepth\n");
> actualTrDepth--;
> uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + actualTrDepth) << 1);
> - bool bFirstQ = ((absPartIdx % qpdiv) == 0);
> + bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
> if (!bFirstQ)
> {
> return;
> @@ -1433,6 +1433,7 @@
> uint32_t tuSize = cu->getCUSize(0) >> (actualTrDepth + m_hChromaShift);
> uint32_t stride = fencYuv->getCStride();
> const bool splitIntoSubTUs = (chFmt == CHROMA_422);
> + int sizeIdx = g_convertToBit[tuSize];
>
> for (int chromaId = TEXT_CHROMA; chromaId < MAX_NUM_COMPONENT; chromaId++)
> {
> @@ -1450,8 +1451,8 @@
> pixel* pred = (chromaId == 1) ? predYuv->getCbAddr(absPartIdxC) : predYuv->getCrAddr(absPartIdxC);
> int16_t* residual = (chromaId == 1) ? resiYuv->getCbAddr(absPartIdxC) : resiYuv->getCrAddr(absPartIdxC);
> pixel* recon = (chromaId == 1) ? reconYuv->getCbAddr(absPartIdxC) : reconYuv->getCrAddr(absPartIdxC);
> - uint32_t numCoeffPerInc = (cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize() >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1)) >> (m_hChromaShift + m_vChromaShift);
> - coeff_t* coeff = (chromaId == 1 ? cu->getCoeffCb() : cu->getCoeffCr()) + numCoeffPerInc * absPartIdxC;
> + uint32_t coeffOffsetC = absPartIdxC << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
> + coeff_t* coeff = (chromaId == 1 ? cu->getCoeffCb() : cu->getCoeffCr()) + coeffOffsetC;
> uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
> pixel* reconIPred = (chromaId == 1) ? cu->getPic()->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder) : cu->getPic()->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder);
> uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getCStride();
> @@ -1474,11 +1475,10 @@
> predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, tuSize, chFmt);
>
> //===== get residual signal =====
> - X265_CHECK(!((uint32_t)(size_t)fenc & (tuSize - 1)), "fenc alignment failure\n");
> - X265_CHECK(!((uint32_t)(size_t)pred & (tuSize - 1)), "pred alignment failure\n");
> - X265_CHECK(!((uint32_t)(size_t)residual & (tuSize - 1)), "residual alignment failure\n");
> - int size = g_convertToBit[tuSize];
> - primitives.calcresidual[size](fenc, pred, residual, stride);
> + X265_CHECK(!((intptr_t)fenc & (tuSize - 1)), "fenc alignment failure\n");
> + X265_CHECK(!((intptr_t)pred & (tuSize - 1)), "pred alignment failure\n");
> + X265_CHECK(!((intptr_t)residual & (tuSize - 1)), "residual alignment failure\n");
> + primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
>
> //--- transform and quantization ---
> uint32_t absSum = 0;
> @@ -1513,7 +1513,7 @@
> {
> int16_t* resiTmp = residual;
> memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);
> - primitives.blockfill_s[size](resiTmp, stride, 0);
> + primitives.blockfill_s[sizeIdx](resiTmp, stride, 0);
> }
>
> //===== reconstruction =====
> @@ -1521,7 +1521,7 @@
> X265_CHECK(tuSize <= 32, "tuSize out of range\n");
>
> // use square primitive
> - int part = partitionFromSizes(tuSize, tuSize);
> + int part = partitionFromSize(tuSize);
> primitives.chroma[CHROMA_444].add_ps[part](recon, stride, pred, residual, stride, stride);
> primitives.chroma[CHROMA_444].copy_pp[part](reconIPred, reconIPredStride, recon, stride);
> }
> @@ -1565,7 +1565,7 @@
> uint32_t overallDistY = 0;
> uint32_t candNum;
> uint64_t candCostList[FAST_UDI_MAX_RDMODE_NUM];
> - uint32_t tuSizeIdx = g_convertToBit[tuSize]; // log2(tuSize) - 2
> + uint32_t sizeIdx = g_convertToBit[tuSize]; // log2(tuSize) - 2
> static const uint8_t intraModeNumFast[] = { 8, 8, 3, 3, 3 }; // 4x4, 8x8, 16x16, 32x32, 64x64
>
> //===== loop over partitions =====
> @@ -1581,7 +1581,7 @@
> pixel* fenc = fencYuv->getLumaAddr(pu, tuSize);
> uint32_t stride = predYuv->getStride();
> uint32_t rdModeList[FAST_UDI_MAX_RDMODE_NUM];
> - int numModesForFullRD = intraModeNumFast[tuSizeIdx];
> + int numModesForFullRD = intraModeNumFast[sizeIdx];
>
> bool doFastSearch = (numModesForFullRD != numModesAvailable);
> if (doFastSearch)
> @@ -1629,6 +1629,7 @@
> scaleTuSize = 32;
> scaleStride = 32;
> costShift = 2;
> + sizeIdx = 5 - 2; // g_convertToBit[scaleTuSize];
>
> // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
> above = aboveScale;
> @@ -1637,11 +1638,10 @@
> leftFiltered = leftScale;
> }
>
> - int log2SizeMinus2 = g_convertToBit[scaleTuSize];
> - pixelcmp_t sa8d = primitives.sa8d[log2SizeMinus2];
> + pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
>
> // DC
> - primitives.intra_pred[log2SizeMinus2][DC_IDX](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
> + primitives.intra_pred[sizeIdx][DC_IDX](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
> modeCosts[DC_IDX] = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
>
> pixel *abovePlanar = above;
> @@ -1654,13 +1654,13 @@
> }
>
> // PLANAR
> - primitives.intra_pred[log2SizeMinus2][PLANAR_IDX](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
> + primitives.intra_pred[sizeIdx][PLANAR_IDX](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
> modeCosts[PLANAR_IDX] = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
>
> // Transpose NxN
> - primitives.transpose[log2SizeMinus2](buf_trans, fenc, scaleStride);
> -
> - primitives.intra_pred_allangs[log2SizeMinus2](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
> + primitives.transpose[sizeIdx](buf_trans, fenc, scaleStride);
> +
> + primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
>
> for (uint32_t mode = 2; mode < numModesAvailable; mode++)
> {
> @@ -1786,7 +1786,7 @@
> if (pu != numPU - 1)
> {
> uint32_t zorder = cu->getZorderIdxInCU() + partOffset;
> - int part = partitionFromSizes(tuSize, tuSize);
> + int part = partitionFromSize(tuSize);
> pixel* dst = cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
> uint32_t dststride = cu->getPic()->getPicYuvRec()->getStride();
> pixel* src = reconYuv->getLumaAddr(partOffset);
> @@ -1844,6 +1844,8 @@
> scaleTuSize = 32;
> costShift = 2;
> }
> + int sizeIdx = g_convertToBit[scaleTuSize];
> + pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
>
> TComPattern::initAdiPatternChroma(cu, absPartIdx, trDepth, m_predBuf, 1);
> TComPattern::initAdiPatternChroma(cu, absPartIdx, trDepth, m_predBuf, 2);
> @@ -1866,8 +1868,6 @@
>
> //===== get prediction signal =====
> predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, scaleTuSize, chFmt);
> - int log2SizeMinus2 = g_convertToBit[scaleTuSize];
> - pixelcmp_t sa8d = primitives.sa8d[log2SizeMinus2];
> cost += sa8d(fenc, stride, pred, stride) << costShift;
> }
>
> @@ -1980,7 +1980,7 @@
>
> if (!isLastSection(&tuIterator))
> {
> - uint32_t compWidth = (cu->getCUSize(0) >> m_hChromaShift) >> initTrDepth;
> + uint32_t compWidth = (cu->getCUSize(0) >> m_hChromaShift) >> initTrDepth;
> uint32_t compHeight = (cu->getCUSize(0) >> m_vChromaShift) >> initTrDepth;
> uint32_t zorder = cu->getZorderIdxInCU() + tuIterator.m_partOffset;
> pixel* dst = cu->getPic()->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder);
> @@ -2662,8 +2662,7 @@
> uint32_t bits = 0, bestBits = 0;
> uint32_t distortion = 0, bdist = 0;
>
> - uint32_t width = cu->getCUSize(0);
> - uint32_t height = cu->getCUSize(0);
> + uint32_t cuSize = cu->getCUSize(0);
>
> // No residual coding : SKIP mode
> if (bSkipRes)
> @@ -2672,10 +2671,10 @@
>
> predYuv->copyToPartYuv(outReconYuv, 0);
> // Luma
> - int part = partitionFromSizes(width, height);
> + int part = partitionFromSize(cuSize);
> distortion = primitives.sse_pp[part](fencYuv->getLumaAddr(), fencYuv->getStride(), outReconYuv->getLumaAddr(), outReconYuv->getStride());
> // Chroma
> - part = partitionFromSizes(width >> m_hChromaShift, height >> m_vChromaShift);
> + part = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
> distortion += m_rdCost->scaleChromaDistCb(primitives.sse_pp[part](fencYuv->getCbAddr(), fencYuv->getCStride(), outReconYuv->getCbAddr(), outReconYuv->getCStride()));
> distortion += m_rdCost->scaleChromaDistCr(primitives.sse_pp[part](fencYuv->getCrAddr(), fencYuv->getCStride(), outReconYuv->getCrAddr(), outReconYuv->getCStride()));
>
> @@ -2695,9 +2694,9 @@
> cu->m_totalDistortion = distortion;
> if (m_rdCost->psyRdEnabled())
> {
> - int size = g_convertToBit[cu->getCUSize(0)];
> + int size = g_convertToBit[cuSize];
> uint32_t psyRdCost = m_rdCost->psyCost(size, fencYuv->getLumaAddr(), fencYuv->getStride(),
> - outReconYuv->getLumaAddr(), outReconYuv->getStride());
> + outReconYuv->getLumaAddr(), outReconYuv->getStride());
> cu->m_totalCost = m_rdCost->calcPsyRdCost(cu->m_totalDistortion, cu->m_totalBits, psyRdCost);
> }
> else
> @@ -2718,7 +2717,7 @@
> bits = 0;
> distortion = 0;
>
> - outResiYuv->subtract(fencYuv, predYuv, width);
> + outResiYuv->subtract(fencYuv, predYuv, cuSize);
> m_rdGoOnSbacCoder->load(m_rdSbacCoders[cu->getDepth(0)][CI_CURR_BEST]);
> xEstimateResidualQT(cu, 0, outResiYuv, cu->getDepth(0), cost, bits, distortion, &zeroDistortion, curUseRDOQ);
>
> @@ -2739,9 +2738,9 @@
> ::memset(cu->getCbf(TEXT_LUMA), 0, qpartnum * sizeof(uint8_t));
> ::memset(cu->getCbf(TEXT_CHROMA_U), 0, qpartnum * sizeof(uint8_t));
> ::memset(cu->getCbf(TEXT_CHROMA_V), 0, qpartnum * sizeof(uint8_t));
> - ::memset(cu->getCoeffY(), 0, width * height * sizeof(coeff_t));
> - ::memset(cu->getCoeffCb(), 0, width * height * sizeof(coeff_t) >> (m_hChromaShift + m_vChromaShift));
> - ::memset(cu->getCoeffCr(), 0, width * height * sizeof(coeff_t) >> (m_hChromaShift + m_vChromaShift));
> + ::memset(cu->getCoeffY(), 0, cuSize * cuSize * sizeof(coeff_t));
> + ::memset(cu->getCoeffCb(), 0, cuSize * cuSize * sizeof(coeff_t) >> (m_hChromaShift + m_vChromaShift));
> + ::memset(cu->getCoeffCr(), 0, cuSize * cuSize * sizeof(coeff_t) >> (m_hChromaShift + m_vChromaShift));
> cu->setTransformSkipSubParts(0, 0, 0, 0, cu->getDepth(0));
> }
> else
> @@ -2771,7 +2770,7 @@
>
> if (cu->getQtRootCbf(0))
> {
> - outReconYuv->addClip(predYuv, outBestResiYuv, width);
> + outReconYuv->addClip(predYuv, outBestResiYuv, cuSize);
> }
> else
> {
> @@ -2779,16 +2778,16 @@
> }
>
> // update with clipped distortion and cost (qp estimation loop uses unclipped values)
> - int part = partitionFromSizes(width, height);
> + int part = partitionFromSize(cuSize);
> bdist = primitives.sse_pp[part](fencYuv->getLumaAddr(), fencYuv->getStride(), outReconYuv->getLumaAddr(), outReconYuv->getStride());
> - part = partitionFromSizes(width >> cu->getHorzChromaShift(), height >> cu->getVertChromaShift());
> + part = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
> bdist += m_rdCost->scaleChromaDistCb(primitives.sse_pp[part](fencYuv->getCbAddr(), fencYuv->getCStride(), outReconYuv->getCbAddr(), outReconYuv->getCStride()));
> bdist += m_rdCost->scaleChromaDistCr(primitives.sse_pp[part](fencYuv->getCrAddr(), fencYuv->getCStride(), outReconYuv->getCrAddr(), outReconYuv->getCStride()));
> if (m_rdCost->psyRdEnabled())
> {
> - int size = g_convertToBit[cu->getCUSize(0)];
> + int size = g_convertToBit[cuSize];
> uint32_t psyRdCost = m_rdCost->psyCost(size, fencYuv->getLumaAddr(), fencYuv->getStride(),
> - outReconYuv->getLumaAddr(), outReconYuv->getStride());
> + outReconYuv->getLumaAddr(), outReconYuv->getStride());
> bcost = m_rdCost->calcPsyRdCost(bdist, bestBits, psyRdCost);
> }
> else
> @@ -2847,8 +2846,7 @@
> {
> X265_CHECK(cu->getDepth(0) == cu->getDepth(absPartIdx), "invalid depth\n");
> const uint32_t trMode = depth - cu->getDepth(0);
> - const uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> depth] + 2;
> - uint32_t trSizeCLog2 = trSizeLog2 - m_hChromaShift;
> + const uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
> const uint32_t setCbf = 1 << trMode;
> int chFmt = cu->getChromaFormat();
>
> @@ -2861,34 +2859,35 @@
> const bool bCheckSplit = (trSizeLog2 > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx));
> X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
>
> - bool bCodeChroma = true;
> - uint32_t trModeC = trMode;
> - if ((trSizeLog2 == 2) && !(chFmt == CHROMA_444))
> - {
> - trSizeCLog2++;
> - trModeC--;
> - uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((depth - 1) << 1);
> - bCodeChroma = ((absPartIdx % qpdiv) == 0);
> - }
> -
> - const bool splitIntoSubTUs = (chFmt == CHROMA_422);
> - uint32_t absPartIdxStep = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trModeC) << 1);
> -
> // code full block
> uint32_t absSumY = 0, absSumU = 0, absSumV = 0;
> int lastPosY = -1, lastPosU = -1, lastPosV = -1;
> if (bCheckFull)
> {
> - const uint32_t numCoeffPerAbsPartIdxIncrement = cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize() >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1);
> -
> - coeff_t *coeffCurY = cu->getCoeffY() + (numCoeffPerAbsPartIdxIncrement * absPartIdx);
> - coeff_t *coeffCurU = cu->getCoeffCb() + (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift + m_vChromaShift));
> - coeff_t *coeffCurV = cu->getCoeffCr() + (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift + m_vChromaShift));
> -
> - int trWidth = 0, trHeight = 0, trWidthC = 0, trHeightC = 0;
> -
> - trWidth = trHeight = 1 << trSizeLog2;
> - trWidthC = trHeightC = 1 << trSizeCLog2;
> + uint32_t trSizeCLog2 = trSizeLog2 - m_hChromaShift;
> + bool bCodeChroma = true;
> + uint32_t trModeC = trMode;
> + if ((trSizeLog2 == 2) && !(chFmt == CHROMA_444))
> + {
> + trSizeCLog2++;
> + trModeC--;
> + uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((depth - 1) << 1);
> + bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
> + }
> +
> + const bool splitIntoSubTUs = (chFmt == CHROMA_422);
> + uint32_t absPartIdxStep = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trModeC) << 1);
> +
> + uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> + uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
> + coeff_t *coeffCurY = cu->getCoeffY() + coeffOffsetY;
> + coeff_t *coeffCurU = cu->getCoeffCb() + coeffOffsetC;
> + coeff_t *coeffCurV = cu->getCoeffCr() + coeffOffsetC;
> +
> + uint32_t trSize = 1 << trSizeLog2;
> + uint32_t trSizeC = 1 << trSizeCLog2;
> + uint32_t sizeIdx = trSizeLog2 - 2;
> + uint32_t sizeIdxC = trSizeCLog2 - 2;
> cu->setTrIdxSubParts(depth - cu->getDepth(0), absPartIdx, depth);
>
> cu->setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
> @@ -2897,7 +2896,7 @@
> m_trQuant->selectLambda(TEXT_LUMA);
>
> absSumY = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, coeffCurY,
> - trWidth, TEXT_LUMA, absPartIdx, &lastPosY, false, curuseRDOQ);
> + trSize, TEXT_LUMA, absPartIdx, &lastPosY, false, curuseRDOQ);
>
> cu->setCbfSubParts(absSumY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
>
> @@ -2909,13 +2908,12 @@
>
> int scalingListType = 3 + TEXT_LUMA;
> X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> - m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, resiYuv->m_width, coeffCurY, trWidth, scalingListType, false, lastPosY); //this is for inter mode only
> + m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, resiYuv->m_width, coeffCurY, trSize, scalingListType, false, lastPosY); //this is for inter mode only
> }
> else
> {
> int16_t *ptr = resiYuv->getLumaAddr(absPartIdx);
> - X265_CHECK(trWidth == trHeight, "square transform expected\n");
> - primitives.blockfill_s[(int)g_convertToBit[trWidth]](ptr, resiYuv->m_width, 0);
> + primitives.blockfill_s[sizeIdx](ptr, resiYuv->m_width, 0);
> }
> cu->setCbfSubParts(absSumY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
>
> @@ -2924,13 +2922,10 @@
> TComTURecurse tuIterator;
> initSection(&tuIterator, splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
>
> - uint32_t widthC = trWidthC;
> - uint32_t heightC = trWidthC;
> -
> do
> {
> uint32_t absPartIdxC = tuIterator.m_absPartIdxTURelCU;
> - uint32_t subTUBufferOffset = widthC * heightC * tuIterator.m_section;
> + uint32_t subTUBufferOffset = trSizeC * trSizeC * tuIterator.m_section;
>
> cu->setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> cu->setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> @@ -2941,12 +2936,12 @@
> m_trQuant->selectLambda(TEXT_CHROMA);
>
> absSumU = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurU + subTUBufferOffset,
> - trWidthC, TEXT_CHROMA_U, absPartIdxC, &lastPosU, false, curuseRDOQ);
> + trSizeC, TEXT_CHROMA_U, absPartIdxC, &lastPosU, false, curuseRDOQ);
>
> curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> absSumV = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurV + subTUBufferOffset,
> - trWidthC, TEXT_CHROMA_V, absPartIdxC, &lastPosV, false, curuseRDOQ);
> + trSizeC, TEXT_CHROMA_V, absPartIdxC, &lastPosV, false, curuseRDOQ);
>
> cu->setCbfPartRange(absSumU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> cu->setCbfPartRange(absSumV ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> @@ -2960,13 +2955,12 @@
>
> int scalingListType = 3 + TEXT_CHROMA_U;
> X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> - m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, pcResiCurrU, resiYuv->m_cwidth, coeffCurU + subTUBufferOffset, trWidthC, scalingListType, false, lastPosU);
> + m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, pcResiCurrU, resiYuv->m_cwidth, coeffCurU + subTUBufferOffset, trSizeC, scalingListType, false, lastPosU);
> }
> else
> {
> int16_t *ptr = resiYuv->getCbAddr(absPartIdxC);
> - X265_CHECK(widthC == heightC, "square chroma transform expected\n");
> - primitives.blockfill_s[(int)g_convertToBit[trWidthC]](ptr, resiYuv->m_cwidth, 0);
> + primitives.blockfill_s[sizeIdxC](ptr, resiYuv->m_cwidth, 0);
> }
> if (absSumV)
> {
> @@ -2976,13 +2970,12 @@
>
> int scalingListType = 3 + TEXT_CHROMA_V;
> X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> - m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, resiYuv->m_cwidth, coeffCurV + subTUBufferOffset, trWidthC, scalingListType, false, lastPosV);
> + m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, resiYuv->m_cwidth, coeffCurV + subTUBufferOffset, trSizeC, scalingListType, false, lastPosV);
> }
> else
> {
> int16_t *ptr = resiYuv->getCrAddr(absPartIdxC);
> - X265_CHECK(widthC == heightC, "square chroma transform expected\n");
> - primitives.blockfill_s[(int)g_convertToBit[trWidthC]](ptr, resiYuv->m_cwidth, 0);
> + primitives.blockfill_s[sizeIdxC](ptr, resiYuv->m_cwidth, 0);
> }
> cu->setCbfPartRange(absSumU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> cu->setCbfPartRange(absSumV ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> @@ -3040,8 +3033,7 @@
> {
> X265_CHECK(cu->getDepth(0) == cu->getDepth(absPartIdx), "depth not matching\n");
> const uint32_t trMode = depth - cu->getDepth(0);
> - const uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> depth] + 2;
> - uint32_t trSizeCLog2 = trSizeLog2 - m_hChromaShift;
> + const uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
> const uint32_t subTUDepth = trMode + 1;
> const uint32_t setCbf = 1 << trMode;
> int chFmt = cu->getChromaFormat();
> @@ -3055,6 +3047,7 @@
> const bool bCheckSplit = (trSizeLog2 > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx));
> X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
>
> + uint32_t trSizeCLog2 = trSizeLog2 - m_hChromaShift;
> bool bCodeChroma = true;
> uint32_t trModeC = trMode;
> if ((trSizeLog2 == 2) && !(chFmt == CHROMA_444))
> @@ -3062,7 +3055,7 @@
> trSizeCLog2++;
> trModeC--;
> uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((depth - 1) << 1);
> - bCodeChroma = ((absPartIdx % qpdiv) == 0);
> + bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
> }
>
> // code full block
> @@ -3080,22 +3073,20 @@
> uint32_t bestsubTUCBF[MAX_NUM_COMPONENT][2];
> m_rdGoOnSbacCoder->store(m_rdSbacCoders[depth][CI_QT_TRAFO_ROOT]);
>
> - int trWidth = 0, trHeight = 0, trWidthC = 0, trHeightC = 0;
> -
> - trWidth = trHeight = 1 << trSizeLog2;
> - trWidthC = 1 << trSizeCLog2;
> - trHeightC = (chFmt == CHROMA_422) ? (trWidthC << 1) : trWidthC;
> + uint32_t trSize = 1 << trSizeLog2;
> const bool splitIntoSubTUs = (chFmt == CHROMA_422);
> uint32_t absPartIdxStep = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trModeC) << 1);
>
> // code full block
> if (bCheckFull)
> {
> - const uint32_t numCoeffPerAbsPartIdxIncrement = cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize() >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1);
> + uint32_t trSizeC = 1 << trSizeCLog2;
> const uint32_t qtlayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> - coeff_t *coeffCurY = m_qtTempCoeffY[qtlayer] + (numCoeffPerAbsPartIdxIncrement * absPartIdx);
> - coeff_t *coeffCurU = m_qtTempCoeffCb[qtlayer] + (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift + m_vChromaShift));
> - coeff_t *coeffCurV = m_qtTempCoeffCr[qtlayer] + (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift + m_vChromaShift));
> + uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> + uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
> + coeff_t *coeffCurY = m_qtTempCoeffY[qtlayer] + coeffOffsetY;
> + coeff_t *coeffCurU = m_qtTempCoeffCb[qtlayer] + coeffOffsetC;
> + coeff_t *coeffCurV = m_qtTempCoeffCr[qtlayer] + coeffOffsetC;
>
> cu->setTrIdxSubParts(depth - cu->getDepth(0), absPartIdx, depth);
> bool checkTransformSkip = cu->getSlice()->getPPS()->getUseTransformSkip() && !cu->getCUTransquantBypass(0);
> @@ -3106,21 +3097,21 @@
>
> if (m_cfg->bEnableRDOQ && curuseRDOQ)
> {
> - m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac, trWidth, TEXT_LUMA);
> + m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac, trSize, TEXT_LUMA);
> }
>
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
> m_trQuant->selectLambda(TEXT_LUMA);
>
> absSum[TEXT_LUMA][0] = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, coeffCurY,
> - trWidth, TEXT_LUMA, absPartIdx, &lastPos[TEXT_LUMA][0], false, curuseRDOQ);
> + trSize, TEXT_LUMA, absPartIdx, &lastPos[TEXT_LUMA][0], false, curuseRDOQ);
>
> cu->setCbfSubParts(absSum[TEXT_LUMA][0] ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
>
> m_entropyCoder->resetBits();
> - m_entropyCoder->encodeQtCbf(cu, absPartIdx, 0, trWidth, trHeight, TEXT_LUMA, trMode, true);
> + m_entropyCoder->encodeQtCbf(cu, absPartIdx, 0, trSize, trSize, TEXT_LUMA, trMode, true);
> if (absSum[TEXT_LUMA][0])
> - m_entropyCoder->encodeCoeffNxN(cu, coeffCurY, absPartIdx, trWidth, TEXT_LUMA);
> + m_entropyCoder->encodeCoeffNxN(cu, coeffCurY, absPartIdx, trSize, TEXT_LUMA);
> singleBitsComp[TEXT_LUMA][0] = m_entropyCoder->getNumberOfWrittenBits();
>
> uint32_t singleBitsPrev = singleBitsComp[TEXT_LUMA][0];
> @@ -3130,20 +3121,17 @@
> TComTURecurse tuIterator;
> initSection(&tuIterator, splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
>
> - uint32_t widthC = trWidthC;
> - uint32_t heightC = splitIntoSubTUs ? (trHeightC >> 1) : trHeightC;
> -
> do
> {
> uint32_t absPartIdxC = tuIterator.m_absPartIdxTURelCU;
> - uint32_t subTUBufferOffset = widthC * heightC * tuIterator.m_section;
> + uint32_t subTUBufferOffset = trSizeC * trSizeC * tuIterator.m_section;
>
> cu->setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> cu->setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
>
> if (m_cfg->bEnableRDOQ && curuseRDOQ)
> {
> - m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac, widthC, TEXT_CHROMA);
> + m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac, trSizeC, TEXT_CHROMA);
> }
> //Cb transform
> int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
> @@ -3152,24 +3140,24 @@
> m_trQuant->selectLambda(TEXT_CHROMA);
>
> absSum[TEXT_CHROMA_U][tuIterator.m_section] = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurU + subTUBufferOffset,
> - widthC, TEXT_CHROMA_U, absPartIdxC, &lastPos[TEXT_CHROMA_U][tuIterator.m_section], false, curuseRDOQ);
> + trSizeC, TEXT_CHROMA_U, absPartIdxC, &lastPos[TEXT_CHROMA_U][tuIterator.m_section], false, curuseRDOQ);
> //Cr transform
> curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> absSum[TEXT_CHROMA_V][tuIterator.m_section] = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurV + subTUBufferOffset,
> - widthC, TEXT_CHROMA_V, absPartIdxC, &lastPos[TEXT_CHROMA_V][tuIterator.m_section], false, curuseRDOQ);
> + trSizeC, TEXT_CHROMA_V, absPartIdxC, &lastPos[TEXT_CHROMA_V][tuIterator.m_section], false, curuseRDOQ);
>
> cu->setCbfPartRange(absSum[TEXT_CHROMA_U][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> cu->setCbfPartRange(absSum[TEXT_CHROMA_V][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
>
> - m_entropyCoder->encodeQtCbf(cu, absPartIdxC, tuIterator.m_absPartIdxStep, widthC, heightC, TEXT_CHROMA_U, trMode, true);
> + m_entropyCoder->encodeQtCbf(cu, absPartIdxC, tuIterator.m_absPartIdxStep, trSizeC, trSizeC, TEXT_CHROMA_U, trMode, true);
> if (absSum[TEXT_CHROMA_U][tuIterator.m_section])
> - m_entropyCoder->encodeCoeffNxN(cu, coeffCurU + subTUBufferOffset, absPartIdxC, widthC, TEXT_CHROMA_U);
> + m_entropyCoder->encodeCoeffNxN(cu, coeffCurU + subTUBufferOffset, absPartIdxC, trSizeC, TEXT_CHROMA_U);
> singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section] = m_entropyCoder->getNumberOfWrittenBits() - singleBitsPrev;
>
> - m_entropyCoder->encodeQtCbf(cu, absPartIdxC, tuIterator.m_absPartIdxStep, widthC, heightC, TEXT_CHROMA_V, trMode, true);
> + m_entropyCoder->encodeQtCbf(cu, absPartIdxC, tuIterator.m_absPartIdxStep, trSizeC, trSizeC, TEXT_CHROMA_V, trMode, true);
> if (absSum[TEXT_CHROMA_V][tuIterator.m_section])
> - m_entropyCoder->encodeCoeffNxN(cu, coeffCurV + subTUBufferOffset, absPartIdxC, widthC, TEXT_CHROMA_V);
> + m_entropyCoder->encodeCoeffNxN(cu, coeffCurV + subTUBufferOffset, absPartIdxC, trSizeC, TEXT_CHROMA_V);
> uint32_t newBits = m_entropyCoder->getNumberOfWrittenBits();
> singleBitsComp[TEXT_CHROMA_V][tuIterator.m_section] = newBits - (singleBitsPrev + singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section]);
>
> @@ -3187,8 +3175,8 @@
> minCost[TEXT_CHROMA_V][subTUIndex] = MAX_INT64;
> }
>
> - int partSize = partitionFromSizes(trWidth, trHeight);
> - uint32_t distY = primitives.sse_sp[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, (pixel*)RDCost::zeroPel, trWidth);
> + int partSize = partitionFromSize(trSize);
> + uint32_t distY = primitives.sse_sp[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, (pixel*)RDCost::zeroPel, trSize);
>
> if (outZeroDist)
> {
> @@ -3203,7 +3191,7 @@
> int scalingListType = 3 + TEXT_LUMA;
> X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width not full CU\n");
> - m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, MAX_CU_SIZE, coeffCurY, trWidth, scalingListType, false, lastPos[TEXT_LUMA][0]); //this is for inter mode only
> + m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, MAX_CU_SIZE, coeffCurY, trSize, scalingListType, false, lastPos[TEXT_LUMA][0]); //this is for inter mode only
>
> const uint32_t nonZeroDistY = primitives.sse_ss[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx), MAX_CU_SIZE);
> if (cu->isLosslessCoded(0))
> @@ -3250,8 +3238,8 @@
> {
> int16_t *ptr = m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx);
> X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width not full CU\n");
> - X265_CHECK(trWidth == trHeight, "not square block\n");
> - primitives.blockfill_s[(int)g_convertToBit[trWidth]](ptr, MAX_CU_SIZE, 0);
> + int sizeIdx = trSizeLog2 - 2;
> + primitives.blockfill_s[sizeIdx](ptr, MAX_CU_SIZE, 0);
> }
> cu->setCbfSubParts(absSum[TEXT_LUMA][0] ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
>
> @@ -3262,18 +3250,15 @@
> TComTURecurse tuIterator;
> initSection(&tuIterator, splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
>
> - uint32_t widthC = trWidthC;
> - uint32_t heightC = splitIntoSubTUs ? (trHeightC >> 1) : trHeightC;
> -
> - int partSizeC = partitionFromSizes(widthC, heightC);
> - const uint32_t numSamplesChroma = widthC * heightC;
> + int partSizeC = partitionFromSize(trSizeC);
> + const uint32_t numSamplesChroma = trSizeC * trSizeC;
>
> do
> {
> uint32_t absPartIdxC = tuIterator.m_absPartIdxTURelCU;
> - uint32_t subTUBufferOffset = widthC * heightC * tuIterator.m_section;
> -
> - distU = m_rdCost->scaleChromaDistCb(primitives.sse_sp[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, (pixel*)RDCost::zeroPel, widthC));
> + uint32_t subTUBufferOffset = trSizeC * trSizeC * tuIterator.m_section;
> +
> + distU = m_rdCost->scaleChromaDistCb(primitives.sse_sp[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, (pixel*)RDCost::zeroPel, trSizeC));
>
> if (outZeroDist)
> {
> @@ -3289,7 +3274,7 @@
> int scalingListType = 3 + TEXT_CHROMA_U;
> X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, pcResiCurrU, m_qtTempShortYuv[qtlayer].m_cwidth, coeffCurU + subTUBufferOffset,
> - widthC, scalingListType, false, lastPos[TEXT_CHROMA_U][tuIterator.m_section]);
> + trSizeC, scalingListType, false, lastPos[TEXT_CHROMA_U][tuIterator.m_section]);
> uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth,
> m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC),
> m_qtTempShortYuv[qtlayer].m_cwidth);
> @@ -3339,11 +3324,11 @@
> {
> int16_t *ptr = m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC);
> const uint32_t stride = m_qtTempShortYuv[qtlayer].m_cwidth;
> - X265_CHECK(widthC == heightC, "square chroma transform expected\n");
> - primitives.blockfill_s[(int)g_convertToBit[widthC]](ptr, stride, 0);
> + int sizeIdxC = trSizeCLog2 - 2;
> + primitives.blockfill_s[sizeIdxC](ptr, stride, 0);
> }
>
> - distV = m_rdCost->scaleChromaDistCr(primitives.sse_sp[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, (pixel*)RDCost::zeroPel, widthC));
> + distV = m_rdCost->scaleChromaDistCr(primitives.sse_sp[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, (pixel*)RDCost::zeroPel, trSizeC));
> if (outZeroDist)
> {
> *outZeroDist += distV;
> @@ -3357,7 +3342,7 @@
> int scalingListType = 3 + TEXT_CHROMA_V;
> X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, m_qtTempShortYuv[qtlayer].m_cwidth, coeffCurV + subTUBufferOffset,
> - widthC, scalingListType, false, lastPos[TEXT_CHROMA_V][tuIterator.m_section]);
> + trSizeC, scalingListType, false, lastPos[TEXT_CHROMA_V][tuIterator.m_section]);
> uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth,
> m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC),
> m_qtTempShortYuv[qtlayer].m_cwidth);
> @@ -3407,8 +3392,8 @@
> {
> int16_t *ptr = m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC);
> const uint32_t stride = m_qtTempShortYuv[qtlayer].m_cwidth;
> - X265_CHECK(widthC == heightC, "square chroma transform expected\n");
> - primitives.blockfill_s[(int)g_convertToBit[widthC]](ptr, stride, 0);
> + int sizeIdxC = trSizeCLog2 - 2;
> + primitives.blockfill_s[sizeIdxC](ptr, stride, 0);
> }
>
> cu->setCbfPartRange(absSum[TEXT_CHROMA_U][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> @@ -3430,9 +3415,9 @@
> memcpy(bestCoeffY, coeffCurY, sizeof(coeff_t) * numSamplesLuma);
>
> int16_t bestResiY[32 * 32];
> - for (int i = 0; i < trHeight; ++i)
> + for (int i = 0; i < trSize; ++i)
> {
> - memcpy(bestResiY + i * trWidth, curResiY + i * MAX_CU_SIZE, sizeof(int16_t) * trWidth);
> + memcpy(bestResiY + i * trSize, curResiY + i * MAX_CU_SIZE, sizeof(int16_t) * trSize);
> }
>
> m_rdGoOnSbacCoder->load(m_rdSbacCoders[depth][CI_QT_TRAFO_ROOT]);
> @@ -3441,21 +3426,21 @@
>
> if (m_cfg->bEnableRDOQTS)
> {
> - m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac, trWidth, TEXT_LUMA);
> + m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac, trSize, TEXT_LUMA);
> }
>
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
>
> m_trQuant->selectLambda(TEXT_LUMA);
> absSumTransformSkipY = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, coeffCurY,
> - trWidth, TEXT_LUMA, absPartIdx, &lastPosTransformSkip[TEXT_LUMA][0], true, curuseRDOQ);
> + trSize, TEXT_LUMA, absPartIdx, &lastPosTransformSkip[TEXT_LUMA][0], true, curuseRDOQ);
> cu->setCbfSubParts(absSumTransformSkipY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
>
> if (absSumTransformSkipY)
> {
> m_entropyCoder->resetBits();
> - m_entropyCoder->encodeQtCbf(cu, absPartIdx, 0, trWidth, trHeight, TEXT_LUMA, trMode, true);
> - m_entropyCoder->encodeCoeffNxN(cu, coeffCurY, absPartIdx, trWidth, TEXT_LUMA);
> + m_entropyCoder->encodeQtCbf(cu, absPartIdx, 0, trSize, trSize, TEXT_LUMA, trMode, true);
> + m_entropyCoder->encodeCoeffNxN(cu, coeffCurY, absPartIdx, trSize, TEXT_LUMA);
> const uint32_t skipSingleBitsY = m_entropyCoder->getNumberOfWrittenBits();
>
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
> @@ -3464,7 +3449,7 @@
> X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width not full CU\n");
>
> - m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, MAX_CU_SIZE, coeffCurY, trWidth, scalingListType, true, lastPosTransformSkip[TEXT_LUMA][0]);
> + m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, MAX_CU_SIZE, coeffCurY, trSize, scalingListType, true, lastPosTransformSkip[TEXT_LUMA][0]);
>
> nonZeroDistY = primitives.sse_ss[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width,
> m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx),
> @@ -3477,9 +3462,9 @@
> {
> cu->setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
> memcpy(coeffCurY, bestCoeffY, sizeof(coeff_t) * numSamplesLuma);
> - for (int i = 0; i < trHeight; ++i)
> + for (int i = 0; i < trSize; ++i)
> {
> - memcpy(curResiY + i * MAX_CU_SIZE, &bestResiY[i * trWidth], sizeof(int16_t) * trWidth);
> + memcpy(curResiY + i * MAX_CU_SIZE, &bestResiY[i * trSize], sizeof(int16_t) * trSize);
> }
> }
> else
> @@ -3503,16 +3488,13 @@
> TComTURecurse tuIterator;
> initSection(&tuIterator, splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
>
> - uint32_t widthC = trWidthC;
> - uint32_t heightC = splitIntoSubTUs ? (trHeightC >> 1) : trHeightC;
> -
> - int partSizeC = partitionFromSizes(widthC, heightC);
> - const uint32_t numSamplesChroma = widthC * heightC;
> + int partSizeC = partitionFromSize(trSizeC);
> + const uint32_t numSamplesChroma = trSizeC * trSizeC;
>
> do
> {
> uint32_t absPartIdxC = tuIterator.m_absPartIdxTURelCU;
> - uint32_t subTUBufferOffset = widthC * heightC * tuIterator.m_section;
> + uint32_t subTUBufferOffset = trSizeC * trSizeC * tuIterator.m_section;
>
> int16_t *curResiU = m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC);
> int16_t *curResiV = m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC);
> @@ -3523,10 +3505,10 @@
> memcpy(bestCoeffV, coeffCurV + subTUBufferOffset, sizeof(coeff_t) * numSamplesChroma);
>
> int16_t bestResiU[32 * 32], bestResiV[32 * 32];
> - for (int i = 0; i < heightC; ++i)
> + for (int i = 0; i < trSizeC; ++i)
> {
> - memcpy(&bestResiU[i * widthC], curResiU + i * stride, sizeof(int16_t) * widthC);
> - memcpy(&bestResiV[i * widthC], curResiV + i * stride, sizeof(int16_t) * widthC);
> + memcpy(&bestResiU[i * trSizeC], curResiU + i * stride, sizeof(int16_t) * trSizeC);
> + memcpy(&bestResiV[i * trSizeC], curResiV + i * stride, sizeof(int16_t) * trSizeC);
> }
>
> cu->setTransformSkipPartRange(1, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> @@ -3534,7 +3516,7 @@
>
> if (m_cfg->bEnableRDOQTS)
> {
> - m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac, widthC, TEXT_CHROMA);
> + m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac, trSizeC, TEXT_CHROMA);
> }
>
> int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
> @@ -3542,11 +3524,11 @@
> m_trQuant->selectLambda(TEXT_CHROMA);
>
> absSumTransformSkipU = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurU + subTUBufferOffset,
> - widthC, TEXT_CHROMA_U, absPartIdxC, &lastPosTransformSkip[TEXT_CHROMA_U][tuIterator.m_section], true, curuseRDOQ);
> + trSizeC, TEXT_CHROMA_U, absPartIdxC, &lastPosTransformSkip[TEXT_CHROMA_U][tuIterator.m_section], true, curuseRDOQ);
> curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> absSumTransformSkipV = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurV + subTUBufferOffset,
> - widthC, TEXT_CHROMA_V, absPartIdxC, &lastPosTransformSkip[TEXT_CHROMA_V][tuIterator.m_section], true, curuseRDOQ);
> + trSizeC, TEXT_CHROMA_V, absPartIdxC, &lastPosTransformSkip[TEXT_CHROMA_V][tuIterator.m_section], true, curuseRDOQ);
>
> cu->setCbfPartRange(absSumTransformSkipU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> cu->setCbfPartRange(absSumTransformSkipV ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> @@ -3556,8 +3538,8 @@
>
> if (absSumTransformSkipU)
> {
> - m_entropyCoder->encodeQtCbf(cu, absPartIdxC, tuIterator.m_absPartIdxStep, widthC, heightC, TEXT_CHROMA_U, trMode, true);
> - m_entropyCoder->encodeCoeffNxN(cu, coeffCurU + subTUBufferOffset, absPartIdxC, widthC, TEXT_CHROMA_U);
> + m_entropyCoder->encodeQtCbf(cu, absPartIdxC, tuIterator.m_absPartIdxStep, trSizeC, trSizeC, TEXT_CHROMA_U, trMode, true);
> + m_entropyCoder->encodeCoeffNxN(cu, coeffCurU + subTUBufferOffset, absPartIdxC, trSizeC, TEXT_CHROMA_U);
> singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section] = m_entropyCoder->getNumberOfWrittenBits();
>
> curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
> @@ -3566,7 +3548,7 @@
> int scalingListType = 3 + TEXT_CHROMA_U;
> X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiU, m_qtTempShortYuv[qtlayer].m_cwidth, coeffCurU + subTUBufferOffset,
> - widthC, scalingListType, true, lastPosTransformSkip[TEXT_CHROMA_U][tuIterator.m_section]);
> + trSizeC, scalingListType, true, lastPosTransformSkip[TEXT_CHROMA_U][tuIterator.m_section]);
> uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth,
> m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC),
> m_qtTempShortYuv[qtlayer].m_cwidth);
> @@ -3579,9 +3561,9 @@
> cu->setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
>
> memcpy(coeffCurU + subTUBufferOffset, bestCoeffU, sizeof(coeff_t) * numSamplesChroma);
> - for (int i = 0; i < heightC; ++i)
> + for (int i = 0; i < trSizeC; ++i)
> {
> - memcpy(curResiU + i * stride, &bestResiU[i * widthC], sizeof(int16_t) * widthC);
> + memcpy(curResiU + i * stride, &bestResiU[i * trSizeC], sizeof(int16_t) * trSizeC);
> }
> }
> else
> @@ -3593,8 +3575,8 @@
>
> if (absSumTransformSkipV)
> {
> - m_entropyCoder->encodeQtCbf(cu, absPartIdxC, tuIterator.m_absPartIdxStep, widthC, heightC, TEXT_CHROMA_V, trMode, true);
> - m_entropyCoder->encodeCoeffNxN(cu, coeffCurV + subTUBufferOffset, absPartIdxC, widthC, TEXT_CHROMA_V);
> + m_entropyCoder->encodeQtCbf(cu, absPartIdxC, tuIterator.m_absPartIdxStep, trSizeC, trSizeC, TEXT_CHROMA_V, trMode, true);
> + m_entropyCoder->encodeCoeffNxN(cu, coeffCurV + subTUBufferOffset, absPartIdxC, trSizeC, TEXT_CHROMA_V);
> singleBitsComp[TEXT_CHROMA_V][tuIterator.m_section] = m_entropyCoder->getNumberOfWrittenBits() - singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section];
>
> curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
> @@ -3603,7 +3585,7 @@
> int scalingListType = 3 + TEXT_CHROMA_V;
> X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, m_qtTempShortYuv[qtlayer].m_cwidth, coeffCurV + subTUBufferOffset,
> - widthC, scalingListType, true, lastPosTransformSkip[TEXT_CHROMA_V][tuIterator.m_section]);
> + trSizeC, scalingListType, true, lastPosTransformSkip[TEXT_CHROMA_V][tuIterator.m_section]);
> uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth,
> m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC),
> m_qtTempShortYuv[qtlayer].m_cwidth);
> @@ -3616,9 +3598,9 @@
> cu->setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
>
> memcpy(coeffCurV + subTUBufferOffset, bestCoeffV, sizeof(coeff_t) * numSamplesChroma);
> - for (int i = 0; i < heightC; ++i)
> + for (int i = 0; i < trSizeC; ++i)
> {
> - memcpy(curResiV + i * stride, &bestResiV[i * widthC], sizeof(int16_t) * widthC);
> + memcpy(curResiV + i * stride, &bestResiV[i * trSizeC], sizeof(int16_t) * trSizeC);
> }
> }
> else
> @@ -3651,36 +3633,37 @@
> offsetSubTUCBFs(cu, TEXT_CHROMA_V, trMode, absPartIdx);
> }
>
> - m_entropyCoder->encodeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_U, trMode, true);
> - m_entropyCoder->encodeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_V, trMode, true);
> + uint32_t trHeightC = (chFmt == CHROMA_422) ? (trSizeC << 1) : trSizeC;
> + m_entropyCoder->encodeQtCbf(cu, absPartIdx, absPartIdxStep, trSizeC, trHeightC, TEXT_CHROMA_U, trMode, true);
> + m_entropyCoder->encodeQtCbf(cu, absPartIdx, absPartIdxStep, trSizeC, trHeightC, TEXT_CHROMA_V, trMode, true);
> }
>
> - m_entropyCoder->encodeQtCbf(cu, absPartIdx, 0, trWidth, trHeight, TEXT_LUMA, trMode, true);
> + m_entropyCoder->encodeQtCbf(cu, absPartIdx, 0, trSize, trSize, TEXT_LUMA, trMode, true);
> if (absSum[TEXT_LUMA][0])
> - m_entropyCoder->encodeCoeffNxN(cu, coeffCurY, absPartIdx, trWidth, TEXT_LUMA);
> + m_entropyCoder->encodeCoeffNxN(cu, coeffCurY, absPartIdx, trSize, TEXT_LUMA);
>
> if (bCodeChroma)
> {
> if (!splitIntoSubTUs)
> {
> if (absSum[TEXT_CHROMA_U][0])
> - m_entropyCoder->encodeCoeffNxN(cu, coeffCurU, absPartIdx, trWidthC, TEXT_CHROMA_U);
> + m_entropyCoder->encodeCoeffNxN(cu, coeffCurU, absPartIdx, trSizeC, TEXT_CHROMA_U);
> if (absSum[TEXT_CHROMA_V][0])
> - m_entropyCoder->encodeCoeffNxN(cu, coeffCurV, absPartIdx, trWidthC, TEXT_CHROMA_V);
> + m_entropyCoder->encodeCoeffNxN(cu, coeffCurV, absPartIdx, trSizeC, TEXT_CHROMA_V);
> }
> else
> {
> - uint32_t subTUSize = trWidthC * trWidthC;
> + uint32_t subTUSize = trSizeC * trSizeC;
> uint32_t partIdxesPerSubTU = absPartIdxStep >> 1;
>
> if (absSum[TEXT_CHROMA_U][0])
> - m_entropyCoder->encodeCoeffNxN(cu, coeffCurU, absPartIdx, trWidthC, TEXT_CHROMA_U);
> + m_entropyCoder->encodeCoeffNxN(cu, coeffCurU, absPartIdx, trSizeC, TEXT_CHROMA_U);
> if (absSum[TEXT_CHROMA_U][1])
> - m_entropyCoder->encodeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + partIdxesPerSubTU, trWidthC, TEXT_CHROMA_U);
> + m_entropyCoder->encodeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + partIdxesPerSubTU, trSizeC, TEXT_CHROMA_U);
> if (absSum[TEXT_CHROMA_V][0])
> - m_entropyCoder->encodeCoeffNxN(cu, coeffCurV, absPartIdx, trWidthC, TEXT_CHROMA_V);
> + m_entropyCoder->encodeCoeffNxN(cu, coeffCurV, absPartIdx, trSizeC, TEXT_CHROMA_V);
> if (absSum[TEXT_CHROMA_V][1])
> - m_entropyCoder->encodeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + partIdxesPerSubTU, trWidthC, TEXT_CHROMA_V);
> + m_entropyCoder->encodeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + partIdxesPerSubTU, trSizeC, TEXT_CHROMA_V);
> }
> }
>
> @@ -3842,7 +3825,7 @@
> const uint32_t curTrMode = depth - cu->getDepth(0);
> const uint32_t trMode = cu->getTransformIdx(absPartIdx);
> const bool bSubdiv = curTrMode != trMode;
> - const uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> depth] + 2;
> + const uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
> uint32_t trSizeCLog2 = trSizeLog2 - m_hChromaShift;
> int chFmt = cu->getChromaFormat();
> const bool splitIntoSubTUs = (chFmt == CHROMA_422);
> @@ -3855,8 +3838,7 @@
> X265_CHECK(cu->getPredictionMode(absPartIdx) != MODE_INTRA, "xEncodeResidualQT() with intra block\n");
>
> bool mCodeAll = true;
> - uint32_t trWidth = 1 << trSizeLog2;
> - uint32_t trHeight = trWidth;
> + uint32_t trSize = 1 << trSizeLog2;
> uint32_t trWidthC = 1 << trSizeCLog2;
> uint32_t trHeightC = splitIntoSubTUs ? (trWidthC << 1) : trWidthC;
>
> @@ -3891,13 +3873,11 @@
> if (!bSubdiv)
> {
> //Luma
> - const uint32_t numCoeffPerAbsPartIdxIncrement = cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize() >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1);
> const uint32_t qtlayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> - coeff_t *coeffCurY = m_qtTempCoeffY[qtlayer] + numCoeffPerAbsPartIdxIncrement * absPartIdx;
> + uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> + coeff_t *coeffCurY = m_qtTempCoeffY[qtlayer] + coeffOffsetY;
>
> //Chroma
> - coeff_t *coeffCurU = m_qtTempCoeffCb[qtlayer] + (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift + m_vChromaShift));
> - coeff_t *coeffCurV = m_qtTempCoeffCr[qtlayer] + (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift + m_vChromaShift));
> bool bCodeChroma = true;
> uint32_t trModeC = trMode;
> if ((trSizeLog2 == 2) && !(chFmt == CHROMA_444))
> @@ -3905,21 +3885,24 @@
> trSizeCLog2++;
> trModeC--;
> uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((depth - 1) << 1);
> - bCodeChroma = ((absPartIdx % qpdiv) == 0);
> + bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
> }
>
> if (bSubdivAndCbf)
> {
> - m_entropyCoder->encodeQtCbf(cu, absPartIdx, 0, trWidth, trHeight, TEXT_LUMA, trMode, true);
> + m_entropyCoder->encodeQtCbf(cu, absPartIdx, 0, trSize, trSize, TEXT_LUMA, trMode, true);
> }
> else
> {
> if (ttype == TEXT_LUMA && cu->getCbf(absPartIdx, TEXT_LUMA, trMode))
> {
> - m_entropyCoder->encodeCoeffNxN(cu, coeffCurY, absPartIdx, trWidth, TEXT_LUMA);
> + m_entropyCoder->encodeCoeffNxN(cu, coeffCurY, absPartIdx, trSize, TEXT_LUMA);
> }
> if (bCodeChroma)
> {
> + uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
> + coeff_t *coeffCurU = m_qtTempCoeffCb[qtlayer] + coeffOffsetC;
> + coeff_t *coeffCurV = m_qtTempCoeffCr[qtlayer] + coeffOffsetC;
> uint32_t trSizeC = 1 << trSizeCLog2;
>
> if (!splitIntoSubTUs)
> @@ -3977,10 +3960,10 @@
> if (curTrMode == trMode)
> {
> int chFmt = cu->getChromaFormat();
> - const uint32_t trSizeLog2 = g_convertToBit[cu->getSlice()->getSPS()->getMaxCUSize() >> depth] + 2;
> - uint32_t trSizeCLog2 = trSizeLog2 - m_hChromaShift;
> + const uint32_t trSizeLog2 = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
> const uint32_t qtlayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
>
> + uint32_t trSizeCLog2 = trSizeLog2 - m_hChromaShift;
> bool bCodeChroma = true;
> bool bChromaSame = false;
> uint32_t trModeC = trMode;
> @@ -3989,15 +3972,14 @@
> trSizeCLog2++;
> trModeC--;
> uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trModeC) << 1);
> - bCodeChroma = ((absPartIdx % qpdiv) == 0);
> + bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
> bChromaSame = true;
> }
>
> if (bSpatial)
> {
> - uint32_t trWidth = 1 << trSizeLog2;
> - uint32_t trHeight = 1 << trSizeLog2;
> - m_qtTempShortYuv[qtlayer].copyPartToPartLuma(resiYuv, absPartIdx, trWidth, trHeight);
> + uint32_t trSize = 1 << trSizeLog2;
> + m_qtTempShortYuv[qtlayer].copyPartToPartLuma(resiYuv, absPartIdx, trSize, trSize);
>
> if (bCodeChroma)
> {
> @@ -4006,21 +3988,20 @@
> }
> else
> {
> - uint32_t numCoeffPerAbsPartIdxIncrement = cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize() >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1);
> - uint32_t numCoeffY = (1 << (trSizeLog2 << 1));
> - coeff_t* coeffSrcY = m_qtTempCoeffY[qtlayer] + numCoeffPerAbsPartIdxIncrement * absPartIdx;
> - coeff_t* coeffDstY = cu->getCoeffY() + numCoeffPerAbsPartIdxIncrement * absPartIdx;
> + uint32_t numCoeffY = 1 << (trSizeLog2 * 2);
> + uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
> + coeff_t* coeffSrcY = m_qtTempCoeffY[qtlayer] + coeffOffsetY;
> + coeff_t* coeffDstY = cu->getCoeffY() + coeffOffsetY;
> ::memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
> if (bCodeChroma)
> {
> - uint32_t trWidthC = 1 << trSizeCLog2;
> - uint32_t trHeightC = (chFmt == CHROMA_422) ? (trWidthC << 1) : trWidthC;
> - uint32_t numCoeffC = trWidthC * trHeightC;
> -
> - coeff_t* coeffSrcU = m_qtTempCoeffCb[qtlayer] + (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift + m_vChromaShift));
> - coeff_t* coeffSrcV = m_qtTempCoeffCr[qtlayer] + (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift + m_vChromaShift));
> - coeff_t* coeffDstU = cu->getCoeffCb() + (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift + m_vChromaShift));
> - coeff_t* coeffDstV = cu->getCoeffCr() + (numCoeffPerAbsPartIdxIncrement * absPartIdx >> (m_hChromaShift + m_vChromaShift));
> + uint32_t numCoeffC = 1 << (trSizeCLog2 * 2 + (chFmt == CHROMA_422));
> + uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
> +
> + coeff_t* coeffSrcU = m_qtTempCoeffCb[qtlayer] + coeffOffsetC;
> + coeff_t* coeffSrcV = m_qtTempCoeffCr[qtlayer] + coeffOffsetC;
> + coeff_t* coeffDstU = cu->getCoeffCb() + coeffOffsetC;
> + coeff_t* coeffDstV = cu->getCoeffCr() + coeffOffsetC;
> ::memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
> ::memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
> }
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/Lib/TLibEncoder/TEncSearch.h
> --- a/source/Lib/TLibEncoder/TEncSearch.h Thu May 22 21:46:21 2014 -0500
> +++ b/source/Lib/TLibEncoder/TEncSearch.h Fri May 23 13:34:51 2014 +0900
> @@ -229,8 +229,8 @@
>
> void xStoreIntraResultQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx);
> void xLoadIntraResultQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx);
> - void xStoreIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t stateU0V1Both2, const bool splitIntoSubTUs);
> - void xLoadIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t stateU0V1Both2, const bool splitIntoSubTUs);
> + void xStoreIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t chromaId, const bool splitIntoSubTUs);
> + void xLoadIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t chromaId, const bool splitIntoSubTUs);
>
> // --------------------------------------------------------------------------------------------
> // Inter search (AMP)
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/common/pixel.cpp
> --- a/source/common/pixel.cpp Thu May 22 21:46:21 2014 -0500
> +++ b/source/common/pixel.cpp Fri May 23 13:34:51 2014 +0900
> @@ -1151,11 +1151,11 @@
> p.calcrecon[BLOCK_32x32] = calcRecons<32>;
> p.calcrecon[BLOCK_64x64] = NULL;
>
> - p.transpose[0] = transpose<4>;
> - p.transpose[1] = transpose<8>;
> - p.transpose[2] = transpose<16>;
> - p.transpose[3] = transpose<32>;
> - p.transpose[4] = transpose<64>;
> + p.transpose[BLOCK_4x4] = transpose<4>;
> + p.transpose[BLOCK_8x8] = transpose<8>;
> + p.transpose[BLOCK_16x16] = transpose<16>;
> + p.transpose[BLOCK_32x32] = transpose<32>;
> + p.transpose[BLOCK_64x64] = transpose<64>;
>
> p.weight_pp = weight_pp_c;
> p.weight_sp = weight_sp_c;
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/common/primitives.cpp
> --- a/source/common/primitives.cpp Thu May 22 21:46:21 2014 -0500
> +++ b/source/common/primitives.cpp Fri May 23 13:34:51 2014 +0900
> @@ -29,7 +29,7 @@
> namespace x265 {
> // x265 private namespace
>
> -uint8_t lumaPartitioneMapTable[] =
> +extern const uint8_t lumaPartitionMapTable[] =
> {
> // 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60 64
> LUMA_4x4, LUMA_4x8, 255, LUMA_4x16, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 4
> @@ -50,6 +50,11 @@
> 255, 255, 255, LUMA_64x16, 255, 255, 255, LUMA_64x32, 255, 255, 255, LUMA_64x48, 255, 255, 255, LUMA_64x64 // 64
> };
>
> +extern const uint8_t lumaSquarePartitionMapTable[] =
> +{
> + LUMA_4x4, LUMA_8x8, 255, LUMA_16x16, 255, 255, 255, LUMA_32x32, 255, 255, 255, 255, 255, 255, 255, LUMA_64x64
> +};
> +
> /* the "authoritative" set of encoder primitives */
> EncoderPrimitives primitives;
>
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/common/primitives.h
> --- a/source/common/primitives.h Thu May 22 21:46:21 2014 -0500
> +++ b/source/common/primitives.h Fri May 23 13:34:51 2014 +0900
> @@ -103,14 +103,23 @@
> inline int partitionFromSizes(int width, int height)
> {
> X265_CHECK(((width | height) & ~(4 | 8 | 16 | 32 | 64)) == 0, "Invalid block width/height\n");
> - extern uint8_t lumaPartitioneMapTable[];
> + extern const uint8_t lumaPartitionMapTable[];
> int w = (width >> 2) - 1;
> int h = (height >> 2) - 1;
> - int part = (int)lumaPartitioneMapTable[(w << 4) + h];
> + int part = (int)lumaPartitionMapTable[(w << 4) + h];
> X265_CHECK(part != 255, "Invalid block width %d height %d\n", width, height);
> return part;
> }
>
> +inline int partitionFromSize(int size)
> +{
> + X265_CHECK((size & ~(4 | 8 | 16 | 32 | 64)) == 0, "Invalid block size\n");
> + extern const uint8_t lumaSquarePartitionMapTable[];
> + int part = (int)lumaSquarePartitionMapTable[(size >> 2) - 1];
> + X265_CHECK(part != 255, "Invalid block size %d\n", size);
> + return part;
> +}
> +
> typedef int (*pixelcmp_t)(pixel *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride); // fenc is aligned
> typedef int (*pixelcmp_ss_t)(int16_t *fenc, intptr_t fencstride, int16_t *fref, intptr_t frefstride);
> typedef int (*pixelcmp_sp_t)(int16_t *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride);
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/common/shortyuv.cpp
> --- a/source/common/shortyuv.cpp Thu May 22 21:46:21 2014 -0500
> +++ b/source/common/shortyuv.cpp Fri May 23 13:34:51 2014 +0900
> @@ -84,7 +84,7 @@
>
> void ShortYuv::subtract(TComYuv* srcYuv0, TComYuv* srcYuv1, uint32_t partSize)
> {
> - int part = partitionFromSizes(partSize, partSize);
> + int part = partitionFromSize(partSize);
>
> pixel* srcY0 = srcYuv0->getLumaAddr();
> pixel* srcY1 = srcYuv1->getLumaAddr();
> @@ -136,7 +136,7 @@
>
> void ShortYuv::copyPartToPartChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, bool bChromaSame)
> {
> - int part = partitionFromSizes(lumaSize, lumaSize);
> + int part = partitionFromSize(lumaSize);
>
> part = ((part == 0) && (m_csp == CHROMA_422)) ? 1 : part;
> int16_t* srcU = getCbAddr(partIdx);
> @@ -158,7 +158,7 @@
>
> void ShortYuv::copyPartToPartChroma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, bool bChromaSame)
> {
> - int part = partitionFromSizes(lumaSize, lumaSize);
> + int part = partitionFromSize(lumaSize);
> int16_t* srcU = getCbAddr(partIdx);
> int16_t* srcV = getCrAddr(partIdx);
> pixel* dstU = dstPicYuv->getCbAddr(partIdx);
> @@ -181,7 +181,7 @@
>
> void ShortYuv::copyPartToPartShortChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, uint32_t chromaId)
> {
> - int part = partitionFromSizes(lumaSize, lumaSize);
> + int part = partitionFromSize(lumaSize);
>
> if (chromaId == 0)
> {
> @@ -214,7 +214,9 @@
>
> void ShortYuv::copyPartToPartYuvChroma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, uint32_t chromaId, const bool splitIntoSubTUs)
> {
> - int part = splitIntoSubTUs ? NUM_CHROMA_PARTITIONS422 : partitionFromSizes(lumaSize, lumaSize);
> + assert(chromaId == 1 || chromaId == 2);
> +
> + int part = splitIntoSubTUs ? NUM_CHROMA_PARTITIONS422 : partitionFromSize(lumaSize);
>
> if (chromaId == 1)
> {
> @@ -224,7 +226,7 @@
> uint32_t dstStride = dstPicYuv->getCStride();
> primitives.chroma[m_csp].copy_sp[part](dstU, dstStride, srcU, srcStride);
> }
> - else if (chromaId == 2)
> + else
> {
> int16_t* srcV = getCrAddr(partIdx);
> pixel* dstV = dstPicYuv->getCrAddr(partIdx);
> @@ -232,16 +234,4 @@
> uint32_t dstStride = dstPicYuv->getCStride();
> primitives.chroma[m_csp].copy_sp[part](dstV, dstStride, srcV, srcStride);
> }
> - else
> - {
> - int16_t* srcU = getCbAddr(partIdx);
> - int16_t* srcV = getCrAddr(partIdx);
> - pixel* dstU = dstPicYuv->getCbAddr(partIdx);
> - pixel* dstV = dstPicYuv->getCrAddr(partIdx);
> -
> - uint32_t srcStride = m_cwidth;
> - uint32_t dstStride = dstPicYuv->getCStride();
> - primitives.chroma[m_csp].copy_sp[part](dstU, dstStride, srcU, srcStride);
> - primitives.chroma[m_csp].copy_sp[part](dstV, dstStride, srcV, srcStride);
> - }
> }
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/common/vec/blockcopy-sse3.cpp
> --- a/source/common/vec/blockcopy-sse3.cpp Thu May 22 21:46:21 2014 -0500
> +++ b/source/common/vec/blockcopy-sse3.cpp Fri May 23 13:34:51 2014 +0900
> @@ -30,7 +30,7 @@
> #if HIGH_BIT_DEPTH
> void blockcopy_pp(int bx, int by, pixel *dst, intptr_t dstride, pixel *src, intptr_t sstride)
> {
> - if ((bx & 7) || (((size_t)dst | (size_t)src | sstride | dstride) & 15))
> + if ((bx & 7) || (((intptr_t)dst | (intptr_t)src | sstride | dstride) & 15))
> {
> // slow path, irregular memory alignments or sizes
> for (int y = 0; y < by; y++)
> @@ -60,7 +60,7 @@
> #else // if HIGH_BIT_DEPTH
> void blockcopy_pp(int bx, int by, pixel *dst, intptr_t dstride, pixel *src, intptr_t sstride)
> {
> - size_t aligncheck = (size_t)dst | (size_t)src | bx | sstride | dstride;
> + intptr_t aligncheck = (intptr_t)dst | (intptr_t)src | bx | sstride | dstride;
>
> if (!(aligncheck & 15))
> {
> @@ -91,7 +91,7 @@
>
> void blockcopy_ps(int bx, int by, pixel *dst, intptr_t dstride, int16_t *src, intptr_t sstride)
> {
> - size_t aligncheck = (size_t)dst | (size_t)src | bx | sstride | dstride;
> + intptr_t aligncheck = (intptr_t)dst | (intptr_t)src | bx | sstride | dstride;
>
> if (!(aligncheck & 15))
> {
> @@ -134,7 +134,7 @@
>
> void pixeladd_ss(int bx, int by, int16_t *dst, intptr_t dstride, int16_t *src0, int16_t *src1, intptr_t sstride0, intptr_t sstride1)
> {
> - size_t aligncheck = (size_t)dst | (size_t)src0 | sstride0 | sstride1 | dstride;
> + intptr_t aligncheck = (intptr_t)dst | (intptr_t)src0 | sstride0 | sstride1 | dstride;
>
> if (!(aligncheck & 15) && !(bx & 7))
> {
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/encoder/compress.cpp
> --- a/source/encoder/compress.cpp Thu May 22 21:46:21 2014 -0500
> +++ b/source/encoder/compress.cpp Fri May 23 13:34:51 2014 +0900
> @@ -138,8 +138,8 @@
> leftFiltered = leftScale;
> }
>
> - int log2SizeMinus2 = g_convertToBit[scaleTuSize];
> - pixelcmp_t sa8d = primitives.sa8d[log2SizeMinus2];
> + int sizeIdx = g_convertToBit[scaleTuSize];
> + pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
>
> uint32_t preds[3];
> cu->getIntraDirLumaPredictor(partOffset, preds);
> @@ -148,7 +148,7 @@
> uint32_t rbits = m_search->xModeBitsRemIntra(cu, partOffset, depth, preds, mpms);
>
> // DC
> - primitives.intra_pred[log2SizeMinus2][DC_IDX](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
> + primitives.intra_pred[sizeIdx][DC_IDX](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
> bsad = costMultiplier * sa8d(fenc, scaleStride, tmp, scaleStride);
> bmode = mode = DC_IDX;
> bbits = !(mpms & ((uint64_t)1 << mode)) ? rbits : m_search->xModeBitsIntra(cu, mode, partOffset, depth);
> @@ -164,7 +164,7 @@
> }
>
> // PLANAR
> - primitives.intra_pred[log2SizeMinus2][PLANAR_IDX](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
> + primitives.intra_pred[sizeIdx][PLANAR_IDX](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
> sad = costMultiplier * sa8d(fenc, scaleStride, tmp, scaleStride);
> mode = PLANAR_IDX;
> bits = !(mpms & ((uint64_t)1 << mode)) ? rbits : m_search->xModeBitsIntra(cu, mode, partOffset, depth);
> @@ -172,9 +172,9 @@
> COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
>
> // Transpose NxN
> - primitives.transpose[log2SizeMinus2](buf_trans, fenc, scaleStride);
> + primitives.transpose[sizeIdx](buf_trans, fenc, scaleStride);
>
> - primitives.intra_pred_allangs[log2SizeMinus2](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
> + primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
>
> for (mode = 2; mode < 35; mode++)
> {
> @@ -211,9 +211,9 @@
> outTempCU->m_totalBits = 0;
> if (m_search->predInterSearch(outTempCU, outPredYuv, bUseMRG, false))
> {
> - int part = g_convertToBit[outTempCU->getCUSize(0)];
> - uint32_t distortion = primitives.sa8d[part](m_origYuv[depth]->getLumaAddr(), m_origYuv[depth]->getStride(),
> - outPredYuv->getLumaAddr(), outPredYuv->getStride());
> + int sizeIdx = g_convertToBit[outTempCU->getCUSize(0)];
> + uint32_t distortion = primitives.sa8d[sizeIdx](m_origYuv[depth]->getLumaAddr(), m_origYuv[depth]->getStride(),
> + outPredYuv->getLumaAddr(), outPredYuv->getStride());
> outTempCU->m_totalDistortion = distortion;
> outTempCU->m_totalCost = m_rdCost->calcRdSADCost(distortion, outTempCU->m_totalBits);
> }
> @@ -243,7 +243,7 @@
> outBestCU->setPredModeSubParts(MODE_INTER, 0, depth);
> outBestCU->setMergeFlag(0, true);
>
> - int part = g_convertToBit[outTempCU->getCUSize(0)];
> + int sizeIdx = g_convertToBit[outTempCU->getCUSize(0)];
> int bestMergeCand = -1;
>
> for (uint32_t mergeCand = 0; mergeCand < maxNumMergeCand; ++mergeCand)
> @@ -262,8 +262,8 @@
> m_search->motionCompensation(outTempCU, m_tmpPredYuv[depth], REF_PIC_LIST_X, 0, true, false);
> uint32_t bitsCand = getTUBits(mergeCand, maxNumMergeCand);
> outTempCU->m_totalBits = bitsCand;
> - outTempCU->m_totalDistortion = primitives.sa8d[part](m_origYuv[depth]->getLumaAddr(), m_origYuv[depth]->getStride(),
> - m_tmpPredYuv[depth]->getLumaAddr(), m_tmpPredYuv[depth]->getStride());
> + outTempCU->m_totalDistortion = primitives.sa8d[sizeIdx](m_origYuv[depth]->getLumaAddr(), m_origYuv[depth]->getStride(),
> + m_tmpPredYuv[depth]->getLumaAddr(), m_tmpPredYuv[depth]->getStride());
> outTempCU->m_totalCost = m_rdCost->calcRdSADCost(outTempCU->m_totalDistortion, outTempCU->m_totalBits);
>
> if (outTempCU->m_totalCost < outBestCU->m_totalCost)
> @@ -866,7 +866,7 @@
> uint32_t src2stride = m_bestPredYuv[0]->getStride();
> uint32_t src1stride = m_origYuv[0]->getStride();
> uint32_t dststride = m_tmpResiYuv[depth]->m_width;
> - int part = partitionFromSizes(cu->getCUSize(0), cu->getCUSize(0));
> + int part = partitionFromSize(cu->getCUSize(0));
> primitives.luma_sub_ps[part](dst, dststride, src1, src2, src1stride, src2stride);
>
> src2 = m_bestPredYuv[0]->getCbAddr(absPartIdx);
> @@ -925,7 +925,7 @@
>
> //Generate Recon
> TComPicYuv* rec = pic->getPicYuvRec();
> - int part = partitionFromSizes(cu->getCUSize(0), cu->getCUSize(0));
> + int part = partitionFromSize(cu->getCUSize(0));
> pixel* src = m_bestPredYuv[0]->getLumaAddr(absPartIdx);
> pixel* dst = rec->getLumaAddr(cu->getAddr(), absPartIdx);
> uint32_t srcstride = m_bestPredYuv[0]->getStride();
> diff -r 5134e76aa729 -r 085be1ffd4a9 source/encoder/slicetype.cpp
> --- a/source/encoder/slicetype.cpp Thu May 22 21:46:21 2014 -0500
> +++ b/source/encoder/slicetype.cpp Fri May 23 13:34:51 2014 +0900
> @@ -1604,7 +1604,7 @@
> }
> if (!fenc->bIntraCalculated)
> {
> - int nLog2SizeMinus2 = g_convertToBit[cuSize]; // partition size
> + int sizeIdx = g_convertToBit[cuSize]; // partition size
>
> pixel _above0[X265_LOWRES_CU_SIZE * 4 + 1], *const above0 = _above0 + 2 * X265_LOWRES_CU_SIZE;
> pixel _above1[X265_LOWRES_CU_SIZE * 4 + 1], *const above1 = _above1 + 2 * X265_LOWRES_CU_SIZE;
> @@ -1643,16 +1643,16 @@
> int predsize = cuSize * cuSize;
>
> // generate 35 intra predictions into tmp
> - primitives.intra_pred[nLog2SizeMinus2][DC_IDX](predictions, cuSize, left0, above0, 0, (cuSize <= 16));
> + primitives.intra_pred[sizeIdx][DC_IDX](predictions, cuSize, left0, above0, 0, (cuSize <= 16));
> pixel *above = (cuSize >= 8) ? above1 : above0;
> pixel *left = (cuSize >= 8) ? left1 : left0;
> - primitives.intra_pred[nLog2SizeMinus2][PLANAR_IDX](predictions + predsize, cuSize, left, above, 0, 0);
> - primitives.intra_pred_allangs[nLog2SizeMinus2](predictions + 2 * predsize, above0, left0, above1, left1, (cuSize <= 16));
> + primitives.intra_pred[sizeIdx][PLANAR_IDX](predictions + predsize, cuSize, left, above, 0, 0);
> + primitives.intra_pred_allangs[sizeIdx](predictions + 2 * predsize, above0, left0, above1, left1, (cuSize <= 16));
>
> // calculate 35 satd costs, keep least cost
> ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
> - primitives.transpose[nLog2SizeMinus2](buf_trans, me.fenc, FENC_STRIDE);
> - pixelcmp_t satd = primitives.satd[partitionFromSizes(cuSize, cuSize)];
> + primitives.transpose[sizeIdx](buf_trans, me.fenc, FENC_STRIDE);
> + pixelcmp_t satd = primitives.satd[partitionFromSize(cuSize)];
> int icost = me.COST_MAX, cost;
> for (uint32_t mode = 0; mode < 35; mode++)
> {
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list