[x265] [PATCH] analysis: Intra picture estimation information sharing
Steve Borho
steve at borho.org
Mon Sep 15 12:40:08 CEST 2014
On 09/15, gopu at multicorewareinc.com wrote:
> # HG changeset patch
> # User Gopu Govindaswamy <gopu at multicorewareinc.com>
> # Date 1410770251 -19800
> # Mon Sep 15 14:07:31 2014 +0530
> # Node ID 9db768fa41ad927c66c1dc4ae446953862052ce4
> # Parent 184e56afa951815f4e295b4fcce094ee03361a2e
> analysis: Intra picture estimation information sharing
>
> when --analysis-mode=save - the encoder runs a full encode and dump the
> best split and mode decisions into x265_analysis.dat(default file name if file
> name is not provided) file
> when --analysis-mode=load - the encoder reads the best split and mode decisions
> from x265_analysis.dat and bypass the actual split and mode decisions, and
> therefore perform a much faster encode
>
> diff -r 184e56afa951 -r 9db768fa41ad source/Lib/TLibCommon/TComRom.cpp
> --- a/source/Lib/TLibCommon/TComRom.cpp Fri Sep 12 12:02:46 2014 +0530
> +++ b/source/Lib/TLibCommon/TComRom.cpp Mon Sep 15 14:07:31 2014 +0530
> @@ -505,5 +505,19 @@
> 0x38,
> };
>
> + /* Contains how much to increment shared depth buffer for different ctu sizes to get next best depth.
> + * here,
> + * depth 0 = 64x64, depth 1 = 32x32, depth 2 = 16x16 and depth 3 = 8x8
> + * if ctu = 64, depth buffer size is 256 combination of depth values 0, 1, 2, 3.
> + * if ctu = 32, depth buffer size is 64 combination of depth values 1, 2, 3.
> + * if ctu = 16, depth buffer size is 16 combination of depth values 2, 3 */
the comment should be w/s aligned with the array, lines 2&3 should be
combined
> +const uint32_t g_depthInc[3][4] =
> +{
> + { 16, 4, 0, 0},
> + { 64, 16, 4, 1},
> + {256, 64, 16, 4}
> +};
> +
> }
> //! \}
> diff -r 184e56afa951 -r 9db768fa41ad source/Lib/TLibCommon/TComRom.h
> --- a/source/Lib/TLibCommon/TComRom.h Fri Sep 12 12:02:46 2014 +0530
> +++ b/source/Lib/TLibCommon/TComRom.h Mon Sep 15 14:07:31 2014 +0530
> @@ -155,6 +155,8 @@
> // Intra tables
> extern const uint8_t g_intraFilterFlags[35];
>
> +extern const uint32_t g_depthInc[3][4];
> +
> }
>
> #endif //ifndef X265_TCOMROM_H
> diff -r 184e56afa951 -r 9db768fa41ad source/encoder/analysis.cpp
> --- a/source/encoder/analysis.cpp Fri Sep 12 12:02:46 2014 +0530
> +++ b/source/encoder/analysis.cpp Mon Sep 15 14:07:31 2014 +0530
> @@ -311,14 +311,24 @@
> uint32_t numPartition = cu->getTotalNumPart();
> if (m_bestCU[0]->m_slice->m_sliceType == I_SLICE)
> {
> - compressIntraCU(m_bestCU[0], m_tempCU[0], false, cu, cu->m_CULocalData);
> - if (m_param->analysisMode == 1)
> + if (m_param->analysisMode == 2)
our code should always use the X265_ANALYSIS_LOAD|SAVE macros,
except when checking != 0.
> {
> - memcpy(&m_bestCU[0]->m_pic->m_intraData->depth[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getDepth(), sizeof(uint8_t) * cu->getTotalNumPart());
> - memcpy(&m_bestCU[0]->m_pic->m_intraData->modes[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getLumaIntraDir(), sizeof(uint8_t) * cu->getTotalNumPart());
> - memcpy(&m_bestCU[0]->m_pic->m_intraData->partSizes[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getPartitionSize(), sizeof(char) * cu->getTotalNumPart());
> - m_bestCU[0]->m_pic->m_intraData->cuAddr[cu->getAddr()] = cu->getAddr();
> - m_bestCU[0]->m_pic->m_intraData->poc[cu->getAddr()] = cu->m_pic->m_POC;
> + sharedCompressIntraCU(m_bestCU[0], m_tempCU[0], false, cu, cu->m_CULocalData,
> + &m_bestCU[0]->m_pic->m_intraData->depth[cu->getAddr() * cu->m_numPartitions],
> + &m_bestCU[0]->m_pic->m_intraData->partSizes[cu->getAddr() * cu->m_numPartitions],
> + &m_bestCU[0]->m_pic->m_intraData->modes[cu->getAddr() * cu->m_numPartitions]);
Pointer checking needs to be done at some point, probably at the frame
level. If the user doesn't allocate a buffer, we shouldn't crash.
We should probably also be setting the analysis pointers to NULL in the
input picture structure prior to returning from x265_encoder_encode() so
they do not accidentally re-use the same buffers for more than one
picture. In short, we need to be a lot more defensive about API abuses.
> + }
> + else
> + {
> + compressIntraCU(m_bestCU[0], m_tempCU[0], false, cu, cu->m_CULocalData);
> + if (m_param->analysisMode == 1)
> + {
> + memcpy(&m_bestCU[0]->m_pic->m_intraData->depth[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getDepth(), sizeof(uint8_t) * cu->getTotalNumPart());
> + memcpy(&m_bestCU[0]->m_pic->m_intraData->modes[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getLumaIntraDir(), sizeof(uint8_t) * cu->getTotalNumPart());
> + memcpy(&m_bestCU[0]->m_pic->m_intraData->partSizes[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getPartitionSize(), sizeof(char) * cu->getTotalNumPart());
> + m_bestCU[0]->m_pic->m_intraData->cuAddr[cu->getAddr()] = cu->getAddr();
> + m_bestCU[0]->m_pic->m_intraData->poc[cu->getAddr()] = cu->m_pic->m_POC;
> + }
> }
> if (m_param->bLogCuStats || m_param->rc.bStatWrite)
> {
> @@ -533,7 +543,142 @@
> #endif
> }
>
> -void Analysis::checkIntra(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, CU *cu)
> +void Analysis::sharedCompressIntraCU(TComDataCU*& outBestCU, TComDataCU*& outTempCU, uint32_t depth, TComDataCU* cuPicsym, CU *cu, uint8_t* sharedDepth, char* sharedPartSizes, uint8_t* sharedModes)
> +{
> + Frame* pic = outBestCU->m_pic;
> +
> + // if current depth == shared depth then skip further splitting.
> + bool bSubBranch = true;
> +
> + if (depth == 0)
!depth
> + {
> + // offset to next best depth in sharedDepth buffer
> + m_zorder = 0;
> +
> + // index to g_depthInc array to increment m_zorder offset to next depth
> + m_ctuToDepthIndex = m_param->maxCUSize / 22;
this math is pretty magical. my guess is there's already a table
somewhere that does this more cleanly? Does this code work with
--ctu 16?
> + // get original YUV data from picture
> + m_origYuv[depth]->copyFromPicYuv(pic->getPicYuvOrg(), outBestCU->getAddr(), outBestCU->getZorderIdxInCU());
> + }
> + else
> + m_origYuv[0]->copyPartToYuv(m_origYuv[depth], outBestCU->getZorderIdxInCU());
> +
> + Slice* slice = outTempCU->m_slice;
> + int32_t cu_split_flag = !(cu->flags & CU::LEAF);
> + int32_t cu_unsplit_flag = !(cu->flags & CU::SPLIT_MANDATORY);
It looks like this function is recursively encoding the entire I slice
CTU. If that is the case the name should reflect that, perhaps
compressSharedIntraCTU.
> + if (cu_unsplit_flag && ((m_zorder == outBestCU->getZorderIdxInCU()) && (depth == sharedDepth[m_zorder])))
> + {
> + m_quant.setQPforQuant(outTempCU);
> + checkIntra(outBestCU, outTempCU, (PartSize)sharedPartSizes[m_zorder], cu, &sharedModes[m_zorder]);
> +
> + if (!(depth == g_maxCUDepth))
> + {
> + m_entropyCoder->resetBits();
> + m_entropyCoder->codeSplitFlag(outBestCU, 0, depth);
> + outBestCU->m_totalBits += m_entropyCoder->getNumberOfWrittenBits();
> + }
> + if (m_rdCost.m_psyRd)
> + outBestCU->m_totalPsyCost = m_rdCost.calcPsyRdCost(outBestCU->m_totalDistortion, outBestCU->m_totalBits, outBestCU->m_psyEnergy);
> + else
> + outBestCU->m_totalRDCost = m_rdCost.calcRdCost(outBestCU->m_totalDistortion, outBestCU->m_totalBits);
How applicable is psy-rd for I slices in the shared re-use case? Does
it influence splits or something? If it's not being used, we should
save the cycles
Should we be measuring cost at all in the reuse case?
> + bSubBranch = false;
> +
> + // increment m_zorder offset to point to next best depth in sharedDepth buffer
> + m_zorder += g_depthInc[m_ctuToDepthIndex][sharedDepth[m_zorder]];
> + }
> +
> + // copy original YUV samples in lossless mode
> + if (outBestCU->isLosslessCoded(0))
> + fillOrigYUVBuffer(outBestCU, m_origYuv[depth]);
> +
> + // further split
> + if (cu_split_flag && bSubBranch)
> + {
> + uint32_t nextDepth = depth + 1;
> + TComDataCU* subBestPartCU = m_bestCU[nextDepth];
> + TComDataCU* subTempPartCU = m_tempCU[nextDepth];
> + for (uint32_t partUnitIdx = 0; partUnitIdx < 4; partUnitIdx++)
> + {
> + CU *child_cu = cuPicsym->m_CULocalData + cu->childIdx + partUnitIdx;
> +
> + if (child_cu->flags & CU::PRESENT)
> + {
> + int32_t qp = outTempCU->getQP(0);
> + subBestPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
> + subTempPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
> + if (0 == partUnitIdx) //initialize RD with previous depth buffer
> + m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
> + else
> + m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[nextDepth][CI_NEXT_BEST]);
we normally code this as:
if (partUnitIdx) // initialize RD with previous depth buffer
m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[nextDepth][CI_NEXT_BEST]);
else
m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
> +
> + sharedCompressIntraCU(subBestPartCU, subTempPartCU, nextDepth, cuPicsym, child_cu, sharedDepth, sharedPartSizes, sharedModes);
> + outTempCU->copyPartFrom(subBestPartCU, partUnitIdx, nextDepth); // Keep best part data to current temporary data.
> +
> + // check if cost == MAX_INT64 then current depth != sharedDepth so, current CU is not best CU
> + // set the cost to MAX_INT64 - 1 to mark it as not best CU
> + if (m_rdCost.m_psyRd && subBestPartCU->m_totalPsyCost == MAX_INT64)
> + outTempCU->m_totalPsyCost = MAX_INT64 - 1;
> + else if(subBestPartCU->m_totalRDCost == MAX_INT64)
> + outTempCU->m_totalRDCost = MAX_INT64 - 1;
> +
> + copyYuv2Tmp(subBestPartCU->getTotalNumPart() * partUnitIdx, nextDepth);
> + }
> + else
> + {
> + subBestPartCU->copyToPic(nextDepth);
> + outTempCU->copyPartFrom(subBestPartCU, partUnitIdx, nextDepth);
> +
> + // increment m_zorder offset to point to next best depth in sharedDepth buffer
> + m_zorder += g_depthInc[m_ctuToDepthIndex][sharedDepth[m_zorder]];
> + }
> + }
> + if (cu->flags & CU::PRESENT)
> + {
> + m_entropyCoder->resetBits();
> + m_entropyCoder->codeSplitFlag(outTempCU, 0, depth);
> + outTempCU->m_totalBits += m_entropyCoder->getNumberOfWrittenBits(); // split bits
> + }
> +
> + // check if cost is greater than (MAX_INT64 - 1)
> + if (m_rdCost.m_psyRd && outTempCU->m_totalPsyCost >= MAX_INT64)
> + outTempCU->m_totalPsyCost = m_rdCost.calcPsyRdCost(outTempCU->m_totalDistortion, outTempCU->m_totalBits, outTempCU->m_psyEnergy);
> + else if (outTempCU->m_totalRDCost >= MAX_INT64)
> + outTempCU->m_totalRDCost = m_rdCost.calcRdCost(outTempCU->m_totalDistortion, outTempCU->m_totalBits);
Unrelated to this patch, but now that psy-rd is stable, we can do away
with the separate cost variables. We should always be measuring rd cost
or psy-rd cost, there's no reason to keep both variables.
> + if (depth == slice->m_pps->maxCuDQPDepth && slice->m_pps->bUseDQP)
> + {
> + bool hasResidual = false;
> + for (uint32_t blkIdx = 0; blkIdx < outTempCU->getTotalNumPart(); blkIdx++)
> + {
> + if (outTempCU->getCbf(blkIdx, TEXT_LUMA) || outTempCU->getCbf(blkIdx, TEXT_CHROMA_U) ||
> + outTempCU->getCbf(blkIdx, TEXT_CHROMA_V))
> + {
> + hasResidual = true;
> + break;
> + }
> + }
> +
> + uint32_t targetPartIdx = 0;
> + if (hasResidual)
> + {
> + bool foundNonZeroCbf = false;
> + outTempCU->setQPSubCUs(outTempCU->getRefQP(targetPartIdx), outTempCU, 0, depth, foundNonZeroCbf);
> + X265_CHECK(foundNonZeroCbf, "expected to find non-zero CBF\n");
> + }
> + else
> + outTempCU->setQPSubParts(outTempCU->getRefQP(targetPartIdx), 0, depth); // set QP to default QP
> + }
> + m_rdEntropyCoders[nextDepth][CI_NEXT_BEST].store(m_rdEntropyCoders[depth][CI_TEMP_BEST]);
> + checkBestMode(outBestCU, outTempCU, depth);
> + }
> + outBestCU->copyToPic(depth);
> + copyYuv2Pic(pic, outBestCU->getAddr(), outBestCU->getZorderIdxInCU(), depth);
> +}
> +
> +void Analysis::checkIntra(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, CU *cu, uint8_t* sharedModes)
> {
> //PPAScopeEvent(CheckRDCostIntra + depth);
> uint32_t depth = g_log2Size[m_param->maxCUSize] - cu->log2CUSize;
> @@ -544,7 +689,10 @@
> uint32_t tuDepthRange[2];
> outTempCU->getQuadtreeTULog2MinSizeInCU(tuDepthRange, 0);
>
> - estIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange);
> + if (sharedModes)
> + sharedIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange, sharedModes);
> + else
> + estIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange);
>
> estIntraPredChromaQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth]);
>
> diff -r 184e56afa951 -r 9db768fa41ad source/encoder/analysis.h
> --- a/source/encoder/analysis.h Fri Sep 12 12:02:46 2014 +0530
> +++ b/source/encoder/analysis.h Mon Sep 15 14:07:31 2014 +0530
> @@ -100,6 +100,9 @@
> StatisticLog m_sliceTypeLog[3];
> StatisticLog* m_log;
>
> + uint32_t m_zorder;
> + uint32_t m_ctuToDepthIndex;
it seems like these should be derivable from existing CU fields, or
passed on the stack to sharedCompressIntraCU()
> +
> Analysis();
> bool create(uint32_t totalDepth, uint32_t maxWidth);
> void destroy();
> @@ -110,7 +113,8 @@
>
> /* Warning: The interface for these functions will undergo significant changes as a major refactor is under progress */
> void compressIntraCU(TComDataCU*& outBestCU, TComDataCU*& outTempCU, uint32_t depth, TComDataCU* cuPicsym, CU *cu);
> - void checkIntra(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, CU *cu);
> + void checkIntra(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, CU *cu, uint8_t* sharedModes=NULL);
I don't generally like default args. please update all callers instead
> + void sharedCompressIntraCU(TComDataCU*& outBestCU, TComDataCU*& outTempCU, uint32_t depth, TComDataCU* cuPicsym, CU *cu, uint8_t* sharedDepth, char* sharedPartSizes, uint8_t* sharedModes);
>
> void compressInterCU_rd0_4(TComDataCU*& outBestCU, TComDataCU*& outTempCU, TComDataCU* cu, uint32_t depth, TComDataCU* cuPicsym, CU *cu_t,
> int bInsidePicture, uint32_t partitionIndex, uint32_t minDepth);
> diff -r 184e56afa951 -r 9db768fa41ad source/encoder/search.cpp
> --- a/source/encoder/search.cpp Fri Sep 12 12:02:46 2014 +0530
> +++ b/source/encoder/search.cpp Mon Sep 15 14:07:31 2014 +0530
> @@ -1484,6 +1484,75 @@
> x265_emms();
> }
>
> +void Search::sharedIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2], uint8_t* sharedModes)
> +{
> + uint32_t depth = cu->getDepth(0);
> + uint32_t initTrDepth = cu->getPartitionSize(0) == SIZE_2Nx2N ? 0 : 1;
> + uint32_t numPU = 1 << (2 * initTrDepth);
> + uint32_t log2TrSize = cu->getLog2CUSize(0) - initTrDepth;
> + uint32_t qNumParts = cu->getTotalNumPart() >> 2;
> + uint32_t overallDistY = 0;
> + static const uint8_t intraModeNumFast[] = { 8, 8, 3, 3, 3 }; // 4x4, 8x8, 16x16, 32x32, 64x64
this array is unused
> +
> + // loop over partitions
> + uint32_t partOffset = 0;
> + uint32_t puDistY;
> + uint64_t puCost;
> + for (uint32_t pu = 0; pu < numPU; pu++, partOffset += qNumParts)
> + {
> + uint32_t bestPUMode = sharedModes[pu];
> + uint32_t bestPUDistY = 0;
these two variables both seem a bit redundant
> + cu->setLumaIntraDirSubParts(bestPUMode, partOffset, depth + initTrDepth);
> +
> + // set context models
> + m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
> +
> + // determine residual for partition
> + puCost = 0;
> + puDistY = xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, true, puCost, depthRange);
> +
> + bestPUDistY = puDistY;
> + xSetIntraResultQT(cu, initTrDepth, partOffset, reconYuv);
> +
> + // update overall distortion
> + overallDistY += bestPUDistY;
> +
> + if (pu != numPU - 1)
> + {
> + uint32_t zorder = cu->getZorderIdxInCU() + partOffset;
> + pixel* dst = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
> + uint32_t dststride = cu->m_pic->getPicYuvRec()->getStride();
> + pixel* src = reconYuv->getLumaAddr(partOffset);
> + uint32_t srcstride = reconYuv->getStride();
> + primitives.luma_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride);
> + }
> +
> + // update PU data
> + cu->setLumaIntraDirSubParts(bestPUMode, partOffset, depth + initTrDepth);
is this call redundant?
> + cu->copyToPic((uint8_t)depth, pu, initTrDepth);
> + }
> +
> + if (numPU > 1)
> + {
> + // set Cbf for all blocks
> + uint32_t combCbfY = 0;
> + uint32_t partIdx = 0;
> + for (uint32_t part = 0; part < 4; part++, partIdx += qNumParts)
> + combCbfY |= cu->getCbf(partIdx, TEXT_LUMA, 1);
> +
> + for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
> + cu->getCbf(TEXT_LUMA)[offs] |= combCbfY;
> +
white-space
> + }
> +
> + // reset context models
> + m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
> +
> + // set distortion (rate and r-d costs are determined later)
> + cu->m_totalDistortion = overallDistY;
cu->m_totalDistortion could be updated within the loop directly
> +}
> +
> void Search::getBestIntraModeChroma(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv)
> {
> uint32_t depth = cu->getDepth(0);
> diff -r 184e56afa951 -r 9db768fa41ad source/encoder/search.h
> --- a/source/encoder/search.h Fri Sep 12 12:02:46 2014 +0530
> +++ b/source/encoder/search.h Mon Sep 15 14:07:31 2014 +0530
> @@ -109,6 +109,7 @@
> bool initSearch(x265_param *param, ScalingList& scalingList);
>
> void estIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2]);
> + void sharedIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2], uint8_t* sharedModes);
> void estIntraPredChromaQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv);
>
> // estimation inter prediction (non-skip)
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list