[x265] [PATCH] analysis: Intra picture estimation information sharing
gopu at multicorewareinc.com
gopu at multicorewareinc.com
Tue Sep 16 10:48:30 CEST 2014
# HG changeset patch
# User Gopu Govindaswamy <gopu at multicorewareinc.com>
# Date 1410857300 -19800
# Tue Sep 16 14:18:20 2014 +0530
# Node ID 61dc8322e6c0af444ba591755c299b945e1e423a
# Parent 1de67321275e70d510f0df3d5b7d4b9d391a1e66
analysis: Intra picture estimation information sharing
when --analysis-mode=save - the encoder runs a full encode and dump the
best split and mode decisions into x265_analysis.dat(default file name if file
name is not provided) file
when --analysis-mode=load - the encoder reads the best split and mode decisions
from x265_analysis.dat and bypass the actual split and mode decisions, and
therefore perform a much faster encode
diff -r 1de67321275e -r 61dc8322e6c0 source/Lib/TLibCommon/CommonDef.h
--- a/source/Lib/TLibCommon/CommonDef.h Mon Sep 15 15:00:13 2014 +0200
+++ b/source/Lib/TLibCommon/CommonDef.h Tue Sep 16 14:18:20 2014 +0530
@@ -100,4 +100,6 @@
#define CHROMA_H_SHIFT(x) (x == X265_CSP_I420 || x == X265_CSP_I422)
#define CHROMA_V_SHIFT(x) (x == X265_CSP_I420)
+#define CTU_TO_DEPTH_INDEX 22 // index to array containing increment offsets to add into zOrder to get next depth
+
#endif // ifndef X265_COMMONDEF_H
diff -r 1de67321275e -r 61dc8322e6c0 source/Lib/TLibCommon/TComRom.cpp
--- a/source/Lib/TLibCommon/TComRom.cpp Mon Sep 15 15:00:13 2014 +0200
+++ b/source/Lib/TLibCommon/TComRom.cpp Tue Sep 16 14:18:20 2014 +0530
@@ -505,5 +505,18 @@
0x38,
};
+ /* Contains how much to increment shared depth buffer for different ctu sizes to get next best depth
+ * here, depth 0 = 64x64, depth 1 = 32x32, depth 2 = 16x16 and depth 3 = 8x8
+ * if ctu = 64, depth buffer size is 256 combination of depth values 0, 1, 2, 3
+ * if ctu = 32, depth buffer size is 64 combination of depth values 1, 2, 3
+ * if ctu = 16, depth buffer size is 16 combination of depth values 2, 3 */
+
+const uint32_t g_depthInc[3][4] =
+{
+ { 16, 4, 0, 0},
+ { 64, 16, 4, 1},
+ {256, 64, 16, 4}
+};
+
}
//! \}
diff -r 1de67321275e -r 61dc8322e6c0 source/Lib/TLibCommon/TComRom.h
--- a/source/Lib/TLibCommon/TComRom.h Mon Sep 15 15:00:13 2014 +0200
+++ b/source/Lib/TLibCommon/TComRom.h Tue Sep 16 14:18:20 2014 +0530
@@ -155,6 +155,8 @@
// Intra tables
extern const uint8_t g_intraFilterFlags[35];
+extern const uint32_t g_depthInc[3][4];
+
}
#endif //ifndef X265_TCOMROM_H
diff -r 1de67321275e -r 61dc8322e6c0 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp Mon Sep 15 15:00:13 2014 +0200
+++ b/source/encoder/analysis.cpp Tue Sep 16 14:18:20 2014 +0530
@@ -311,14 +311,25 @@
uint32_t numPartition = cu->getTotalNumPart();
if (m_bestCU[0]->m_slice->m_sliceType == I_SLICE)
{
- compressIntraCU(m_bestCU[0], m_tempCU[0], false, cu, cu->m_CULocalData);
- if (m_param->analysisMode == 1)
+ if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_bestCU[0]->m_pic->m_intraData)
{
- memcpy(&m_bestCU[0]->m_pic->m_intraData->depth[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getDepth(), sizeof(uint8_t) * cu->getTotalNumPart());
- memcpy(&m_bestCU[0]->m_pic->m_intraData->modes[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getLumaIntraDir(), sizeof(uint8_t) * cu->getTotalNumPart());
- memcpy(&m_bestCU[0]->m_pic->m_intraData->partSizes[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getPartitionSize(), sizeof(char) * cu->getTotalNumPart());
- m_bestCU[0]->m_pic->m_intraData->cuAddr[cu->getAddr()] = cu->getAddr();
- m_bestCU[0]->m_pic->m_intraData->poc[cu->getAddr()] = cu->m_pic->m_POC;
+ uint32_t zOrder = 0;
+ compressSharedIntraCTU(m_bestCU[0], m_tempCU[0], false, cu, cu->m_CULocalData,
+ &m_bestCU[0]->m_pic->m_intraData->depth[cu->getAddr() * cu->m_numPartitions],
+ &m_bestCU[0]->m_pic->m_intraData->partSizes[cu->getAddr() * cu->m_numPartitions],
+ &m_bestCU[0]->m_pic->m_intraData->modes[cu->getAddr() * cu->m_numPartitions], zOrder);
+ }
+ else
+ {
+ compressIntraCU(m_bestCU[0], m_tempCU[0], false, cu, cu->m_CULocalData);
+ if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_bestCU[0]->m_pic->m_intraData)
+ {
+ memcpy(&m_bestCU[0]->m_pic->m_intraData->depth[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getDepth(), sizeof(uint8_t) * cu->getTotalNumPart());
+ memcpy(&m_bestCU[0]->m_pic->m_intraData->modes[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getLumaIntraDir(), sizeof(uint8_t) * cu->getTotalNumPart());
+ memcpy(&m_bestCU[0]->m_pic->m_intraData->partSizes[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getPartitionSize(), sizeof(char) * cu->getTotalNumPart());
+ m_bestCU[0]->m_pic->m_intraData->cuAddr[cu->getAddr()] = cu->getAddr();
+ m_bestCU[0]->m_pic->m_intraData->poc[cu->getAddr()] = cu->m_pic->m_POC;
+ }
}
if (m_param->bLogCuStats || m_param->rc.bStatWrite)
{
@@ -424,9 +435,9 @@
if (cu_unsplit_flag)
{
m_quant.setQPforQuant(outTempCU);
- checkIntra(outBestCU, outTempCU, SIZE_2Nx2N, cu);
+ checkIntra(outBestCU, outTempCU, SIZE_2Nx2N, cu, NULL);
if (depth == g_maxCUDepth)
- checkIntra(outBestCU, outTempCU, SIZE_NxN, cu);
+ checkIntra(outBestCU, outTempCU, SIZE_NxN, cu, NULL);
else
{
m_entropyCoder->resetBits();
@@ -533,7 +544,141 @@
#endif
}
-void Analysis::checkIntra(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, CU *cu)
+void Analysis::compressSharedIntraCTU(TComDataCU*& outBestCU, TComDataCU*& outTempCU, uint32_t depth, TComDataCU* cuPicsym, CU *cu, uint8_t* sharedDepth, char* sharedPartSizes, uint8_t* sharedModes, uint32_t &zOrder)
+{
+ Frame* pic = outBestCU->m_pic;
+
+ // if current depth == shared depth then skip further splitting.
+ bool bSubBranch = true;
+
+ // index to g_depthInc array to increment zOrder offset to next depth
+ int32_t ctuToDepthIndex = m_param->maxCUSize / CTU_TO_DEPTH_INDEX;
+
+ if (!depth)
+ m_origYuv[depth]->copyFromPicYuv(pic->getPicYuvOrg(), outBestCU->getAddr(), outBestCU->getZorderIdxInCU());
+ else
+ m_origYuv[0]->copyPartToYuv(m_origYuv[depth], outBestCU->getZorderIdxInCU());
+
+ Slice* slice = outTempCU->m_slice;
+ int32_t cu_split_flag = !(cu->flags & CU::LEAF);
+ int32_t cu_unsplit_flag = !(cu->flags & CU::SPLIT_MANDATORY);
+
+ if (cu_unsplit_flag && ((zOrder == outBestCU->getZorderIdxInCU()) && (depth == sharedDepth[zOrder])))
+ {
+ m_quant.setQPforQuant(outTempCU);
+ checkIntra(outBestCU, outTempCU, (PartSize)sharedPartSizes[zOrder], cu, &sharedModes[zOrder]);
+
+ if (!(depth == g_maxCUDepth))
+ {
+ m_entropyCoder->resetBits();
+ m_entropyCoder->codeSplitFlag(outBestCU, 0, depth);
+ outBestCU->m_totalBits += m_entropyCoder->getNumberOfWrittenBits();
+ }
+
+ // set current best CU cost to 0 marking as best CU present in shared CU data
+ outBestCU->m_totalRDCost = 0;
+ bSubBranch = false;
+
+ // increment zOrder offset to point to next best depth in sharedDepth buffer
+ zOrder += g_depthInc[ctuToDepthIndex][sharedDepth[zOrder]];
+ }
+
+ // copy original YUV samples in lossless mode
+ if (outBestCU->isLosslessCoded(0))
+ fillOrigYUVBuffer(outBestCU, m_origYuv[depth]);
+
+ // further split
+ if (cu_split_flag && bSubBranch)
+ {
+ uint32_t nextDepth = depth + 1;
+ TComDataCU* subBestPartCU = m_bestCU[nextDepth];
+ TComDataCU* subTempPartCU = m_tempCU[nextDepth];
+ for (uint32_t partUnitIdx = 0; partUnitIdx < 4; partUnitIdx++)
+ {
+ CU *child_cu = cuPicsym->m_CULocalData + cu->childIdx + partUnitIdx;
+
+ if (child_cu->flags & CU::PRESENT)
+ {
+ int32_t qp = outTempCU->getQP(0);
+ subBestPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
+ subTempPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
+
+ if (partUnitIdx) // initialize RD with previous depth buffer
+ m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[nextDepth][CI_NEXT_BEST]);
+ else
+ m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+
+ // set current best CU cost to 1 marking as non-best CU by default
+ subTempPartCU->m_totalRDCost = 1;
+
+ compressSharedIntraCTU(subBestPartCU, subTempPartCU, nextDepth, cuPicsym, child_cu, sharedDepth, sharedPartSizes, sharedModes, zOrder);
+ outTempCU->copyPartFrom(subBestPartCU, partUnitIdx, nextDepth); // Keep best part data to current temporary data.
+
+ if(!subBestPartCU->m_totalRDCost) // if cost is 0, CU is best CU
+ outTempCU->m_totalRDCost = 0; // set outTempCU cost to 0, so later check will use this CU as best CU
+
+ copyYuv2Tmp(subBestPartCU->getTotalNumPart() * partUnitIdx, nextDepth);
+ }
+ else
+ {
+ subBestPartCU->copyToPic(nextDepth);
+ outTempCU->copyPartFrom(subBestPartCU, partUnitIdx, nextDepth);
+
+ // increment zOrder offset to point to next best depth in sharedDepth buffer
+ zOrder += g_depthInc[ctuToDepthIndex][nextDepth];
+ }
+ }
+
+ if (cu->flags & CU::PRESENT)
+ {
+ m_entropyCoder->resetBits();
+ m_entropyCoder->codeSplitFlag(outTempCU, 0, depth);
+ outTempCU->m_totalBits += m_entropyCoder->getNumberOfWrittenBits(); // split bits
+ }
+ if (depth == slice->m_pps->maxCuDQPDepth && slice->m_pps->bUseDQP)
+ {
+ bool hasResidual = false;
+ for (uint32_t blkIdx = 0; blkIdx < outTempCU->getTotalNumPart(); blkIdx++)
+ {
+ if (outTempCU->getCbf(blkIdx, TEXT_LUMA) || outTempCU->getCbf(blkIdx, TEXT_CHROMA_U) ||
+ outTempCU->getCbf(blkIdx, TEXT_CHROMA_V))
+ {
+ hasResidual = true;
+ break;
+ }
+ }
+
+ uint32_t targetPartIdx = 0;
+ if (hasResidual)
+ {
+ bool foundNonZeroCbf = false;
+ outTempCU->setQPSubCUs(outTempCU->getRefQP(targetPartIdx), outTempCU, 0, depth, foundNonZeroCbf);
+ X265_CHECK(foundNonZeroCbf, "expected to find non-zero CBF\n");
+ }
+ else
+ outTempCU->setQPSubParts(outTempCU->getRefQP(targetPartIdx), 0, depth); // set QP to default QP
+ }
+ m_rdEntropyCoders[nextDepth][CI_NEXT_BEST].store(m_rdEntropyCoders[depth][CI_TEMP_BEST]);
+ checkBestMode(outBestCU, outTempCU, depth);
+ }
+ outBestCU->copyToPic(depth);
+ copyYuv2Pic(pic, outBestCU->getAddr(), outBestCU->getZorderIdxInCU(), depth);
+
+#if CHECKED_BUILD || _DEBUG
+ X265_CHECK(outBestCU->getPartitionSize(0) != SIZE_NONE, "no best partition size\n");
+ X265_CHECK(outBestCU->getPredictionMode(0) != MODE_NONE, "no best partition mode\n");
+ if (m_rdCost.m_psyRd)
+ {
+ X265_CHECK(outBestCU->m_totalPsyCost != MAX_INT64, "no best partition cost\n");
+ }
+ else
+ {
+ X265_CHECK(outBestCU->m_totalRDCost != MAX_INT64, "no best partition cost\n");
+ }
+#endif
+}
+
+void Analysis::checkIntra(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, CU *cu, uint8_t* sharedModes)
{
//PPAScopeEvent(CheckRDCostIntra + depth);
uint32_t depth = g_log2Size[m_param->maxCUSize] - cu->log2CUSize;
@@ -544,7 +689,10 @@
uint32_t tuDepthRange[2];
outTempCU->getQuadtreeTULog2MinSizeInCU(tuDepthRange, 0);
- estIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange);
+ if (sharedModes)
+ sharedEstIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange, sharedModes);
+ else
+ estIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange);
estIntraPredChromaQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth]);
diff -r 1de67321275e -r 61dc8322e6c0 source/encoder/analysis.h
--- a/source/encoder/analysis.h Mon Sep 15 15:00:13 2014 +0200
+++ b/source/encoder/analysis.h Tue Sep 16 14:18:20 2014 +0530
@@ -110,7 +110,8 @@
/* Warning: The interface for these functions will undergo significant changes as a major refactor is under progress */
void compressIntraCU(TComDataCU*& outBestCU, TComDataCU*& outTempCU, uint32_t depth, TComDataCU* cuPicsym, CU *cu);
- void checkIntra(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, CU *cu);
+ void checkIntra(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, CU *cu, uint8_t* sharedModes);
+ void compressSharedIntraCTU(TComDataCU*& outBestCU, TComDataCU*& outTempCU, uint32_t depth, TComDataCU* cuPicsym, CU *cu, uint8_t* sharedDepth, char* sharedPartSizes, uint8_t* sharedModes, uint32_t &zOrder);
void compressInterCU_rd0_4(TComDataCU*& outBestCU, TComDataCU*& outTempCU, TComDataCU* cu, uint32_t depth, TComDataCU* cuPicsym, CU *cu_t,
int bInsidePicture, uint32_t partitionIndex, uint32_t minDepth);
diff -r 1de67321275e -r 61dc8322e6c0 source/encoder/search.cpp
--- a/source/encoder/search.cpp Mon Sep 15 15:00:13 2014 +0200
+++ b/source/encoder/search.cpp Tue Sep 16 14:18:20 2014 +0530
@@ -1393,6 +1393,61 @@
x265_emms();
}
+void Search::sharedEstIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2], uint8_t* sharedModes)
+{
+ uint32_t depth = cu->getDepth(0);
+ uint32_t initTrDepth = cu->getPartitionSize(0) == SIZE_2Nx2N ? 0 : 1;
+ uint32_t numPU = 1 << (2 * initTrDepth);
+ uint32_t log2TrSize = cu->getLog2CUSize(0) - initTrDepth;
+ uint32_t qNumParts = cu->getTotalNumPart() >> 2;
+
+ // loop over partitions
+ uint32_t partOffset = 0;
+ uint64_t puCost = 0;
+ uint32_t bits = 0;
+ uint32_t dststride = cu->m_pic->getPicYuvRec()->getStride();
+ uint32_t srcstride = reconYuv->getStride();
+
+ for (uint32_t pu = 0; pu < numPU; pu++, partOffset += qNumParts)
+ {
+ cu->setLumaIntraDirSubParts(sharedModes[pu], partOffset, depth + initTrDepth);
+
+ // set context models
+ m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+
+ // update overall distortion (rate and r-d costs are determined later)
+ cu->m_totalDistortion += xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, true, puCost, bits, depthRange);
+ xSetIntraResultQT(cu, initTrDepth, partOffset, reconYuv);
+
+ if (pu != numPU - 1)
+ {
+ uint32_t zorder = cu->getZorderIdxInCU() + partOffset;
+ pixel* dst = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
+ pixel* src = reconYuv->getLumaAddr(partOffset);
+ primitives.luma_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride);
+ }
+
+ // update PU data
+ cu->setLumaIntraDirSubParts(sharedModes[pu], partOffset, depth + initTrDepth);
+ cu->copyToPic((uint8_t)depth, pu, initTrDepth);
+ }
+
+ if (numPU > 1)
+ {
+ // set Cbf for all blocks
+ uint32_t combCbfY = 0;
+ uint32_t partIdx = 0;
+ for (uint32_t part = 0; part < 4; part++, partIdx += qNumParts)
+ combCbfY |= cu->getCbf(partIdx, TEXT_LUMA, 1);
+
+ for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
+ cu->getCbf(TEXT_LUMA)[offs] |= combCbfY;
+ }
+
+ // reset context models
+ m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+}
+
void Search::getBestIntraModeChroma(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv)
{
uint32_t depth = cu->getDepth(0);
diff -r 1de67321275e -r 61dc8322e6c0 source/encoder/search.h
--- a/source/encoder/search.h Mon Sep 15 15:00:13 2014 +0200
+++ b/source/encoder/search.h Tue Sep 16 14:18:20 2014 +0530
@@ -80,6 +80,7 @@
bool initSearch(x265_param *param, ScalingList& scalingList);
void estIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2]);
+ void sharedEstIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2], uint8_t* sharedModes);
void estIntraPredChromaQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv);
// estimation inter prediction (non-skip)
More information about the x265-devel
mailing list