[x265] [PATCH] analysis: Intra picture estimation information sharing

gopu at multicorewareinc.com gopu at multicorewareinc.com
Mon Sep 15 10:38:19 CEST 2014


# HG changeset patch
# User Gopu Govindaswamy <gopu at multicorewareinc.com>
# Date 1410770251 -19800
#      Mon Sep 15 14:07:31 2014 +0530
# Node ID 9db768fa41ad927c66c1dc4ae446953862052ce4
# Parent  184e56afa951815f4e295b4fcce094ee03361a2e
analysis: Intra picture estimation information sharing

when --analysis-mode=save - the encoder runs a full encode and dump the
best split and mode decisions into x265_analysis.dat(default file name if file
name is not provided) file
when --analysis-mode=load - the encoder reads the best split and mode decisions
from x265_analysis.dat and bypass the actual split and mode decisions, and
therefore perform a much faster encode

diff -r 184e56afa951 -r 9db768fa41ad source/Lib/TLibCommon/TComRom.cpp
--- a/source/Lib/TLibCommon/TComRom.cpp	Fri Sep 12 12:02:46 2014 +0530
+++ b/source/Lib/TLibCommon/TComRom.cpp	Mon Sep 15 14:07:31 2014 +0530
@@ -505,5 +505,19 @@
     0x38, 
 };
 
+    /* Contains how much to increment shared depth buffer for different ctu sizes to get next best depth.
+     * here,
+     * depth 0 = 64x64, depth 1 = 32x32, depth 2 = 16x16 and depth 3 = 8x8
+     * if ctu = 64, depth buffer size is 256 combination of depth values 0, 1, 2, 3.
+     * if ctu = 32, depth buffer size is 64 combination of depth values 1, 2, 3.
+     * if ctu = 16, depth buffer size is 16 combination of depth values 2, 3 */
+
+const uint32_t g_depthInc[3][4] =
+{
+    { 16,  4,  0, 0},
+    { 64, 16,  4, 1},
+    {256, 64, 16, 4}
+};
+
 }
 //! \}
diff -r 184e56afa951 -r 9db768fa41ad source/Lib/TLibCommon/TComRom.h
--- a/source/Lib/TLibCommon/TComRom.h	Fri Sep 12 12:02:46 2014 +0530
+++ b/source/Lib/TLibCommon/TComRom.h	Mon Sep 15 14:07:31 2014 +0530
@@ -155,6 +155,8 @@
 // Intra tables
 extern const uint8_t g_intraFilterFlags[35];
 
+extern const uint32_t g_depthInc[3][4];
+
 }
 
 #endif  //ifndef X265_TCOMROM_H
diff -r 184e56afa951 -r 9db768fa41ad source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp	Fri Sep 12 12:02:46 2014 +0530
+++ b/source/encoder/analysis.cpp	Mon Sep 15 14:07:31 2014 +0530
@@ -311,14 +311,24 @@
     uint32_t numPartition = cu->getTotalNumPart();
     if (m_bestCU[0]->m_slice->m_sliceType == I_SLICE)
     {
-        compressIntraCU(m_bestCU[0], m_tempCU[0], false, cu, cu->m_CULocalData);
-        if (m_param->analysisMode == 1)
+        if (m_param->analysisMode == 2)
         {
-            memcpy(&m_bestCU[0]->m_pic->m_intraData->depth[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getDepth(), sizeof(uint8_t) * cu->getTotalNumPart());
-            memcpy(&m_bestCU[0]->m_pic->m_intraData->modes[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getLumaIntraDir(), sizeof(uint8_t) * cu->getTotalNumPart());
-            memcpy(&m_bestCU[0]->m_pic->m_intraData->partSizes[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getPartitionSize(), sizeof(char) * cu->getTotalNumPart());
-            m_bestCU[0]->m_pic->m_intraData->cuAddr[cu->getAddr()] = cu->getAddr();
-            m_bestCU[0]->m_pic->m_intraData->poc[cu->getAddr()]    = cu->m_pic->m_POC;
+            sharedCompressIntraCU(m_bestCU[0], m_tempCU[0], false, cu, cu->m_CULocalData, 
+                &m_bestCU[0]->m_pic->m_intraData->depth[cu->getAddr() * cu->m_numPartitions],
+                &m_bestCU[0]->m_pic->m_intraData->partSizes[cu->getAddr() * cu->m_numPartitions],
+                &m_bestCU[0]->m_pic->m_intraData->modes[cu->getAddr() * cu->m_numPartitions]);
+        }
+        else
+        {
+            compressIntraCU(m_bestCU[0], m_tempCU[0], false, cu, cu->m_CULocalData);
+            if (m_param->analysisMode == 1)
+            {
+                memcpy(&m_bestCU[0]->m_pic->m_intraData->depth[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getDepth(), sizeof(uint8_t) * cu->getTotalNumPart());
+                memcpy(&m_bestCU[0]->m_pic->m_intraData->modes[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getLumaIntraDir(), sizeof(uint8_t) * cu->getTotalNumPart());
+                memcpy(&m_bestCU[0]->m_pic->m_intraData->partSizes[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getPartitionSize(), sizeof(char) * cu->getTotalNumPart());
+                m_bestCU[0]->m_pic->m_intraData->cuAddr[cu->getAddr()] = cu->getAddr();
+                m_bestCU[0]->m_pic->m_intraData->poc[cu->getAddr()]    = cu->m_pic->m_POC;
+            }
         }
         if (m_param->bLogCuStats || m_param->rc.bStatWrite)
         {
@@ -533,7 +543,142 @@
 #endif
 }
 
-void Analysis::checkIntra(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, CU *cu)
+void Analysis::sharedCompressIntraCU(TComDataCU*& outBestCU, TComDataCU*& outTempCU, uint32_t depth, TComDataCU* cuPicsym, CU *cu, uint8_t* sharedDepth, char* sharedPartSizes, uint8_t* sharedModes)
+{
+    Frame* pic = outBestCU->m_pic;
+
+    // if current depth == shared depth then skip further splitting.
+    bool bSubBranch = true;
+
+    if (depth == 0)
+    {
+        // offset to next best depth in sharedDepth buffer
+        m_zorder = 0;
+
+        // index to g_depthInc array to increment m_zorder offset to next depth
+        m_ctuToDepthIndex = m_param->maxCUSize / 22;
+
+        // get original YUV data from picture
+        m_origYuv[depth]->copyFromPicYuv(pic->getPicYuvOrg(), outBestCU->getAddr(), outBestCU->getZorderIdxInCU());
+    }
+    else
+        m_origYuv[0]->copyPartToYuv(m_origYuv[depth], outBestCU->getZorderIdxInCU());
+
+    Slice* slice = outTempCU->m_slice;
+    int32_t cu_split_flag = !(cu->flags & CU::LEAF);
+    int32_t cu_unsplit_flag = !(cu->flags & CU::SPLIT_MANDATORY);
+
+    if (cu_unsplit_flag && ((m_zorder == outBestCU->getZorderIdxInCU()) && (depth == sharedDepth[m_zorder])))
+    {
+        m_quant.setQPforQuant(outTempCU);
+        checkIntra(outBestCU, outTempCU, (PartSize)sharedPartSizes[m_zorder], cu, &sharedModes[m_zorder]);
+
+        if (!(depth == g_maxCUDepth))
+        {
+            m_entropyCoder->resetBits();
+            m_entropyCoder->codeSplitFlag(outBestCU, 0, depth);
+            outBestCU->m_totalBits += m_entropyCoder->getNumberOfWrittenBits();
+        }
+        if (m_rdCost.m_psyRd)
+            outBestCU->m_totalPsyCost = m_rdCost.calcPsyRdCost(outBestCU->m_totalDistortion, outBestCU->m_totalBits, outBestCU->m_psyEnergy);
+        else
+            outBestCU->m_totalRDCost  = m_rdCost.calcRdCost(outBestCU->m_totalDistortion, outBestCU->m_totalBits);
+
+        bSubBranch = false;
+
+        // increment m_zorder offset to point to next best depth in sharedDepth buffer
+        m_zorder += g_depthInc[m_ctuToDepthIndex][sharedDepth[m_zorder]];
+    }
+
+    // copy original YUV samples in lossless mode
+    if (outBestCU->isLosslessCoded(0))
+        fillOrigYUVBuffer(outBestCU, m_origYuv[depth]);
+
+    // further split
+    if (cu_split_flag && bSubBranch)
+    {
+        uint32_t    nextDepth     = depth + 1;
+        TComDataCU* subBestPartCU = m_bestCU[nextDepth];
+        TComDataCU* subTempPartCU = m_tempCU[nextDepth];
+        for (uint32_t partUnitIdx = 0; partUnitIdx < 4; partUnitIdx++)
+        {
+            CU *child_cu = cuPicsym->m_CULocalData + cu->childIdx + partUnitIdx;
+
+            if (child_cu->flags & CU::PRESENT)
+            {
+                int32_t qp = outTempCU->getQP(0);
+                subBestPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
+                subTempPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp); // clear sub partition datas or init.
+                if (0 == partUnitIdx) //initialize RD with previous depth buffer
+                    m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+                else
+                    m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[nextDepth][CI_NEXT_BEST]);
+
+                sharedCompressIntraCU(subBestPartCU, subTempPartCU, nextDepth, cuPicsym, child_cu, sharedDepth, sharedPartSizes, sharedModes);
+                outTempCU->copyPartFrom(subBestPartCU, partUnitIdx, nextDepth); // Keep best part data to current temporary data.
+
+                // check if cost ==  MAX_INT64 then current depth != sharedDepth so, current CU is not best CU
+                // set the cost to MAX_INT64 - 1 to mark it as not best CU
+                if (m_rdCost.m_psyRd && subBestPartCU->m_totalPsyCost == MAX_INT64)
+                    outTempCU->m_totalPsyCost = MAX_INT64 - 1;
+                else if(subBestPartCU->m_totalRDCost == MAX_INT64)
+                    outTempCU->m_totalRDCost = MAX_INT64 - 1;
+
+                copyYuv2Tmp(subBestPartCU->getTotalNumPart() * partUnitIdx, nextDepth);
+            }
+            else
+            {
+                subBestPartCU->copyToPic(nextDepth);
+                outTempCU->copyPartFrom(subBestPartCU, partUnitIdx, nextDepth);
+
+                // increment m_zorder offset to point to next best depth in sharedDepth buffer
+                m_zorder += g_depthInc[m_ctuToDepthIndex][sharedDepth[m_zorder]];
+            }
+        }
+        if (cu->flags & CU::PRESENT)
+        {
+            m_entropyCoder->resetBits();
+            m_entropyCoder->codeSplitFlag(outTempCU, 0, depth);
+            outTempCU->m_totalBits += m_entropyCoder->getNumberOfWrittenBits(); // split bits
+        }
+
+        // check if cost is greater than (MAX_INT64 - 1)
+        if (m_rdCost.m_psyRd && outTempCU->m_totalPsyCost >= MAX_INT64)
+            outTempCU->m_totalPsyCost = m_rdCost.calcPsyRdCost(outTempCU->m_totalDistortion, outTempCU->m_totalBits, outTempCU->m_psyEnergy);
+        else if (outTempCU->m_totalRDCost >= MAX_INT64)
+            outTempCU->m_totalRDCost = m_rdCost.calcRdCost(outTempCU->m_totalDistortion, outTempCU->m_totalBits);
+
+        if (depth == slice->m_pps->maxCuDQPDepth && slice->m_pps->bUseDQP)
+        {
+            bool hasResidual = false;
+            for (uint32_t blkIdx = 0; blkIdx < outTempCU->getTotalNumPart(); blkIdx++)
+            {
+                if (outTempCU->getCbf(blkIdx, TEXT_LUMA) || outTempCU->getCbf(blkIdx, TEXT_CHROMA_U) ||
+                    outTempCU->getCbf(blkIdx, TEXT_CHROMA_V))
+                {
+                    hasResidual = true;
+                    break;
+                }
+            }
+
+            uint32_t targetPartIdx = 0;
+            if (hasResidual)
+            {
+                bool foundNonZeroCbf = false;
+                outTempCU->setQPSubCUs(outTempCU->getRefQP(targetPartIdx), outTempCU, 0, depth, foundNonZeroCbf);
+                X265_CHECK(foundNonZeroCbf, "expected to find non-zero CBF\n");
+            }
+            else
+                outTempCU->setQPSubParts(outTempCU->getRefQP(targetPartIdx), 0, depth); // set QP to default QP
+        }
+        m_rdEntropyCoders[nextDepth][CI_NEXT_BEST].store(m_rdEntropyCoders[depth][CI_TEMP_BEST]);
+        checkBestMode(outBestCU, outTempCU, depth);
+    }
+    outBestCU->copyToPic(depth);
+    copyYuv2Pic(pic, outBestCU->getAddr(), outBestCU->getZorderIdxInCU(), depth);
+}
+
+void Analysis::checkIntra(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, CU *cu, uint8_t* sharedModes)
 {
     //PPAScopeEvent(CheckRDCostIntra + depth);
     uint32_t depth = g_log2Size[m_param->maxCUSize] - cu->log2CUSize;
@@ -544,7 +689,10 @@
     uint32_t tuDepthRange[2];
     outTempCU->getQuadtreeTULog2MinSizeInCU(tuDepthRange, 0);
 
-    estIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange);
+    if (sharedModes)
+        sharedIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange, sharedModes);
+    else
+        estIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange);
 
     estIntraPredChromaQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth]);
 
diff -r 184e56afa951 -r 9db768fa41ad source/encoder/analysis.h
--- a/source/encoder/analysis.h	Fri Sep 12 12:02:46 2014 +0530
+++ b/source/encoder/analysis.h	Mon Sep 15 14:07:31 2014 +0530
@@ -100,6 +100,9 @@
     StatisticLog  m_sliceTypeLog[3];
     StatisticLog* m_log;
 
+    uint32_t      m_zorder;
+    uint32_t      m_ctuToDepthIndex;
+
     Analysis();
     bool create(uint32_t totalDepth, uint32_t maxWidth);
     void destroy();
@@ -110,7 +113,8 @@
 
     /* Warning: The interface for these functions will undergo significant changes as a major refactor is under progress */
     void compressIntraCU(TComDataCU*& outBestCU, TComDataCU*& outTempCU, uint32_t depth, TComDataCU* cuPicsym, CU *cu);
-    void checkIntra(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, CU *cu);
+    void checkIntra(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, CU *cu, uint8_t* sharedModes=NULL);
+    void sharedCompressIntraCU(TComDataCU*& outBestCU, TComDataCU*& outTempCU, uint32_t depth, TComDataCU* cuPicsym, CU *cu, uint8_t* sharedDepth, char* sharedPartSizes, uint8_t* sharedModes);
 
     void compressInterCU_rd0_4(TComDataCU*& outBestCU, TComDataCU*& outTempCU, TComDataCU* cu, uint32_t depth, TComDataCU* cuPicsym, CU *cu_t,
                                int bInsidePicture, uint32_t partitionIndex, uint32_t minDepth);
diff -r 184e56afa951 -r 9db768fa41ad source/encoder/search.cpp
--- a/source/encoder/search.cpp	Fri Sep 12 12:02:46 2014 +0530
+++ b/source/encoder/search.cpp	Mon Sep 15 14:07:31 2014 +0530
@@ -1484,6 +1484,75 @@
     x265_emms();
 }
 
+void Search::sharedIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2], uint8_t* sharedModes)
+{
+    uint32_t depth        = cu->getDepth(0);
+    uint32_t initTrDepth  = cu->getPartitionSize(0) == SIZE_2Nx2N ? 0 : 1;
+    uint32_t numPU        = 1 << (2 * initTrDepth);
+    uint32_t log2TrSize   = cu->getLog2CUSize(0) - initTrDepth;
+    uint32_t qNumParts    = cu->getTotalNumPart() >> 2;
+    uint32_t overallDistY = 0;
+    static const uint8_t intraModeNumFast[] = { 8, 8, 3, 3, 3 }; // 4x4, 8x8, 16x16, 32x32, 64x64
+
+    // loop over partitions
+    uint32_t partOffset = 0;
+    uint32_t puDistY;
+    uint64_t puCost;
+    for (uint32_t pu = 0; pu < numPU; pu++, partOffset += qNumParts)
+    {
+        uint32_t bestPUMode = sharedModes[pu];
+        uint32_t bestPUDistY = 0;
+
+        cu->setLumaIntraDirSubParts(bestPUMode, partOffset, depth + initTrDepth);
+
+        // set context models
+        m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+
+        // determine residual for partition
+        puCost = 0;
+        puDistY = xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, true, puCost, depthRange);
+
+        bestPUDistY = puDistY;
+        xSetIntraResultQT(cu, initTrDepth, partOffset, reconYuv);
+
+        // update overall distortion
+        overallDistY += bestPUDistY;
+
+        if (pu != numPU - 1)
+        {
+            uint32_t zorder      = cu->getZorderIdxInCU() + partOffset;
+            pixel*   dst         = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
+            uint32_t dststride   = cu->m_pic->getPicYuvRec()->getStride();
+            pixel*   src         = reconYuv->getLumaAddr(partOffset);
+            uint32_t srcstride   = reconYuv->getStride();
+            primitives.luma_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride);
+        }
+
+        // update PU data
+        cu->setLumaIntraDirSubParts(bestPUMode, partOffset, depth + initTrDepth);
+        cu->copyToPic((uint8_t)depth, pu, initTrDepth);
+    }
+
+    if (numPU > 1)
+    {
+        // set Cbf for all blocks
+        uint32_t combCbfY = 0;
+        uint32_t partIdx  = 0;
+        for (uint32_t part = 0; part < 4; part++, partIdx += qNumParts)
+            combCbfY |= cu->getCbf(partIdx, TEXT_LUMA,     1);
+
+        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
+            cu->getCbf(TEXT_LUMA)[offs] |= combCbfY;
+
+    }
+
+    // reset context models
+    m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+
+    // set distortion (rate and r-d costs are determined later)
+    cu->m_totalDistortion = overallDistY;
+}
+
 void Search::getBestIntraModeChroma(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv)
 {
     uint32_t depth   = cu->getDepth(0);
diff -r 184e56afa951 -r 9db768fa41ad source/encoder/search.h
--- a/source/encoder/search.h	Fri Sep 12 12:02:46 2014 +0530
+++ b/source/encoder/search.h	Mon Sep 15 14:07:31 2014 +0530
@@ -109,6 +109,7 @@
     bool initSearch(x265_param *param, ScalingList& scalingList);
 
     void estIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2]);
+    void sharedIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2], uint8_t* sharedModes);
     void estIntraPredChromaQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv);
 
     // estimation inter prediction (non-skip)


More information about the x265-devel mailing list