[x265] [PATCH 2 of 2] implement QP based RD refinement

Sun Dec 27 12:22:30 CET 2015

# HG changeset patch
# User Kavitha Sampath<kavitha at multicorewareinc.com>
# Date 1450765749 -19800
#      Tue Dec 22 11:59:09 2015 +0530
# Node ID 4321eec92de66588c35ac8523e76858a7f80294d
# Parent  a161c580095a1ae08813912c0dbc9838695a7800
implement QP based RD refinement

After CU analysis, calculate R-D cost on the best partition mode
for a range of QP values to find the optimal rounding effect.

diff -r a161c580095a -r 4321eec92de6 source/common/cudata.cpp

--- a/source/common/cudata.cpp	Fri Dec 11 14:53:49 2015 +0530
+++ b/source/common/cudata.cpp	Tue Dec 22 11:59:09 2015 +0530
@@ -480,7 +480,7 @@
 }
 
 /* The reverse of copyToPic, called only by encodeResidue */
-void CUData::copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp)
+void CUData::copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp, bool copyQp)
 {
     m_encData       = ctu.m_encData;
     m_slice         = ctu.m_slice;
@@ -491,7 +491,8 @@
     m_numPartitions = cuGeom.numPartitions;
 
     /* copy out all prediction info for this part */
-    m_partCopy((uint8_t*)m_qp, (uint8_t*)ctu.m_qp + m_absIdxInCTU);
+    if (copyQp) m_partCopy((uint8_t*)m_qp, (uint8_t*)ctu.m_qp + m_absIdxInCTU);
+
     m_partCopy(m_log2CUSize,   ctu.m_log2CUSize + m_absIdxInCTU);
     m_partCopy(m_lumaIntraDir, ctu.m_lumaIntraDir + m_absIdxInCTU);
     m_partCopy(m_tqBypass,     ctu.m_tqBypass + m_absIdxInCTU);
diff -r a161c580095a -r 4321eec92de6 source/common/cudata.h
--- a/source/common/cudata.h	Fri Dec 11 14:53:49 2015 +0530
+++ b/source/common/cudata.h	Tue Dec 22 11:59:09 2015 +0530
@@ -222,7 +222,7 @@
     void     copyToPic(uint32_t depth) const;
 
     /* RD-0 methods called only from encodeResidue */
-    void     copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp);
+    void     copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp, bool copyQp = true);
     void     updatePic(uint32_t depth) const;
 
     void     setPartSizeSubParts(PartSize size)    { m_partSet(m_partSize, (uint8_t)size); }
diff -r a161c580095a -r 4321eec92de6 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp	Fri Dec 11 14:53:49 2015 +0530
+++ b/source/encoder/analysis.cpp	Tue Dec 22 11:59:09 2015 +0530
@@ -83,6 +83,12 @@
     m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2;
     m_bChromaSa8d = m_param->rdLevel >= 3;
 
+    int costArrSize = 1;
+    uint32_t maxDQPDepth = g_log2Size[m_param->maxCUSize] - g_log2Size[m_param->rc.qgSize];
+    for (uint32_t i = 1; i <= maxDQPDepth; i++)
+        costArrSize += (1 << (i * 2));
+    cacheCost = X265_MALLOC(uint64_t, costArrSize);
+
     int csp = m_param->internalCsp;
     uint32_t cuSize = g_maxCUSize;
 
@@ -119,6 +125,7 @@
             m_modeDepth[i].pred[j].reconYuv.destroy();
         }
     }
+    X265_FREE(cacheCost);
 }
 
 Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
@@ -201,6 +208,9 @@
         }
     }
 
+    if (m_param->bEnableRdRefine)
+        qprdRefine(ctu, cuGeom, qp, qp);
+
     return *m_modeDepth[0].bestMode;
 }
 
@@ -229,6 +239,61 @@
     }
 }
 
+void Analysis::qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp)
+{
+    uint32_t depth = cuGeom.depth;
+    ModeDepth& md = m_modeDepth[depth];
+    md.bestMode = NULL;
+
+    bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
+
+    int bestCUQP = qp;
+    int lambdaQP = lqp;
+
+    bool doQPRefine = (bDecidedDepth && depth <= m_slice->m_pps->maxCuDQPDepth) || (!bDecidedDepth && depth == m_slice->m_pps->maxCuDQPDepth);
+
+    if (doQPRefine)
+    {
+        uint64_t bestCUCost, origCUCost, cuCost, cuPrevCost;
+
+        int cuIdx = (cuGeom.childOffset - 1) / 3;
+        bestCUCost = origCUCost = cacheCost[cuIdx];
+
+        for (int dir = 2; dir >= -2; dir -= 4)
+        {
+            int threshold = 1;
+            int failure = 0;
+            cuPrevCost = origCUCost;
+
+            int modCUQP = qp + dir;
+            while (modCUQP >= QP_MIN && modCUQP <= QP_MAX_SPEC)
+            {
+                recodeCU(parentCTU, cuGeom, modCUQP, qp);
+                cuCost = md.bestMode->rdCost;
+
+                COPY2_IF_LT(bestCUCost, cuCost, bestCUQP, modCUQP);
+                if (cuCost < cuPrevCost)
+                    failure = 0;
+                else
+                    failure++;
+
+                if (failure > threshold)
+                    break;
+
+                cuPrevCost = cuCost;
+                modCUQP += dir;
+            }
+        }
+        lambdaQP = bestCUQP;
+    }
+
+    recodeCU(parentCTU, cuGeom, bestCUQP, lambdaQP);
+
+    /* Copy best data to encData CTU and recon */
+    md.bestMode->cu.copyToPic(depth);
+    md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
+}
+
 void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
 {
     uint32_t depth = cuGeom.depth;
@@ -334,6 +399,12 @@
         checkBestMode(*splitPred, depth);
     }
 
+    if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth)
+    {
+        int cuIdx = (cuGeom.childOffset - 1) / 3;
+        cacheCost[cuIdx] = md.bestMode->rdCost;
+    }
+
     /* Copy best data to encData CTU and recon */
     md.bestMode->cu.copyToPic(depth);
     if (md.bestMode != &md.pred[PRED_SPLIT])
@@ -1611,6 +1682,12 @@
     if (mightSplit && !foundSkip)
         checkBestMode(md.pred[PRED_SPLIT], depth);
 
+    if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth)
+    {
+        int cuIdx = (cuGeom.childOffset - 1) / 3;
+        cacheCost[cuIdx] = md.bestMode->rdCost;
+    }
+
        /* determine which motion references the parent CU should search */
     SplitData splitCUData;
     if (!(m_param->limitReferences & X265_REF_LIMIT_DEPTH))
@@ -1647,6 +1724,110 @@
     return splitCUData;
 }
 
+void Analysis::recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp)
+{
+    uint32_t depth = cuGeom.depth;
+    ModeDepth& md = m_modeDepth[depth];
+    md.bestMode = NULL;
+
+    bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
+    bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
+    bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
+
+    if (bDecidedDepth)
+    {
+        setLambdaFromQP(parentCTU, qp, lqp);
+
+        Mode& mode = md.pred[0];
+        md.bestMode = &mode;
+        mode.cu.initSubCU(parentCTU, cuGeom, qp);
+        PartSize size = (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx];
+        if (parentCTU.isIntra(cuGeom.absPartIdx))
+        {
+            memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
+            memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
+            checkIntra(mode, cuGeom, size);
+        }
+        else
+        {
+            mode.cu.copyFromPic(parentCTU, cuGeom, m_csp, false);
+            for (int part = 0; part < (int)parentCTU.getNumPartInter(cuGeom.absPartIdx); part++)
+            {
+                PredictionUnit pu(mode.cu, cuGeom, part);
+                motionCompensation(mode.cu, pu, mode.predYuv, true, true);
+            }
+
+            if (parentCTU.isSkipped(cuGeom.absPartIdx))
+                encodeResAndCalcRdSkipCU(mode);
+            else
+                encodeResAndCalcRdInterCU(mode, cuGeom);
+
+            /* checkMerge2Nx2N function performs checkDQP after encoding residual, do the same */
+            bool mergeInter2Nx2N = size == SIZE_2Nx2N && parentCTU.m_mergeFlag[cuGeom.absPartIdx];
+            if (parentCTU.isSkipped(cuGeom.absPartIdx) || mergeInter2Nx2N)
+                checkDQP(mode, cuGeom);
+        }
+
+        if (m_bTryLossless)
+            tryLossless(cuGeom);
+
+        if (mightSplit)
+            addSplitFlagCost(*md.bestMode, cuGeom.depth);
+    }
+    else
+    {
+        Mode* splitPred = &md.pred[PRED_SPLIT];
+        md.bestMode = splitPred;
+        splitPred->initCosts();
+        CUData* splitCU = &splitPred->cu;
+        splitCU->initSubCU(parentCTU, cuGeom, qp);
+
+        uint32_t nextDepth = depth + 1;
+        ModeDepth& nd = m_modeDepth[nextDepth];
+        invalidateContexts(nextDepth);
+        Entropy* nextContext = &m_rqt[depth].cur;
+        int nextQP = qp;
+
+        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
+        {
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+            if (childGeom.flags & CUGeom::PRESENT)
+            {
+                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
+                m_rqt[nextDepth].cur.load(*nextContext);
+
+                if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
+                    nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
+
+                qprdRefine(parentCTU, childGeom, nextQP, lqp);
+
+                // Save best CU and pred data for this sub CU
+                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
+                splitPred->addSubCosts(*nd.bestMode);
+                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
+                nextContext = &nd.bestMode->contexts;
+            }
+            else
+            {
+                splitCU->setEmptyPart(childGeom, subPartIdx);
+                // Set depth of non-present CU to 0 to ensure that correct CU is fetched as reference to code deltaQP
+                memset(parentCTU.m_cuDepth + childGeom.absPartIdx, 0, childGeom.numPartitions);
+            }
+        }
+        nextContext->store(splitPred->contexts);
+        if (mightNotSplit)
+            addSplitFlagCost(*splitPred, cuGeom.depth);
+        else
+            updateModeCost(*splitPred);
+
+        checkDQPForSplitPred(*splitPred, cuGeom);
+
+        /* Copy best data to encData CTU and recon */
+        md.bestMode->cu.copyToPic(depth);
+        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
+    }
+}
+
 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
 void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom)
 {
diff -r a161c580095a -r 4321eec92de6 source/encoder/analysis.h
--- a/source/encoder/analysis.h	Fri Dec 11 14:53:49 2015 +0530
+++ b/source/encoder/analysis.h	Tue Dec 22 11:59:09 2015 +0530
@@ -123,6 +123,10 @@
     int32_t*             m_reuseRef;
     uint32_t*            m_reuseBestMergeCand;
     uint32_t m_splitRefIdx[4];
+    uint64_t* cacheCost;
+
+    /* refine RD based on QP for rd-levels 5 and 6 */
+    void qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp);
 
     /* full analysis for an I-slice CU */
     void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
@@ -132,6 +136,8 @@
     SplitData compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
     SplitData compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
 
+    void recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t origqp = -1);
+
     /* measure merge and skip */
     void checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom);
     void checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom, bool isShareMergeCand);