[x265-commits] [x265] analysis: rename m_qp to m_aqQP for clarity

Mon Apr 27 19:32:43 CEST 2015

details:   http://hg.videolan.org/x265/rev/5429e35d132d
branches:  
changeset: 10286:5429e35d132d
user:      Steve Borho <steve at borho.org>
date:      Fri Apr 24 16:51:38 2015 -0500
description:
analysis: rename m_qp to m_aqQP for clarity
Subject: [x265] analysis: simplify CTU QP init loops

details:   http://hg.videolan.org/x265/rev/bfd57a0c0875
branches:  
changeset: 10287:bfd57a0c0875
user:      Steve Borho <steve at borho.org>
date:      Fri Apr 24 15:14:54 2015 -0500
description:
analysis: simplify CTU QP init loops
Subject: [x265] analysis: keep per-CU AQ QPs in cuGeom index order, simplify arguments

details:   http://hg.videolan.org/x265/rev/5644bbd23e71
branches:  
changeset: 10288:5644bbd23e71
user:      Steve Borho <steve at borho.org>
date:      Fri Apr 24 16:05:12 2015 -0500
description:
analysis: keep per-CU AQ QPs in cuGeom index order, simplify arguments
Subject: [x265] analysis: hoist all adaptive-quant work to recursive callers

details:   http://hg.videolan.org/x265/rev/0d99d6436375
branches:  
changeset: 10289:0d99d6436375
user:      Steve Borho <steve at borho.org>
date:      Fri Apr 24 16:14:22 2015 -0500
description:
analysis: hoist all adaptive-quant work to recursive callers

Note that the depth-first code will probably need to reset lambda after
returning from recursion (even before any of these patches)
Subject: [x265] analysis: configure slave quant QP prior to pmode intra RDO for RD 0..4

details:   http://hg.videolan.org/x265/rev/6a0a37c01cff
branches:  
changeset: 10290:6a0a37c01cff
user:      Steve Borho <steve at borho.org>
date:      Sat Apr 25 00:39:48 2015 -0500
description:
analysis: configure slave quant QP prior to pmode intra RDO for RD 0..4

related to #127, but that bug report uses preset veryslow, --rd 5
Subject: [x265] analysis: always configure quant QP directly after setting RD lambda

details:   http://hg.videolan.org/x265/rev/68a13226d586
branches:  
changeset: 10291:68a13226d586
user:      Steve Borho <steve at borho.org>
date:      Sat Apr 25 01:39:55 2015 -0500
description:
analysis: always configure quant QP directly after setting RD lambda

Basically, everywhere we adjust or assign QP we set quant QP immediately. This
removes a great many ad-hoc calls to setQPforQuant() and hopefully makes it
impossible to miss quant being configured properly.

This patch fixes a layering violation where the frame encoder was setting the
RDO lambdas directly, but only when delta-QP was not enabled.

diffstat:

 source/common/cudata.cpp        |    4 +-
 source/common/cudata.h          |    2 +-
 source/common/quant.cpp         |   13 +-
 source/common/quant.h           |    2 +-
 source/encoder/analysis.cpp     |  144 +++++++++++++++------------------------
 source/encoder/analysis.h       |   12 +-
 source/encoder/frameencoder.cpp |    3 +-
 source/encoder/rdcost.h         |    3 +-
 source/encoder/search.cpp       |   20 ++--
 source/encoder/search.h         |    2 +-
 10 files changed, 88 insertions(+), 117 deletions(-)

diffs (truncated from 596 to 300 lines):

diff -r 4a7176bab742 -r 68a13226d586 source/common/cudata.cpp

--- a/source/common/cudata.cpp	Fri Apr 24 16:07:42 2015 -0500
+++ b/source/common/cudata.cpp	Sat Apr 25 01:39:55 2015 -0500
@@ -2027,6 +2027,7 @@ void CUData::calcCTUGeoms(uint32_t ctuWi
         uint32_t blockSize = 1 << log2CUSize;
         uint32_t sbWidth   = 1 << (g_log2Size[maxCUSize] - log2CUSize);
         int32_t lastLevelFlag = log2CUSize == g_log2Size[minCUSize];
+
         for (uint32_t sbY = 0; sbY < sbWidth; sbY++)
         {
             for (uint32_t sbX = 0; sbX < sbWidth; sbX++)
@@ -2049,7 +2050,8 @@ void CUData::calcCTUGeoms(uint32_t ctuWi
                 cu->childOffset = childIdx - cuIdx;
                 cu->absPartIdx = g_depthScanIdx[yOffset][xOffset] * 4;
                 cu->numPartitions = (NUM_4x4_PARTITIONS >> ((g_maxLog2CUSize - cu->log2CUSize) * 2));
-                cu->depth = g_log2Size[maxCUSize] - log2CUSize;
+                cu->depth = (uint16_t)(g_log2Size[maxCUSize] - log2CUSize);
+                cu->index = (uint16_t)cuIdx;
 
                 cu->flags = 0;
                 CU_SET_FLAG(cu->flags, CUGeom::PRESENT, presentFlag);
diff -r 4a7176bab742 -r 68a13226d586 source/common/cudata.h
--- a/source/common/cudata.h	Fri Apr 24 16:07:42 2015 -0500
+++ b/source/common/cudata.h	Sat Apr 25 01:39:55 2015 -0500
@@ -85,8 +85,8 @@ struct CUGeom
     uint32_t childOffset;   // offset of the first child CU from current CU
     uint32_t absPartIdx;    // Part index of this CU in terms of 4x4 blocks.
     uint32_t numPartitions; // Number of 4x4 blocks in the CU
-    uint32_t depth;         // depth of this CU relative from CTU
     uint32_t flags;         // CU flags.
+    uint16_t depth, index;  // depth of this CU relative from CTU, absolute index
 };
 
 struct MVField
diff -r 4a7176bab742 -r 68a13226d586 source/common/quant.cpp
--- a/source/common/quant.cpp	Fri Apr 24 16:07:42 2015 -0500
+++ b/source/common/quant.cpp	Sat Apr 25 01:39:55 2015 -0500
@@ -225,16 +225,15 @@ Quant::~Quant()
     X265_FREE(m_fencShortBuf);
 }
 
-void Quant::setQPforQuant(const CUData& cu)
+void Quant::setQPforQuant(const CUData& ctu, int qp)
 {
-    m_tqBypass = !!cu.m_tqBypass[0];
+    m_tqBypass = !!ctu.m_tqBypass[0];
     if (m_tqBypass)
         return;
-    m_nr = m_frameNr ? &m_frameNr[cu.m_encData->m_frameEncoderID] : NULL;
-    int qpy = cu.m_qp[0];
-    m_qpParam[TEXT_LUMA].setQpParam(qpy + QP_BD_OFFSET);
-    setChromaQP(qpy + cu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, cu.m_chromaFormat);
-    setChromaQP(qpy + cu.m_slice->m_pps->chromaQpOffset[1], TEXT_CHROMA_V, cu.m_chromaFormat);
+    m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL;
+    m_qpParam[TEXT_LUMA].setQpParam(qp + QP_BD_OFFSET);
+    setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat);
+    setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[1], TEXT_CHROMA_V, ctu.m_chromaFormat);
 }
 
 void Quant::setChromaQP(int qpin, TextType ttype, int chFmt)
diff -r 4a7176bab742 -r 68a13226d586 source/common/quant.h
--- a/source/common/quant.h	Fri Apr 24 16:07:42 2015 -0500
+++ b/source/common/quant.h	Sat Apr 25 01:39:55 2015 -0500
@@ -103,7 +103,7 @@ public:
     bool allocNoiseReduction(const x265_param& param);
 
     /* CU setup */
-    void setQPforQuant(const CUData& cu);
+    void setQPforQuant(const CUData& ctu, int qp);
 
     uint32_t transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride, coeff_t* coeff,
                           uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip);
diff -r 4a7176bab742 -r 68a13226d586 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp	Fri Apr 24 16:07:42 2015 -0500
+++ b/source/encoder/analysis.cpp	Sat Apr 25 01:39:55 2015 -0500
@@ -75,8 +75,6 @@ Analysis::Analysis()
     m_reuseInterDataCTU = NULL;
     m_reuseRef = NULL;
     m_reuseBestMergeCand = NULL;
-    for (int i = 0; i < NUM_CU_DEPTH; i++)
-        m_qp[i] = NULL;
 }
 
 bool Analysis::create(ThreadLocalData *tld)
@@ -103,12 +101,9 @@ bool Analysis::create(ThreadLocalData *t
             ok &= md.pred[j].reconYuv.create(cuSize, csp);
             md.pred[j].fencYuv = &md.fencYuv;
         }
-        CHECKED_MALLOC(m_qp[depth], int, (size_t)1 << (depth << 1));
     }
 
     return ok;
-fail:
-    return false;
 }
 
 void Analysis::destroy()
@@ -123,7 +118,17 @@ void Analysis::destroy()
             m_modeDepth[i].pred[j].predYuv.destroy();
             m_modeDepth[i].pred[j].reconYuv.destroy();
         }
-        X265_FREE(m_qp[i]);
+    }
+}
+
+void Analysis::initAqQPs(uint32_t depth, const CUData& ctu, const CUGeom* rootGeom)
+{
+    for (int d0 = 0; d0 < 4; d0++)
+    {
+        m_aqQP[rootGeom->index + d0] = calculateQpforCuSize(ctu, rootGeom[d0]);
+
+        if (m_slice->m_pps->maxCuDQPDepth > depth)
+            initAqQPs(depth + 1, ctu, &rootGeom[d0] + rootGeom[d0].childOffset);
     }
 }
 
@@ -141,37 +146,16 @@ Mode& Analysis::compressCTU(CUData& ctu,
 
     if (m_slice->m_pps->bUseDQP)
     {
-        CUGeom *curCUGeom  = (CUGeom *)&cuGeom;
-        CUGeom *parentGeom = (CUGeom *)&cuGeom;
+        m_aqQP[0] = setLambdaFromQP(ctu, calculateQpforCuSize(ctu, cuGeom));
 
-        /* TODO: In future, we should extend this to 8x8 QGs as well, since that's the minimum size 
-        allowed by the HEVC standard. The AQ offset calculation will need to be at 8x8 granularity.
-        And this messy section will need to be reworked */
-
-        m_qp[0][0] = calculateQpforCuSize(ctu, *curCUGeom);
-        curCUGeom = curCUGeom + curCUGeom->childOffset;
-        parentGeom = curCUGeom;
-        if (m_slice->m_pps->maxCuDQPDepth >= 1)
-        {
-            for (int i = 0; i < 4; i++)
-            {
-                m_qp[1][i] = calculateQpforCuSize(ctu, *(parentGeom + i));
-                if (m_slice->m_pps->maxCuDQPDepth == 2)
-                {
-                    curCUGeom = parentGeom + i + (parentGeom + i)->childOffset;
-                    for (int j = 0; j < 4; j++)
-                        m_qp[2][i * 4 + j] = calculateQpforCuSize(ctu, *(curCUGeom + j));
-                }
-            }
-        }
-        setLambdaFromQP(*m_slice, m_qp[0][0]);
-        m_qp[0][0] = x265_clip3(QP_MIN, QP_MAX_SPEC, m_qp[0][0]);
-        ctu.setQPSubParts((int8_t)m_qp[0][0], 0, 0);
+        if (m_slice->m_pps->maxCuDQPDepth)
+            initAqQPs(1, ctu, &cuGeom + 1);
     }
     else
-        m_qp[0][0] = m_slice->m_sliceQp;
+        /* adaptive quant disabled, CTU QP is always slice QP, and within spec range */
+        m_aqQP[0] = setLambdaFromQP(ctu, m_slice->m_sliceQp);
 
-    m_quant.setQPforQuant(ctu);
+    ctu.setQPSubParts((int8_t)m_aqQP[0], 0, 0);
     m_rqt[0].cur.load(initialContext);
     m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0);
 
@@ -194,7 +178,7 @@ Mode& Analysis::compressCTU(CUData& ctu,
     uint32_t zOrder = 0;
     if (m_slice->m_sliceType == I_SLICE)
     {
-        compressIntraCU(ctu, cuGeom, zOrder, m_qp[0][0], 0);
+        compressIntraCU(ctu, cuGeom, zOrder, m_aqQP[0]);
         if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.intraData)
         {
             CUData* bestCU = &m_modeDepth[0].bestMode->cu;
@@ -212,18 +196,18 @@ Mode& Analysis::compressCTU(CUData& ctu,
             * they are available for intra predictions */
             m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0);
 
-            compressInterCU_rd0_4(ctu, cuGeom, m_qp[0][0], 0);
+            compressInterCU_rd0_4(ctu, cuGeom, m_aqQP[0]);
 
             /* generate residual for entire CTU at once and copy to reconPic */
             encodeResidue(ctu, cuGeom);
         }
         else if (m_param->bDistributeModeAnalysis && m_param->rdLevel >= 2)
-            compressInterCU_dist(ctu, cuGeom, m_qp[0][0], 0);
+            compressInterCU_dist(ctu, cuGeom, m_aqQP[0]);
         else if (m_param->rdLevel <= 4)
-            compressInterCU_rd0_4(ctu, cuGeom, m_qp[0][0], 0);
+            compressInterCU_rd0_4(ctu, cuGeom, m_aqQP[0]);
         else
         {
-            compressInterCU_rd5_6(ctu, cuGeom, zOrder, m_qp[0][0], 0);
+            compressInterCU_rd5_6(ctu, cuGeom, zOrder, m_aqQP[0]);
             if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.interData)
             {
                 CUData* bestCU = &m_modeDepth[0].bestMode->cu;
@@ -245,24 +229,28 @@ void Analysis::tryLossless(const CUGeom&
         return;
     else if (md.bestMode->cu.isIntra(0))
     {
+        m_quant.m_tqBypass = true;
         md.pred[PRED_LOSSLESS].initCosts();
         md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
         PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
         uint8_t* modes = md.pred[PRED_LOSSLESS].cu.m_lumaIntraDir;
         checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size, modes, NULL);
         checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
+        m_quant.m_tqBypass = false;
     }
     else
     {
+        m_quant.m_tqBypass = true;
         md.pred[PRED_LOSSLESS].initCosts();
         md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
         md.pred[PRED_LOSSLESS].predYuv.copyFromYuv(md.bestMode->predYuv);
         encodeResAndCalcRdInterCU(md.pred[PRED_LOSSLESS], cuGeom);
         checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
+        m_quant.m_tqBypass = false;
     }
 }
 
-void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t& zOrder, int32_t qp, uint32_t partIdx)
+void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t& zOrder, int32_t qp)
 {
     uint32_t depth = cuGeom.depth;
     ModeDepth& md = m_modeDepth[depth];
@@ -271,13 +259,6 @@ void Analysis::compressIntraCU(const CUD
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
 
-    if (m_slice->m_pps->bUseDQP && depth && depth <= m_slice->m_pps->maxCuDQPDepth)
-    {
-        qp = m_qp[depth][partIdx];
-        setLambdaFromQP(*m_slice, qp);
-        qp = x265_clip3(QP_MIN, QP_MAX_SPEC, qp);
-    }
-
     if (m_param->analysisMode == X265_ANALYSIS_LOAD)
     {
         uint8_t* reuseDepth  = &m_reuseIntraDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
@@ -290,7 +271,6 @@ void Analysis::compressIntraCU(const CUD
             PartSize size = (PartSize)reusePartSizes[zOrder];
             Mode& mode = size == SIZE_2Nx2N ? md.pred[PRED_INTRA] : md.pred[PRED_INTRA_NxN];
             mode.cu.initSubCU(parentCTU, cuGeom, qp);
-            m_quant.setQPforQuant(mode.cu);
             checkIntra(mode, cuGeom, size, &reuseModes[zOrder], &reuseChromaModes[zOrder]);
             checkBestMode(mode, depth);
 
@@ -308,7 +288,6 @@ void Analysis::compressIntraCU(const CUD
     else if (mightNotSplit)
     {
         md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
-        m_quant.setQPforQuant(md.pred[PRED_INTRA].cu);
         checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL, NULL);
         checkBestMode(md.pred[PRED_INTRA], depth);
 
@@ -337,6 +316,7 @@ void Analysis::compressIntraCU(const CUD
         ModeDepth& nd = m_modeDepth[nextDepth];
         invalidateContexts(nextDepth);
         Entropy* nextContext = &m_rqt[depth].cur;
+        int32_t nextQP = qp;
 
         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
         {
@@ -345,7 +325,11 @@ void Analysis::compressIntraCU(const CUD
             {
                 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
                 m_rqt[nextDepth].cur.load(*nextContext);
-                compressIntraCU(parentCTU, childGeom, zOrder, qp, partIdx * 4 + subPartIdx);
+
+                if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
+                    nextQP = setLambdaFromQP(parentCTU, m_aqQP[childGeom.index]);
+
+                compressIntraCU(parentCTU, childGeom, zOrder, nextQP);
 
                 // Save best CU and pred data for this sub CU
                 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
@@ -413,14 +397,9 @@ void Analysis::processPmode(PMODE& pmode
     {
         slave.m_slice = m_slice;
         slave.m_frame = m_frame;
-        slave.setLambdaFromQP(*m_slice, m_rdCost.m_qp);
+        slave.setLambdaFromQP(md.pred[PRED_2Nx2N].cu, m_rdCost.m_qp);
         slave.invalidateContexts(0);
-
-        if (m_param->rdLevel >= 5)
-        {
-            slave.m_rqt[pmode.cuGeom.depth].cur.load(m_rqt[pmode.cuGeom.depth].cur);
-            slave.m_quant.setQPforQuant(md.pred[PRED_2Nx2N].cu);
-        }
+        slave.m_rqt[pmode.cuGeom.depth].cur.load(m_rqt[pmode.cuGeom.depth].cur);
     }
 
     /* perform Mode task, repeat until no more work is available */
@@ -431,8 +410,6 @@ void Analysis::processPmode(PMODE& pmode
             switch (pmode.modes[task])
             {
             case PRED_INTRA:
-                if (&slave != this)
-                    slave.m_rqt[pmode.cuGeom.depth].cur.load(m_rqt[pmode.cuGeom.depth].cur);
                 slave.checkIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom);
                 if (m_param->rdLevel > 2)
                     slave.encodeIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom);
@@ -533,7 +510,7 @@ void Analysis::processPmode(PMODE& pmode