[x265] [PATCH 3 of 6 REV2] analysis: keep per-CU AQ QPs in cuGeom index order, simplify arguments

Sun Apr 26 19:21:04 CEST 2015

# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1429909512 18000
#      Fri Apr 24 16:05:12 2015 -0500
# Node ID 5644bbd23e71996651f4ed558e0260201a91f70d
# Parent  bfd57a0c0875e219d902ff3af6f4a0ddaa16b125
analysis: keep per-CU AQ QPs in cuGeom index order, simplify arguments

diff -r bfd57a0c0875 -r 5644bbd23e71 source/common/cudata.cpp

--- a/source/common/cudata.cpp	Fri Apr 24 15:14:54 2015 -0500
+++ b/source/common/cudata.cpp	Fri Apr 24 16:05:12 2015 -0500
@@ -2027,6 +2027,7 @@
         uint32_t blockSize = 1 << log2CUSize;
         uint32_t sbWidth   = 1 << (g_log2Size[maxCUSize] - log2CUSize);
         int32_t lastLevelFlag = log2CUSize == g_log2Size[minCUSize];
+
         for (uint32_t sbY = 0; sbY < sbWidth; sbY++)
         {
             for (uint32_t sbX = 0; sbX < sbWidth; sbX++)
@@ -2049,7 +2050,8 @@
                 cu->childOffset = childIdx - cuIdx;
                 cu->absPartIdx = g_depthScanIdx[yOffset][xOffset] * 4;
                 cu->numPartitions = (NUM_4x4_PARTITIONS >> ((g_maxLog2CUSize - cu->log2CUSize) * 2));
-                cu->depth = g_log2Size[maxCUSize] - log2CUSize;
+                cu->depth = (uint16_t)(g_log2Size[maxCUSize] - log2CUSize);
+                cu->index = (uint16_t)cuIdx;
 
                 cu->flags = 0;
                 CU_SET_FLAG(cu->flags, CUGeom::PRESENT, presentFlag);
diff -r bfd57a0c0875 -r 5644bbd23e71 source/common/cudata.h
--- a/source/common/cudata.h	Fri Apr 24 15:14:54 2015 -0500
+++ b/source/common/cudata.h	Fri Apr 24 16:05:12 2015 -0500
@@ -85,8 +85,8 @@
     uint32_t childOffset;   // offset of the first child CU from current CU
     uint32_t absPartIdx;    // Part index of this CU in terms of 4x4 blocks.
     uint32_t numPartitions; // Number of 4x4 blocks in the CU
-    uint32_t depth;         // depth of this CU relative from CTU
     uint32_t flags;         // CU flags.
+    uint16_t depth, index;  // depth of this CU relative from CTU, absolute index
 };
 
 struct MVField
diff -r bfd57a0c0875 -r 5644bbd23e71 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp	Fri Apr 24 15:14:54 2015 -0500
+++ b/source/encoder/analysis.cpp	Fri Apr 24 16:05:12 2015 -0500
@@ -75,8 +75,6 @@
     m_reuseInterDataCTU = NULL;
     m_reuseRef = NULL;
     m_reuseBestMergeCand = NULL;
-    for (int i = 0; i < NUM_CU_DEPTH; i++)
-        m_aqQP[i] = NULL;
 }
 
 bool Analysis::create(ThreadLocalData *tld)
@@ -103,12 +101,9 @@
             ok &= md.pred[j].reconYuv.create(cuSize, csp);
             md.pred[j].fencYuv = &md.fencYuv;
         }
-        CHECKED_MALLOC(m_aqQP[depth], int, (size_t)1 << (depth << 1));
     }
 
     return ok;
-fail:
-    return false;
 }
 
 void Analysis::destroy()
@@ -123,7 +118,17 @@
             m_modeDepth[i].pred[j].predYuv.destroy();
             m_modeDepth[i].pred[j].reconYuv.destroy();
         }
-        X265_FREE(m_aqQP[i]);
+    }
+}
+
+void Analysis::initAqQPs(uint32_t depth, const CUData& ctu, const CUGeom* rootGeom)
+{
+    for (int d0 = 0; d0 < 4; d0++)
+    {
+        m_aqQP[rootGeom->index + d0] = calculateQpforCuSize(ctu, rootGeom[d0]);
+
+        if (m_slice->m_pps->maxCuDQPDepth > depth)
+            initAqQPs(depth + 1, ctu, &rootGeom[d0] + rootGeom[d0].childOffset);
     }
 }
 
@@ -141,32 +146,16 @@
 
     if (m_slice->m_pps->bUseDQP)
     {
-        /* TODO: In future, we could extend this to 8x8 QGs as well, since that's the minimum size 
-         * allowed by the HEVC standard. The AQ offset calculation will need to be at 8x8 granularity.
-         * And this messy section will need to be reworked */
-        m_aqQP[0][0] = calculateQpforCuSize(ctu, cuGeom);
+        m_aqQP[0] = calculateQpforCuSize(ctu, cuGeom);
+        setLambdaFromQP(*m_slice, m_aqQP[0]);
+        m_aqQP[0] = x265_clip3(QP_MIN, QP_MAX_SPEC, m_aqQP[0]);
+        ctu.setQPSubParts((int8_t)m_aqQP[0], 0, 0);
 
-        const CUGeom* rootGeom = &cuGeom + 1;
-        if (m_slice->m_pps->maxCuDQPDepth >= 1)
-        {
-            for (int d0 = 0; d0 < 4; d0++)
-            {
-                m_aqQP[1][d0] = calculateQpforCuSize(ctu, rootGeom[d0]);
-                if (m_slice->m_pps->maxCuDQPDepth == 2)
-                {
-                    const CUGeom* curGeom = &rootGeom[d0] + rootGeom[d0].childOffset;
-                    for (int d1 = 0; d1 < 4; d1++)
-                        m_aqQP[2][d0 * 4 + d1] = calculateQpforCuSize(ctu, curGeom[d1]);
-                }
-            }
-        }
-
-        setLambdaFromQP(*m_slice, m_aqQP[0][0]);
-        m_aqQP[0][0] = x265_clip3(QP_MIN, QP_MAX_SPEC, m_aqQP[0][0]);
-        ctu.setQPSubParts((int8_t)m_aqQP[0][0], 0, 0);
+        if (m_slice->m_pps->maxCuDQPDepth)
+            initAqQPs(1, ctu, &cuGeom + 1);
     }
     else
-        m_aqQP[0][0] = m_slice->m_sliceQp;
+        m_aqQP[0] = m_slice->m_sliceQp;
 
     m_quant.setQPforQuant(ctu);
     m_rqt[0].cur.load(initialContext);
@@ -191,7 +180,7 @@
     uint32_t zOrder = 0;
     if (m_slice->m_sliceType == I_SLICE)
     {
-        compressIntraCU(ctu, cuGeom, zOrder, m_aqQP[0][0], 0);
+        compressIntraCU(ctu, cuGeom, zOrder, m_aqQP[0]);
         if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.intraData)
         {
             CUData* bestCU = &m_modeDepth[0].bestMode->cu;
@@ -209,18 +198,18 @@
             * they are available for intra predictions */
             m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0);
 
-            compressInterCU_rd0_4(ctu, cuGeom, m_aqQP[0][0], 0);
+            compressInterCU_rd0_4(ctu, cuGeom, m_aqQP[0]);
 
             /* generate residual for entire CTU at once and copy to reconPic */
             encodeResidue(ctu, cuGeom);
         }
         else if (m_param->bDistributeModeAnalysis && m_param->rdLevel >= 2)
-            compressInterCU_dist(ctu, cuGeom, m_aqQP[0][0], 0);
+            compressInterCU_dist(ctu, cuGeom, m_aqQP[0]);
         else if (m_param->rdLevel <= 4)
-            compressInterCU_rd0_4(ctu, cuGeom, m_aqQP[0][0], 0);
+            compressInterCU_rd0_4(ctu, cuGeom, m_aqQP[0]);
         else
         {
-            compressInterCU_rd5_6(ctu, cuGeom, zOrder, m_aqQP[0][0], 0);
+            compressInterCU_rd5_6(ctu, cuGeom, zOrder, m_aqQP[0]);
             if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.interData)
             {
                 CUData* bestCU = &m_modeDepth[0].bestMode->cu;
@@ -259,7 +248,7 @@
     }
 }
 
-void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t& zOrder, int32_t qp, uint32_t partIdx)
+void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t& zOrder, int32_t qp)
 {
     uint32_t depth = cuGeom.depth;
     ModeDepth& md = m_modeDepth[depth];
@@ -270,7 +259,6 @@
 
     if (m_slice->m_pps->bUseDQP && depth && depth <= m_slice->m_pps->maxCuDQPDepth)
     {
-        qp = m_aqQP[depth][partIdx];
         setLambdaFromQP(*m_slice, qp);
         qp = x265_clip3(QP_MIN, QP_MAX_SPEC, qp);
     }
@@ -342,7 +330,10 @@
             {
                 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
                 m_rqt[nextDepth].cur.load(*nextContext);
-                compressIntraCU(parentCTU, childGeom, zOrder, qp, partIdx * 4 + subPartIdx);
+
+                int32_t nextQP = m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth ?
+                                 m_aqQP[childGeom.index] : qp;
+                compressIntraCU(parentCTU, childGeom, zOrder, nextQP);
 
                 // Save best CU and pred data for this sub CU
                 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
@@ -530,7 +521,7 @@
     while (task >= 0);
 }
 
-void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, uint32_t partIdx)
+void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
 {
     uint32_t depth = cuGeom.depth;
     uint32_t cuAddr = parentCTU.m_cuAddr;
@@ -545,7 +536,6 @@
 
     if (m_slice->m_pps->bUseDQP && depth && depth <= m_slice->m_pps->maxCuDQPDepth)
     {
-        qp = m_aqQP[depth][partIdx];
         setLambdaFromQP(*m_slice, qp);
         qp = x265_clip3(QP_MIN, QP_MAX_SPEC, qp);
     }
@@ -749,7 +739,9 @@
             {
                 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
                 m_rqt[nextDepth].cur.load(*nextContext);
-                compressInterCU_dist(parentCTU, childGeom, qp, partIdx * 4 + subPartIdx);
+                int32_t nextQP = m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth ?
+                                 m_aqQP[childGeom.index] : qp;
+                compressInterCU_dist(parentCTU, childGeom, nextQP);
 
                 // Save best CU and pred data for this sub CU
                 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
@@ -788,7 +780,7 @@
         md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.absPartIdx);
 }
 
-void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, uint32_t partIdx)
+void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
 {
     uint32_t depth = cuGeom.depth;
     uint32_t cuAddr = parentCTU.m_cuAddr;
@@ -801,7 +793,6 @@
 
     if (m_slice->m_pps->bUseDQP && depth && depth <= m_slice->m_pps->maxCuDQPDepth)
     {
-        qp = m_aqQP[depth][partIdx];
         setLambdaFromQP(*m_slice, qp);
         qp = x265_clip3(QP_MIN, QP_MAX_SPEC, qp);
     }
@@ -1028,7 +1019,9 @@
             {
                 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
                 m_rqt[nextDepth].cur.load(*nextContext);
-                compressInterCU_rd0_4(parentCTU, childGeom, qp, partIdx * 4 + subPartIdx);
+                int32_t nextQP = m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth ?
+                                 m_aqQP[childGeom.index] : qp;
+                compressInterCU_rd0_4(parentCTU, childGeom, nextQP);
 
                 // Save best CU and pred data for this sub CU
                 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
@@ -1079,7 +1072,7 @@
         md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.absPartIdx);
 }
 
-void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp, uint32_t partIdx)
+void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp)
 {
     uint32_t depth = cuGeom.depth;
     ModeDepth& md = m_modeDepth[depth];
@@ -1090,7 +1083,6 @@
 
     if (m_slice->m_pps->bUseDQP && depth && depth <= m_slice->m_pps->maxCuDQPDepth)
     {
-        qp = m_aqQP[depth][partIdx];
         setLambdaFromQP(*m_slice, qp);
         qp = x265_clip3(QP_MIN, QP_MAX_SPEC, qp);
     }
@@ -1234,7 +1226,9 @@
             {
                 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
                 m_rqt[nextDepth].cur.load(*nextContext);
-                compressInterCU_rd5_6(parentCTU, childGeom, zOrder, qp, partIdx * 4 + subPartIdx);
+                int32_t nextQP = m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth ?
+                                 m_aqQP[childGeom.index] : qp;
+                compressInterCU_rd5_6(parentCTU, childGeom, zOrder, nextQP);
 
                 // Save best CU and pred data for this sub CU
                 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
diff -r bfd57a0c0875 -r 5644bbd23e71 source/encoder/analysis.h
--- a/source/encoder/analysis.h	Fri Apr 24 15:14:54 2015 -0500
+++ b/source/encoder/analysis.h	Fri Apr 24 16:05:12 2015 -0500
@@ -90,7 +90,7 @@
     void processPmode(PMODE& pmode, Analysis& slave);
 
     ModeDepth m_modeDepth[NUM_CU_DEPTH];
-    int*      m_aqQP[NUM_CU_DEPTH];
+    int       m_aqQP[CUGeom::MAX_GEOMS];
     bool      m_bTryLossless;
     bool      m_bChromaSa8d;
 
@@ -109,13 +109,15 @@
     int32_t*             m_reuseRef;
     uint32_t*            m_reuseBestMergeCand;
 
+    void initAqQPs(uint32_t depth, const CUData& ctu, const CUGeom* rootGeom);
+
     /* full analysis for an I-slice CU */
-    void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qpDepth, uint32_t partIdx);
+    void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
 
     /* full analysis for a P or B slice CU */
-    void compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qpDepth, uint32_t partIdx);
-    void compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qpDepth, uint32_t partIdx);
-    void compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qpDepth, uint32_t partIdx);
+    void compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
+    void compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
+    void compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
 
     /* measure merge and skip */
     void checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom);