[x265] [PATCH 20 of 24] sao: move SAO RDO Decide into encode loop

Min Chen chenm003 at 163.com
Tue Dec 8 00:54:57 CET 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1449511601 21600
# Node ID 690f1e3baab270884b3f00bd56006738ad4a5314
# Parent  f023dda04a265ff507746af68c213e61303805f6
sao: move SAO RDO Decide into encode loop
---
 source/encoder/frameencoder.cpp |   77 ++++++++++++++-------
 source/encoder/framefilter.cpp  |   41 ++++++++++-
 source/encoder/framefilter.h    |    1 +
 source/encoder/sao.cpp          |  140 ++++++++++++++++++++++++++++++++++----
 source/encoder/sao.h            |    4 +-
 5 files changed, 217 insertions(+), 46 deletions(-)

diff -r f023dda04a26 -r 690f1e3baab2 source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp	Mon Dec 07 12:06:38 2015 -0600
+++ b/source/encoder/frameencoder.cpp	Mon Dec 07 12:06:41 2015 -0600
@@ -1107,7 +1107,14 @@
                 m_frameFilter.m_parallelFilter[row - 1].waitForExit();
 
                 // Processing new group
-                const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get(), (int)col) : col);
+                int allowCol = col;
+
+                // avoid race condition on last column
+                if (row >= 2)
+                {
+                    allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get()
+                                                              : m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get()), (int)col);
+                }
                 m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(allowCol);
                 m_frameFilter.m_parallelFilter[row - 1].tryBondPeers(*this, 1);
             }
@@ -1119,7 +1126,14 @@
                 m_frameFilter.m_parallelFilter[row].waitForExit();
 
                 // Processing last row
-                const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get(), (int)col) : col);
+                int allowCol = col;
+
+                // avoid race condition on last column
+                if (row >= 2)
+                {
+                    allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get()
+                                                              : m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get()), (int)col);
+                }
                 m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol);
                 m_frameFilter.m_parallelFilter[row].tryBondPeers(*this, 1);
             }
@@ -1183,26 +1197,48 @@
     if (!m_param->bEnableSAO && (m_param->bEnableWavefront || row == m_numRows - 1))
         rowCoder.finishSlice();
 
+    /* Processing left Deblock block with current threading */
+    if ((m_param->bEnableLoopFilter | m_param->bEnableSAO) & (row >= 1))
+    {
+        /* TODO: Multiple Threading */
+        /* Check to avoid previous row process slower than current row */
+        if (row >= 2)
+        {
+            int prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get();
+            while(prevCol != (int)numCols - 1)
+                prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.waitForChange(prevCol);
+        }
+        m_frameFilter.m_parallelFilter[row - 1].waitForExit();
+        m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(numCols);
+        m_frameFilter.m_parallelFilter[row - 1].processTasks(-1);
+    }
+
+    /* trigger row-wise loop filters */
+    if (row == m_numRows - 1)
+    {
+        /* TODO: Early start last row */
+        if (m_param->bEnableLoopFilter | m_param->bEnableSAO)
+        {
+            if (m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get() != (int)numCols - 1)
+                x265_log(m_param, X265_LOG_WARNING, "detected ParallelFilter race condition on last row\n");
+
+            // avoid race on last row and last column
+            if (row >= 1)
+            {
+                int prevCol = m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get();
+                while(prevCol != (int)numCols - 1)
+                    prevCol = m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.waitForChange(prevCol);
+            }
+
+            /* NOTE: Last Row not execute before, so didn't need wait */
+            m_frameFilter.m_parallelFilter[row].waitForExit();
+            m_frameFilter.m_parallelFilter[row].m_allowedCol.set(numCols);
+            m_frameFilter.m_parallelFilter[row].processTasks(-1);
+        }
+    }
+
     if (m_param->bEnableWavefront)
     {
-        /* Processing left Deblock block with current threading */
-        if ((m_param->bEnableLoopFilter | m_param->bEnableSAO) & (row >= 1))
-        {
-            /* TODO: Multiple Threading */
-            m_frameFilter.m_parallelFilter[row - 1].waitForExit();
-
-            /* Check to avoid previous row process slower than current row */
-            if (row >= 2)
-            {
-                int prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get();
-                while(prevCol != (int)numCols)
-                    prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastCol.waitForChange(prevCol);
-            }
-            m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(numCols);
-            m_frameFilter.m_parallelFilter[row - 1].processTasks(-1);
-        }
-
-        /* trigger row-wise loop filters */
         if (row >= m_filterRowDelay)
         {
             enableRowFilter(row - m_filterRowDelay);
@@ -1215,17 +1251,6 @@
 
         if (row == m_numRows - 1)
         {
-            /* TODO: Early start last row */
-            if (m_param->bEnableLoopFilter | m_param->bEnableSAO)
-            {
-                X265_CHECK(m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.get() == (int)numCols, "Deblock m_EncodedCol check failed");
-
-                /* NOTE: Last Row not execute before, so didn't need wait */
-                m_frameFilter.m_parallelFilter[row].waitForExit();
-                m_frameFilter.m_parallelFilter[row].m_allowedCol.set(numCols);
-                m_frameFilter.m_parallelFilter[row].processTasks(-1);
-            }
-
             for (uint32_t i = m_numRows - m_filterRowDelay; i < m_numRows; i++)
                 enableRowFilter(i);
             tryWakeOne();
diff -r f023dda04a26 -r 690f1e3baab2 source/encoder/framefilter.cpp
--- a/source/encoder/framefilter.cpp	Mon Dec 07 12:06:38 2015 -0600
+++ b/source/encoder/framefilter.cpp	Mon Dec 07 12:06:41 2015 -0600
@@ -115,6 +115,7 @@
 
             m_parallelFilter[row].m_lastCol.set(0);
             m_parallelFilter[row].m_allowedCol.set(0);
+            m_parallelFilter[row].m_lastDeblocked.set(-1);
             m_parallelFilter[row].m_encData = frame->m_encData;
         }
 
@@ -148,6 +149,7 @@
 // NOTE: Single Threading only
 void FrameFilter::ParallelFilter::processTasks(int /*workerThreadId*/)
 {
+    SAOParam* saoParam = m_encData->m_saoParam;
     const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
     const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
     PicYuv* reconPic = m_encData->m_reconPic;
@@ -169,7 +171,7 @@
             deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER);
         }
 
-        if (col > 0)
+        if (col >= 1)
         {
             if (m_param->bEnableLoopFilter)
             {
@@ -178,7 +180,21 @@
             }
 
             if (m_param->bEnableSAO)
+            {
+                // Save SAO bottom row reference pixels
                 copySaoAboveRef(reconPic, cuAddr - 1, col - 1);
+
+                // SAO Decide
+                if (col >= 2)
+                {
+                    // NOTE: Delay 2 column to avoid mistake on below case, it is Deblock sync logic issue, less probability but still alive
+                    //       ... H V |
+                    //       ..S H V |
+                    m_sao.rdoSaoUnitCu(saoParam, m_rowAddr, col - 2, cuAddr - 2);
+                }
+            }
+
+            m_lastDeblocked.set(col - 1);
         }
         m_lastCol.incr();
     }
@@ -194,7 +210,19 @@
         }
 
         if (m_param->bEnableSAO)
+        {
+            // Save SAO bottom row reference pixels
             copySaoAboveRef(reconPic, cuAddr, numCols - 1);
+
+            // SAO Decide
+            // NOTE: reduce condition check for 1 CU only video, Why someone play with it?
+            if (numCols >= 2)
+                m_sao.rdoSaoUnitCu(saoParam, m_rowAddr, numCols - 2, cuAddr - 1);
+
+            if (numCols >= 1)
+                m_sao.rdoSaoUnitCu(saoParam, m_rowAddr, numCols - 1, cuAddr);
+        }
+        m_lastDeblocked.set(numCols - 1);
     }
 }
 
@@ -218,8 +246,6 @@
     SAOParam* saoParam = encData.m_saoParam;
     if (m_param->bEnableSAO)
     {
-        m_parallelFilter[row].m_sao.rdoSaoUnitRow(saoParam, row);
-
         // NOTE: Delay a row because SAO decide need top row pixels at next row, is it HM's bug?
         if (row >= m_saoRowDelay)
             processSao(row - m_saoRowDelay);
@@ -234,7 +260,14 @@
     {
         if (m_param->bEnableSAO)
         {
-            m_parallelFilter[row].m_sao.rdoSaoUnitRowEnd(saoParam, encData.m_slice->m_sps->numCUsInFrame);
+            // Merge numNoSao into RootNode (Node0)
+            for(int i = 1; i < m_numRows; i++)
+            {
+                m_parallelFilter[0].m_sao.m_numNoSao[0] += m_parallelFilter[i].m_sao.m_numNoSao[0];
+                m_parallelFilter[0].m_sao.m_numNoSao[1] += m_parallelFilter[i].m_sao.m_numNoSao[1];
+            }
+
+            m_parallelFilter[0].m_sao.rdoSaoUnitRowEnd(saoParam, encData.m_slice->m_sps->numCUsInFrame);
 
             for (int i = m_numRows - m_saoRowDelay; i < m_numRows; i++)
                 processSao(i);
diff -r f023dda04a26 -r 690f1e3baab2 source/encoder/framefilter.h
--- a/source/encoder/framefilter.h	Mon Dec 07 12:06:38 2015 -0600
+++ b/source/encoder/framefilter.h	Mon Dec 07 12:06:41 2015 -0600
@@ -69,6 +69,7 @@
         SAO                 m_sao;
         ThreadSafeInteger   m_lastCol;          /* The column that next to process */
         ThreadSafeInteger   m_allowedCol;       /* The column that processed from Encode pipeline */
+        ThreadSafeInteger   m_lastDeblocked;   /* The column that finished all of Deblock stages  */
 
         ParallelFilter()
             : m_rowAddr(0)
diff -r f023dda04a26 -r 690f1e3baab2 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp	Mon Dec 07 12:06:38 2015 -0600
+++ b/source/encoder/sao.cpp	Mon Dec 07 12:06:41 2015 -0600
@@ -90,15 +90,7 @@
     m_tmpL2[0] = NULL;
     m_tmpL2[1] = NULL;
     m_tmpL2[2] = NULL;
-
-    m_depthSaoRate[0][0] = 0;
-    m_depthSaoRate[0][1] = 0;
-    m_depthSaoRate[0][2] = 0;
-    m_depthSaoRate[0][3] = 0;
-    m_depthSaoRate[1][0] = 0;
-    m_depthSaoRate[1][1] = 0;
-    m_depthSaoRate[1][2] = 0;
-    m_depthSaoRate[1][3] = 0;
+    m_depthSaoRate = NULL;
 }
 
 bool SAO::create(x265_param* param, int initCommon)
@@ -130,6 +122,16 @@
     {
         CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
         CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
+        CHECKED_MALLOC(m_depthSaoRate, double, 2 * SAO_DEPTHRATE_SIZE);
+
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 0] = 0;
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 1] = 0;
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 2] = 0;
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 3] = 0;
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 0] = 0;
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 1] = 0;
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 2] = 0;
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 3] = 0;
 
         CHECKED_MALLOC(m_clipTableBase,  pixel, maxY + 2 * rangeExt);
         m_clipTable = &(m_clipTableBase[rangeExt]);
@@ -166,11 +168,13 @@
 {
     X265_CHECK(m_countPreDblk == NULL, "duplicate initialize on m_countPreDblk");
     X265_CHECK(m_offsetOrgPreDblk == NULL, "duplicate initialize on m_offsetOrgPreDblk");
+    X265_CHECK(m_depthSaoRate == NULL, "duplicate initialize on m_depthSaoRate");
     X265_CHECK(m_clipTableBase == NULL, "duplicate initialize on m_clipTableBase");
     X265_CHECK(m_clipTable == NULL, "duplicate initialize on m_clipTable");
 
     m_countPreDblk = root->m_countPreDblk;
     m_offsetOrgPreDblk = root->m_offsetOrgPreDblk;
+    m_depthSaoRate = root->m_depthSaoRate;
     m_clipTableBase = root->m_clipTableBase; // Unnecessary
     m_clipTable = root->m_clipTable;
 }
@@ -202,6 +206,7 @@
     {
         X265_FREE_ZERO(m_countPreDblk);
         X265_FREE_ZERO(m_offsetOrgPreDblk);
+        X265_FREE_ZERO(m_depthSaoRate);
         X265_FREE_ZERO(m_clipTableBase);
     }
 }
@@ -262,9 +267,9 @@
     // NOTE: Allow SAO automatic turn-off only when frame parallelism is disabled.
     if (m_param->frameNumThreads == 1)
     {
-        if (m_refDepth > 0 && m_depthSaoRate[0][m_refDepth - 1] > SAO_ENCODING_RATE)
+        if (m_refDepth > 0 && m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth - 1] > SAO_ENCODING_RATE)
             saoParam->bSaoFlag[0] = false;
-        if (m_refDepth > 0 && m_depthSaoRate[1][m_refDepth - 1] > SAO_ENCODING_RATE_CHROMA)
+        if (m_refDepth > 0 && m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth - 1] > SAO_ENCODING_RATE_CHROMA)
             saoParam->bSaoFlag[1] = false;
     }
 }
@@ -1218,14 +1223,14 @@
 void SAO::rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus)
 {
     if (!saoParam->bSaoFlag[0])
-        m_depthSaoRate[0][m_refDepth] = 1.0;
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth] = 1.0;
     else
-        m_depthSaoRate[0][m_refDepth] = m_numNoSao[0] / ((double)numctus);
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth] = m_numNoSao[0] / ((double)numctus);
 
     if (!saoParam->bSaoFlag[1])
-        m_depthSaoRate[1][m_refDepth] = 1.0;
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth] = 1.0;
     else
-        m_depthSaoRate[1][m_refDepth] = m_numNoSao[1] / ((double)numctus);
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth] = m_numNoSao[1] / ((double)numctus);
 }
 
 void SAO::rdoSaoUnitRow(SAOParam* saoParam, int idxY)
@@ -1339,6 +1344,111 @@
     }
 }
 
+void SAO::rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr)
+{
+    SaoCtuParam mergeSaoParam[NUM_MERGE_MODE][2];
+    double mergeDist[NUM_MERGE_MODE];
+    const bool allowMerge[2] = {(idxX != 0), (rowBaseAddr != 0)}; // left, up
+
+    const int addrUp   = rowBaseAddr ? addr - m_numCuInWidth : -1;
+    const int addrLeft = idxX ? addr - 1 : -1;
+
+    m_entropyCoder.load(m_rdContexts.cur);
+    if (allowMerge[0])
+        m_entropyCoder.codeSaoMerge(0);
+    if (allowMerge[1])
+        m_entropyCoder.codeSaoMerge(0);
+    m_entropyCoder.store(m_rdContexts.temp);
+
+    // reset stats Y, Cb, Cr
+    X265_CHECK(sizeof(PerPlane) == (sizeof(int32_t) * (NUM_PLANE * MAX_NUM_SAO_TYPE * MAX_NUM_SAO_CLASS)), "Found Padding space in struct PerPlane");
+
+    // TODO: Confirm the address space is continuous
+    if (m_param->bSaoNonDeblocked)
+    {
+        memcpy(m_count, m_countPreDblk[addr], sizeof(m_count));
+        memcpy(m_offsetOrg, m_offsetOrgPreDblk[addr], sizeof(m_offsetOrg));
+    }
+    else
+    {
+        memset(m_count, 0, sizeof(m_count));
+        memset(m_offsetOrg, 0, sizeof(m_offsetOrg));
+    }
+
+    saoParam->ctuParam[0][addr].reset();
+    saoParam->ctuParam[1][addr].reset();
+    saoParam->ctuParam[2][addr].reset();
+
+    if (saoParam->bSaoFlag[0])
+        calcSaoStatsCu(addr, 0);
+
+    if (saoParam->bSaoFlag[1])
+    {
+        calcSaoStatsCu(addr, 1);
+        calcSaoStatsCu(addr, 2);
+    }
+
+    saoComponentParamDist(saoParam, addr, addrUp, addrLeft, &mergeSaoParam[0][0], mergeDist);
+
+    sao2ChromaParamDist(saoParam, addr, addrUp, addrLeft, mergeSaoParam, mergeDist);
+
+    if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
+    {
+        // Cost of new SAO_params
+        m_entropyCoder.load(m_rdContexts.cur);
+        m_entropyCoder.resetBits();
+        if (allowMerge[0])
+            m_entropyCoder.codeSaoMerge(0);
+        if (allowMerge[1])
+            m_entropyCoder.codeSaoMerge(0);
+        for (int plane = 0; plane < 3; plane++)
+        {
+            if (saoParam->bSaoFlag[plane > 0])
+                m_entropyCoder.codeSaoOffset(saoParam->ctuParam[plane][addr], plane);
+        }
+
+        uint32_t rate = m_entropyCoder.getNumberOfWrittenBits();
+        double bestCost = mergeDist[0] + (double)rate;
+        m_entropyCoder.store(m_rdContexts.temp);
+
+        // Cost of Merge
+        for (int mergeIdx = 0; mergeIdx < 2; ++mergeIdx)
+        {
+            if (!allowMerge[mergeIdx])
+                continue;
+
+            m_entropyCoder.load(m_rdContexts.cur);
+            m_entropyCoder.resetBits();
+            if (allowMerge[0])
+                m_entropyCoder.codeSaoMerge(1 - mergeIdx);
+            if (allowMerge[1] && (mergeIdx == 1))
+                m_entropyCoder.codeSaoMerge(1);
+
+            rate = m_entropyCoder.getNumberOfWrittenBits();
+            double mergeCost = mergeDist[mergeIdx + 1] + (double)rate;
+            if (mergeCost < bestCost)
+            {
+                SaoMergeMode mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT;
+                bestCost = mergeCost;
+                m_entropyCoder.store(m_rdContexts.temp);
+                for (int plane = 0; plane < 3; plane++)
+                {
+                    mergeSaoParam[plane][mergeIdx].mergeMode = mergeMode;
+                    if (saoParam->bSaoFlag[plane > 0])
+                        copySaoUnit(&saoParam->ctuParam[plane][addr], &mergeSaoParam[plane][mergeIdx]);
+                }
+            }
+        }
+
+        if (saoParam->ctuParam[0][addr].typeIdx < 0)
+            m_numNoSao[0]++;
+        if (saoParam->ctuParam[1][addr].typeIdx < 0)
+            m_numNoSao[1]++;
+        m_entropyCoder.load(m_rdContexts.temp);
+        m_entropyCoder.store(m_rdContexts.cur);
+    }
+}
+
 /** rate distortion optimization of SAO unit */
 inline int64_t SAO::estSaoTypeDist(int plane, int typeIdx, double lambda, int32_t* currentDistortionTableBo, double* currentRdCostTableBo)
 {
diff -r f023dda04a26 -r 690f1e3baab2 source/encoder/sao.h
--- a/source/encoder/sao.h	Mon Dec 07 12:06:38 2015 -0600
+++ b/source/encoder/sao.h	Mon Dec 07 12:06:41 2015 -0600
@@ -62,6 +62,7 @@
     enum { NUM_EDGETYPE = 5 };
     enum { NUM_PLANE = 3 };
     enum { NUM_MERGE_MODE = 3 };
+    enum { SAO_DEPTHRATE_SIZE = 4 };
 
     static const uint32_t s_eoTable[NUM_EDGETYPE];
 
@@ -79,7 +80,7 @@
     PerPlane*   m_countPreDblk;
     PerPlane*   m_offsetOrgPreDblk;
 
-    double      m_depthSaoRate[2][4];
+    double*     m_depthSaoRate;
     int8_t      m_offsetBo[NUM_PLANE][SAO_NUM_BO_CLASSES];
     int8_t      m_offsetEo[NUM_PLANE][NUM_EDGETYPE];
 
@@ -149,6 +150,7 @@
 
     void rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus);
     void rdoSaoUnitRow(SAOParam* saoParam, int idxY);
+    void rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr);
 
     friend class FrameFilter;
 };



More information about the x265-devel mailing list