[x265] [PATCH 20 of 24] sao: move SAO RDO Decide into encode loop
Min Chen
chenm003 at 163.com
Tue Dec 8 00:54:57 CET 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1449511601 21600
# Node ID 690f1e3baab270884b3f00bd56006738ad4a5314
# Parent f023dda04a265ff507746af68c213e61303805f6
sao: move SAO RDO Decide into encode loop
---
source/encoder/frameencoder.cpp | 77 ++++++++++++++-------
source/encoder/framefilter.cpp | 41 ++++++++++-
source/encoder/framefilter.h | 1 +
source/encoder/sao.cpp | 140 ++++++++++++++++++++++++++++++++++----
source/encoder/sao.h | 4 +-
5 files changed, 217 insertions(+), 46 deletions(-)
diff -r f023dda04a26 -r 690f1e3baab2 source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp Mon Dec 07 12:06:38 2015 -0600
+++ b/source/encoder/frameencoder.cpp Mon Dec 07 12:06:41 2015 -0600
@@ -1107,7 +1107,14 @@
m_frameFilter.m_parallelFilter[row - 1].waitForExit();
// Processing new group
- const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get(), (int)col) : col);
+ int allowCol = col;
+
+ // avoid race condition on last column
+ if (row >= 2)
+ {
+ allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get()
+ : m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get()), (int)col);
+ }
m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(allowCol);
m_frameFilter.m_parallelFilter[row - 1].tryBondPeers(*this, 1);
}
@@ -1119,7 +1126,14 @@
m_frameFilter.m_parallelFilter[row].waitForExit();
// Processing last row
- const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get(), (int)col) : col);
+ int allowCol = col;
+
+ // avoid race condition on last column
+ if (row >= 2)
+ {
+ allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get()
+ : m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get()), (int)col);
+ }
m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol);
m_frameFilter.m_parallelFilter[row].tryBondPeers(*this, 1);
}
@@ -1183,26 +1197,48 @@
if (!m_param->bEnableSAO && (m_param->bEnableWavefront || row == m_numRows - 1))
rowCoder.finishSlice();
+ /* Processing left Deblock block with current threading */
+ if ((m_param->bEnableLoopFilter | m_param->bEnableSAO) & (row >= 1))
+ {
+ /* TODO: Multiple Threading */
+ /* Check to avoid previous row process slower than current row */
+ if (row >= 2)
+ {
+ int prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get();
+ while(prevCol != (int)numCols - 1)
+ prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.waitForChange(prevCol);
+ }
+ m_frameFilter.m_parallelFilter[row - 1].waitForExit();
+ m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(numCols);
+ m_frameFilter.m_parallelFilter[row - 1].processTasks(-1);
+ }
+
+ /* trigger row-wise loop filters */
+ if (row == m_numRows - 1)
+ {
+ /* TODO: Early start last row */
+ if (m_param->bEnableLoopFilter | m_param->bEnableSAO)
+ {
+ if (m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get() != (int)numCols - 1)
+ x265_log(m_param, X265_LOG_WARNING, "detected ParallelFilter race condition on last row\n");
+
+ // avoid race on last row and last column
+ if (row >= 1)
+ {
+ int prevCol = m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get();
+ while(prevCol != (int)numCols - 1)
+ prevCol = m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.waitForChange(prevCol);
+ }
+
+ /* NOTE: Last Row not execute before, so didn't need wait */
+ m_frameFilter.m_parallelFilter[row].waitForExit();
+ m_frameFilter.m_parallelFilter[row].m_allowedCol.set(numCols);
+ m_frameFilter.m_parallelFilter[row].processTasks(-1);
+ }
+ }
+
if (m_param->bEnableWavefront)
{
- /* Processing left Deblock block with current threading */
- if ((m_param->bEnableLoopFilter | m_param->bEnableSAO) & (row >= 1))
- {
- /* TODO: Multiple Threading */
- m_frameFilter.m_parallelFilter[row - 1].waitForExit();
-
- /* Check to avoid previous row process slower than current row */
- if (row >= 2)
- {
- int prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get();
- while(prevCol != (int)numCols)
- prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastCol.waitForChange(prevCol);
- }
- m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(numCols);
- m_frameFilter.m_parallelFilter[row - 1].processTasks(-1);
- }
-
- /* trigger row-wise loop filters */
if (row >= m_filterRowDelay)
{
enableRowFilter(row - m_filterRowDelay);
@@ -1215,17 +1251,6 @@
if (row == m_numRows - 1)
{
- /* TODO: Early start last row */
- if (m_param->bEnableLoopFilter | m_param->bEnableSAO)
- {
- X265_CHECK(m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.get() == (int)numCols, "Deblock m_EncodedCol check failed");
-
- /* NOTE: Last Row not execute before, so didn't need wait */
- m_frameFilter.m_parallelFilter[row].waitForExit();
- m_frameFilter.m_parallelFilter[row].m_allowedCol.set(numCols);
- m_frameFilter.m_parallelFilter[row].processTasks(-1);
- }
-
for (uint32_t i = m_numRows - m_filterRowDelay; i < m_numRows; i++)
enableRowFilter(i);
tryWakeOne();
diff -r f023dda04a26 -r 690f1e3baab2 source/encoder/framefilter.cpp
--- a/source/encoder/framefilter.cpp Mon Dec 07 12:06:38 2015 -0600
+++ b/source/encoder/framefilter.cpp Mon Dec 07 12:06:41 2015 -0600
@@ -115,6 +115,7 @@
m_parallelFilter[row].m_lastCol.set(0);
m_parallelFilter[row].m_allowedCol.set(0);
+ m_parallelFilter[row].m_lastDeblocked.set(-1);
m_parallelFilter[row].m_encData = frame->m_encData;
}
@@ -148,6 +149,7 @@
// NOTE: Single Threading only
void FrameFilter::ParallelFilter::processTasks(int /*workerThreadId*/)
{
+ SAOParam* saoParam = m_encData->m_saoParam;
const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
PicYuv* reconPic = m_encData->m_reconPic;
@@ -169,7 +171,7 @@
deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER);
}
- if (col > 0)
+ if (col >= 1)
{
if (m_param->bEnableLoopFilter)
{
@@ -178,7 +180,21 @@
}
if (m_param->bEnableSAO)
+ {
+ // Save SAO bottom row reference pixels
copySaoAboveRef(reconPic, cuAddr - 1, col - 1);
+
+ // SAO Decide
+ if (col >= 2)
+ {
+ // NOTE: Delay 2 column to avoid mistake on below case, it is Deblock sync logic issue, less probability but still alive
+ // ... H V |
+ // ..S H V |
+ m_sao.rdoSaoUnitCu(saoParam, m_rowAddr, col - 2, cuAddr - 2);
+ }
+ }
+
+ m_lastDeblocked.set(col - 1);
}
m_lastCol.incr();
}
@@ -194,7 +210,19 @@
}
if (m_param->bEnableSAO)
+ {
+ // Save SAO bottom row reference pixels
copySaoAboveRef(reconPic, cuAddr, numCols - 1);
+
+ // SAO Decide
+ // NOTE: reduce condition check for 1 CU only video, Why someone play with it?
+ if (numCols >= 2)
+ m_sao.rdoSaoUnitCu(saoParam, m_rowAddr, numCols - 2, cuAddr - 1);
+
+ if (numCols >= 1)
+ m_sao.rdoSaoUnitCu(saoParam, m_rowAddr, numCols - 1, cuAddr);
+ }
+ m_lastDeblocked.set(numCols - 1);
}
}
@@ -218,8 +246,6 @@
SAOParam* saoParam = encData.m_saoParam;
if (m_param->bEnableSAO)
{
- m_parallelFilter[row].m_sao.rdoSaoUnitRow(saoParam, row);
-
// NOTE: Delay a row because SAO decide need top row pixels at next row, is it HM's bug?
if (row >= m_saoRowDelay)
processSao(row - m_saoRowDelay);
@@ -234,7 +260,14 @@
{
if (m_param->bEnableSAO)
{
- m_parallelFilter[row].m_sao.rdoSaoUnitRowEnd(saoParam, encData.m_slice->m_sps->numCUsInFrame);
+ // Merge numNoSao into RootNode (Node0)
+ for(int i = 1; i < m_numRows; i++)
+ {
+ m_parallelFilter[0].m_sao.m_numNoSao[0] += m_parallelFilter[i].m_sao.m_numNoSao[0];
+ m_parallelFilter[0].m_sao.m_numNoSao[1] += m_parallelFilter[i].m_sao.m_numNoSao[1];
+ }
+
+ m_parallelFilter[0].m_sao.rdoSaoUnitRowEnd(saoParam, encData.m_slice->m_sps->numCUsInFrame);
for (int i = m_numRows - m_saoRowDelay; i < m_numRows; i++)
processSao(i);
diff -r f023dda04a26 -r 690f1e3baab2 source/encoder/framefilter.h
--- a/source/encoder/framefilter.h Mon Dec 07 12:06:38 2015 -0600
+++ b/source/encoder/framefilter.h Mon Dec 07 12:06:41 2015 -0600
@@ -69,6 +69,7 @@
SAO m_sao;
ThreadSafeInteger m_lastCol; /* The column that next to process */
ThreadSafeInteger m_allowedCol; /* The column that processed from Encode pipeline */
+ ThreadSafeInteger m_lastDeblocked; /* The column that finished all of Deblock stages */
ParallelFilter()
: m_rowAddr(0)
diff -r f023dda04a26 -r 690f1e3baab2 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp Mon Dec 07 12:06:38 2015 -0600
+++ b/source/encoder/sao.cpp Mon Dec 07 12:06:41 2015 -0600
@@ -90,15 +90,7 @@
m_tmpL2[0] = NULL;
m_tmpL2[1] = NULL;
m_tmpL2[2] = NULL;
-
- m_depthSaoRate[0][0] = 0;
- m_depthSaoRate[0][1] = 0;
- m_depthSaoRate[0][2] = 0;
- m_depthSaoRate[0][3] = 0;
- m_depthSaoRate[1][0] = 0;
- m_depthSaoRate[1][1] = 0;
- m_depthSaoRate[1][2] = 0;
- m_depthSaoRate[1][3] = 0;
+ m_depthSaoRate = NULL;
}
bool SAO::create(x265_param* param, int initCommon)
@@ -130,6 +122,16 @@
{
CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
+ CHECKED_MALLOC(m_depthSaoRate, double, 2 * SAO_DEPTHRATE_SIZE);
+
+ m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 0] = 0;
+ m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 1] = 0;
+ m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 2] = 0;
+ m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 3] = 0;
+ m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 0] = 0;
+ m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 1] = 0;
+ m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 2] = 0;
+ m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 3] = 0;
CHECKED_MALLOC(m_clipTableBase, pixel, maxY + 2 * rangeExt);
m_clipTable = &(m_clipTableBase[rangeExt]);
@@ -166,11 +168,13 @@
{
X265_CHECK(m_countPreDblk == NULL, "duplicate initialize on m_countPreDblk");
X265_CHECK(m_offsetOrgPreDblk == NULL, "duplicate initialize on m_offsetOrgPreDblk");
+ X265_CHECK(m_depthSaoRate == NULL, "duplicate initialize on m_depthSaoRate");
X265_CHECK(m_clipTableBase == NULL, "duplicate initialize on m_clipTableBase");
X265_CHECK(m_clipTable == NULL, "duplicate initialize on m_clipTable");
m_countPreDblk = root->m_countPreDblk;
m_offsetOrgPreDblk = root->m_offsetOrgPreDblk;
+ m_depthSaoRate = root->m_depthSaoRate;
m_clipTableBase = root->m_clipTableBase; // Unnecessary
m_clipTable = root->m_clipTable;
}
@@ -202,6 +206,7 @@
{
X265_FREE_ZERO(m_countPreDblk);
X265_FREE_ZERO(m_offsetOrgPreDblk);
+ X265_FREE_ZERO(m_depthSaoRate);
X265_FREE_ZERO(m_clipTableBase);
}
}
@@ -262,9 +267,9 @@
// NOTE: Allow SAO automatic turn-off only when frame parallelism is disabled.
if (m_param->frameNumThreads == 1)
{
- if (m_refDepth > 0 && m_depthSaoRate[0][m_refDepth - 1] > SAO_ENCODING_RATE)
+ if (m_refDepth > 0 && m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth - 1] > SAO_ENCODING_RATE)
saoParam->bSaoFlag[0] = false;
- if (m_refDepth > 0 && m_depthSaoRate[1][m_refDepth - 1] > SAO_ENCODING_RATE_CHROMA)
+ if (m_refDepth > 0 && m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth - 1] > SAO_ENCODING_RATE_CHROMA)
saoParam->bSaoFlag[1] = false;
}
}
@@ -1218,14 +1223,14 @@
void SAO::rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus)
{
if (!saoParam->bSaoFlag[0])
- m_depthSaoRate[0][m_refDepth] = 1.0;
+ m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth] = 1.0;
else
- m_depthSaoRate[0][m_refDepth] = m_numNoSao[0] / ((double)numctus);
+ m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth] = m_numNoSao[0] / ((double)numctus);
if (!saoParam->bSaoFlag[1])
- m_depthSaoRate[1][m_refDepth] = 1.0;
+ m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth] = 1.0;
else
- m_depthSaoRate[1][m_refDepth] = m_numNoSao[1] / ((double)numctus);
+ m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth] = m_numNoSao[1] / ((double)numctus);
}
void SAO::rdoSaoUnitRow(SAOParam* saoParam, int idxY)
@@ -1339,6 +1344,111 @@
}
}
+void SAO::rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr)
+{
+ SaoCtuParam mergeSaoParam[NUM_MERGE_MODE][2];
+ double mergeDist[NUM_MERGE_MODE];
+ const bool allowMerge[2] = {(idxX != 0), (rowBaseAddr != 0)}; // left, up
+
+ const int addrUp = rowBaseAddr ? addr - m_numCuInWidth : -1;
+ const int addrLeft = idxX ? addr - 1 : -1;
+
+ m_entropyCoder.load(m_rdContexts.cur);
+ if (allowMerge[0])
+ m_entropyCoder.codeSaoMerge(0);
+ if (allowMerge[1])
+ m_entropyCoder.codeSaoMerge(0);
+ m_entropyCoder.store(m_rdContexts.temp);
+
+ // reset stats Y, Cb, Cr
+ X265_CHECK(sizeof(PerPlane) == (sizeof(int32_t) * (NUM_PLANE * MAX_NUM_SAO_TYPE * MAX_NUM_SAO_CLASS)), "Found Padding space in struct PerPlane");
+
+ // TODO: Confirm the address space is continuous
+ if (m_param->bSaoNonDeblocked)
+ {
+ memcpy(m_count, m_countPreDblk[addr], sizeof(m_count));
+ memcpy(m_offsetOrg, m_offsetOrgPreDblk[addr], sizeof(m_offsetOrg));
+ }
+ else
+ {
+ memset(m_count, 0, sizeof(m_count));
+ memset(m_offsetOrg, 0, sizeof(m_offsetOrg));
+ }
+
+ saoParam->ctuParam[0][addr].reset();
+ saoParam->ctuParam[1][addr].reset();
+ saoParam->ctuParam[2][addr].reset();
+
+ if (saoParam->bSaoFlag[0])
+ calcSaoStatsCu(addr, 0);
+
+ if (saoParam->bSaoFlag[1])
+ {
+ calcSaoStatsCu(addr, 1);
+ calcSaoStatsCu(addr, 2);
+ }
+
+ saoComponentParamDist(saoParam, addr, addrUp, addrLeft, &mergeSaoParam[0][0], mergeDist);
+
+ sao2ChromaParamDist(saoParam, addr, addrUp, addrLeft, mergeSaoParam, mergeDist);
+
+ if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
+ {
+ // Cost of new SAO_params
+ m_entropyCoder.load(m_rdContexts.cur);
+ m_entropyCoder.resetBits();
+ if (allowMerge[0])
+ m_entropyCoder.codeSaoMerge(0);
+ if (allowMerge[1])
+ m_entropyCoder.codeSaoMerge(0);
+ for (int plane = 0; plane < 3; plane++)
+ {
+ if (saoParam->bSaoFlag[plane > 0])
+ m_entropyCoder.codeSaoOffset(saoParam->ctuParam[plane][addr], plane);
+ }
+
+ uint32_t rate = m_entropyCoder.getNumberOfWrittenBits();
+ double bestCost = mergeDist[0] + (double)rate;
+ m_entropyCoder.store(m_rdContexts.temp);
+
+ // Cost of Merge
+ for (int mergeIdx = 0; mergeIdx < 2; ++mergeIdx)
+ {
+ if (!allowMerge[mergeIdx])
+ continue;
+
+ m_entropyCoder.load(m_rdContexts.cur);
+ m_entropyCoder.resetBits();
+ if (allowMerge[0])
+ m_entropyCoder.codeSaoMerge(1 - mergeIdx);
+ if (allowMerge[1] && (mergeIdx == 1))
+ m_entropyCoder.codeSaoMerge(1);
+
+ rate = m_entropyCoder.getNumberOfWrittenBits();
+ double mergeCost = mergeDist[mergeIdx + 1] + (double)rate;
+ if (mergeCost < bestCost)
+ {
+ SaoMergeMode mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT;
+ bestCost = mergeCost;
+ m_entropyCoder.store(m_rdContexts.temp);
+ for (int plane = 0; plane < 3; plane++)
+ {
+ mergeSaoParam[plane][mergeIdx].mergeMode = mergeMode;
+ if (saoParam->bSaoFlag[plane > 0])
+ copySaoUnit(&saoParam->ctuParam[plane][addr], &mergeSaoParam[plane][mergeIdx]);
+ }
+ }
+ }
+
+ if (saoParam->ctuParam[0][addr].typeIdx < 0)
+ m_numNoSao[0]++;
+ if (saoParam->ctuParam[1][addr].typeIdx < 0)
+ m_numNoSao[1]++;
+ m_entropyCoder.load(m_rdContexts.temp);
+ m_entropyCoder.store(m_rdContexts.cur);
+ }
+}
+
/** rate distortion optimization of SAO unit */
inline int64_t SAO::estSaoTypeDist(int plane, int typeIdx, double lambda, int32_t* currentDistortionTableBo, double* currentRdCostTableBo)
{
diff -r f023dda04a26 -r 690f1e3baab2 source/encoder/sao.h
--- a/source/encoder/sao.h Mon Dec 07 12:06:38 2015 -0600
+++ b/source/encoder/sao.h Mon Dec 07 12:06:41 2015 -0600
@@ -62,6 +62,7 @@
enum { NUM_EDGETYPE = 5 };
enum { NUM_PLANE = 3 };
enum { NUM_MERGE_MODE = 3 };
+ enum { SAO_DEPTHRATE_SIZE = 4 };
static const uint32_t s_eoTable[NUM_EDGETYPE];
@@ -79,7 +80,7 @@
PerPlane* m_countPreDblk;
PerPlane* m_offsetOrgPreDblk;
- double m_depthSaoRate[2][4];
+ double* m_depthSaoRate;
int8_t m_offsetBo[NUM_PLANE][SAO_NUM_BO_CLASSES];
int8_t m_offsetEo[NUM_PLANE][NUM_EDGETYPE];
@@ -149,6 +150,7 @@
void rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus);
void rdoSaoUnitRow(SAOParam* saoParam, int idxY);
+ void rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr);
friend class FrameFilter;
};
More information about the x265-devel
mailing list