[x265] [PATCH 05 of 15] move SAO into class ParallelFilter and modify it to row based
Min Chen
chenm003 at 163.com
Wed Dec 2 18:28:28 CET 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1449076352 21600
# Node ID eb20b66eebe7e9de04cec0f98f1c3c43e678fcf5
# Parent 2601513575c7511d109ed906626b126d6e4f29fb
move SAO into class ParallelFilter and modify it to row based
---
source/common/common.h | 1 +
source/encoder/frameencoder.cpp | 36 +++++++-------
source/encoder/framefilter.cpp | 95 +++++++++++++++++++++++++-------------
source/encoder/framefilter.h | 14 +++---
source/encoder/sao.cpp | 81 ++++++++++++++++++++++++---------
source/encoder/sao.h | 7 ++-
6 files changed, 151 insertions(+), 83 deletions(-)
diff -r 2601513575c7 -r eb20b66eebe7 source/common/common.h
--- a/source/common/common.h Wed Dec 02 11:12:29 2015 -0600
+++ b/source/common/common.h Wed Dec 02 11:12:32 2015 -0600
@@ -215,6 +215,7 @@
#define X265_MALLOC(type, count) (type*)x265_malloc(sizeof(type) * (count))
#define X265_FREE(ptr) x265_free(ptr)
+#define X265_FREE_ZERO(ptr) x265_free(ptr); (ptr) = NULL
#define CHECKED_MALLOC(var, type, count) \
{ \
var = (type*)x265_malloc(sizeof(type) * (count)); \
diff -r 2601513575c7 -r eb20b66eebe7 source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp Wed Dec 02 11:12:29 2015 -0600
+++ b/source/encoder/frameencoder.cpp Wed Dec 02 11:12:32 2015 -0600
@@ -1093,7 +1093,7 @@
/* SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas */
if (m_param->bEnableSAO && m_param->bSaoNonDeblocked)
- m_frameFilter.m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row);
+ m_frameFilter.m_parallelFilter[row].m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row);
/* Deblock with idle threading */
if (m_param->bEnableLoopFilter)
@@ -1103,24 +1103,24 @@
if (row > 0)
{
// Waitting last threading finish
- m_frameFilter.m_pdeblock[row - 1].waitForExit();
+ m_frameFilter.m_parallelFilter[row - 1].waitForExit();
// Processing new group
- const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_pdeblock[row - 2].m_lastCol.get(), (int)col) : col);
- m_frameFilter.m_pdeblock[row - 1].m_allowedCol.set(allowCol);
- m_frameFilter.m_pdeblock[row - 1].tryBondPeers(*this, 1);
+ const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get(), (int)col) : col);
+ m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(allowCol);
+ m_frameFilter.m_parallelFilter[row - 1].tryBondPeers(*this, 1);
}
// Last Row may start early
if (row == m_numRows - 1)
{
// Waitting last threading finish
- m_frameFilter.m_pdeblock[row].waitForExit();
+ m_frameFilter.m_parallelFilter[row].waitForExit();
// Processing last row
- const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_pdeblock[row - 1].m_lastCol.get(), (int)col) : col);
- m_frameFilter.m_pdeblock[row].m_allowedCol.set(allowCol);
- m_frameFilter.m_pdeblock[row].tryBondPeers(*this, 1);
+ const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get(), (int)col) : col);
+ m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol);
+ m_frameFilter.m_parallelFilter[row].tryBondPeers(*this, 1);
}
}
@@ -1188,17 +1188,17 @@
if (m_param->bEnableLoopFilter & (row > 0))
{
/* TODO: Multiple Threading */
- m_frameFilter.m_pdeblock[row - 1].waitForExit();
+ m_frameFilter.m_parallelFilter[row - 1].waitForExit();
/* Check to avoid previous row process slower than current row */
if (row >= 2)
{
- int prevCol = m_frameFilter.m_pdeblock[row - 2].m_lastCol.get();
+ int prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get();
while(prevCol != (int)numCols)
- prevCol = m_frameFilter.m_pdeblock[row - 2].m_lastCol.waitForChange(prevCol);
+ prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastCol.waitForChange(prevCol);
}
- m_frameFilter.m_pdeblock[row - 1].m_allowedCol.set(numCols);
- m_frameFilter.m_pdeblock[row - 1].processTasks(-1);
+ m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(numCols);
+ m_frameFilter.m_parallelFilter[row - 1].processTasks(-1);
}
/* trigger row-wise loop filters */
@@ -1217,12 +1217,12 @@
/* TODO: Early start last row */
if (m_param->bEnableLoopFilter)
{
- X265_CHECK(m_frameFilter.m_pdeblock[row - 1].m_allowedCol.get() == (int)numCols, "Deblock m_EncodedCol check failed");
+ X265_CHECK(m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.get() == (int)numCols, "Deblock m_EncodedCol check failed");
/* NOTE: Last Row not execute before, so didn't need wait */
- m_frameFilter.m_pdeblock[row].waitForExit();
- m_frameFilter.m_pdeblock[row].m_allowedCol.set(numCols);
- m_frameFilter.m_pdeblock[row].processTasks(-1);
+ m_frameFilter.m_parallelFilter[row].waitForExit();
+ m_frameFilter.m_parallelFilter[row].m_allowedCol.set(numCols);
+ m_frameFilter.m_parallelFilter[row].processTasks(-1);
}
for (uint32_t i = m_numRows - m_filterRowDelay; i < m_numRows; i++)
diff -r 2601513575c7 -r eb20b66eebe7 source/encoder/framefilter.cpp
--- a/source/encoder/framefilter.cpp Wed Dec 02 11:12:29 2015 -0600
+++ b/source/encoder/framefilter.cpp Wed Dec 02 11:12:32 2015 -0600
@@ -35,19 +35,22 @@
static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height);
static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt);
-uint32_t FrameFilter::ParallelDeblock::numCols = 0;
+uint32_t FrameFilter::ParallelFilter::numCols = 0;
void FrameFilter::destroy()
{
- if (m_param->bEnableSAO)
- m_sao.destroy();
-
X265_FREE(m_ssimBuf);
- if (m_pdeblock)
+ if (m_parallelFilter)
{
- delete[] m_pdeblock;
- m_pdeblock = NULL;
+ if (m_param->bEnableSAO)
+ {
+ for(int row = 0; row < m_numRows; row++)
+ m_parallelFilter[row].m_sao.destroy((row == 0 ? 1 : 0));
+ }
+
+ delete[] m_parallelFilter;
+ m_parallelFilter = NULL;
}
}
@@ -63,50 +66,65 @@
m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0;
m_lastHeight = m_param->sourceHeight % g_maxCUSize ? m_param->sourceHeight % g_maxCUSize : g_maxCUSize;
- if (m_param->bEnableSAO)
- if (!m_sao.create(m_param))
- m_param->bEnableSAO = 0;
-
if (m_param->bEnableSsim)
m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3));
if (m_param->bEnableLoopFilter)
- m_pdeblock = new ParallelDeblock[numRows];
+ m_parallelFilter = new ParallelFilter[numRows];
- if (m_pdeblock)
+ if (m_parallelFilter)
{
+ if (m_param->bEnableSAO)
+ {
+ for(int row = 0; row < numRows; row++)
+ {
+ if (!m_parallelFilter[row].m_sao.create(m_param, (row == 0 ? 1 : 0)))
+ m_param->bEnableSAO = 0;
+ else
+ {
+ if (row != 0)
+ m_parallelFilter[row].m_sao.createFromRootNode(&m_parallelFilter[0].m_sao);
+ }
+
+ }
+ }
+
for(int row = 0; row < numRows; row++)
{
- m_pdeblock[row].m_rowAddr = row * numCols;
- m_pdeblock[row].m_frameEncoder = m_frameEncoder;
+ m_parallelFilter[row].m_rowAddr = row * numCols;
+ m_parallelFilter[row].m_frameEncoder = m_frameEncoder;
}
}
// Setting maximum columns
- ParallelDeblock::numCols = numCols;
+ ParallelFilter::numCols = numCols;
}
void FrameFilter::start(Frame *frame, Entropy& initState, int qp)
{
m_frame = frame;
- if (m_param->bEnableSAO)
- m_sao.startSlice(frame, initState, qp);
-
- // Reset Deblock Data Struct
- if (m_pdeblock)
+ // Reset Filter Data Struct
+ if (m_parallelFilter)
{
for(int row = 0; row < m_numRows; row++)
{
- m_pdeblock[row].m_lastCol.set(0);
- m_pdeblock[row].m_allowedCol.set(0);
- m_pdeblock[row].m_encData = frame->m_encData;
+ if (m_param->bEnableSAO)
+ m_parallelFilter[row].m_sao.startSlice(frame, initState, qp);
+
+ m_parallelFilter[row].m_lastCol.set(0);
+ m_parallelFilter[row].m_allowedCol.set(0);
+ m_parallelFilter[row].m_encData = frame->m_encData;
}
+
+ // Reset SAO global/common statistics
+ if (m_param->bEnableSAO)
+ m_parallelFilter[0].m_sao.resetStats();
}
}
// NOTE: Single Threading only
-void FrameFilter::ParallelDeblock::processTasks(int /*workerThreadId*/)
+void FrameFilter::ParallelFilter::processTasks(int /*workerThreadId*/)
{
const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
@@ -160,11 +178,11 @@
SAOParam* saoParam = encData.m_saoParam;
if (m_param->bEnableSAO)
{
- m_sao.m_entropyCoder.load(m_frameEncoder->m_initSliceContext);
- m_sao.m_rdContexts.next.load(m_frameEncoder->m_initSliceContext);
- m_sao.m_rdContexts.cur.load(m_frameEncoder->m_initSliceContext);
+ m_parallelFilter[row].m_sao.m_entropyCoder.load(m_frameEncoder->m_initSliceContext);
+ m_parallelFilter[row].m_sao.m_rdContexts.next.load(m_frameEncoder->m_initSliceContext);
+ m_parallelFilter[row].m_sao.m_rdContexts.cur.load(m_frameEncoder->m_initSliceContext);
- m_sao.rdoSaoUnitRow(saoParam, row);
+ m_parallelFilter[row].m_sao.rdoSaoUnitRow(saoParam, row);
// NOTE: Delay a row because SAO decide need top row pixels at next row, is it HM's bug?
if (row >= m_saoRowDelay)
@@ -180,7 +198,7 @@
{
if (m_param->bEnableSAO)
{
- m_sao.rdoSaoUnitRowEnd(saoParam, encData.m_slice->m_sps->numCUsInFrame);
+ m_parallelFilter[row].m_sao.rdoSaoUnitRowEnd(saoParam, encData.m_slice->m_sps->numCUsInFrame);
for (int i = m_numRows - m_saoRowDelay; i < m_numRows; i++)
processSao(i);
@@ -489,12 +507,23 @@
SAOParam* saoParam = encData.m_saoParam;
if (saoParam->bSaoFlag[0])
- m_sao.processSaoUnitRow(saoParam->ctuParam[0], row, 0);
+ {
+ m_parallelFilter[row].m_sao.processSaoUnitRow(saoParam->ctuParam[0], row, 0);
+ if (row != m_numRows - 1)
+ {
+ memcpy(m_parallelFilter[row + 1].m_sao.m_tmpU1[0], m_parallelFilter[row].m_sao.m_tmpU1[0], sizeof(pixel) * m_param->sourceWidth);
+ }
+ }
if (saoParam->bSaoFlag[1])
{
- m_sao.processSaoUnitRow(saoParam->ctuParam[1], row, 1);
- m_sao.processSaoUnitRow(saoParam->ctuParam[2], row, 2);
+ m_parallelFilter[row].m_sao.processSaoUnitRow(saoParam->ctuParam[1], row, 1);
+ m_parallelFilter[row].m_sao.processSaoUnitRow(saoParam->ctuParam[2], row, 2);
+ if (row != m_numRows - 1)
+ {
+ memcpy(m_parallelFilter[row + 1].m_sao.m_tmpU1[1], m_parallelFilter[row].m_sao.m_tmpU1[1], sizeof(pixel) * m_param->sourceWidth);
+ memcpy(m_parallelFilter[row + 1].m_sao.m_tmpU1[2], m_parallelFilter[row].m_sao.m_tmpU1[2], sizeof(pixel) * m_param->sourceWidth);
+ }
}
if (encData.m_slice->m_pps->bTransquantBypassEnabled)
diff -r 2601513575c7 -r eb20b66eebe7 source/encoder/framefilter.h
--- a/source/encoder/framefilter.h Wed Dec 02 11:12:29 2015 -0600
+++ b/source/encoder/framefilter.h Wed Dec 02 11:12:32 2015 -0600
@@ -51,7 +51,6 @@
int m_vChromaShift;
int m_pad[2];
- SAO m_sao;
int m_numRows;
int m_saoRowDelay;
int m_lastHeight;
@@ -59,41 +58,42 @@
void* m_ssimBuf; /* Temp storage for ssim computation */
#define MAX_PFILTER_CUS (4) /* maximum CUs for every thread */
- class ParallelDeblock : public BondedTaskGroup, public Deblock
+ class ParallelFilter : public BondedTaskGroup, public Deblock
{
public:
static uint32_t numCols;
uint32_t m_rowAddr;
FrameEncoder* m_frameEncoder;
FrameData* m_encData;
+ SAO m_sao;
ThreadSafeInteger m_lastCol; /* The column that next to process */
ThreadSafeInteger m_allowedCol; /* The column that processed from Encode pipeline */
- ParallelDeblock()
+ ParallelFilter()
: m_rowAddr(0)
, m_frameEncoder(NULL)
, m_encData(NULL)
{
}
- ~ParallelDeblock()
+ ~ParallelFilter()
{ }
void processTasks(int workerThreadId);
protected:
- ParallelDeblock operator=(const ParallelDeblock&);
+ ParallelFilter operator=(const ParallelFilter&);
};
- ParallelDeblock* m_pdeblock;
+ ParallelFilter* m_parallelFilter;
FrameFilter()
: m_param(NULL)
, m_frame(NULL)
, m_frameEncoder(NULL)
, m_ssimBuf(NULL)
- , m_pdeblock(NULL)
+ , m_parallelFilter(NULL)
{
}
diff -r 2601513575c7 -r eb20b66eebe7 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp Wed Dec 02 11:12:29 2015 -0600
+++ b/source/encoder/sao.cpp Wed Dec 02 11:12:32 2015 -0600
@@ -103,7 +103,7 @@
m_depthSaoRate[1][3] = 0;
}
-bool SAO::create(x265_param* param)
+bool SAO::create(x265_param* param, int initCommon)
{
m_param = param;
m_chromaFormat = param->internalCsp;
@@ -131,12 +131,24 @@
m_tmpU2[i] += 1;
}
- CHECKED_MALLOC(m_count, PerClass, NUM_PLANE);
- CHECKED_MALLOC(m_offset, PerClass, NUM_PLANE);
- CHECKED_MALLOC(m_offsetOrg, PerClass, NUM_PLANE);
+ if (initCommon)
+ {
+ CHECKED_MALLOC(m_count, PerClass, NUM_PLANE);
+ CHECKED_MALLOC(m_offset, PerClass, NUM_PLANE);
+ CHECKED_MALLOC(m_offsetOrg, PerClass, NUM_PLANE);
- CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
- CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
+ CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
+ CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
+ }
+ else
+ {
+ // must initialize these common pointer outside of function
+ m_count = NULL;
+ m_offset = NULL;
+ m_offsetOrg = NULL;
+ m_countPreDblk = NULL;
+ m_offsetOrgPreDblk = NULL;
+ }
m_clipTable = &(m_clipTableBase[rangeExt]);
@@ -155,24 +167,50 @@
return false;
}
-void SAO::destroy()
+void SAO::createFromRootNode(SAO* root)
{
- X265_FREE(m_clipTableBase);
+ X265_CHECK(m_count == NULL, "duplicate initialize on m_count");
+ X265_CHECK(m_offset == NULL, "duplicate initialize on m_offset");
+ X265_CHECK(m_offsetOrg == NULL, "duplicate initialize on m_offsetOrg");
+ X265_CHECK(m_countPreDblk == NULL, "duplicate initialize on m_countPreDblk");
+ X265_CHECK(m_offsetOrgPreDblk == NULL, "duplicate initialize on m_offsetOrgPreDblk");
- X265_FREE(m_tmpL1);
- X265_FREE(m_tmpL2);
+ m_count = root->m_count;
+ m_offset = root->m_offset;
+ m_offsetOrg = root->m_offsetOrg;
+ m_countPreDblk = root->m_countPreDblk;
+ m_offsetOrgPreDblk = root->m_offsetOrgPreDblk;
+}
+
+void SAO::destroy(int destoryCommon)
+{
+ X265_FREE_ZERO(m_clipTableBase);
+
+ X265_FREE_ZERO(m_tmpL1);
+ X265_FREE_ZERO(m_tmpL2);
for (int i = 0; i < 3; i++)
{
- if (m_tmpU1[i]) X265_FREE(m_tmpU1[i] - 1);
- if (m_tmpU2[i]) X265_FREE(m_tmpU2[i] - 1);
+ if (m_tmpU1[i])
+ {
+ X265_FREE(m_tmpU1[i] - 1);
+ m_tmpU1[i] = NULL;
+ }
+ if (m_tmpU2[i])
+ {
+ X265_FREE(m_tmpU2[i] - 1);
+ m_tmpU2[i] = NULL;
+ }
}
- X265_FREE(m_count);
- X265_FREE(m_offset);
- X265_FREE(m_offsetOrg);
- X265_FREE(m_countPreDblk);
- X265_FREE(m_offsetOrgPreDblk);
+ if (destoryCommon)
+ {
+ X265_FREE(m_count);
+ X265_FREE(m_offset);
+ X265_FREE(m_offsetOrg);
+ X265_FREE(m_countPreDblk);
+ X265_FREE(m_offsetOrgPreDblk);
+ }
}
/* allocate memory for SAO parameters */
@@ -210,8 +248,6 @@
break;
}
- resetStats();
-
m_entropyCoder.load(initState);
m_rdContexts.next.load(initState);
m_rdContexts.cur.load(initState);
@@ -586,15 +622,14 @@
ctuHeight >>= m_vChromaShift;
}
+ int addr = idxY * m_numCuInWidth;
+ pixel* rec = reconPic->getPlaneAddr(plane, addr);
+
if (!idxY)
{
- pixel* rec = reconPic->m_picOrg[plane];
memcpy(m_tmpU1[plane], rec, sizeof(pixel) * picWidth);
}
- int addr = idxY * m_numCuInWidth;
- pixel* rec = plane ? reconPic->getChromaAddr(plane, addr) : reconPic->getLumaAddr(addr);
-
for (int i = 0; i < ctuHeight + 1; i++)
{
m_tmpL1[i] = rec[0];
diff -r 2601513575c7 -r eb20b66eebe7 source/encoder/sao.h
--- a/source/encoder/sao.h Wed Dec 02 11:12:29 2015 -0600
+++ b/source/encoder/sao.h Wed Dec 02 11:12:32 2015 -0600
@@ -120,8 +120,9 @@
SAO();
- bool create(x265_param* param);
- void destroy();
+ bool create(x265_param* param, int initCommon);
+ void createFromRootNode(SAO *root);
+ void destroy(int destoryCommon);
void allocSaoParam(SAOParam* saoParam) const;
@@ -147,6 +148,8 @@
void rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus);
void rdoSaoUnitRow(SAOParam* saoParam, int idxY);
+
+ friend class FrameFilter;
};
}
More information about the x265-devel
mailing list