[x265] [PATCH 05 of 24] move SAO into class ParallelFilter and modify it to row based
Deepthi Nandakumar
deepthi at multicorewareinc.com
Fri Dec 11 11:25:31 CET 2015
Thanks - very nicely done!
On Tue, Dec 8, 2015 at 5:24 AM, Min Chen <chenm003 at 163.com> wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1449511560 21600
> # Node ID c68eec7fb242748363ec985937b20ed1aff73f02
> # Parent 3542d3abd018491d6ad67a79b0e6d05b604d3818
> move SAO into class ParallelFilter and modify it to row based
> ---
> source/common/common.h | 1 +
> source/encoder/frameencoder.cpp | 36 +++++++-------
> source/encoder/framefilter.cpp | 95
> +++++++++++++++++++++++++-------------
> source/encoder/framefilter.h | 14 +++---
> source/encoder/sao.cpp | 81 ++++++++++++++++++++++++---------
> source/encoder/sao.h | 7 ++-
> 6 files changed, 151 insertions(+), 83 deletions(-)
>
> diff -r 3542d3abd018 -r c68eec7fb242 source/common/common.h
> --- a/source/common/common.h Mon Dec 07 12:05:57 2015 -0600
> +++ b/source/common/common.h Mon Dec 07 12:06:00 2015 -0600
> @@ -215,6 +215,7 @@
>
> #define X265_MALLOC(type, count) (type*)x265_malloc(sizeof(type) *
> (count))
> #define X265_FREE(ptr) x265_free(ptr)
> +#define X265_FREE_ZERO(ptr) x265_free(ptr); (ptr) = NULL
> #define CHECKED_MALLOC(var, type, count) \
> { \
> var = (type*)x265_malloc(sizeof(type) * (count)); \
> diff -r 3542d3abd018 -r c68eec7fb242 source/encoder/frameencoder.cpp
> --- a/source/encoder/frameencoder.cpp Mon Dec 07 12:05:57 2015 -0600
> +++ b/source/encoder/frameencoder.cpp Mon Dec 07 12:06:00 2015 -0600
> @@ -1093,7 +1093,7 @@
>
> /* SAO parameter estimation using non-deblocked pixels for CTU
> bottom and right boundary areas */
> if (m_param->bEnableSAO && m_param->bSaoNonDeblocked)
> - m_frameFilter.m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col,
> row);
> +
> m_frameFilter.m_parallelFilter[row].m_sao.calcSaoStatsCu_BeforeDblk(m_frame,
> col, row);
>
> /* Deblock with idle threading */
> if (m_param->bEnableLoopFilter)
> @@ -1103,24 +1103,24 @@
> if (row > 0)
> {
> // Waitting last threading finish
> - m_frameFilter.m_pdeblock[row - 1].waitForExit();
> + m_frameFilter.m_parallelFilter[row - 1].waitForExit();
>
> // Processing new group
> - const int allowCol = ((row >= 2) ?
> X265_MIN(m_frameFilter.m_pdeblock[row - 2].m_lastCol.get(), (int)col) :
> col);
> - m_frameFilter.m_pdeblock[row -
> 1].m_allowedCol.set(allowCol);
> - m_frameFilter.m_pdeblock[row - 1].tryBondPeers(*this, 1);
> + const int allowCol = ((row >= 2) ?
> X265_MIN(m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get(), (int)col)
> : col);
> + m_frameFilter.m_parallelFilter[row -
> 1].m_allowedCol.set(allowCol);
> + m_frameFilter.m_parallelFilter[row -
> 1].tryBondPeers(*this, 1);
> }
>
> // Last Row may start early
> if (row == m_numRows - 1)
> {
> // Waitting last threading finish
> - m_frameFilter.m_pdeblock[row].waitForExit();
> + m_frameFilter.m_parallelFilter[row].waitForExit();
>
> // Processing last row
> - const int allowCol = ((row >= 2) ?
> X265_MIN(m_frameFilter.m_pdeblock[row - 1].m_lastCol.get(), (int)col) :
> col);
> - m_frameFilter.m_pdeblock[row].m_allowedCol.set(allowCol);
> - m_frameFilter.m_pdeblock[row].tryBondPeers(*this, 1);
> + const int allowCol = ((row >= 2) ?
> X265_MIN(m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get(), (int)col)
> : col);
> +
> m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol);
> + m_frameFilter.m_parallelFilter[row].tryBondPeers(*this,
> 1);
> }
> }
>
> @@ -1188,17 +1188,17 @@
> if (m_param->bEnableLoopFilter & (row > 0))
> {
> /* TODO: Multiple Threading */
> - m_frameFilter.m_pdeblock[row - 1].waitForExit();
> + m_frameFilter.m_parallelFilter[row - 1].waitForExit();
>
> /* Check to avoid previous row process slower than current
> row */
> if (row >= 2)
> {
> - int prevCol = m_frameFilter.m_pdeblock[row -
> 2].m_lastCol.get();
> + int prevCol = m_frameFilter.m_parallelFilter[row -
> 2].m_lastCol.get();
> while(prevCol != (int)numCols)
> - prevCol = m_frameFilter.m_pdeblock[row -
> 2].m_lastCol.waitForChange(prevCol);
> + prevCol = m_frameFilter.m_parallelFilter[row -
> 2].m_lastCol.waitForChange(prevCol);
> }
> - m_frameFilter.m_pdeblock[row - 1].m_allowedCol.set(numCols);
> - m_frameFilter.m_pdeblock[row - 1].processTasks(-1);
> + m_frameFilter.m_parallelFilter[row -
> 1].m_allowedCol.set(numCols);
> + m_frameFilter.m_parallelFilter[row - 1].processTasks(-1);
> }
>
> /* trigger row-wise loop filters */
> @@ -1217,12 +1217,12 @@
> /* TODO: Early start last row */
> if (m_param->bEnableLoopFilter)
> {
> - X265_CHECK(m_frameFilter.m_pdeblock[row -
> 1].m_allowedCol.get() == (int)numCols, "Deblock m_EncodedCol check failed");
> + X265_CHECK(m_frameFilter.m_parallelFilter[row -
> 1].m_allowedCol.get() == (int)numCols, "Deblock m_EncodedCol check failed");
>
> /* NOTE: Last Row not execute before, so didn't need wait
> */
> - m_frameFilter.m_pdeblock[row].waitForExit();
> - m_frameFilter.m_pdeblock[row].m_allowedCol.set(numCols);
> - m_frameFilter.m_pdeblock[row].processTasks(-1);
> + m_frameFilter.m_parallelFilter[row].waitForExit();
> +
> m_frameFilter.m_parallelFilter[row].m_allowedCol.set(numCols);
> + m_frameFilter.m_parallelFilter[row].processTasks(-1);
> }
>
> for (uint32_t i = m_numRows - m_filterRowDelay; i <
> m_numRows; i++)
> diff -r 3542d3abd018 -r c68eec7fb242 source/encoder/framefilter.cpp
> --- a/source/encoder/framefilter.cpp Mon Dec 07 12:05:57 2015 -0600
> +++ b/source/encoder/framefilter.cpp Mon Dec 07 12:06:00 2015 -0600
> @@ -35,19 +35,22 @@
> static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride,
> uint32_t width, uint32_t height);
> static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2,
> intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t&
> cnt);
>
> -uint32_t FrameFilter::ParallelDeblock::numCols = 0;
> +uint32_t FrameFilter::ParallelFilter::numCols = 0;
>
> void FrameFilter::destroy()
> {
> - if (m_param->bEnableSAO)
> - m_sao.destroy();
> -
> X265_FREE(m_ssimBuf);
>
> - if (m_pdeblock)
> + if (m_parallelFilter)
> {
> - delete[] m_pdeblock;
> - m_pdeblock = NULL;
> + if (m_param->bEnableSAO)
> + {
> + for(int row = 0; row < m_numRows; row++)
> + m_parallelFilter[row].m_sao.destroy((row == 0 ? 1 : 0));
> + }
> +
> + delete[] m_parallelFilter;
> + m_parallelFilter = NULL;
> }
> }
>
> @@ -63,50 +66,65 @@
> m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0;
> m_lastHeight = m_param->sourceHeight % g_maxCUSize ?
> m_param->sourceHeight % g_maxCUSize : g_maxCUSize;
>
> - if (m_param->bEnableSAO)
> - if (!m_sao.create(m_param))
> - m_param->bEnableSAO = 0;
> -
> if (m_param->bEnableSsim)
> m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3));
>
> if (m_param->bEnableLoopFilter)
> - m_pdeblock = new ParallelDeblock[numRows];
> + m_parallelFilter = new ParallelFilter[numRows];
>
> - if (m_pdeblock)
> + if (m_parallelFilter)
> {
> + if (m_param->bEnableSAO)
> + {
> + for(int row = 0; row < numRows; row++)
> + {
> + if (!m_parallelFilter[row].m_sao.create(m_param, (row ==
> 0 ? 1 : 0)))
> + m_param->bEnableSAO = 0;
> + else
> + {
> + if (row != 0)
> +
> m_parallelFilter[row].m_sao.createFromRootNode(&m_parallelFilter[0].m_sao);
> + }
> +
> + }
> + }
> +
> for(int row = 0; row < numRows; row++)
> {
> - m_pdeblock[row].m_rowAddr = row * numCols;
> - m_pdeblock[row].m_frameEncoder = m_frameEncoder;
> + m_parallelFilter[row].m_rowAddr = row * numCols;
> + m_parallelFilter[row].m_frameEncoder = m_frameEncoder;
> }
> }
>
> // Setting maximum columns
> - ParallelDeblock::numCols = numCols;
> + ParallelFilter::numCols = numCols;
> }
>
> void FrameFilter::start(Frame *frame, Entropy& initState, int qp)
> {
> m_frame = frame;
>
> - if (m_param->bEnableSAO)
> - m_sao.startSlice(frame, initState, qp);
> -
> - // Reset Deblock Data Struct
> - if (m_pdeblock)
> + // Reset Filter Data Struct
> + if (m_parallelFilter)
> {
> for(int row = 0; row < m_numRows; row++)
> {
> - m_pdeblock[row].m_lastCol.set(0);
> - m_pdeblock[row].m_allowedCol.set(0);
> - m_pdeblock[row].m_encData = frame->m_encData;
> + if (m_param->bEnableSAO)
> + m_parallelFilter[row].m_sao.startSlice(frame, initState,
> qp);
> +
> + m_parallelFilter[row].m_lastCol.set(0);
> + m_parallelFilter[row].m_allowedCol.set(0);
> + m_parallelFilter[row].m_encData = frame->m_encData;
> }
> +
> + // Reset SAO global/common statistics
> + if (m_param->bEnableSAO)
> + m_parallelFilter[0].m_sao.resetStats();
> }
> }
>
> // NOTE: Single Threading only
> -void FrameFilter::ParallelDeblock::processTasks(int /*workerThreadId*/)
> +void FrameFilter::ParallelFilter::processTasks(int /*workerThreadId*/)
> {
> const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
> const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
> @@ -160,11 +178,11 @@
> SAOParam* saoParam = encData.m_saoParam;
> if (m_param->bEnableSAO)
> {
> - m_sao.m_entropyCoder.load(m_frameEncoder->m_initSliceContext);
> - m_sao.m_rdContexts.next.load(m_frameEncoder->m_initSliceContext);
> - m_sao.m_rdContexts.cur.load(m_frameEncoder->m_initSliceContext);
> +
> m_parallelFilter[row].m_sao.m_entropyCoder.load(m_frameEncoder->m_initSliceContext);
> +
> m_parallelFilter[row].m_sao.m_rdContexts.next.load(m_frameEncoder->m_initSliceContext);
> +
> m_parallelFilter[row].m_sao.m_rdContexts.cur.load(m_frameEncoder->m_initSliceContext);
>
> - m_sao.rdoSaoUnitRow(saoParam, row);
> + m_parallelFilter[row].m_sao.rdoSaoUnitRow(saoParam, row);
>
> // NOTE: Delay a row because SAO decide need top row pixels at
> next row, is it HM's bug?
> if (row >= m_saoRowDelay)
> @@ -180,7 +198,7 @@
> {
> if (m_param->bEnableSAO)
> {
> - m_sao.rdoSaoUnitRowEnd(saoParam,
> encData.m_slice->m_sps->numCUsInFrame);
> + m_parallelFilter[row].m_sao.rdoSaoUnitRowEnd(saoParam,
> encData.m_slice->m_sps->numCUsInFrame);
>
> for (int i = m_numRows - m_saoRowDelay; i < m_numRows; i++)
> processSao(i);
> @@ -489,12 +507,23 @@
> SAOParam* saoParam = encData.m_saoParam;
>
> if (saoParam->bSaoFlag[0])
> - m_sao.processSaoUnitRow(saoParam->ctuParam[0], row, 0);
> + {
> +
> m_parallelFilter[row].m_sao.processSaoUnitRow(saoParam->ctuParam[0], row,
> 0);
> + if (row != m_numRows - 1)
> + {
> + memcpy(m_parallelFilter[row + 1].m_sao.m_tmpU1[0],
> m_parallelFilter[row].m_sao.m_tmpU1[0], sizeof(pixel) *
> m_param->sourceWidth);
> + }
> + }
>
> if (saoParam->bSaoFlag[1])
> {
> - m_sao.processSaoUnitRow(saoParam->ctuParam[1], row, 1);
> - m_sao.processSaoUnitRow(saoParam->ctuParam[2], row, 2);
> +
> m_parallelFilter[row].m_sao.processSaoUnitRow(saoParam->ctuParam[1], row,
> 1);
> +
> m_parallelFilter[row].m_sao.processSaoUnitRow(saoParam->ctuParam[2], row,
> 2);
> + if (row != m_numRows - 1)
> + {
> + memcpy(m_parallelFilter[row + 1].m_sao.m_tmpU1[1],
> m_parallelFilter[row].m_sao.m_tmpU1[1], sizeof(pixel) *
> m_param->sourceWidth);
> + memcpy(m_parallelFilter[row + 1].m_sao.m_tmpU1[2],
> m_parallelFilter[row].m_sao.m_tmpU1[2], sizeof(pixel) *
> m_param->sourceWidth);
> + }
> }
>
> if (encData.m_slice->m_pps->bTransquantBypassEnabled)
> diff -r 3542d3abd018 -r c68eec7fb242 source/encoder/framefilter.h
> --- a/source/encoder/framefilter.h Mon Dec 07 12:05:57 2015 -0600
> +++ b/source/encoder/framefilter.h Mon Dec 07 12:06:00 2015 -0600
> @@ -51,7 +51,6 @@
> int m_vChromaShift;
> int m_pad[2];
>
> - SAO m_sao;
> int m_numRows;
> int m_saoRowDelay;
> int m_lastHeight;
> @@ -59,41 +58,42 @@
> void* m_ssimBuf; /* Temp storage for ssim computation */
>
> #define MAX_PFILTER_CUS (4) /* maximum CUs for every thread */
> - class ParallelDeblock : public BondedTaskGroup, public Deblock
> + class ParallelFilter : public BondedTaskGroup, public Deblock
> {
> public:
> static uint32_t numCols;
> uint32_t m_rowAddr;
> FrameEncoder* m_frameEncoder;
> FrameData* m_encData;
> + SAO m_sao;
> ThreadSafeInteger m_lastCol; /* The column that next
> to process */
> ThreadSafeInteger m_allowedCol; /* The column that
> processed from Encode pipeline */
>
> - ParallelDeblock()
> + ParallelFilter()
> : m_rowAddr(0)
> , m_frameEncoder(NULL)
> , m_encData(NULL)
> {
> }
>
> - ~ParallelDeblock()
> + ~ParallelFilter()
> { }
>
> void processTasks(int workerThreadId);
>
> protected:
>
> - ParallelDeblock operator=(const ParallelDeblock&);
> + ParallelFilter operator=(const ParallelFilter&);
> };
>
> - ParallelDeblock* m_pdeblock;
> + ParallelFilter* m_parallelFilter;
>
> FrameFilter()
> : m_param(NULL)
> , m_frame(NULL)
> , m_frameEncoder(NULL)
> , m_ssimBuf(NULL)
> - , m_pdeblock(NULL)
> + , m_parallelFilter(NULL)
> {
> }
>
> diff -r 3542d3abd018 -r c68eec7fb242 source/encoder/sao.cpp
> --- a/source/encoder/sao.cpp Mon Dec 07 12:05:57 2015 -0600
> +++ b/source/encoder/sao.cpp Mon Dec 07 12:06:00 2015 -0600
> @@ -103,7 +103,7 @@
> m_depthSaoRate[1][3] = 0;
> }
>
> -bool SAO::create(x265_param* param)
> +bool SAO::create(x265_param* param, int initCommon)
> {
> m_param = param;
> m_chromaFormat = param->internalCsp;
> @@ -131,12 +131,24 @@
> m_tmpU2[i] += 1;
> }
>
> - CHECKED_MALLOC(m_count, PerClass, NUM_PLANE);
> - CHECKED_MALLOC(m_offset, PerClass, NUM_PLANE);
> - CHECKED_MALLOC(m_offsetOrg, PerClass, NUM_PLANE);
> + if (initCommon)
> + {
> + CHECKED_MALLOC(m_count, PerClass, NUM_PLANE);
> + CHECKED_MALLOC(m_offset, PerClass, NUM_PLANE);
> + CHECKED_MALLOC(m_offsetOrg, PerClass, NUM_PLANE);
>
> - CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
> - CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
> + CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
> + CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
> + }
> + else
> + {
> + // must initialize these common pointer outside of function
> + m_count = NULL;
> + m_offset = NULL;
> + m_offsetOrg = NULL;
> + m_countPreDblk = NULL;
> + m_offsetOrgPreDblk = NULL;
> + }
>
> m_clipTable = &(m_clipTableBase[rangeExt]);
>
> @@ -155,24 +167,50 @@
> return false;
> }
>
> -void SAO::destroy()
> +void SAO::createFromRootNode(SAO* root)
> {
> - X265_FREE(m_clipTableBase);
> + X265_CHECK(m_count == NULL, "duplicate initialize on m_count");
> + X265_CHECK(m_offset == NULL, "duplicate initialize on m_offset");
> + X265_CHECK(m_offsetOrg == NULL, "duplicate initialize on
> m_offsetOrg");
> + X265_CHECK(m_countPreDblk == NULL, "duplicate initialize on
> m_countPreDblk");
> + X265_CHECK(m_offsetOrgPreDblk == NULL, "duplicate initialize on
> m_offsetOrgPreDblk");
>
> - X265_FREE(m_tmpL1);
> - X265_FREE(m_tmpL2);
> + m_count = root->m_count;
> + m_offset = root->m_offset;
> + m_offsetOrg = root->m_offsetOrg;
> + m_countPreDblk = root->m_countPreDblk;
> + m_offsetOrgPreDblk = root->m_offsetOrgPreDblk;
> +}
> +
> +void SAO::destroy(int destoryCommon)
> +{
> + X265_FREE_ZERO(m_clipTableBase);
> +
> + X265_FREE_ZERO(m_tmpL1);
> + X265_FREE_ZERO(m_tmpL2);
>
> for (int i = 0; i < 3; i++)
> {
> - if (m_tmpU1[i]) X265_FREE(m_tmpU1[i] - 1);
> - if (m_tmpU2[i]) X265_FREE(m_tmpU2[i] - 1);
> + if (m_tmpU1[i])
> + {
> + X265_FREE(m_tmpU1[i] - 1);
> + m_tmpU1[i] = NULL;
> + }
> + if (m_tmpU2[i])
> + {
> + X265_FREE(m_tmpU2[i] - 1);
> + m_tmpU2[i] = NULL;
> + }
> }
>
> - X265_FREE(m_count);
> - X265_FREE(m_offset);
> - X265_FREE(m_offsetOrg);
> - X265_FREE(m_countPreDblk);
> - X265_FREE(m_offsetOrgPreDblk);
> + if (destoryCommon)
> + {
> + X265_FREE(m_count);
> + X265_FREE(m_offset);
> + X265_FREE(m_offsetOrg);
> + X265_FREE(m_countPreDblk);
> + X265_FREE(m_offsetOrgPreDblk);
> + }
> }
>
> /* allocate memory for SAO parameters */
> @@ -210,8 +248,6 @@
> break;
> }
>
> - resetStats();
> -
> m_entropyCoder.load(initState);
> m_rdContexts.next.load(initState);
> m_rdContexts.cur.load(initState);
> @@ -586,15 +622,14 @@
> ctuHeight >>= m_vChromaShift;
> }
>
> + int addr = idxY * m_numCuInWidth;
> + pixel* rec = reconPic->getPlaneAddr(plane, addr);
> +
> if (!idxY)
> {
> - pixel* rec = reconPic->m_picOrg[plane];
> memcpy(m_tmpU1[plane], rec, sizeof(pixel) * picWidth);
> }
>
> - int addr = idxY * m_numCuInWidth;
> - pixel* rec = plane ? reconPic->getChromaAddr(plane, addr) :
> reconPic->getLumaAddr(addr);
> -
> for (int i = 0; i < ctuHeight + 1; i++)
> {
> m_tmpL1[i] = rec[0];
> diff -r 3542d3abd018 -r c68eec7fb242 source/encoder/sao.h
> --- a/source/encoder/sao.h Mon Dec 07 12:05:57 2015 -0600
> +++ b/source/encoder/sao.h Mon Dec 07 12:06:00 2015 -0600
> @@ -120,8 +120,9 @@
>
> SAO();
>
> - bool create(x265_param* param);
> - void destroy();
> + bool create(x265_param* param, int initCommon);
> + void createFromRootNode(SAO *root);
> + void destroy(int destoryCommon);
>
> void allocSaoParam(SAOParam* saoParam) const;
>
> @@ -147,6 +148,8 @@
>
> void rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus);
> void rdoSaoUnitRow(SAOParam* saoParam, int idxY);
> +
> + friend class FrameFilter;
> };
>
> }
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
--
Deepthi Nandakumar
Engineering Manager, x265
Multicoreware, Inc
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20151211/45996bdd/attachment-0001.html>
More information about the x265-devel
mailing list