[x265] [PATCH 4 of 6] Optimize Deblock with idle threading and put Deblock into encode loop to accelerate Frame Parallelism
Steve Borho
steve at borho.org
Wed Nov 18 18:56:34 CET 2015
On 11/18, Min Chen wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1447865931 21600
> # Node ID bf3171a0d20a175268ed987c6f93feb07229562e
> # Parent d964074180e715a49fe094319f4071931fde5fa3
> Optimize Deblock with idle threading and put Deblock into encode loop to accelerate Frame Parallelism
> ---
> source/common/threading.h | 18 +++++++
> source/encoder/frameencoder.cpp | 53 +++++++++++++++++++-
> source/encoder/framefilter.cpp | 102 ++++++++++++++++++++++++++------------
> source/encoder/framefilter.h | 44 ++++++++++++++++-
> 4 files changed, 179 insertions(+), 38 deletions(-)
>
> diff -r d964074180e7 -r bf3171a0d20a source/common/threading.h
> --- a/source/common/threading.h Wed Nov 18 10:58:48 2015 -0600
> +++ b/source/common/threading.h Wed Nov 18 10:58:51 2015 -0600
> @@ -205,6 +205,15 @@
> return ret;
> }
>
> + int getIncr(const int n = 1)
we don't use const on integer arguments
> + {
> + EnterCriticalSection(&m_cs);
> + int ret = m_val;
> + m_val += n;
> + LeaveCriticalSection(&m_cs);
> + return ret;
> + }
> +
> void set(int newval)
> {
> EnterCriticalSection(&m_cs);
> @@ -394,6 +403,15 @@
> return ret;
> }
>
> + int getIncr(const int n = 1)
> + {
> + pthread_mutex_lock(&m_mutex);
> + int ret = m_val;
> + m_val += n;
> + pthread_mutex_unlock(&m_mutex);
> + return ret;
> + }
> +
> void set(int newval)
> {
> pthread_mutex_lock(&m_mutex);
> diff -r d964074180e7 -r bf3171a0d20a source/encoder/frameencoder.cpp
> --- a/source/encoder/frameencoder.cpp Wed Nov 18 10:58:48 2015 -0600
> +++ b/source/encoder/frameencoder.cpp Wed Nov 18 10:58:51 2015 -0600
> @@ -124,7 +124,7 @@
> m_pool = NULL;
> }
>
> - m_frameFilter.init(top, this, numRows);
> + m_frameFilter.init(top, this, numRows, numCols);
>
> // initialize HRD parameters of SPS
> if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
> @@ -857,7 +857,7 @@
> // Called by worker threads
> void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld)
> {
> - uint32_t row = (uint32_t)intRow;
> + const uint32_t row = (uint32_t)intRow;
> CTURow& curRow = m_rows[row];
>
> tld.analysis.m_param = m_param;
> @@ -899,7 +899,7 @@
> {
> ProfileScopeEvent(encodeCTU);
>
> - uint32_t col = curRow.completed;
> + const uint32_t col = curRow.completed;
> const uint32_t cuAddr = lineStartCUAddr + col;
> CUData* ctu = curEncData.getPicCTU(cuAddr);
> ctu->initCTU(*m_frame, cuAddr, slice->m_sliceQp);
> @@ -1089,10 +1089,29 @@
> }
> }
>
> + // TODO: move Deblock and SAO to before VBV check
> +
> /* SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas */
> if (m_param->bEnableSAO && m_param->bSaoNonDeblocked)
> m_frameFilter.m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row);
>
> + /* Deblock with idle threading */
> + if (m_param->bEnableLoopFilter)
> + {
> + // TODO: Multiple Threading
> + // Delay ONE row to avoid Intra Prediction Conflict
> + if (row > 0)
> + {
> + // Waitting last threading finish
> + m_frameFilter.m_pdeblock[row - 1].waitForExit();
> +
> + // Processing new group
> + const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_pdeblock[row - 2].m_lastCol.get(), (int)col) : col);
> + m_frameFilter.m_pdeblock[row - 1].m_allowedCol.set(allowCol);
> + m_frameFilter.m_pdeblock[row - 1].tryBondPeers(*this, 1);
> + }
> + }
> +
> if (m_param->bEnableWavefront && curRow.completed >= 2 && row < m_numRows - 1 &&
> (!m_bAllRowsStop || intRow + 1 < m_vbvResetTriggerRow))
> {
> @@ -1153,6 +1172,23 @@
>
> if (m_param->bEnableWavefront)
> {
> + /* Processing left Deblock block with current threading */
> + if (m_param->bEnableLoopFilter & (row > 0))
> + {
> + /* TODO: Multiple Threading */
> + m_frameFilter.m_pdeblock[row - 1].waitForExit();
> +
> + /* Check to avoid previous row process slower than current row */
> + if (row >= 2)
> + {
> + int prevCol = m_frameFilter.m_pdeblock[row - 2].m_lastCol.get();
> + while(prevCol != (int)numCols)
> + prevCol = m_frameFilter.m_pdeblock[row - 2].m_lastCol.waitForChange(prevCol);
> + }
> + m_frameFilter.m_pdeblock[row - 1].m_allowedCol.set(numCols);
> + m_frameFilter.m_pdeblock[row - 1].processTasks(-1);
> + }
> +
> /* trigger row-wise loop filters */
> if (row >= m_filterRowDelay)
> {
> @@ -1163,8 +1199,19 @@
> enqueueRowFilter(0);
> tryWakeOne();
> }
> +
> if (row == m_numRows - 1)
> {
> + /* TODO: Early start last row */
> + if (m_param->bEnableLoopFilter)
> + {
> + X265_CHECK(m_frameFilter.m_pdeblock[row - 1].m_allowedCol.get() == (int)numCols, "Deblock m_EncodedCol check failed");
> +
> + /* NOTE: Last Row not execute before, so didn't need wait */
> + m_frameFilter.m_pdeblock[row].m_allowedCol.set(numCols);
> + m_frameFilter.m_pdeblock[row].processTasks(-1);
> + }
> +
> for (uint32_t i = m_numRows - m_filterRowDelay; i < m_numRows; i++)
> enableRowFilter(i);
> tryWakeOne();
> diff -r d964074180e7 -r bf3171a0d20a source/encoder/framefilter.cpp
> --- a/source/encoder/framefilter.cpp Wed Nov 18 10:58:48 2015 -0600
> +++ b/source/encoder/framefilter.cpp Wed Nov 18 10:58:51 2015 -0600
> @@ -35,13 +35,7 @@
> static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height);
> static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt);
>
> -FrameFilter::FrameFilter()
> - : m_param(NULL)
> - , m_frame(NULL)
> - , m_frameEncoder(NULL)
> - , m_ssimBuf(NULL)
> -{
> -}
> +uint32_t FrameFilter::ParallelDeblock::numCols = 0;
>
> void FrameFilter::destroy()
> {
> @@ -49,9 +43,15 @@
> m_sao.destroy();
>
> X265_FREE(m_ssimBuf);
> +
> + if (m_pdeblock)
> + {
> + delete[] m_pdeblock;
> + m_pdeblock = NULL;
> + }
> }
>
> -void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows)
> +void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols)
> {
> m_param = top->m_param;
> m_frameEncoder = frame;
> @@ -69,6 +69,21 @@
>
> if (m_param->bEnableSsim)
> m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3));
> +
> + if (m_param->bEnableLoopFilter)
> + m_pdeblock = new ParallelDeblock[numRows];
> +
> + if (m_pdeblock)
> + {
> + for(int row = 0; row < numRows; row++)
> + {
> + m_pdeblock[row].m_rowAddr = row * numCols;
> + m_pdeblock[row].m_frameEncoder = m_frameEncoder;
> + }
> + }
> +
> + // Setting maximum columns
> + ParallelDeblock::numCols = numCols;
> }
>
> void FrameFilter::start(Frame *frame, Entropy& initState, int qp)
> @@ -77,6 +92,52 @@
>
> if (m_param->bEnableSAO)
> m_sao.startSlice(frame, initState, qp);
> +
> + // Reset Deblock Data Struct
> + if (m_pdeblock)
> + {
> + for(int row = 0; row < m_numRows; row++)
> + {
> + m_pdeblock[row].m_lastCol.set(0);
> + m_pdeblock[row].m_allowedCol.set(0);
> + m_pdeblock[row].m_encData = frame->m_encData;
> + }
> + }
> +}
> +
> +// NOTE: Single Threading only
> +void FrameFilter::ParallelDeblock::processTasks(int /*workerThreadId*/)
> +{
> + const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
> + const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
> + const int colStart = m_lastCol.get();
> + // TODO: Waiting previous row finish or simple clip on it?
> + const int colEnd = m_allowedCol.get();
> +
> + // Avoid threading conflict
> + if (colStart >= colEnd)
> + return;
> +
> + for (uint32_t col = (uint32_t)colStart; col < (uint32_t)colEnd; col++)
> + {
> + const uint32_t cuAddr = m_rowAddr + col;
> + const CUData* ctu = m_encData->getPicCTU(cuAddr);
> + deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER);
> +
> + if (col > 0)
> + {
> + const CUData* ctuPrev = m_encData->getPicCTU(cuAddr - 1);
> + deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR);
> + }
> + m_lastCol.incr();
> + }
> +
> + if (colEnd == (int)numCols)
> + {
> + const uint32_t cuAddr = m_rowAddr + numCols - 1;
> + const CUData* ctuPrev = m_encData->getPicCTU(cuAddr);
> + deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_HOR);
> + }
> }
>
> void FrameFilter::processRow(int row)
> @@ -94,30 +155,6 @@
> return;
> }
> FrameData& encData = *m_frame->m_encData;
> - const uint32_t numCols = encData.m_slice->m_sps->numCuInWidth;
> - const uint32_t lineStartCUAddr = row * numCols;
> -
> - if (m_param->bEnableLoopFilter)
> - {
> - const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
> - const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
> -
> - for (uint32_t col = 0; col < numCols; col++)
> - {
> - uint32_t cuAddr = lineStartCUAddr + col;
> - const CUData* ctu = encData.getPicCTU(cuAddr);
> - deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER);
> -
> - if (col > 0)
> - {
> - const CUData* ctuPrev = encData.getPicCTU(cuAddr - 1);
> - deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR);
> - }
> - }
> -
> - const CUData* ctuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1);
> - deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[lineStartCUAddr + numCols - 1]], Deblock::EDGE_HOR);
> - }
>
> // SAO
> SAOParam* saoParam = encData.m_saoParam;
> @@ -476,3 +513,4 @@
> }
> }
> }
> +
> diff -r d964074180e7 -r bf3171a0d20a source/encoder/framefilter.h
> --- a/source/encoder/framefilter.h Wed Nov 18 10:58:48 2015 -0600
> +++ b/source/encoder/framefilter.h Wed Nov 18 10:58:51 2015 -0600
> @@ -29,6 +29,7 @@
> #include "frame.h"
> #include "deblock.h"
> #include "sao.h"
> +#include "threadpool.h" // class BondedTaskGroup
>
> namespace X265_NS {
> // private x265 namespace
> @@ -39,7 +40,7 @@
> struct ThreadLocalData;
>
> // Manages the processing of a single frame loopfilter
> -class FrameFilter : public Deblock
> +class FrameFilter
> {
> public:
>
> @@ -57,9 +58,46 @@
>
> void* m_ssimBuf; /* Temp storage for ssim computation */
>
> - FrameFilter();
> +#define MAX_PFILTER_CUS (4) /* maximum CUs for every thread */
> + class ParallelDeblock : public BondedTaskGroup, public Deblock
> + {
> + public:
> + static uint32_t numCols;
> + uint32_t m_rowAddr;
> + FrameEncoder* m_frameEncoder;
> + FrameData* m_encData;
> + ThreadSafeInteger m_lastCol; /* The column that next to process */
> + ThreadSafeInteger m_allowedCol; /* The column that processed from Encode pipeline */
>
> - void init(Encoder *top, FrameEncoder *frame, int numRows);
> + ParallelDeblock()
> + : m_rowAddr(0)
> + , m_frameEncoder(NULL)
> + , m_encData(NULL)
> + {
> + }
> +
> + ~ParallelDeblock()
> + { }
> +
> + void processTasks(int workerThreadId);
> +
> + protected:
> +
> + ParallelDeblock operator=(const ParallelDeblock&);
> + };
> +
> + ParallelDeblock* m_pdeblock;
> +
> + FrameFilter()
> + : m_param(NULL)
> + , m_frame(NULL)
> + , m_frameEncoder(NULL)
> + , m_ssimBuf(NULL)
> + , m_pdeblock(NULL)
> + {
> + }
> +
> + void init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols);
> void destroy();
>
> void start(Frame *pic, Entropy& initState, int qp);
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list