[x265] [PATCH 2 of 2 demo only] Parallelism filter (Deblock) on idle threads
Min Chen
chenm003 at 163.com
Fri Nov 13 02:35:02 CET 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1447278442 21600
# Node ID 92bc95c580b339c17cfc223d57643ab49dc46084
# Parent 3702727fbe1817c6d55f40a9b215d8711d8c9c44
Parallelism filter (Deblock) on idle threads
---
source/common/threading.h | 8 +++++
source/encoder/framefilter.cpp | 67 ++++++++++++++++++++++++++++------------
source/encoder/framefilter.h | 43 ++++++++++++++++++++++++-
3 files changed, 96 insertions(+), 22 deletions(-)
diff -r 3702727fbe18 -r 92bc95c580b3 source/common/threading.h
--- a/source/common/threading.h Wed Nov 11 11:35:08 2015 -0600
+++ b/source/common/threading.h Wed Nov 11 15:47:22 2015 -0600
@@ -205,6 +205,14 @@
return ret;
}
+ int getIncr()
+ {
+ EnterCriticalSection(&m_cs);
+ int ret = m_val++;
+ LeaveCriticalSection(&m_cs);
+ return ret;
+ }
+
void set(int newval)
{
EnterCriticalSection(&m_cs);
diff -r 3702727fbe18 -r 92bc95c580b3 source/encoder/framefilter.cpp
--- a/source/encoder/framefilter.cpp Wed Nov 11 11:35:08 2015 -0600
+++ b/source/encoder/framefilter.cpp Wed Nov 11 15:47:22 2015 -0600
@@ -35,14 +35,6 @@
static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height);
static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt);
-FrameFilter::FrameFilter()
- : m_param(NULL)
- , m_frame(NULL)
- , m_frameEncoder(NULL)
- , m_ssimBuf(NULL)
-{
-}
-
void FrameFilter::destroy()
{
if (m_param->bEnableSAO)
@@ -79,6 +71,31 @@
m_sao.startSlice(frame, initState, qp);
}
+void FrameFilter::ParallelFilter::processTasks(int /*workerThreadId*/)
+{
+ FrameData& encData = *(master.m_frame->m_encData);
+
+ while((uint32_t)lastGrpId.get() < m_totalGrps)
+ {
+ const uint32_t colStart = lastGrpId.getIncr() * NUM_PARALLEL_DEBLOCK_CUS;
+ const uint32_t colLast = X265_MIN(colStart + NUM_PARALLEL_DEBLOCK_CUS, m_numCols);
+
+ for (uint32_t col = colStart; col < colLast; col++)
+ {
+ const uint32_t cuAddr = m_lineRowCUAddr + col;
+ const CUData* ctu = encData.getPicCTU(cuAddr);
+ deblockCTU(ctu, m_cuGeoms[m_ctuGeomMap[cuAddr]], EDGE_VER);
+
+ // Bypass First & Last CU in BlockGroup to avoid conflict with neighbor block
+ if (col > colStart + 1)
+ {
+ const CUData* ctuPrev = encData.getPicCTU(cuAddr - 1);
+ deblockCTU(ctuPrev, m_cuGeoms[m_ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR);
+ }
+ }
+ }
+}
+
void FrameFilter::processRow(int row)
{
ProfileScopeEvent(filterCTURow);
@@ -96,27 +113,37 @@
FrameData& encData = *m_frame->m_encData;
const uint32_t numCols = encData.m_slice->m_sps->numCuInWidth;
const uint32_t lineStartCUAddr = row * numCols;
+ const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
+ const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
if (m_param->bEnableLoopFilter)
{
- const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
- const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
+ ParallelFilter blkFilter(*this, lineStartCUAddr, numCols, cuGeoms, ctuGeomMap);
+ if (encData.m_jobProvider)
+ blkFilter.tryBondPeers(*encData.m_jobProvider, blkFilter.m_totalGrps);
- for (uint32_t col = 0; col < numCols; col++)
+ /* Processing with current threading */
+ blkFilter.processTasks(-1);
+
+ /* Waiting all of BlockGroup finish */
+ blkFilter.waitForExit();
+
+ /* Processing last Block with EDGE_HOR in every BlockGroup (Bypass last BlockGroup since it maybe not full) */
+ CUData* ctu;
+ for (uint32_t col = 0; col < numCols; col += NUM_PARALLEL_DEBLOCK_CUS)
{
- uint32_t cuAddr = lineStartCUAddr + col;
- const CUData* ctu = encData.getPicCTU(cuAddr);
- deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER);
+ const uint32_t cuAddr0 = lineStartCUAddr + col;
+ ctu = encData.getPicCTU(cuAddr0);
+ blkFilter.deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr0]], Deblock::EDGE_HOR);
- if (col > 0)
+ /* Processing last CU in previous group, except just one CU in a BlockGroup */
+ const uint32_t cuAddrN = lineStartCUAddr + X265_MIN(col + NUM_PARALLEL_DEBLOCK_CUS, numCols) - 1;
+ if (cuAddr0 != cuAddrN)
{
- const CUData* ctuPrev = encData.getPicCTU(cuAddr - 1);
- deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR);
+ ctu = encData.getPicCTU(cuAddrN);
+ blkFilter.deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddrN]], Deblock::EDGE_HOR);
}
}
-
- const CUData* ctuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1);
- deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[lineStartCUAddr + numCols - 1]], Deblock::EDGE_HOR);
}
// SAO
diff -r 3702727fbe18 -r 92bc95c580b3 source/encoder/framefilter.h
--- a/source/encoder/framefilter.h Wed Nov 11 11:35:08 2015 -0600
+++ b/source/encoder/framefilter.h Wed Nov 11 15:47:22 2015 -0600
@@ -29,6 +29,7 @@
#include "frame.h"
#include "deblock.h"
#include "sao.h"
+#include "threadpool.h" // class BondedTaskGroup
namespace X265_NS {
// private x265 namespace
@@ -39,7 +40,7 @@
struct ThreadLocalData;
// Manages the processing of a single frame loopfilter
-class FrameFilter : public Deblock
+class FrameFilter
{
public:
@@ -57,7 +58,45 @@
void* m_ssimBuf; /* Temp storage for ssim computation */
- FrameFilter();
+#define NUM_PARALLEL_DEBLOCK_CUS (4) /* maximum CUs for every thread */
+ class ParallelFilter : public BondedTaskGroup, public Deblock
+ {
+ public:
+ ThreadSafeInteger lastGrpId;
+ FrameFilter& master;
+ const uint32_t m_numCols;
+ const uint32_t m_totalGrps;
+ const CUGeom* m_cuGeoms;
+ const uint32_t* m_ctuGeomMap;
+ const uint32_t m_lineRowCUAddr;
+
+ ParallelFilter(FrameFilter& fe, uint32_t lineRowCUAddr, uint32_t numCols, const CUGeom* cuGeoms, const uint32_t *ctuGeomMap)
+ : master(fe)
+ , m_numCols(numCols)
+ , m_totalGrps((numCols + NUM_PARALLEL_DEBLOCK_CUS - 1) / NUM_PARALLEL_DEBLOCK_CUS)
+ , m_cuGeoms(cuGeoms)
+ , m_ctuGeomMap(ctuGeomMap)
+ , m_lineRowCUAddr(lineRowCUAddr)
+ { }
+
+ ~ParallelFilter()
+ { }
+
+ void processTasks(int workerThreadId);
+
+ protected:
+
+ ParallelFilter operator=(const ParallelFilter&);
+ };
+
+ FrameFilter()
+ : m_param(NULL)
+ , m_frame(NULL)
+ , m_frameEncoder(NULL)
+ , m_ssimBuf(NULL)
+ {
+ }
+
void init(Encoder *top, FrameEncoder *frame, int numRows);
void destroy();
More information about the x265-devel
mailing list