[x265] [PATCH 02 of 24] Optimize Deblock with idle threading and put Deblock into encode loop to accelerate Frame Parallelism
Min Chen
chenm003 at 163.com
Tue Dec 8 00:54:39 CET 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1449511552 21600
# Node ID 6726fba8beab483428949404d6ffbd4f345e9149
# Parent 4f6b549198244291d25d6d2a0208e212960237c1
Optimize Deblock with idle threading and put Deblock into encode loop to accelerate Frame Parallelism
---
source/common/threading.h | 18 +++++++
source/encoder/frameencoder.cpp | 53 +++++++++++++++++++-
source/encoder/framefilter.cpp | 102 ++++++++++++++++++++++++++------------
source/encoder/framefilter.h | 44 ++++++++++++++++-
4 files changed, 179 insertions(+), 38 deletions(-)
diff -r 4f6b54919824 -r 6726fba8beab source/common/threading.h
--- a/source/common/threading.h Mon Dec 07 12:05:49 2015 -0600
+++ b/source/common/threading.h Mon Dec 07 12:05:52 2015 -0600
@@ -205,6 +205,15 @@
return ret;
}
+ int getIncr(int n = 1)
+ {
+ EnterCriticalSection(&m_cs);
+ int ret = m_val;
+ m_val += n;
+ LeaveCriticalSection(&m_cs);
+ return ret;
+ }
+
void set(int newval)
{
EnterCriticalSection(&m_cs);
@@ -394,6 +403,15 @@
return ret;
}
+ int getIncr(int n = 1)
+ {
+ pthread_mutex_lock(&m_mutex);
+ int ret = m_val;
+ m_val += n;
+ pthread_mutex_unlock(&m_mutex);
+ return ret;
+ }
+
void set(int newval)
{
pthread_mutex_lock(&m_mutex);
diff -r 4f6b54919824 -r 6726fba8beab source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp Mon Dec 07 12:05:49 2015 -0600
+++ b/source/encoder/frameencoder.cpp Mon Dec 07 12:05:52 2015 -0600
@@ -124,7 +124,7 @@
m_pool = NULL;
}
- m_frameFilter.init(top, this, numRows);
+ m_frameFilter.init(top, this, numRows, numCols);
// initialize HRD parameters of SPS
if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
@@ -857,7 +857,7 @@
// Called by worker threads
void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld)
{
- uint32_t row = (uint32_t)intRow;
+ const uint32_t row = (uint32_t)intRow;
CTURow& curRow = m_rows[row];
tld.analysis.m_param = m_param;
@@ -899,7 +899,7 @@
{
ProfileScopeEvent(encodeCTU);
- uint32_t col = curRow.completed;
+ const uint32_t col = curRow.completed;
const uint32_t cuAddr = lineStartCUAddr + col;
CUData* ctu = curEncData.getPicCTU(cuAddr);
ctu->initCTU(*m_frame, cuAddr, slice->m_sliceQp);
@@ -1089,10 +1089,29 @@
}
}
+ // TODO: move Deblock and SAO to before VBV check
+
/* SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas */
if (m_param->bEnableSAO && m_param->bSaoNonDeblocked)
m_frameFilter.m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row);
+ /* Deblock with idle threading */
+ if (m_param->bEnableLoopFilter)
+ {
+ // TODO: Multiple Threading
+ // Delay ONE row to avoid Intra Prediction Conflict
+ if (row > 0)
+ {
+ // Waitting last threading finish
+ m_frameFilter.m_pdeblock[row - 1].waitForExit();
+
+ // Processing new group
+ const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_pdeblock[row - 2].m_lastCol.get(), (int)col) : col);
+ m_frameFilter.m_pdeblock[row - 1].m_allowedCol.set(allowCol);
+ m_frameFilter.m_pdeblock[row - 1].tryBondPeers(*this, 1);
+ }
+ }
+
if (m_param->bEnableWavefront && curRow.completed >= 2 && row < m_numRows - 1 &&
(!m_bAllRowsStop || intRow + 1 < m_vbvResetTriggerRow))
{
@@ -1153,6 +1172,23 @@
if (m_param->bEnableWavefront)
{
+ /* Processing left Deblock block with current threading */
+ if (m_param->bEnableLoopFilter & (row > 0))
+ {
+ /* TODO: Multiple Threading */
+ m_frameFilter.m_pdeblock[row - 1].waitForExit();
+
+ /* Check to avoid previous row process slower than current row */
+ if (row >= 2)
+ {
+ int prevCol = m_frameFilter.m_pdeblock[row - 2].m_lastCol.get();
+ while(prevCol != (int)numCols)
+ prevCol = m_frameFilter.m_pdeblock[row - 2].m_lastCol.waitForChange(prevCol);
+ }
+ m_frameFilter.m_pdeblock[row - 1].m_allowedCol.set(numCols);
+ m_frameFilter.m_pdeblock[row - 1].processTasks(-1);
+ }
+
/* trigger row-wise loop filters */
if (row >= m_filterRowDelay)
{
@@ -1163,8 +1199,19 @@
enqueueRowFilter(0);
tryWakeOne();
}
+
if (row == m_numRows - 1)
{
+ /* TODO: Early start last row */
+ if (m_param->bEnableLoopFilter)
+ {
+ X265_CHECK(m_frameFilter.m_pdeblock[row - 1].m_allowedCol.get() == (int)numCols, "Deblock m_EncodedCol check failed");
+
+ /* NOTE: Last Row not execute before, so didn't need wait */
+ m_frameFilter.m_pdeblock[row].m_allowedCol.set(numCols);
+ m_frameFilter.m_pdeblock[row].processTasks(-1);
+ }
+
for (uint32_t i = m_numRows - m_filterRowDelay; i < m_numRows; i++)
enableRowFilter(i);
tryWakeOne();
diff -r 4f6b54919824 -r 6726fba8beab source/encoder/framefilter.cpp
--- a/source/encoder/framefilter.cpp Mon Dec 07 12:05:49 2015 -0600
+++ b/source/encoder/framefilter.cpp Mon Dec 07 12:05:52 2015 -0600
@@ -35,13 +35,7 @@
static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height);
static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt);
-FrameFilter::FrameFilter()
- : m_param(NULL)
- , m_frame(NULL)
- , m_frameEncoder(NULL)
- , m_ssimBuf(NULL)
-{
-}
+uint32_t FrameFilter::ParallelDeblock::numCols = 0;
void FrameFilter::destroy()
{
@@ -49,9 +43,15 @@
m_sao.destroy();
X265_FREE(m_ssimBuf);
+
+ if (m_pdeblock)
+ {
+ delete[] m_pdeblock;
+ m_pdeblock = NULL;
+ }
}
-void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows)
+void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols)
{
m_param = top->m_param;
m_frameEncoder = frame;
@@ -69,6 +69,21 @@
if (m_param->bEnableSsim)
m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3));
+
+ if (m_param->bEnableLoopFilter)
+ m_pdeblock = new ParallelDeblock[numRows];
+
+ if (m_pdeblock)
+ {
+ for(int row = 0; row < numRows; row++)
+ {
+ m_pdeblock[row].m_rowAddr = row * numCols;
+ m_pdeblock[row].m_frameEncoder = m_frameEncoder;
+ }
+ }
+
+ // Setting maximum columns
+ ParallelDeblock::numCols = numCols;
}
void FrameFilter::start(Frame *frame, Entropy& initState, int qp)
@@ -77,6 +92,52 @@
if (m_param->bEnableSAO)
m_sao.startSlice(frame, initState, qp);
+
+ // Reset Deblock Data Struct
+ if (m_pdeblock)
+ {
+ for(int row = 0; row < m_numRows; row++)
+ {
+ m_pdeblock[row].m_lastCol.set(0);
+ m_pdeblock[row].m_allowedCol.set(0);
+ m_pdeblock[row].m_encData = frame->m_encData;
+ }
+ }
+}
+
+// NOTE: Single Threading only
+void FrameFilter::ParallelDeblock::processTasks(int /*workerThreadId*/)
+{
+ const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
+ const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
+ const int colStart = m_lastCol.get();
+ // TODO: Waiting previous row finish or simple clip on it?
+ const int colEnd = m_allowedCol.get();
+
+ // Avoid threading conflict
+ if (colStart >= colEnd)
+ return;
+
+ for (uint32_t col = (uint32_t)colStart; col < (uint32_t)colEnd; col++)
+ {
+ const uint32_t cuAddr = m_rowAddr + col;
+ const CUData* ctu = m_encData->getPicCTU(cuAddr);
+ deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER);
+
+ if (col > 0)
+ {
+ const CUData* ctuPrev = m_encData->getPicCTU(cuAddr - 1);
+ deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR);
+ }
+ m_lastCol.incr();
+ }
+
+ if (colEnd == (int)numCols)
+ {
+ const uint32_t cuAddr = m_rowAddr + numCols - 1;
+ const CUData* ctuPrev = m_encData->getPicCTU(cuAddr);
+ deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_HOR);
+ }
}
void FrameFilter::processRow(int row)
@@ -94,30 +155,6 @@
return;
}
FrameData& encData = *m_frame->m_encData;
- const uint32_t numCols = encData.m_slice->m_sps->numCuInWidth;
- const uint32_t lineStartCUAddr = row * numCols;
-
- if (m_param->bEnableLoopFilter)
- {
- const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
- const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
-
- for (uint32_t col = 0; col < numCols; col++)
- {
- uint32_t cuAddr = lineStartCUAddr + col;
- const CUData* ctu = encData.getPicCTU(cuAddr);
- deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER);
-
- if (col > 0)
- {
- const CUData* ctuPrev = encData.getPicCTU(cuAddr - 1);
- deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR);
- }
- }
-
- const CUData* ctuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1);
- deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[lineStartCUAddr + numCols - 1]], Deblock::EDGE_HOR);
- }
// SAO
SAOParam* saoParam = encData.m_saoParam;
@@ -476,3 +513,4 @@
}
}
}
+
diff -r 4f6b54919824 -r 6726fba8beab source/encoder/framefilter.h
--- a/source/encoder/framefilter.h Mon Dec 07 12:05:49 2015 -0600
+++ b/source/encoder/framefilter.h Mon Dec 07 12:05:52 2015 -0600
@@ -29,6 +29,7 @@
#include "frame.h"
#include "deblock.h"
#include "sao.h"
+#include "threadpool.h" // class BondedTaskGroup
namespace X265_NS {
// private x265 namespace
@@ -39,7 +40,7 @@
struct ThreadLocalData;
// Manages the processing of a single frame loopfilter
-class FrameFilter : public Deblock
+class FrameFilter
{
public:
@@ -57,9 +58,46 @@
void* m_ssimBuf; /* Temp storage for ssim computation */
- FrameFilter();
+#define MAX_PFILTER_CUS (4) /* maximum CUs for every thread */
+ class ParallelDeblock : public BondedTaskGroup, public Deblock
+ {
+ public:
+ static uint32_t numCols;
+ uint32_t m_rowAddr;
+ FrameEncoder* m_frameEncoder;
+ FrameData* m_encData;
+ ThreadSafeInteger m_lastCol; /* The column that next to process */
+ ThreadSafeInteger m_allowedCol; /* The column that processed from Encode pipeline */
- void init(Encoder *top, FrameEncoder *frame, int numRows);
+ ParallelDeblock()
+ : m_rowAddr(0)
+ , m_frameEncoder(NULL)
+ , m_encData(NULL)
+ {
+ }
+
+ ~ParallelDeblock()
+ { }
+
+ void processTasks(int workerThreadId);
+
+ protected:
+
+ ParallelDeblock operator=(const ParallelDeblock&);
+ };
+
+ ParallelDeblock* m_pdeblock;
+
+ FrameFilter()
+ : m_param(NULL)
+ , m_frame(NULL)
+ , m_frameEncoder(NULL)
+ , m_ssimBuf(NULL)
+ , m_pdeblock(NULL)
+ {
+ }
+
+ void init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols);
void destroy();
void start(Frame *pic, Entropy& initState, int qp);
More information about the x265-devel
mailing list