[x265] [PATCH 4 of 6] Optimize Deblock with idle threading and put Deblock into encode loop to accelerate Frame Parallelism

Min Chen chenm003 at 163.com
Wed Nov 18 18:14:07 CET 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1447865931 21600
# Node ID bf3171a0d20a175268ed987c6f93feb07229562e
# Parent  d964074180e715a49fe094319f4071931fde5fa3
Optimize Deblock with idle threading and put Deblock into encode loop to accelerate Frame Parallelism
---
 source/common/threading.h       |   18 +++++++
 source/encoder/frameencoder.cpp |   53 +++++++++++++++++++-
 source/encoder/framefilter.cpp  |  102 ++++++++++++++++++++++++++------------
 source/encoder/framefilter.h    |   44 ++++++++++++++++-
 4 files changed, 179 insertions(+), 38 deletions(-)

diff -r d964074180e7 -r bf3171a0d20a source/common/threading.h
--- a/source/common/threading.h	Wed Nov 18 10:58:48 2015 -0600
+++ b/source/common/threading.h	Wed Nov 18 10:58:51 2015 -0600
@@ -205,6 +205,15 @@
         return ret;
     }
 
+    int getIncr(const int n = 1)
+    {
+        EnterCriticalSection(&m_cs);
+        int ret = m_val;
+        m_val += n;
+        LeaveCriticalSection(&m_cs);
+        return ret;
+    }
+
     void set(int newval)
     {
         EnterCriticalSection(&m_cs);
@@ -394,6 +403,15 @@
         return ret;
     }
 
+    int getIncr(const int n = 1)
+    {
+        pthread_mutex_lock(&m_mutex);
+        int ret = m_val;
+        m_val += n;
+        pthread_mutex_unlock(&m_mutex);
+        return ret;
+    }
+
     void set(int newval)
     {
         pthread_mutex_lock(&m_mutex);
diff -r d964074180e7 -r bf3171a0d20a source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp	Wed Nov 18 10:58:48 2015 -0600
+++ b/source/encoder/frameencoder.cpp	Wed Nov 18 10:58:51 2015 -0600
@@ -124,7 +124,7 @@
         m_pool = NULL;
     }
 
-    m_frameFilter.init(top, this, numRows);
+    m_frameFilter.init(top, this, numRows, numCols);
 
     // initialize HRD parameters of SPS
     if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
@@ -857,7 +857,7 @@
 // Called by worker threads
 void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld)
 {
-    uint32_t row = (uint32_t)intRow;
+    const uint32_t row = (uint32_t)intRow;
     CTURow& curRow = m_rows[row];
 
     tld.analysis.m_param = m_param;
@@ -899,7 +899,7 @@
     {
         ProfileScopeEvent(encodeCTU);
 
-        uint32_t col = curRow.completed;
+        const uint32_t col = curRow.completed;
         const uint32_t cuAddr = lineStartCUAddr + col;
         CUData* ctu = curEncData.getPicCTU(cuAddr);
         ctu->initCTU(*m_frame, cuAddr, slice->m_sliceQp);
@@ -1089,10 +1089,29 @@
             }
         }
 
+        // TODO: move Deblock and SAO to before VBV check
+
         /* SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas */
         if (m_param->bEnableSAO && m_param->bSaoNonDeblocked)
             m_frameFilter.m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row);
 
+        /* Deblock with idle threading */
+        if (m_param->bEnableLoopFilter)
+        {
+            // TODO: Multiple Threading
+            // Delay ONE row to avoid Intra Prediction Conflict
+            if (row > 0)
+            {
+                // Waitting last threading finish
+                m_frameFilter.m_pdeblock[row - 1].waitForExit();
+
+                // Processing new group
+                const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_pdeblock[row - 2].m_lastCol.get(), (int)col) : col);
+                m_frameFilter.m_pdeblock[row - 1].m_allowedCol.set(allowCol);
+                m_frameFilter.m_pdeblock[row - 1].tryBondPeers(*this, 1);
+            }
+        }
+
         if (m_param->bEnableWavefront && curRow.completed >= 2 && row < m_numRows - 1 &&
             (!m_bAllRowsStop || intRow + 1 < m_vbvResetTriggerRow))
         {
@@ -1153,6 +1172,23 @@
 
     if (m_param->bEnableWavefront)
     {
+        /* Processing left Deblock block with current threading */
+        if (m_param->bEnableLoopFilter & (row > 0))
+        {
+            /* TODO: Multiple Threading */
+            m_frameFilter.m_pdeblock[row - 1].waitForExit();
+
+            /* Check to avoid previous row process slower than current row */
+            if (row >= 2)
+            {
+                int prevCol = m_frameFilter.m_pdeblock[row - 2].m_lastCol.get();
+                while(prevCol != (int)numCols)
+                    prevCol = m_frameFilter.m_pdeblock[row - 2].m_lastCol.waitForChange(prevCol);
+            }
+            m_frameFilter.m_pdeblock[row - 1].m_allowedCol.set(numCols);
+            m_frameFilter.m_pdeblock[row - 1].processTasks(-1);
+        }
+
         /* trigger row-wise loop filters */
         if (row >= m_filterRowDelay)
         {
@@ -1163,8 +1199,19 @@
                 enqueueRowFilter(0);
             tryWakeOne();
         }
+
         if (row == m_numRows - 1)
         {
+            /* TODO: Early start last row */
+            if (m_param->bEnableLoopFilter)
+            {
+                X265_CHECK(m_frameFilter.m_pdeblock[row - 1].m_allowedCol.get() == (int)numCols, "Deblock m_EncodedCol check failed");
+
+                /* NOTE: Last Row not execute before, so didn't need wait */
+                m_frameFilter.m_pdeblock[row].m_allowedCol.set(numCols);
+                m_frameFilter.m_pdeblock[row].processTasks(-1);
+            }
+
             for (uint32_t i = m_numRows - m_filterRowDelay; i < m_numRows; i++)
                 enableRowFilter(i);
             tryWakeOne();
diff -r d964074180e7 -r bf3171a0d20a source/encoder/framefilter.cpp
--- a/source/encoder/framefilter.cpp	Wed Nov 18 10:58:48 2015 -0600
+++ b/source/encoder/framefilter.cpp	Wed Nov 18 10:58:51 2015 -0600
@@ -35,13 +35,7 @@
 static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height);
 static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt);
 
-FrameFilter::FrameFilter()
-    : m_param(NULL)
-    , m_frame(NULL)
-    , m_frameEncoder(NULL)
-    , m_ssimBuf(NULL)
-{
-}
+uint32_t FrameFilter::ParallelDeblock::numCols = 0;
 
 void FrameFilter::destroy()
 {
@@ -49,9 +43,15 @@
         m_sao.destroy();
 
     X265_FREE(m_ssimBuf);
+
+    if (m_pdeblock)
+    {
+        delete[] m_pdeblock;
+        m_pdeblock = NULL;
+    }
 }
 
-void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows)
+void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols)
 {
     m_param = top->m_param;
     m_frameEncoder = frame;
@@ -69,6 +69,21 @@
 
     if (m_param->bEnableSsim)
         m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3));
+
+    if (m_param->bEnableLoopFilter)
+        m_pdeblock = new ParallelDeblock[numRows];
+
+    if (m_pdeblock)
+    {
+        for(int row = 0; row < numRows; row++)
+        {
+            m_pdeblock[row].m_rowAddr = row * numCols;
+            m_pdeblock[row].m_frameEncoder = m_frameEncoder;
+        }
+    }
+
+    // Setting maximum columns
+    ParallelDeblock::numCols = numCols;
 }
 
 void FrameFilter::start(Frame *frame, Entropy& initState, int qp)
@@ -77,6 +92,52 @@
 
     if (m_param->bEnableSAO)
         m_sao.startSlice(frame, initState, qp);
+
+    // Reset Deblock Data Struct
+    if (m_pdeblock)
+    {
+        for(int row = 0; row < m_numRows; row++)
+        {
+            m_pdeblock[row].m_lastCol.set(0);
+            m_pdeblock[row].m_allowedCol.set(0);
+            m_pdeblock[row].m_encData = frame->m_encData;
+        }
+    }
+}
+
+// NOTE: Single Threading only
+void FrameFilter::ParallelDeblock::processTasks(int /*workerThreadId*/)
+{
+    const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
+    const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
+    const int colStart = m_lastCol.get();
+    // TODO: Waiting previous row finish or simple clip on it?
+    const int colEnd = m_allowedCol.get();
+
+    // Avoid threading conflict
+    if (colStart >= colEnd)
+        return;
+
+    for (uint32_t col = (uint32_t)colStart; col < (uint32_t)colEnd; col++)
+    {
+        const uint32_t cuAddr = m_rowAddr + col;
+        const CUData* ctu = m_encData->getPicCTU(cuAddr);
+        deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER);
+
+        if (col > 0)
+        {
+            const CUData* ctuPrev = m_encData->getPicCTU(cuAddr - 1);
+            deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR);
+        }
+        m_lastCol.incr();
+    }
+
+    if (colEnd == (int)numCols)
+    {
+        const uint32_t cuAddr = m_rowAddr + numCols - 1;
+        const CUData* ctuPrev = m_encData->getPicCTU(cuAddr);
+        deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_HOR);
+    }
 }
 
 void FrameFilter::processRow(int row)
@@ -94,30 +155,6 @@
         return;
     }
     FrameData& encData = *m_frame->m_encData;
-    const uint32_t numCols = encData.m_slice->m_sps->numCuInWidth;
-    const uint32_t lineStartCUAddr = row * numCols;
-
-    if (m_param->bEnableLoopFilter)
-    {
-        const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
-        const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
-
-        for (uint32_t col = 0; col < numCols; col++)
-        {
-            uint32_t cuAddr = lineStartCUAddr + col;
-            const CUData* ctu = encData.getPicCTU(cuAddr);
-            deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER);
-
-            if (col > 0)
-            {
-                const CUData* ctuPrev = encData.getPicCTU(cuAddr - 1);
-                deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR);
-            }
-        }
-
-        const CUData* ctuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1);
-        deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[lineStartCUAddr + numCols - 1]], Deblock::EDGE_HOR);
-    }
 
     // SAO
     SAOParam* saoParam = encData.m_saoParam;
@@ -476,3 +513,4 @@
         }
     }
 }
+
diff -r d964074180e7 -r bf3171a0d20a source/encoder/framefilter.h
--- a/source/encoder/framefilter.h	Wed Nov 18 10:58:48 2015 -0600
+++ b/source/encoder/framefilter.h	Wed Nov 18 10:58:51 2015 -0600
@@ -29,6 +29,7 @@
 #include "frame.h"
 #include "deblock.h"
 #include "sao.h"
+#include "threadpool.h" // class BondedTaskGroup
 
 namespace X265_NS {
 // private x265 namespace
@@ -39,7 +40,7 @@
 struct ThreadLocalData;
 
 // Manages the processing of a single frame loopfilter
-class FrameFilter : public Deblock
+class FrameFilter
 {
 public:
 
@@ -57,9 +58,46 @@
     
     void*         m_ssimBuf; /* Temp storage for ssim computation */
 
-    FrameFilter();
+#define MAX_PFILTER_CUS     (4) /* maximum CUs for every thread */
+    class ParallelDeblock : public BondedTaskGroup, public Deblock
+    {
+    public:
+        static uint32_t     numCols;
+        uint32_t            m_rowAddr;
+        FrameEncoder*       m_frameEncoder;
+        FrameData*          m_encData;
+        ThreadSafeInteger   m_lastCol;          /* The column that next to process */
+        ThreadSafeInteger   m_allowedCol;       /* The column that processed from Encode pipeline */
 
-    void init(Encoder *top, FrameEncoder *frame, int numRows);
+        ParallelDeblock()
+            : m_rowAddr(0)
+            , m_frameEncoder(NULL)
+            , m_encData(NULL)
+        {
+        }
+
+        ~ParallelDeblock()
+        { }
+
+        void processTasks(int workerThreadId);
+
+    protected:
+
+        ParallelDeblock operator=(const ParallelDeblock&);
+    };
+
+    ParallelDeblock*    m_pdeblock;
+
+    FrameFilter()
+        : m_param(NULL)
+        , m_frame(NULL)
+        , m_frameEncoder(NULL)
+        , m_ssimBuf(NULL)
+        , m_pdeblock(NULL)
+    {
+    }
+
+    void init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols);
     void destroy();
 
     void start(Frame *pic, Entropy& initState, int qp);



More information about the x265-devel mailing list