[x265] [PATCH 2 of 2 demo only] Parallelism filter (Deblock) on idle threads

Min Chen chenm003 at 163.com
Fri Nov 13 02:35:02 CET 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1447278442 21600
# Node ID 92bc95c580b339c17cfc223d57643ab49dc46084
# Parent  3702727fbe1817c6d55f40a9b215d8711d8c9c44
Parallelism filter (Deblock) on idle threads
---
 source/common/threading.h      |    8 +++++
 source/encoder/framefilter.cpp |   67 ++++++++++++++++++++++++++++------------
 source/encoder/framefilter.h   |   43 ++++++++++++++++++++++++-
 3 files changed, 96 insertions(+), 22 deletions(-)

diff -r 3702727fbe18 -r 92bc95c580b3 source/common/threading.h
--- a/source/common/threading.h	Wed Nov 11 11:35:08 2015 -0600
+++ b/source/common/threading.h	Wed Nov 11 15:47:22 2015 -0600
@@ -205,6 +205,14 @@
         return ret;
     }
 
+    int getIncr()
+    {
+        EnterCriticalSection(&m_cs);
+        int ret = m_val++;
+        LeaveCriticalSection(&m_cs);
+        return ret;
+    }
+
     void set(int newval)
     {
         EnterCriticalSection(&m_cs);
diff -r 3702727fbe18 -r 92bc95c580b3 source/encoder/framefilter.cpp
--- a/source/encoder/framefilter.cpp	Wed Nov 11 11:35:08 2015 -0600
+++ b/source/encoder/framefilter.cpp	Wed Nov 11 15:47:22 2015 -0600
@@ -35,14 +35,6 @@
 static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height);
 static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt);
 
-FrameFilter::FrameFilter()
-    : m_param(NULL)
-    , m_frame(NULL)
-    , m_frameEncoder(NULL)
-    , m_ssimBuf(NULL)
-{
-}
-
 void FrameFilter::destroy()
 {
     if (m_param->bEnableSAO)
@@ -79,6 +71,31 @@
         m_sao.startSlice(frame, initState, qp);
 }
 
+void FrameFilter::ParallelFilter::processTasks(int /*workerThreadId*/)
+{
+    FrameData& encData = *(master.m_frame->m_encData);
+
+    while((uint32_t)lastGrpId.get() < m_totalGrps)
+    {
+        const uint32_t colStart = lastGrpId.getIncr() * NUM_PARALLEL_DEBLOCK_CUS;
+        const uint32_t colLast  = X265_MIN(colStart + NUM_PARALLEL_DEBLOCK_CUS, m_numCols);
+
+        for (uint32_t col = colStart; col < colLast; col++)
+        {
+            const uint32_t cuAddr = m_lineRowCUAddr + col;
+            const CUData* ctu = encData.getPicCTU(cuAddr);
+            deblockCTU(ctu, m_cuGeoms[m_ctuGeomMap[cuAddr]], EDGE_VER);
+
+            // Bypass First & Last CU in BlockGroup to avoid conflict with neighbor block
+            if (col > colStart + 1)
+            {
+                const CUData* ctuPrev = encData.getPicCTU(cuAddr - 1);
+                deblockCTU(ctuPrev, m_cuGeoms[m_ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR);
+            }
+        }
+    }
+}
+
 void FrameFilter::processRow(int row)
 {
     ProfileScopeEvent(filterCTURow);
@@ -96,27 +113,37 @@
     FrameData& encData = *m_frame->m_encData;
     const uint32_t numCols = encData.m_slice->m_sps->numCuInWidth;
     const uint32_t lineStartCUAddr = row * numCols;
+    const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
+    const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
 
     if (m_param->bEnableLoopFilter)
     {
-        const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
-        const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
+        ParallelFilter blkFilter(*this, lineStartCUAddr, numCols, cuGeoms, ctuGeomMap);
+        if (encData.m_jobProvider)
+            blkFilter.tryBondPeers(*encData.m_jobProvider, blkFilter.m_totalGrps);
 
-        for (uint32_t col = 0; col < numCols; col++)
+        /* Processing with current threading */
+        blkFilter.processTasks(-1);
+
+        /* Waiting all of BlockGroup finish */
+        blkFilter.waitForExit();
+
+        /* Processing last Block with EDGE_HOR in every BlockGroup (Bypass last BlockGroup since it maybe not full) */
+        CUData* ctu;
+        for (uint32_t col = 0; col < numCols; col += NUM_PARALLEL_DEBLOCK_CUS)
         {
-            uint32_t cuAddr = lineStartCUAddr + col;
-            const CUData* ctu = encData.getPicCTU(cuAddr);
-            deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER);
+            const uint32_t cuAddr0 = lineStartCUAddr + col;
+            ctu = encData.getPicCTU(cuAddr0);
+            blkFilter.deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr0]], Deblock::EDGE_HOR);
 
-            if (col > 0)
+            /* Processing last CU in previous group, except just one CU in a BlockGroup */
+            const uint32_t cuAddrN = lineStartCUAddr + X265_MIN(col + NUM_PARALLEL_DEBLOCK_CUS, numCols) - 1;
+            if (cuAddr0 != cuAddrN)
             {
-                const CUData* ctuPrev = encData.getPicCTU(cuAddr - 1);
-                deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR);
+                ctu = encData.getPicCTU(cuAddrN);
+                blkFilter.deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddrN]], Deblock::EDGE_HOR);
             }
         }
-
-        const CUData* ctuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1);
-        deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[lineStartCUAddr + numCols - 1]], Deblock::EDGE_HOR);
     }
 
     // SAO
diff -r 3702727fbe18 -r 92bc95c580b3 source/encoder/framefilter.h
--- a/source/encoder/framefilter.h	Wed Nov 11 11:35:08 2015 -0600
+++ b/source/encoder/framefilter.h	Wed Nov 11 15:47:22 2015 -0600
@@ -29,6 +29,7 @@
 #include "frame.h"
 #include "deblock.h"
 #include "sao.h"
+#include "threadpool.h" // class BondedTaskGroup
 
 namespace X265_NS {
 // private x265 namespace
@@ -39,7 +40,7 @@
 struct ThreadLocalData;
 
 // Manages the processing of a single frame loopfilter
-class FrameFilter : public Deblock
+class FrameFilter
 {
 public:
 
@@ -57,7 +58,45 @@
     
     void*         m_ssimBuf; /* Temp storage for ssim computation */
 
-    FrameFilter();
+#define NUM_PARALLEL_DEBLOCK_CUS    (4) /* maximum CUs for every thread */
+    class ParallelFilter : public BondedTaskGroup, public Deblock
+    {
+    public:
+        ThreadSafeInteger lastGrpId;
+        FrameFilter& master;
+        const uint32_t m_numCols;
+        const uint32_t m_totalGrps;
+        const CUGeom* m_cuGeoms;
+        const uint32_t* m_ctuGeomMap;
+        const uint32_t m_lineRowCUAddr;
+
+        ParallelFilter(FrameFilter& fe, uint32_t lineRowCUAddr, uint32_t numCols, const CUGeom* cuGeoms, const uint32_t *ctuGeomMap)
+            : master(fe)
+            , m_numCols(numCols)
+            , m_totalGrps((numCols + NUM_PARALLEL_DEBLOCK_CUS - 1) / NUM_PARALLEL_DEBLOCK_CUS)
+            , m_cuGeoms(cuGeoms)
+            , m_ctuGeomMap(ctuGeomMap)
+            , m_lineRowCUAddr(lineRowCUAddr)
+        { }
+
+        ~ParallelFilter()
+        { }
+
+        void processTasks(int workerThreadId);
+
+    protected:
+
+        ParallelFilter operator=(const ParallelFilter&);
+    };
+
+    FrameFilter()
+        : m_param(NULL)
+        , m_frame(NULL)
+        , m_frameEncoder(NULL)
+        , m_ssimBuf(NULL)
+    {
+    }
+
 
     void init(Encoder *top, FrameEncoder *frame, int numRows);
     void destroy();



More information about the x265-devel mailing list