[x265] [PATCH 4 of 6] Optimize Deblock with idle threading and put Deblock into encode loop to accelerate Frame Parallelism

Steve Borho steve at borho.org
Wed Nov 18 18:56:34 CET 2015


On 11/18, Min Chen wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1447865931 21600
> # Node ID bf3171a0d20a175268ed987c6f93feb07229562e
> # Parent  d964074180e715a49fe094319f4071931fde5fa3
> Optimize Deblock with idle threading and put Deblock into encode loop to accelerate Frame Parallelism
> ---
>  source/common/threading.h       |   18 +++++++
>  source/encoder/frameencoder.cpp |   53 +++++++++++++++++++-
>  source/encoder/framefilter.cpp  |  102 ++++++++++++++++++++++++++------------
>  source/encoder/framefilter.h    |   44 ++++++++++++++++-
>  4 files changed, 179 insertions(+), 38 deletions(-)
> 
> diff -r d964074180e7 -r bf3171a0d20a source/common/threading.h
> --- a/source/common/threading.h	Wed Nov 18 10:58:48 2015 -0600
> +++ b/source/common/threading.h	Wed Nov 18 10:58:51 2015 -0600
> @@ -205,6 +205,15 @@
>          return ret;
>      }
>  
> +    int getIncr(const int n = 1)

we don't use const on integer arguments

> +    {
> +        EnterCriticalSection(&m_cs);
> +        int ret = m_val;
> +        m_val += n;
> +        LeaveCriticalSection(&m_cs);
> +        return ret;
> +    }
> +
>      void set(int newval)
>      {
>          EnterCriticalSection(&m_cs);
> @@ -394,6 +403,15 @@
>          return ret;
>      }
>  
> +    int getIncr(const int n = 1)
> +    {
> +        pthread_mutex_lock(&m_mutex);
> +        int ret = m_val;
> +        m_val += n;
> +        pthread_mutex_unlock(&m_mutex);
> +        return ret;
> +    }
> +
>      void set(int newval)
>      {
>          pthread_mutex_lock(&m_mutex);
> diff -r d964074180e7 -r bf3171a0d20a source/encoder/frameencoder.cpp
> --- a/source/encoder/frameencoder.cpp	Wed Nov 18 10:58:48 2015 -0600
> +++ b/source/encoder/frameencoder.cpp	Wed Nov 18 10:58:51 2015 -0600
> @@ -124,7 +124,7 @@
>          m_pool = NULL;
>      }
>  
> -    m_frameFilter.init(top, this, numRows);
> +    m_frameFilter.init(top, this, numRows, numCols);
>  
>      // initialize HRD parameters of SPS
>      if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
> @@ -857,7 +857,7 @@
>  // Called by worker threads
>  void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld)
>  {
> -    uint32_t row = (uint32_t)intRow;
> +    const uint32_t row = (uint32_t)intRow;
>      CTURow& curRow = m_rows[row];
>  
>      tld.analysis.m_param = m_param;
> @@ -899,7 +899,7 @@
>      {
>          ProfileScopeEvent(encodeCTU);
>  
> -        uint32_t col = curRow.completed;
> +        const uint32_t col = curRow.completed;
>          const uint32_t cuAddr = lineStartCUAddr + col;
>          CUData* ctu = curEncData.getPicCTU(cuAddr);
>          ctu->initCTU(*m_frame, cuAddr, slice->m_sliceQp);
> @@ -1089,10 +1089,29 @@
>              }
>          }
>  
> +        // TODO: move Deblock and SAO to before VBV check
> +
>          /* SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas */
>          if (m_param->bEnableSAO && m_param->bSaoNonDeblocked)
>              m_frameFilter.m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row);
>  
> +        /* Deblock with idle threading */
> +        if (m_param->bEnableLoopFilter)
> +        {
> +            // TODO: Multiple Threading
> +            // Delay ONE row to avoid Intra Prediction Conflict
> +            if (row > 0)
> +            {
> +                // Waitting last threading finish
> +                m_frameFilter.m_pdeblock[row - 1].waitForExit();
> +
> +                // Processing new group
> +                const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_pdeblock[row - 2].m_lastCol.get(), (int)col) : col);
> +                m_frameFilter.m_pdeblock[row - 1].m_allowedCol.set(allowCol);
> +                m_frameFilter.m_pdeblock[row - 1].tryBondPeers(*this, 1);
> +            }
> +        }
> +
>          if (m_param->bEnableWavefront && curRow.completed >= 2 && row < m_numRows - 1 &&
>              (!m_bAllRowsStop || intRow + 1 < m_vbvResetTriggerRow))
>          {
> @@ -1153,6 +1172,23 @@
>  
>      if (m_param->bEnableWavefront)
>      {
> +        /* Processing left Deblock block with current threading */
> +        if (m_param->bEnableLoopFilter & (row > 0))
> +        {
> +            /* TODO: Multiple Threading */
> +            m_frameFilter.m_pdeblock[row - 1].waitForExit();
> +
> +            /* Check to avoid previous row process slower than current row */
> +            if (row >= 2)
> +            {
> +                int prevCol = m_frameFilter.m_pdeblock[row - 2].m_lastCol.get();
> +                while(prevCol != (int)numCols)
> +                    prevCol = m_frameFilter.m_pdeblock[row - 2].m_lastCol.waitForChange(prevCol);
> +            }
> +            m_frameFilter.m_pdeblock[row - 1].m_allowedCol.set(numCols);
> +            m_frameFilter.m_pdeblock[row - 1].processTasks(-1);
> +        }
> +
>          /* trigger row-wise loop filters */
>          if (row >= m_filterRowDelay)
>          {
> @@ -1163,8 +1199,19 @@
>                  enqueueRowFilter(0);
>              tryWakeOne();
>          }
> +
>          if (row == m_numRows - 1)
>          {
> +            /* TODO: Early start last row */
> +            if (m_param->bEnableLoopFilter)
> +            {
> +                X265_CHECK(m_frameFilter.m_pdeblock[row - 1].m_allowedCol.get() == (int)numCols, "Deblock m_EncodedCol check failed");
> +
> +                /* NOTE: Last Row not execute before, so didn't need wait */
> +                m_frameFilter.m_pdeblock[row].m_allowedCol.set(numCols);
> +                m_frameFilter.m_pdeblock[row].processTasks(-1);
> +            }
> +
>              for (uint32_t i = m_numRows - m_filterRowDelay; i < m_numRows; i++)
>                  enableRowFilter(i);
>              tryWakeOne();
> diff -r d964074180e7 -r bf3171a0d20a source/encoder/framefilter.cpp
> --- a/source/encoder/framefilter.cpp	Wed Nov 18 10:58:48 2015 -0600
> +++ b/source/encoder/framefilter.cpp	Wed Nov 18 10:58:51 2015 -0600
> @@ -35,13 +35,7 @@
>  static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height);
>  static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt);
>  
> -FrameFilter::FrameFilter()
> -    : m_param(NULL)
> -    , m_frame(NULL)
> -    , m_frameEncoder(NULL)
> -    , m_ssimBuf(NULL)
> -{
> -}
> +uint32_t FrameFilter::ParallelDeblock::numCols = 0;
>  
>  void FrameFilter::destroy()
>  {
> @@ -49,9 +43,15 @@
>          m_sao.destroy();
>  
>      X265_FREE(m_ssimBuf);
> +
> +    if (m_pdeblock)
> +    {
> +        delete[] m_pdeblock;
> +        m_pdeblock = NULL;
> +    }
>  }
>  
> -void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows)
> +void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols)
>  {
>      m_param = top->m_param;
>      m_frameEncoder = frame;
> @@ -69,6 +69,21 @@
>  
>      if (m_param->bEnableSsim)
>          m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3));
> +
> +    if (m_param->bEnableLoopFilter)
> +        m_pdeblock = new ParallelDeblock[numRows];
> +
> +    if (m_pdeblock)
> +    {
> +        for(int row = 0; row < numRows; row++)
> +        {
> +            m_pdeblock[row].m_rowAddr = row * numCols;
> +            m_pdeblock[row].m_frameEncoder = m_frameEncoder;
> +        }
> +    }
> +
> +    // Setting maximum columns
> +    ParallelDeblock::numCols = numCols;
>  }
>  
>  void FrameFilter::start(Frame *frame, Entropy& initState, int qp)
> @@ -77,6 +92,52 @@
>  
>      if (m_param->bEnableSAO)
>          m_sao.startSlice(frame, initState, qp);
> +
> +    // Reset Deblock Data Struct
> +    if (m_pdeblock)
> +    {
> +        for(int row = 0; row < m_numRows; row++)
> +        {
> +            m_pdeblock[row].m_lastCol.set(0);
> +            m_pdeblock[row].m_allowedCol.set(0);
> +            m_pdeblock[row].m_encData = frame->m_encData;
> +        }
> +    }
> +}
> +
> +// NOTE: Single Threading only
> +void FrameFilter::ParallelDeblock::processTasks(int /*workerThreadId*/)
> +{
> +    const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
> +    const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
> +    const int colStart = m_lastCol.get();
> +    // TODO: Waiting previous row finish or simple clip on it?
> +    const int colEnd = m_allowedCol.get();
> +
> +    // Avoid threading conflict
> +    if (colStart >= colEnd)
> +        return;
> +
> +    for (uint32_t col = (uint32_t)colStart; col < (uint32_t)colEnd; col++)
> +    {
> +        const uint32_t cuAddr = m_rowAddr + col;
> +        const CUData* ctu = m_encData->getPicCTU(cuAddr);
> +        deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER);
> +
> +        if (col > 0)
> +        {
> +            const CUData* ctuPrev = m_encData->getPicCTU(cuAddr - 1);
> +            deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR);
> +        }
> +        m_lastCol.incr();
> +    }
> +
> +    if (colEnd == (int)numCols)
> +    {
> +        const uint32_t cuAddr = m_rowAddr + numCols - 1;
> +        const CUData* ctuPrev = m_encData->getPicCTU(cuAddr);
> +        deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_HOR);
> +    }
>  }
>  
>  void FrameFilter::processRow(int row)
> @@ -94,30 +155,6 @@
>          return;
>      }
>      FrameData& encData = *m_frame->m_encData;
> -    const uint32_t numCols = encData.m_slice->m_sps->numCuInWidth;
> -    const uint32_t lineStartCUAddr = row * numCols;
> -
> -    if (m_param->bEnableLoopFilter)
> -    {
> -        const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
> -        const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
> -
> -        for (uint32_t col = 0; col < numCols; col++)
> -        {
> -            uint32_t cuAddr = lineStartCUAddr + col;
> -            const CUData* ctu = encData.getPicCTU(cuAddr);
> -            deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER);
> -
> -            if (col > 0)
> -            {
> -                const CUData* ctuPrev = encData.getPicCTU(cuAddr - 1);
> -                deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR);
> -            }
> -        }
> -
> -        const CUData* ctuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1);
> -        deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[lineStartCUAddr + numCols - 1]], Deblock::EDGE_HOR);
> -    }
>  
>      // SAO
>      SAOParam* saoParam = encData.m_saoParam;
> @@ -476,3 +513,4 @@
>          }
>      }
>  }
> +
> diff -r d964074180e7 -r bf3171a0d20a source/encoder/framefilter.h
> --- a/source/encoder/framefilter.h	Wed Nov 18 10:58:48 2015 -0600
> +++ b/source/encoder/framefilter.h	Wed Nov 18 10:58:51 2015 -0600
> @@ -29,6 +29,7 @@
>  #include "frame.h"
>  #include "deblock.h"
>  #include "sao.h"
> +#include "threadpool.h" // class BondedTaskGroup
>  
>  namespace X265_NS {
>  // private x265 namespace
> @@ -39,7 +40,7 @@
>  struct ThreadLocalData;
>  
>  // Manages the processing of a single frame loopfilter
> -class FrameFilter : public Deblock
> +class FrameFilter
>  {
>  public:
>  
> @@ -57,9 +58,46 @@
>      
>      void*         m_ssimBuf; /* Temp storage for ssim computation */
>  
> -    FrameFilter();
> +#define MAX_PFILTER_CUS     (4) /* maximum CUs for every thread */
> +    class ParallelDeblock : public BondedTaskGroup, public Deblock
> +    {
> +    public:
> +        static uint32_t     numCols;
> +        uint32_t            m_rowAddr;
> +        FrameEncoder*       m_frameEncoder;
> +        FrameData*          m_encData;
> +        ThreadSafeInteger   m_lastCol;          /* The column that next to process */
> +        ThreadSafeInteger   m_allowedCol;       /* The column that processed from Encode pipeline */
>  
> -    void init(Encoder *top, FrameEncoder *frame, int numRows);
> +        ParallelDeblock()
> +            : m_rowAddr(0)
> +            , m_frameEncoder(NULL)
> +            , m_encData(NULL)
> +        {
> +        }
> +
> +        ~ParallelDeblock()
> +        { }
> +
> +        void processTasks(int workerThreadId);
> +
> +    protected:
> +
> +        ParallelDeblock operator=(const ParallelDeblock&);
> +    };
> +
> +    ParallelDeblock*    m_pdeblock;
> +
> +    FrameFilter()
> +        : m_param(NULL)
> +        , m_frame(NULL)
> +        , m_frameEncoder(NULL)
> +        , m_ssimBuf(NULL)
> +        , m_pdeblock(NULL)
> +    {
> +    }
> +
> +    void init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols);
>      void destroy();
>  
>      void start(Frame *pic, Entropy& initState, int qp);
> 
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho


More information about the x265-devel mailing list