[x265] [PATCH 05 of 24] move SAO into class ParallelFilter and modify it to row based

Deepthi Nandakumar deepthi at multicorewareinc.com
Fri Dec 11 11:25:31 CET 2015


Thanks - very nicely done!

On Tue, Dec 8, 2015 at 5:24 AM, Min Chen <chenm003 at 163.com> wrote:

> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1449511560 21600
> # Node ID c68eec7fb242748363ec985937b20ed1aff73f02
> # Parent  3542d3abd018491d6ad67a79b0e6d05b604d3818
> move SAO into class ParallelFilter and modify it to row based
> ---
>  source/common/common.h          |    1 +
>  source/encoder/frameencoder.cpp |   36 +++++++-------
>  source/encoder/framefilter.cpp  |   95
> +++++++++++++++++++++++++-------------
>  source/encoder/framefilter.h    |   14 +++---
>  source/encoder/sao.cpp          |   81 ++++++++++++++++++++++++---------
>  source/encoder/sao.h            |    7 ++-
>  6 files changed, 151 insertions(+), 83 deletions(-)
>
> diff -r 3542d3abd018 -r c68eec7fb242 source/common/common.h
> --- a/source/common/common.h    Mon Dec 07 12:05:57 2015 -0600
> +++ b/source/common/common.h    Mon Dec 07 12:06:00 2015 -0600
> @@ -215,6 +215,7 @@
>
>  #define X265_MALLOC(type, count)    (type*)x265_malloc(sizeof(type) *
> (count))
>  #define X265_FREE(ptr)              x265_free(ptr)
> +#define X265_FREE_ZERO(ptr)         x265_free(ptr); (ptr) = NULL
>  #define CHECKED_MALLOC(var, type, count) \
>      { \
>          var = (type*)x265_malloc(sizeof(type) * (count)); \
> diff -r 3542d3abd018 -r c68eec7fb242 source/encoder/frameencoder.cpp
> --- a/source/encoder/frameencoder.cpp   Mon Dec 07 12:05:57 2015 -0600
> +++ b/source/encoder/frameencoder.cpp   Mon Dec 07 12:06:00 2015 -0600
> @@ -1093,7 +1093,7 @@
>
>          /* SAO parameter estimation using non-deblocked pixels for CTU
> bottom and right boundary areas */
>          if (m_param->bEnableSAO && m_param->bSaoNonDeblocked)
> -            m_frameFilter.m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col,
> row);
> +
> m_frameFilter.m_parallelFilter[row].m_sao.calcSaoStatsCu_BeforeDblk(m_frame,
> col, row);
>
>          /* Deblock with idle threading */
>          if (m_param->bEnableLoopFilter)
> @@ -1103,24 +1103,24 @@
>              if (row > 0)
>              {
>                  // Waitting last threading finish
> -                m_frameFilter.m_pdeblock[row - 1].waitForExit();
> +                m_frameFilter.m_parallelFilter[row - 1].waitForExit();
>
>                  // Processing new group
> -                const int allowCol = ((row >= 2) ?
> X265_MIN(m_frameFilter.m_pdeblock[row - 2].m_lastCol.get(), (int)col) :
> col);
> -                m_frameFilter.m_pdeblock[row -
> 1].m_allowedCol.set(allowCol);
> -                m_frameFilter.m_pdeblock[row - 1].tryBondPeers(*this, 1);
> +                const int allowCol = ((row >= 2) ?
> X265_MIN(m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get(), (int)col)
> : col);
> +                m_frameFilter.m_parallelFilter[row -
> 1].m_allowedCol.set(allowCol);
> +                m_frameFilter.m_parallelFilter[row -
> 1].tryBondPeers(*this, 1);
>              }
>
>              // Last Row may start early
>              if (row == m_numRows - 1)
>              {
>                  // Waitting last threading finish
> -                m_frameFilter.m_pdeblock[row].waitForExit();
> +                m_frameFilter.m_parallelFilter[row].waitForExit();
>
>                  // Processing last row
> -                const int allowCol = ((row >= 2) ?
> X265_MIN(m_frameFilter.m_pdeblock[row - 1].m_lastCol.get(), (int)col) :
> col);
> -                m_frameFilter.m_pdeblock[row].m_allowedCol.set(allowCol);
> -                m_frameFilter.m_pdeblock[row].tryBondPeers(*this, 1);
> +                const int allowCol = ((row >= 2) ?
> X265_MIN(m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get(), (int)col)
> : col);
> +
> m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol);
> +                m_frameFilter.m_parallelFilter[row].tryBondPeers(*this,
> 1);
>              }
>          }
>
> @@ -1188,17 +1188,17 @@
>          if (m_param->bEnableLoopFilter & (row > 0))
>          {
>              /* TODO: Multiple Threading */
> -            m_frameFilter.m_pdeblock[row - 1].waitForExit();
> +            m_frameFilter.m_parallelFilter[row - 1].waitForExit();
>
>              /* Check to avoid previous row process slower than current
> row */
>              if (row >= 2)
>              {
> -                int prevCol = m_frameFilter.m_pdeblock[row -
> 2].m_lastCol.get();
> +                int prevCol = m_frameFilter.m_parallelFilter[row -
> 2].m_lastCol.get();
>                  while(prevCol != (int)numCols)
> -                    prevCol = m_frameFilter.m_pdeblock[row -
> 2].m_lastCol.waitForChange(prevCol);
> +                    prevCol = m_frameFilter.m_parallelFilter[row -
> 2].m_lastCol.waitForChange(prevCol);
>              }
> -            m_frameFilter.m_pdeblock[row - 1].m_allowedCol.set(numCols);
> -            m_frameFilter.m_pdeblock[row - 1].processTasks(-1);
> +            m_frameFilter.m_parallelFilter[row -
> 1].m_allowedCol.set(numCols);
> +            m_frameFilter.m_parallelFilter[row - 1].processTasks(-1);
>          }
>
>          /* trigger row-wise loop filters */
> @@ -1217,12 +1217,12 @@
>              /* TODO: Early start last row */
>              if (m_param->bEnableLoopFilter)
>              {
> -                X265_CHECK(m_frameFilter.m_pdeblock[row -
> 1].m_allowedCol.get() == (int)numCols, "Deblock m_EncodedCol check failed");
> +                X265_CHECK(m_frameFilter.m_parallelFilter[row -
> 1].m_allowedCol.get() == (int)numCols, "Deblock m_EncodedCol check failed");
>
>                  /* NOTE: Last Row not execute before, so didn't need wait
> */
> -                m_frameFilter.m_pdeblock[row].waitForExit();
> -                m_frameFilter.m_pdeblock[row].m_allowedCol.set(numCols);
> -                m_frameFilter.m_pdeblock[row].processTasks(-1);
> +                m_frameFilter.m_parallelFilter[row].waitForExit();
> +
> m_frameFilter.m_parallelFilter[row].m_allowedCol.set(numCols);
> +                m_frameFilter.m_parallelFilter[row].processTasks(-1);
>              }
>
>              for (uint32_t i = m_numRows - m_filterRowDelay; i <
> m_numRows; i++)
> diff -r 3542d3abd018 -r c68eec7fb242 source/encoder/framefilter.cpp
> --- a/source/encoder/framefilter.cpp    Mon Dec 07 12:05:57 2015 -0600
> +++ b/source/encoder/framefilter.cpp    Mon Dec 07 12:06:00 2015 -0600
> @@ -35,19 +35,22 @@
>  static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride,
> uint32_t width, uint32_t height);
>  static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2,
> intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t&
> cnt);
>
> -uint32_t FrameFilter::ParallelDeblock::numCols = 0;
> +uint32_t FrameFilter::ParallelFilter::numCols = 0;
>
>  void FrameFilter::destroy()
>  {
> -    if (m_param->bEnableSAO)
> -        m_sao.destroy();
> -
>      X265_FREE(m_ssimBuf);
>
> -    if (m_pdeblock)
> +    if (m_parallelFilter)
>      {
> -        delete[] m_pdeblock;
> -        m_pdeblock = NULL;
> +        if (m_param->bEnableSAO)
> +        {
> +            for(int row = 0; row < m_numRows; row++)
> +                m_parallelFilter[row].m_sao.destroy((row == 0 ? 1 : 0));
> +        }
> +
> +        delete[] m_parallelFilter;
> +        m_parallelFilter = NULL;
>      }
>  }
>
> @@ -63,50 +66,65 @@
>      m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0;
>      m_lastHeight = m_param->sourceHeight % g_maxCUSize ?
> m_param->sourceHeight % g_maxCUSize : g_maxCUSize;
>
> -    if (m_param->bEnableSAO)
> -        if (!m_sao.create(m_param))
> -            m_param->bEnableSAO = 0;
> -
>      if (m_param->bEnableSsim)
>          m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3));
>
>      if (m_param->bEnableLoopFilter)
> -        m_pdeblock = new ParallelDeblock[numRows];
> +        m_parallelFilter = new ParallelFilter[numRows];
>
> -    if (m_pdeblock)
> +    if (m_parallelFilter)
>      {
> +        if (m_param->bEnableSAO)
> +        {
> +            for(int row = 0; row < numRows; row++)
> +            {
> +                if (!m_parallelFilter[row].m_sao.create(m_param, (row ==
> 0 ? 1 : 0)))
> +                    m_param->bEnableSAO = 0;
> +                else
> +                {
> +                    if (row != 0)
> +
> m_parallelFilter[row].m_sao.createFromRootNode(&m_parallelFilter[0].m_sao);
> +                }
> +
> +            }
> +        }
> +
>          for(int row = 0; row < numRows; row++)
>          {
> -            m_pdeblock[row].m_rowAddr = row * numCols;
> -            m_pdeblock[row].m_frameEncoder = m_frameEncoder;
> +            m_parallelFilter[row].m_rowAddr = row * numCols;
> +            m_parallelFilter[row].m_frameEncoder = m_frameEncoder;
>          }
>      }
>
>      // Setting maximum columns
> -    ParallelDeblock::numCols = numCols;
> +    ParallelFilter::numCols = numCols;
>  }
>
>  void FrameFilter::start(Frame *frame, Entropy& initState, int qp)
>  {
>      m_frame = frame;
>
> -    if (m_param->bEnableSAO)
> -        m_sao.startSlice(frame, initState, qp);
> -
> -    // Reset Deblock Data Struct
> -    if (m_pdeblock)
> +    // Reset Filter Data Struct
> +    if (m_parallelFilter)
>      {
>          for(int row = 0; row < m_numRows; row++)
>          {
> -            m_pdeblock[row].m_lastCol.set(0);
> -            m_pdeblock[row].m_allowedCol.set(0);
> -            m_pdeblock[row].m_encData = frame->m_encData;
> +            if (m_param->bEnableSAO)
> +                m_parallelFilter[row].m_sao.startSlice(frame, initState,
> qp);
> +
> +            m_parallelFilter[row].m_lastCol.set(0);
> +            m_parallelFilter[row].m_allowedCol.set(0);
> +            m_parallelFilter[row].m_encData = frame->m_encData;
>          }
> +
> +        // Reset SAO global/common statistics
> +        if (m_param->bEnableSAO)
> +            m_parallelFilter[0].m_sao.resetStats();
>      }
>  }
>
>  // NOTE: Single Threading only
> -void FrameFilter::ParallelDeblock::processTasks(int /*workerThreadId*/)
> +void FrameFilter::ParallelFilter::processTasks(int /*workerThreadId*/)
>  {
>      const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
>      const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
> @@ -160,11 +178,11 @@
>      SAOParam* saoParam = encData.m_saoParam;
>      if (m_param->bEnableSAO)
>      {
> -        m_sao.m_entropyCoder.load(m_frameEncoder->m_initSliceContext);
> -        m_sao.m_rdContexts.next.load(m_frameEncoder->m_initSliceContext);
> -        m_sao.m_rdContexts.cur.load(m_frameEncoder->m_initSliceContext);
> +
> m_parallelFilter[row].m_sao.m_entropyCoder.load(m_frameEncoder->m_initSliceContext);
> +
> m_parallelFilter[row].m_sao.m_rdContexts.next.load(m_frameEncoder->m_initSliceContext);
> +
> m_parallelFilter[row].m_sao.m_rdContexts.cur.load(m_frameEncoder->m_initSliceContext);
>
> -        m_sao.rdoSaoUnitRow(saoParam, row);
> +        m_parallelFilter[row].m_sao.rdoSaoUnitRow(saoParam, row);
>
>          // NOTE: Delay a row because SAO decide need top row pixels at
> next row, is it HM's bug?
>          if (row >= m_saoRowDelay)
> @@ -180,7 +198,7 @@
>      {
>          if (m_param->bEnableSAO)
>          {
> -            m_sao.rdoSaoUnitRowEnd(saoParam,
> encData.m_slice->m_sps->numCUsInFrame);
> +            m_parallelFilter[row].m_sao.rdoSaoUnitRowEnd(saoParam,
> encData.m_slice->m_sps->numCUsInFrame);
>
>              for (int i = m_numRows - m_saoRowDelay; i < m_numRows; i++)
>                  processSao(i);
> @@ -489,12 +507,23 @@
>      SAOParam* saoParam = encData.m_saoParam;
>
>      if (saoParam->bSaoFlag[0])
> -        m_sao.processSaoUnitRow(saoParam->ctuParam[0], row, 0);
> +    {
> +
> m_parallelFilter[row].m_sao.processSaoUnitRow(saoParam->ctuParam[0], row,
> 0);
> +        if (row != m_numRows - 1)
> +        {
> +            memcpy(m_parallelFilter[row + 1].m_sao.m_tmpU1[0],
> m_parallelFilter[row].m_sao.m_tmpU1[0], sizeof(pixel) *
> m_param->sourceWidth);
> +        }
> +    }
>
>      if (saoParam->bSaoFlag[1])
>      {
> -        m_sao.processSaoUnitRow(saoParam->ctuParam[1], row, 1);
> -        m_sao.processSaoUnitRow(saoParam->ctuParam[2], row, 2);
> +
> m_parallelFilter[row].m_sao.processSaoUnitRow(saoParam->ctuParam[1], row,
> 1);
> +
> m_parallelFilter[row].m_sao.processSaoUnitRow(saoParam->ctuParam[2], row,
> 2);
> +        if (row != m_numRows - 1)
> +        {
> +            memcpy(m_parallelFilter[row + 1].m_sao.m_tmpU1[1],
> m_parallelFilter[row].m_sao.m_tmpU1[1], sizeof(pixel) *
> m_param->sourceWidth);
> +            memcpy(m_parallelFilter[row + 1].m_sao.m_tmpU1[2],
> m_parallelFilter[row].m_sao.m_tmpU1[2], sizeof(pixel) *
> m_param->sourceWidth);
> +        }
>      }
>
>      if (encData.m_slice->m_pps->bTransquantBypassEnabled)
> diff -r 3542d3abd018 -r c68eec7fb242 source/encoder/framefilter.h
> --- a/source/encoder/framefilter.h      Mon Dec 07 12:05:57 2015 -0600
> +++ b/source/encoder/framefilter.h      Mon Dec 07 12:06:00 2015 -0600
> @@ -51,7 +51,6 @@
>      int           m_vChromaShift;
>      int           m_pad[2];
>
> -    SAO           m_sao;
>      int           m_numRows;
>      int           m_saoRowDelay;
>      int           m_lastHeight;
> @@ -59,41 +58,42 @@
>      void*         m_ssimBuf; /* Temp storage for ssim computation */
>
>  #define MAX_PFILTER_CUS     (4) /* maximum CUs for every thread */
> -    class ParallelDeblock : public BondedTaskGroup, public Deblock
> +    class ParallelFilter : public BondedTaskGroup, public Deblock
>      {
>      public:
>          static uint32_t     numCols;
>          uint32_t            m_rowAddr;
>          FrameEncoder*       m_frameEncoder;
>          FrameData*          m_encData;
> +        SAO                 m_sao;
>          ThreadSafeInteger   m_lastCol;          /* The column that next
> to process */
>          ThreadSafeInteger   m_allowedCol;       /* The column that
> processed from Encode pipeline */
>
> -        ParallelDeblock()
> +        ParallelFilter()
>              : m_rowAddr(0)
>              , m_frameEncoder(NULL)
>              , m_encData(NULL)
>          {
>          }
>
> -        ~ParallelDeblock()
> +        ~ParallelFilter()
>          { }
>
>          void processTasks(int workerThreadId);
>
>      protected:
>
> -        ParallelDeblock operator=(const ParallelDeblock&);
> +        ParallelFilter operator=(const ParallelFilter&);
>      };
>
> -    ParallelDeblock*    m_pdeblock;
> +    ParallelFilter*     m_parallelFilter;
>
>      FrameFilter()
>          : m_param(NULL)
>          , m_frame(NULL)
>          , m_frameEncoder(NULL)
>          , m_ssimBuf(NULL)
> -        , m_pdeblock(NULL)
> +        , m_parallelFilter(NULL)
>      {
>      }
>
> diff -r 3542d3abd018 -r c68eec7fb242 source/encoder/sao.cpp
> --- a/source/encoder/sao.cpp    Mon Dec 07 12:05:57 2015 -0600
> +++ b/source/encoder/sao.cpp    Mon Dec 07 12:06:00 2015 -0600
> @@ -103,7 +103,7 @@
>      m_depthSaoRate[1][3] = 0;
>  }
>
> -bool SAO::create(x265_param* param)
> +bool SAO::create(x265_param* param, int initCommon)
>  {
>      m_param = param;
>      m_chromaFormat = param->internalCsp;
> @@ -131,12 +131,24 @@
>          m_tmpU2[i] += 1;
>      }
>
> -    CHECKED_MALLOC(m_count, PerClass, NUM_PLANE);
> -    CHECKED_MALLOC(m_offset, PerClass, NUM_PLANE);
> -    CHECKED_MALLOC(m_offsetOrg, PerClass, NUM_PLANE);
> +    if (initCommon)
> +    {
> +        CHECKED_MALLOC(m_count, PerClass, NUM_PLANE);
> +        CHECKED_MALLOC(m_offset, PerClass, NUM_PLANE);
> +        CHECKED_MALLOC(m_offsetOrg, PerClass, NUM_PLANE);
>
> -    CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
> -    CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
> +        CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
> +        CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
> +    }
> +    else
> +    {
> +        // must initialize these common pointer outside of function
> +        m_count = NULL;
> +        m_offset = NULL;
> +        m_offsetOrg = NULL;
> +        m_countPreDblk = NULL;
> +        m_offsetOrgPreDblk = NULL;
> +    }
>
>      m_clipTable = &(m_clipTableBase[rangeExt]);
>
> @@ -155,24 +167,50 @@
>      return false;
>  }
>
> -void SAO::destroy()
> +void SAO::createFromRootNode(SAO* root)
>  {
> -    X265_FREE(m_clipTableBase);
> +    X265_CHECK(m_count == NULL, "duplicate initialize on m_count");
> +    X265_CHECK(m_offset == NULL, "duplicate initialize on m_offset");
> +    X265_CHECK(m_offsetOrg == NULL, "duplicate initialize on
> m_offsetOrg");
> +    X265_CHECK(m_countPreDblk == NULL, "duplicate initialize on
> m_countPreDblk");
> +    X265_CHECK(m_offsetOrgPreDblk == NULL, "duplicate initialize on
> m_offsetOrgPreDblk");
>
> -    X265_FREE(m_tmpL1);
> -    X265_FREE(m_tmpL2);
> +    m_count = root->m_count;
> +    m_offset = root->m_offset;
> +    m_offsetOrg = root->m_offsetOrg;
> +    m_countPreDblk = root->m_countPreDblk;
> +    m_offsetOrgPreDblk = root->m_offsetOrgPreDblk;
> +}
> +
> +void SAO::destroy(int destoryCommon)
> +{
> +    X265_FREE_ZERO(m_clipTableBase);
> +
> +    X265_FREE_ZERO(m_tmpL1);
> +    X265_FREE_ZERO(m_tmpL2);
>
>      for (int i = 0; i < 3; i++)
>      {
> -        if (m_tmpU1[i]) X265_FREE(m_tmpU1[i] - 1);
> -        if (m_tmpU2[i]) X265_FREE(m_tmpU2[i] - 1);
> +        if (m_tmpU1[i])
> +        {
> +            X265_FREE(m_tmpU1[i] - 1);
> +            m_tmpU1[i] = NULL;
> +        }
> +        if (m_tmpU2[i])
> +        {
> +            X265_FREE(m_tmpU2[i] - 1);
> +            m_tmpU2[i] = NULL;
> +        }
>      }
>
> -    X265_FREE(m_count);
> -    X265_FREE(m_offset);
> -    X265_FREE(m_offsetOrg);
> -    X265_FREE(m_countPreDblk);
> -    X265_FREE(m_offsetOrgPreDblk);
> +    if (destoryCommon)
> +    {
> +        X265_FREE(m_count);
> +        X265_FREE(m_offset);
> +        X265_FREE(m_offsetOrg);
> +        X265_FREE(m_countPreDblk);
> +        X265_FREE(m_offsetOrgPreDblk);
> +    }
>  }
>
>  /* allocate memory for SAO parameters */
> @@ -210,8 +248,6 @@
>          break;
>      }
>
> -    resetStats();
> -
>      m_entropyCoder.load(initState);
>      m_rdContexts.next.load(initState);
>      m_rdContexts.cur.load(initState);
> @@ -586,15 +622,14 @@
>          ctuHeight >>= m_vChromaShift;
>      }
>
> +    int addr = idxY * m_numCuInWidth;
> +    pixel* rec = reconPic->getPlaneAddr(plane, addr);
> +
>      if (!idxY)
>      {
> -        pixel* rec = reconPic->m_picOrg[plane];
>          memcpy(m_tmpU1[plane], rec, sizeof(pixel) * picWidth);
>      }
>
> -    int addr = idxY * m_numCuInWidth;
> -    pixel* rec = plane ? reconPic->getChromaAddr(plane, addr) :
> reconPic->getLumaAddr(addr);
> -
>      for (int i = 0; i < ctuHeight + 1; i++)
>      {
>          m_tmpL1[i] = rec[0];
> diff -r 3542d3abd018 -r c68eec7fb242 source/encoder/sao.h
> --- a/source/encoder/sao.h      Mon Dec 07 12:05:57 2015 -0600
> +++ b/source/encoder/sao.h      Mon Dec 07 12:06:00 2015 -0600
> @@ -120,8 +120,9 @@
>
>      SAO();
>
> -    bool create(x265_param* param);
> -    void destroy();
> +    bool create(x265_param* param, int initCommon);
> +    void createFromRootNode(SAO *root);
> +    void destroy(int destoryCommon);
>
>      void allocSaoParam(SAOParam* saoParam) const;
>
> @@ -147,6 +148,8 @@
>
>      void rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus);
>      void rdoSaoUnitRow(SAOParam* saoParam, int idxY);
> +
> +    friend class FrameFilter;
>  };
>
>  }
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>



-- 
Deepthi Nandakumar
Engineering Manager, x265
Multicoreware, Inc
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20151211/45996bdd/attachment-0001.html>


More information about the x265-devel mailing list