[x265] [PATCH] noiseReduction: make noiseReduction deterministic

Tue Oct 14 08:54:02 CEST 2014

Yup, TComPicSym or even Frame should be fine - since every Frame will be
associated with one FrameEncoder at start of encode.

On Tue, Oct 14, 2014 at 11:45 AM, Steve Borho <steve at borho.org> wrote:

> On 10/14, deepthi at multicorewareinc.com wrote:
> > # HG changeset patch
> > # User Deepthi Nandakumar <deepthi at multicorewareinc.com>
> > # Date 1413196915 -19800
> > #      Mon Oct 13 16:11:55 2014 +0530
> > # Node ID d12a9a16ed2b975c6b57ca67a9ea5af1a96692ae
> > # Parent  f26e81eb555aa586380b34314c302ea9b148f357
> > noiseReduction: make noiseReduction deterministic
> >
> > diff -r f26e81eb555a -r d12a9a16ed2b source/Lib/TLibCommon/TComRom.h
> > --- a/source/Lib/TLibCommon/TComRom.h Mon Oct 13 14:36:40 2014 +0530
> > +++ b/source/Lib/TLibCommon/TComRom.h Mon Oct 13 16:11:55 2014 +0530
> > @@ -43,29 +43,6 @@
> >  namespace x265 {
> >  // private namespace
>
> most of this deserves to be in its own patch
>
> > -#define NUM_CU_DEPTH            4                           // maximun
> number of CU depths
> > -#define NUM_FULL_DEPTH          5                           // maximun
> number of full depths
> > -#define MIN_LOG2_CU_SIZE        3                           //
> log2(minCUSize)
> > -#define MAX_LOG2_CU_SIZE        6                           //
> log2(maxCUSize)
> > -#define MIN_CU_SIZE             (1 << MIN_LOG2_CU_SIZE)     // minimum
> allowable size of CU
> > -#define MAX_CU_SIZE             (1 << MAX_LOG2_CU_SIZE)     // maximum
> allowable size of CU
> > -
> > -#define LOG2_UNIT_SIZE          2                           //
> log2(unitSize)
> > -#define UNIT_SIZE               (1 << LOG2_UNIT_SIZE)       // unit
> size of CU partition
> > -#define TMVP_UNIT_MASK          0xF0                        // mask for
> mapping index to CompressMV field
> > -
> > -#define MAX_NUM_PARTITIONS      256
> > -
> > -#define MIN_PU_SIZE             4
> > -#define MIN_TU_SIZE             4
> > -#define MAX_NUM_SPU_W           (MAX_CU_SIZE / MIN_PU_SIZE) // maximum
> number of SPU in horizontal line
> > -#define ADI_BUF_STRIDE          (2 * MAX_CU_SIZE + 1 + 15)  //
> alignment to 16 bytes
> > -
> > -#define MAX_LOG2_TR_SIZE 5
> > -#define MAX_LOG2_TS_SIZE 2 // TODO: RExt
> > -#define MAX_TR_SIZE (1 << MAX_LOG2_TR_SIZE)
> > -#define MAX_TS_SIZE (1 << MAX_LOG2_TS_SIZE)
> > -
> >  #define SLFASE_CONSTANT 0x5f4e4a53
> >
> >  void initROM();
> > diff -r f26e81eb555a -r d12a9a16ed2b source/common/common.h
> > --- a/source/common/common.h  Mon Oct 13 14:36:40 2014 +0530
> > +++ b/source/common/common.h  Mon Oct 13 16:11:55 2014 +0530
> > @@ -214,15 +214,40 @@
> >  namespace x265 {
> >
> >  enum { SAO_NUM_OFFSET = 4 };
> > +#define NUM_CU_DEPTH            4                           // maximun
> number of CU depths
> > +#define NUM_FULL_DEPTH          5                           // maximun
> number of full depths
> > +#define MIN_LOG2_CU_SIZE        3                           //
> log2(minCUSize)
> > +#define MAX_LOG2_CU_SIZE        6                           //
> log2(maxCUSize)
> > +#define MIN_CU_SIZE             (1 << MIN_LOG2_CU_SIZE)     // minimum
> allowable size of CU
> > +#define MAX_CU_SIZE             (1 << MAX_LOG2_CU_SIZE)     // maximum
> allowable size of CU
> > +
> > +#define LOG2_UNIT_SIZE          2                           //
> log2(unitSize)
> > +#define UNIT_SIZE               (1 << LOG2_UNIT_SIZE)       // unit
> size of CU partition
> > +#define TMVP_UNIT_MASK          0xF0                        // mask for
> mapping index to CompressMV field
> > +
> > +#define MAX_NUM_PARTITIONS      256
> > +
> > +#define MIN_PU_SIZE             4
> > +#define MIN_TU_SIZE             4
> > +#define MAX_NUM_SPU_W           (MAX_CU_SIZE / MIN_PU_SIZE) // maximum
> number of SPU in horizontal line
> > +#define ADI_BUF_STRIDE          (2 * MAX_CU_SIZE + 1 + 15)  //
> alignment to 16 bytes
> > +
> > +#define MAX_LOG2_TR_SIZE 5
> > +#define MAX_LOG2_TS_SIZE 2 // TODO: RExt
> > +#define MAX_TR_SIZE (1 << MAX_LOG2_TR_SIZE)
> > +#define MAX_TS_SIZE (1 << MAX_LOG2_TS_SIZE)
> > +
> > +#define MAX_NUM_TR_COEFFS MAX_TR_SIZE * MAX_TR_SIZE /* Maximum number
> of transform coefficients, for a 32x32 transform */
> > +#define MAX_NUM_TR_CATEGORIES    8   /* 32, 16, 8, 4 transform
> categories each for luma and chroma */
> >
> >  // NOTE: MUST be alignment to 16 or 32 bytes for asm code
> >  struct NoiseReduction
> >  {
> >      /* 0 = luma 4x4, 1 = luma 8x8, 2 = luma 16x16, 3 = luma 32x32
> >       * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma
> 32x32 */
> > -    uint16_t offsetDenoise[8][1024];
> > -    uint32_t residualSum[8][1024];
> > -    uint32_t count[8];
> > +    uint16_t offsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
> > +    uint32_t residualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
> > +    uint32_t count[MAX_NUM_TR_CATEGORIES];
> >  };
> >
> >  struct SaoCtuParam
> > diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/analysis.cpp
> > --- a/source/encoder/analysis.cpp     Mon Oct 13 14:36:40 2014 +0530
> > +++ b/source/encoder/analysis.cpp     Mon Oct 13 16:11:55 2014 +0530
> > @@ -292,7 +292,8 @@
> >          if (!jobId || m_param->rdLevel > 4)
> >          {
> >              slave->m_quant.setQPforQuant(cu);
> > -            slave->m_quant.m_nr = m_quant.m_nr;
> > +            int frameEncoderID = m_tld[threadId].frameEncoderID;
> > +            slave->m_quant.m_nr = m_tld[threadId].m_nr[frameEncoderID];
>
> almost. this is taking the frameEncoderID from the slave, which has not
> beeen initialized
>
> frameEncoderID needs to come from the master TLD, which you probably
> can't easily access from here. So frameEncoderID should probably be a
> member of Analysis or (perhaps even better) the TComPicSym.
>
> >
> slave->m_rdContexts[depth].cur.load(m_rdContexts[depth].cur);
> >          }
> >      }
> > diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/analysis.h
> > --- a/source/encoder/analysis.h       Mon Oct 13 14:36:40 2014 +0530
> > +++ b/source/encoder/analysis.h       Mon Oct 13 16:11:55 2014 +0530
> > @@ -172,8 +172,18 @@
> >  struct ThreadLocalData
> >  {
> >      Analysis analysis;
> > +    NoiseReduction** m_nr;
>
> why a double pointer? why not just allocate an array of them?
>
> > +    int frameEncoderID;
> >
> > -    ~ThreadLocalData() { analysis.destroy(); }
> > +    ThreadLocalData()
> > +    {
> > +        m_nr = NULL;
> > +    }
> > +
> > +    ~ThreadLocalData()
> > +    {
> > +        analysis.destroy();
> > +    }
> >  };
> >
> >  }
> > diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/encoder.cpp
> > --- a/source/encoder/encoder.cpp      Mon Oct 13 14:36:40 2014 +0530
> > +++ b/source/encoder/encoder.cpp      Mon Oct 13 16:11:55 2014 +0530
> > @@ -162,15 +162,21 @@
> >
> >      /* Allocate thread local data, one for each thread pool worker and
> >       * if --no-wpp, one for each frame encoder */
> > -    int numLocalData = poolThreadCount;
> > +    m_numThreadLocalData = poolThreadCount;
> >      if (!m_param->bEnableWavefront)
> > -        numLocalData += m_param->frameNumThreads;
> > -    m_threadLocalData = new ThreadLocalData[numLocalData];
> > -    for (int i = 0; i < numLocalData; i++)
> > +        m_numThreadLocalData += m_param->frameNumThreads;
> > +    m_threadLocalData = new ThreadLocalData[m_numThreadLocalData];
> > +    for (int i = 0; i < m_numThreadLocalData; i++)
> >      {
> >          m_threadLocalData[i].analysis.setThreadPool(m_threadPool);
> >          m_threadLocalData[i].analysis.initSearch(m_param,
> m_scalingList);
> >          m_threadLocalData[i].analysis.create(g_maxCUDepth + 1,
> g_maxCUSize, m_threadLocalData);
> > +        m_threadLocalData[i].m_nr = X265_MALLOC(NoiseReduction*,
> m_param->frameNumThreads);
> > +        for (int j = 0; j < m_param->frameNumThreads; j++)
> > +        {
> > +            m_threadLocalData[i].m_nr[j] = X265_MALLOC(NoiseReduction,
> 1);
> > +            memset(m_threadLocalData[i].m_nr[j], 0,
> sizeof(NoiseReduction));
> > +        }
> >      }
>
> ok, except for the double allocations
>
> >      if (!m_param->bEnableWavefront)
> > @@ -240,6 +246,13 @@
> >          delete [] m_frameEncoder;
> >      }
> >
> > +    for (int i = 0; i < m_numThreadLocalData; i++)
> > +    {
> > +        for (int j = 0; j < m_param->frameNumThreads; j++)
> > +           X265_FREE(m_threadLocalData[i].m_nr[j]);
> > +        X265_FREE(m_threadLocalData[i].m_nr);
> > +    }
> > +
> >      delete [] m_threadLocalData;
> >
> >      if (m_lookahead)
> > @@ -400,6 +413,7 @@
> >          m_lookahead->flush();
> >
> >      FrameEncoder *curEncoder = &m_frameEncoder[m_curEncoder];
> > +    curEncoder->m_frameEncoderID = m_curEncoder;
> >      m_curEncoder = (m_curEncoder + 1) % m_param->frameNumThreads;
> >      int ret = 0;
> >
> > diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/encoder.h
> > --- a/source/encoder/encoder.h        Mon Oct 13 14:36:40 2014 +0530
> > +++ b/source/encoder/encoder.h        Mon Oct 13 16:11:55 2014 +0530
> > @@ -93,7 +93,6 @@
> >
> >      int                m_curEncoder;
> >
> > -
> >      /* Collect statistics globally */
> >      EncStats           m_analyzeAll;
> >      EncStats           m_analyzeI;
> > @@ -116,6 +115,7 @@
> >      PPS                m_pps;
> >      NALList            m_nalList;
> >      ScalingList        m_scalingList;      // quantization matrix
> information
> > +    int                m_numThreadLocalData;
> >
> >      int                m_lastBPSEI;
> >      uint32_t           m_numDelayedPic;
> > diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/frameencoder.cpp
> > --- a/source/encoder/frameencoder.cpp Mon Oct 13 14:36:40 2014 +0530
> > +++ b/source/encoder/frameencoder.cpp Mon Oct 13 16:11:55 2014 +0530
> > @@ -49,8 +49,9 @@
> >      m_vbvResetTriggerRow = -1;
> >      m_outStreams = NULL;
> >      m_substreamSizes = NULL;
> > -    m_nr = NULL;
> > +    m_frameNR = NULL;
> >      m_tld = NULL;
> > +    m_frameEncoderID = 0;
> >      memset(&m_frameStats, 0, sizeof(m_frameStats));
> >      memset(&m_rce, 0, sizeof(RateControlEntry));
> >  }
> > @@ -75,7 +76,7 @@
> >      X265_FREE(m_substreamSizes);
> >      m_frameFilter.destroy();
> >
> > -    X265_FREE(m_nr);
> > +    X265_FREE(m_frameNR);
> >
> >      // wait for worker thread to exit
> >      stop();
> > @@ -119,9 +120,9 @@
> >      }
> >
> >      if (m_param->noiseReduction)
> > -        m_nr = X265_MALLOC(NoiseReduction, 1);
> > -    if (m_nr)
> > -        memset(m_nr, 0, sizeof(NoiseReduction));
> > +        m_frameNR = X265_MALLOC(NoiseReduction, 1);
> > +    if (m_frameNR)
> > +        memset(m_frameNR, 0, sizeof(NoiseReduction));
> >      else
> >          m_param->noiseReduction = 0;
> >
> > @@ -393,8 +394,36 @@
> >      if (m_top->m_rateControl->rateControlEnd(m_frame, m_accessUnitBits,
> &m_rce, &m_frameStats) < 0)
> >          m_top->m_aborted = true;
> >
> > +    /* Accumulate NR statistics from all worker threads */
> > +    if (m_frameNR)
> > +    {
> > +        for (int i = 0; i < m_top->m_numThreadLocalData; i++)
> > +        {
> > +            NoiseReduction* nr =
> m_top->m_threadLocalData[i].m_nr[m_frameEncoderID];
> > +            for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
> > +            {
> > +                for(int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++)
> > +                    m_frameNR->residualSum[cat][coeff] +=
> nr->residualSum[cat][coeff];
> > +
> > +                m_frameNR->count[cat] += nr->count[cat];
> > +            }
> > +        }
> > +    }
> > +
> >      noiseReductionUpdate();
> >
> > +    /* Copy updated NR coefficients back to all worker threads */
> > +    if (m_frameNR)
> > +    {
> > +        for (int i = 0; i < m_top->m_numThreadLocalData; i++)
> > +        {
> > +            NoiseReduction* nr =
> m_top->m_threadLocalData[i].m_nr[m_frameEncoderID];
> > +            memcpy(nr->offsetDenoise, m_frameNR->offsetDenoise,
> sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
> > +            memset(nr->count, 0, sizeof(uint32_t) *
> MAX_NUM_TR_CATEGORIES);
> > +            memset(nr->residualSum, 0, sizeof(uint32_t) *
> MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
> > +        }
> > +    }
>
> \o/
>
> > +
> >      // Decrement referenced frame reference counts, allow them to be
> recycled
> >      for (int l = 0; l < numPredDir; l++)
> >      {
> > @@ -616,7 +645,8 @@
> >      // setup thread-local data
> >      Slice *slice = m_frame->m_picSym->m_slice;
> >      TComPicYuv* fenc = m_frame->getPicYuvOrg();
> > -    tld.analysis.m_quant.m_nr = m_nr;
> > +    tld.analysis.m_quant.m_nr = tld.m_nr[m_frameEncoderID];
> > +    tld.frameEncoderID = m_frameEncoderID;
> >      tld.analysis.m_me.setSourcePlane(fenc->getLumaAddr(),
> fenc->getStride());
> >      tld.analysis.m_log =
> &tld.analysis.m_sliceTypeLog[m_frame->m_picSym->m_slice->m_sliceType];
> >      tld.analysis.setQP(slice, slice->m_sliceQp);
> > @@ -866,34 +896,34 @@
> >  /* DCT-domain noise reduction / adaptive deadzone from libavcodec */
> >  void FrameEncoder::noiseReductionUpdate()
> >  {
> > -    if (!m_nr)
> > +    if (!m_frameNR)
> >          return;
> >
> >      static const uint32_t maxBlocksPerTrSize[4] = {1 << 18, 1 << 16, 1
> << 14, 1 << 12};
> >
> > -    for (int cat = 0; cat < 8; cat++)
> > +    for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
> >      {
> >          int trSize = cat & 3;
> >          int coefCount = 1 << ((trSize + 2) * 2);
> >
> > -        if (m_nr->count[cat] > maxBlocksPerTrSize[trSize])
> > +        if (m_frameNR->count[cat] > maxBlocksPerTrSize[trSize])
> >          {
> >              for (int i = 0; i < coefCount; i++)
> > -                m_nr->residualSum[cat][i] >>= 1;
> > -            m_nr->count[cat] >>= 1;
> > +                m_frameNR->residualSum[cat][i] >>= 1;
> > +            m_frameNR->count[cat] >>= 1;
> >          }
> >
> > -        uint64_t scaledCount = (uint64_t)m_param->noiseReduction *
> m_nr->count[cat];
> > +        uint64_t scaledCount = (uint64_t)m_param->noiseReduction *
> m_frameNR->count[cat];
> >
> >          for (int i = 0; i < coefCount; i++)
> >          {
> > -            uint64_t value = scaledCount + m_nr->residualSum[cat][i] /
> 2;
> > -            uint64_t denom = m_nr->residualSum[cat][i] + 1;
> > -            m_nr->offsetDenoise[cat][i] = (uint16_t)(value / denom);
> > +            uint64_t value = scaledCount +
> m_frameNR->residualSum[cat][i] / 2;
> > +            uint64_t denom = m_frameNR->residualSum[cat][i] + 1;
> > +            m_frameNR->offsetDenoise[cat][i] = (uint16_t)(value /
> denom);
> >          }
> >
> >          // Don't denoise DC coefficients
> > -        m_nr->offsetDenoise[cat][0] = 0;
> > +        m_frameNR->offsetDenoise[cat][0] = 0;
> >      }
> >  }
> >
> > diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/frameencoder.h
> > --- a/source/encoder/frameencoder.h   Mon Oct 13 14:36:40 2014 +0530
> > +++ b/source/encoder/frameencoder.h   Mon Oct 13 16:11:55 2014 +0530
> > @@ -140,7 +140,7 @@
> >      Bitstream                m_bs;
> >      Bitstream*               m_outStreams;
> >      uint32_t*                m_substreamSizes;
> > -    NoiseReduction*          m_nr;
> > +    NoiseReduction*          m_frameNR;
> >      NALList                  m_nalList;
> >      ThreadLocalData*         m_tld; /* for --no-wpp */
> >
> > @@ -148,6 +148,7 @@
> >      int                      m_filterRowDelayCus;
> >      Event                    m_completionEvent;
> >      int64_t                  m_totalTime;
> > +    int                      m_frameEncoderID;
> >
> >  protected:
> >
> > _______________________________________________
> > x265-devel mailing list
> > x265-devel at videolan.org
> > https://mailman.videolan.org/listinfo/x265-devel
>
> --
> Steve Borho
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20141014/740f3ae3/attachment-0001.html>