[x265] [PATCH] noiseReduction: make noiseReduction deterministic

Tue Oct 14 08:15:10 CEST 2014

On 10/14, deepthi at multicorewareinc.com wrote:
> # HG changeset patch
> # User Deepthi Nandakumar <deepthi at multicorewareinc.com>
> # Date 1413196915 -19800
> #      Mon Oct 13 16:11:55 2014 +0530
> # Node ID d12a9a16ed2b975c6b57ca67a9ea5af1a96692ae
> # Parent  f26e81eb555aa586380b34314c302ea9b148f357
> noiseReduction: make noiseReduction deterministic
> 
> diff -r f26e81eb555a -r d12a9a16ed2b source/Lib/TLibCommon/TComRom.h
> --- a/source/Lib/TLibCommon/TComRom.h	Mon Oct 13 14:36:40 2014 +0530
> +++ b/source/Lib/TLibCommon/TComRom.h	Mon Oct 13 16:11:55 2014 +0530
> @@ -43,29 +43,6 @@
>  namespace x265 {
>  // private namespace

most of this deserves to be in its own patch

> -#define NUM_CU_DEPTH            4                           // maximun number of CU depths
> -#define NUM_FULL_DEPTH          5                           // maximun number of full depths
> -#define MIN_LOG2_CU_SIZE        3                           // log2(minCUSize)
> -#define MAX_LOG2_CU_SIZE        6                           // log2(maxCUSize)
> -#define MIN_CU_SIZE             (1 << MIN_LOG2_CU_SIZE)     // minimum allowable size of CU
> -#define MAX_CU_SIZE             (1 << MAX_LOG2_CU_SIZE)     // maximum allowable size of CU
> -
> -#define LOG2_UNIT_SIZE          2                           // log2(unitSize)
> -#define UNIT_SIZE               (1 << LOG2_UNIT_SIZE)       // unit size of CU partition
> -#define TMVP_UNIT_MASK          0xF0                        // mask for mapping index to CompressMV field
> -
> -#define MAX_NUM_PARTITIONS      256
> -
> -#define MIN_PU_SIZE             4
> -#define MIN_TU_SIZE             4
> -#define MAX_NUM_SPU_W           (MAX_CU_SIZE / MIN_PU_SIZE) // maximum number of SPU in horizontal line
> -#define ADI_BUF_STRIDE          (2 * MAX_CU_SIZE + 1 + 15)  // alignment to 16 bytes
> -
> -#define MAX_LOG2_TR_SIZE 5
> -#define MAX_LOG2_TS_SIZE 2 // TODO: RExt
> -#define MAX_TR_SIZE (1 << MAX_LOG2_TR_SIZE)
> -#define MAX_TS_SIZE (1 << MAX_LOG2_TS_SIZE)
> -
>  #define SLFASE_CONSTANT 0x5f4e4a53
>  
>  void initROM();
> diff -r f26e81eb555a -r d12a9a16ed2b source/common/common.h
> --- a/source/common/common.h	Mon Oct 13 14:36:40 2014 +0530
> +++ b/source/common/common.h	Mon Oct 13 16:11:55 2014 +0530
> @@ -214,15 +214,40 @@
>  namespace x265 {
>  
>  enum { SAO_NUM_OFFSET = 4 };
> +#define NUM_CU_DEPTH            4                           // maximun number of CU depths
> +#define NUM_FULL_DEPTH          5                           // maximun number of full depths
> +#define MIN_LOG2_CU_SIZE        3                           // log2(minCUSize)
> +#define MAX_LOG2_CU_SIZE        6                           // log2(maxCUSize)
> +#define MIN_CU_SIZE             (1 << MIN_LOG2_CU_SIZE)     // minimum allowable size of CU
> +#define MAX_CU_SIZE             (1 << MAX_LOG2_CU_SIZE)     // maximum allowable size of CU
> +
> +#define LOG2_UNIT_SIZE          2                           // log2(unitSize)
> +#define UNIT_SIZE               (1 << LOG2_UNIT_SIZE)       // unit size of CU partition
> +#define TMVP_UNIT_MASK          0xF0                        // mask for mapping index to CompressMV field
> +
> +#define MAX_NUM_PARTITIONS      256
> +
> +#define MIN_PU_SIZE             4
> +#define MIN_TU_SIZE             4
> +#define MAX_NUM_SPU_W           (MAX_CU_SIZE / MIN_PU_SIZE) // maximum number of SPU in horizontal line
> +#define ADI_BUF_STRIDE          (2 * MAX_CU_SIZE + 1 + 15)  // alignment to 16 bytes
> +
> +#define MAX_LOG2_TR_SIZE 5
> +#define MAX_LOG2_TS_SIZE 2 // TODO: RExt
> +#define MAX_TR_SIZE (1 << MAX_LOG2_TR_SIZE)
> +#define MAX_TS_SIZE (1 << MAX_LOG2_TS_SIZE)
> +
> +#define MAX_NUM_TR_COEFFS MAX_TR_SIZE * MAX_TR_SIZE /* Maximum number of transform coefficients, for a 32x32 transform */
> +#define MAX_NUM_TR_CATEGORIES    8   /* 32, 16, 8, 4 transform categories each for luma and chroma */
>  
>  // NOTE: MUST be alignment to 16 or 32 bytes for asm code
>  struct NoiseReduction
>  {
>      /* 0 = luma 4x4, 1 = luma 8x8, 2 = luma 16x16, 3 = luma 32x32
>       * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma 32x32 */
> -    uint16_t offsetDenoise[8][1024];
> -    uint32_t residualSum[8][1024];
> -    uint32_t count[8];
> +    uint16_t offsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
> +    uint32_t residualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
> +    uint32_t count[MAX_NUM_TR_CATEGORIES];
>  };
>  
>  struct SaoCtuParam
> diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/analysis.cpp
> --- a/source/encoder/analysis.cpp	Mon Oct 13 14:36:40 2014 +0530
> +++ b/source/encoder/analysis.cpp	Mon Oct 13 16:11:55 2014 +0530
> @@ -292,7 +292,8 @@
>          if (!jobId || m_param->rdLevel > 4)
>          {
>              slave->m_quant.setQPforQuant(cu);
> -            slave->m_quant.m_nr = m_quant.m_nr;
> +            int frameEncoderID = m_tld[threadId].frameEncoderID;
> +            slave->m_quant.m_nr = m_tld[threadId].m_nr[frameEncoderID];

almost. this is taking the frameEncoderID from the slave, which has not
beeen initialized

frameEncoderID needs to come from the master TLD, which you probably
can't easily access from here. So frameEncoderID should probably be a
member of Analysis or (perhaps even better) the TComPicSym.

>              slave->m_rdContexts[depth].cur.load(m_rdContexts[depth].cur);
>          }
>      }
> diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/analysis.h
> --- a/source/encoder/analysis.h	Mon Oct 13 14:36:40 2014 +0530
> +++ b/source/encoder/analysis.h	Mon Oct 13 16:11:55 2014 +0530
> @@ -172,8 +172,18 @@
>  struct ThreadLocalData
>  {
>      Analysis analysis;
> +    NoiseReduction** m_nr;

why a double pointer? why not just allocate an array of them?

> +    int frameEncoderID;
>  
> -    ~ThreadLocalData() { analysis.destroy(); }
> +    ThreadLocalData()
> +    {
> +        m_nr = NULL;
> +    }
> +
> +    ~ThreadLocalData()
> +    {
> +        analysis.destroy();
> +    }
>  };
>  
>  }
> diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/encoder.cpp
> --- a/source/encoder/encoder.cpp	Mon Oct 13 14:36:40 2014 +0530
> +++ b/source/encoder/encoder.cpp	Mon Oct 13 16:11:55 2014 +0530
> @@ -162,15 +162,21 @@
>  
>      /* Allocate thread local data, one for each thread pool worker and
>       * if --no-wpp, one for each frame encoder */
> -    int numLocalData = poolThreadCount;
> +    m_numThreadLocalData = poolThreadCount;
>      if (!m_param->bEnableWavefront)
> -        numLocalData += m_param->frameNumThreads;
> -    m_threadLocalData = new ThreadLocalData[numLocalData];
> -    for (int i = 0; i < numLocalData; i++)
> +        m_numThreadLocalData += m_param->frameNumThreads;
> +    m_threadLocalData = new ThreadLocalData[m_numThreadLocalData];
> +    for (int i = 0; i < m_numThreadLocalData; i++)
>      {
>          m_threadLocalData[i].analysis.setThreadPool(m_threadPool);
>          m_threadLocalData[i].analysis.initSearch(m_param, m_scalingList);
>          m_threadLocalData[i].analysis.create(g_maxCUDepth + 1, g_maxCUSize, m_threadLocalData);
> +        m_threadLocalData[i].m_nr = X265_MALLOC(NoiseReduction*, m_param->frameNumThreads);
> +        for (int j = 0; j < m_param->frameNumThreads; j++)
> +        {
> +            m_threadLocalData[i].m_nr[j] = X265_MALLOC(NoiseReduction, 1);
> +            memset(m_threadLocalData[i].m_nr[j], 0, sizeof(NoiseReduction));
> +        }
>      }

ok, except for the double allocations

>      if (!m_param->bEnableWavefront)
> @@ -240,6 +246,13 @@
>          delete [] m_frameEncoder;
>      }
>  
> +    for (int i = 0; i < m_numThreadLocalData; i++)
> +    {
> +        for (int j = 0; j < m_param->frameNumThreads; j++)
> +           X265_FREE(m_threadLocalData[i].m_nr[j]);
> +        X265_FREE(m_threadLocalData[i].m_nr);
> +    }
> +
>      delete [] m_threadLocalData;
>  
>      if (m_lookahead)
> @@ -400,6 +413,7 @@
>          m_lookahead->flush();
>  
>      FrameEncoder *curEncoder = &m_frameEncoder[m_curEncoder];
> +    curEncoder->m_frameEncoderID = m_curEncoder;
>      m_curEncoder = (m_curEncoder + 1) % m_param->frameNumThreads;
>      int ret = 0;
>  
> diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/encoder.h
> --- a/source/encoder/encoder.h	Mon Oct 13 14:36:40 2014 +0530
> +++ b/source/encoder/encoder.h	Mon Oct 13 16:11:55 2014 +0530
> @@ -93,7 +93,6 @@
>  
>      int                m_curEncoder;
>  
> -
>      /* Collect statistics globally */
>      EncStats           m_analyzeAll;
>      EncStats           m_analyzeI;
> @@ -116,6 +115,7 @@
>      PPS                m_pps;
>      NALList            m_nalList;
>      ScalingList        m_scalingList;      // quantization matrix information
> +    int                m_numThreadLocalData;
>  
>      int                m_lastBPSEI;
>      uint32_t           m_numDelayedPic;
> diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/frameencoder.cpp
> --- a/source/encoder/frameencoder.cpp	Mon Oct 13 14:36:40 2014 +0530
> +++ b/source/encoder/frameencoder.cpp	Mon Oct 13 16:11:55 2014 +0530
> @@ -49,8 +49,9 @@
>      m_vbvResetTriggerRow = -1;
>      m_outStreams = NULL;
>      m_substreamSizes = NULL;
> -    m_nr = NULL;
> +    m_frameNR = NULL;
>      m_tld = NULL;
> +    m_frameEncoderID = 0;
>      memset(&m_frameStats, 0, sizeof(m_frameStats));
>      memset(&m_rce, 0, sizeof(RateControlEntry));
>  }
> @@ -75,7 +76,7 @@
>      X265_FREE(m_substreamSizes);
>      m_frameFilter.destroy();
>  
> -    X265_FREE(m_nr);
> +    X265_FREE(m_frameNR);
>  
>      // wait for worker thread to exit
>      stop();
> @@ -119,9 +120,9 @@
>      }
>  
>      if (m_param->noiseReduction)
> -        m_nr = X265_MALLOC(NoiseReduction, 1);
> -    if (m_nr)
> -        memset(m_nr, 0, sizeof(NoiseReduction));
> +        m_frameNR = X265_MALLOC(NoiseReduction, 1);
> +    if (m_frameNR)
> +        memset(m_frameNR, 0, sizeof(NoiseReduction));
>      else
>          m_param->noiseReduction = 0;
>  
> @@ -393,8 +394,36 @@
>      if (m_top->m_rateControl->rateControlEnd(m_frame, m_accessUnitBits, &m_rce, &m_frameStats) < 0)
>          m_top->m_aborted = true;
>  
> +    /* Accumulate NR statistics from all worker threads */
> +    if (m_frameNR)
> +    {
> +        for (int i = 0; i < m_top->m_numThreadLocalData; i++)
> +        {
> +            NoiseReduction* nr = m_top->m_threadLocalData[i].m_nr[m_frameEncoderID];
> +            for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
> +            {
> +                for(int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++)
> +                    m_frameNR->residualSum[cat][coeff] += nr->residualSum[cat][coeff];
> +            
> +                m_frameNR->count[cat] += nr->count[cat];
> +            }
> +        }
> +    }
> +
>      noiseReductionUpdate();
>  
> +    /* Copy updated NR coefficients back to all worker threads */
> +    if (m_frameNR)
> +    {
> +        for (int i = 0; i < m_top->m_numThreadLocalData; i++)
> +        {
> +            NoiseReduction* nr = m_top->m_threadLocalData[i].m_nr[m_frameEncoderID];
> +            memcpy(nr->offsetDenoise, m_frameNR->offsetDenoise, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
> +            memset(nr->count, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES);
> +            memset(nr->residualSum, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
> +        }
> +    }

\o/

> +
>      // Decrement referenced frame reference counts, allow them to be recycled
>      for (int l = 0; l < numPredDir; l++)
>      {
> @@ -616,7 +645,8 @@
>      // setup thread-local data
>      Slice *slice = m_frame->m_picSym->m_slice;
>      TComPicYuv* fenc = m_frame->getPicYuvOrg();
> -    tld.analysis.m_quant.m_nr = m_nr;
> +    tld.analysis.m_quant.m_nr = tld.m_nr[m_frameEncoderID];
> +    tld.frameEncoderID = m_frameEncoderID;
>      tld.analysis.m_me.setSourcePlane(fenc->getLumaAddr(), fenc->getStride());
>      tld.analysis.m_log = &tld.analysis.m_sliceTypeLog[m_frame->m_picSym->m_slice->m_sliceType];
>      tld.analysis.setQP(slice, slice->m_sliceQp);
> @@ -866,34 +896,34 @@
>  /* DCT-domain noise reduction / adaptive deadzone from libavcodec */
>  void FrameEncoder::noiseReductionUpdate()
>  {
> -    if (!m_nr)
> +    if (!m_frameNR)
>          return;
>  
>      static const uint32_t maxBlocksPerTrSize[4] = {1 << 18, 1 << 16, 1 << 14, 1 << 12};
>  
> -    for (int cat = 0; cat < 8; cat++)
> +    for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
>      {
>          int trSize = cat & 3;
>          int coefCount = 1 << ((trSize + 2) * 2);
>  
> -        if (m_nr->count[cat] > maxBlocksPerTrSize[trSize])
> +        if (m_frameNR->count[cat] > maxBlocksPerTrSize[trSize])
>          {
>              for (int i = 0; i < coefCount; i++)
> -                m_nr->residualSum[cat][i] >>= 1;
> -            m_nr->count[cat] >>= 1;
> +                m_frameNR->residualSum[cat][i] >>= 1;
> +            m_frameNR->count[cat] >>= 1;
>          }
>  
> -        uint64_t scaledCount = (uint64_t)m_param->noiseReduction * m_nr->count[cat];
> +        uint64_t scaledCount = (uint64_t)m_param->noiseReduction * m_frameNR->count[cat];
>  
>          for (int i = 0; i < coefCount; i++)
>          {
> -            uint64_t value = scaledCount + m_nr->residualSum[cat][i] / 2;
> -            uint64_t denom = m_nr->residualSum[cat][i] + 1;
> -            m_nr->offsetDenoise[cat][i] = (uint16_t)(value / denom);
> +            uint64_t value = scaledCount + m_frameNR->residualSum[cat][i] / 2;
> +            uint64_t denom = m_frameNR->residualSum[cat][i] + 1;
> +            m_frameNR->offsetDenoise[cat][i] = (uint16_t)(value / denom);
>          }
>  
>          // Don't denoise DC coefficients
> -        m_nr->offsetDenoise[cat][0] = 0;
> +        m_frameNR->offsetDenoise[cat][0] = 0;
>      }
>  }
>  
> diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/frameencoder.h
> --- a/source/encoder/frameencoder.h	Mon Oct 13 14:36:40 2014 +0530
> +++ b/source/encoder/frameencoder.h	Mon Oct 13 16:11:55 2014 +0530
> @@ -140,7 +140,7 @@
>      Bitstream                m_bs;
>      Bitstream*               m_outStreams;
>      uint32_t*                m_substreamSizes;
> -    NoiseReduction*          m_nr;
> +    NoiseReduction*          m_frameNR;
>      NALList                  m_nalList;
>      ThreadLocalData*         m_tld; /* for --no-wpp */
>  
> @@ -148,6 +148,7 @@
>      int                      m_filterRowDelayCus;
>      Event                    m_completionEvent;
>      int64_t                  m_totalTime;
> +    int                      m_frameEncoderID;
>  
>  protected:
>  
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho