[x265] [PATCH] noiseReduction: make noiseReduction deterministic

deepthi at multicorewareinc.com deepthi at multicorewareinc.com
Tue Oct 14 06:37:48 CEST 2014


# HG changeset patch
# User Deepthi Nandakumar <deepthi at multicorewareinc.com>
# Date 1413196915 -19800
#      Mon Oct 13 16:11:55 2014 +0530
# Node ID d12a9a16ed2b975c6b57ca67a9ea5af1a96692ae
# Parent  f26e81eb555aa586380b34314c302ea9b148f357
noiseReduction: make noiseReduction deterministic

diff -r f26e81eb555a -r d12a9a16ed2b source/Lib/TLibCommon/TComRom.h
--- a/source/Lib/TLibCommon/TComRom.h	Mon Oct 13 14:36:40 2014 +0530
+++ b/source/Lib/TLibCommon/TComRom.h	Mon Oct 13 16:11:55 2014 +0530
@@ -43,29 +43,6 @@
 namespace x265 {
 // private namespace
 
-#define NUM_CU_DEPTH            4                           // maximun number of CU depths
-#define NUM_FULL_DEPTH          5                           // maximun number of full depths
-#define MIN_LOG2_CU_SIZE        3                           // log2(minCUSize)
-#define MAX_LOG2_CU_SIZE        6                           // log2(maxCUSize)
-#define MIN_CU_SIZE             (1 << MIN_LOG2_CU_SIZE)     // minimum allowable size of CU
-#define MAX_CU_SIZE             (1 << MAX_LOG2_CU_SIZE)     // maximum allowable size of CU
-
-#define LOG2_UNIT_SIZE          2                           // log2(unitSize)
-#define UNIT_SIZE               (1 << LOG2_UNIT_SIZE)       // unit size of CU partition
-#define TMVP_UNIT_MASK          0xF0                        // mask for mapping index to CompressMV field
-
-#define MAX_NUM_PARTITIONS      256
-
-#define MIN_PU_SIZE             4
-#define MIN_TU_SIZE             4
-#define MAX_NUM_SPU_W           (MAX_CU_SIZE / MIN_PU_SIZE) // maximum number of SPU in horizontal line
-#define ADI_BUF_STRIDE          (2 * MAX_CU_SIZE + 1 + 15)  // alignment to 16 bytes
-
-#define MAX_LOG2_TR_SIZE 5
-#define MAX_LOG2_TS_SIZE 2 // TODO: RExt
-#define MAX_TR_SIZE (1 << MAX_LOG2_TR_SIZE)
-#define MAX_TS_SIZE (1 << MAX_LOG2_TS_SIZE)
-
 #define SLFASE_CONSTANT 0x5f4e4a53
 
 void initROM();
diff -r f26e81eb555a -r d12a9a16ed2b source/common/common.h
--- a/source/common/common.h	Mon Oct 13 14:36:40 2014 +0530
+++ b/source/common/common.h	Mon Oct 13 16:11:55 2014 +0530
@@ -214,15 +214,40 @@
 namespace x265 {
 
 enum { SAO_NUM_OFFSET = 4 };
+#define NUM_CU_DEPTH            4                           // maximun number of CU depths
+#define NUM_FULL_DEPTH          5                           // maximun number of full depths
+#define MIN_LOG2_CU_SIZE        3                           // log2(minCUSize)
+#define MAX_LOG2_CU_SIZE        6                           // log2(maxCUSize)
+#define MIN_CU_SIZE             (1 << MIN_LOG2_CU_SIZE)     // minimum allowable size of CU
+#define MAX_CU_SIZE             (1 << MAX_LOG2_CU_SIZE)     // maximum allowable size of CU
+
+#define LOG2_UNIT_SIZE          2                           // log2(unitSize)
+#define UNIT_SIZE               (1 << LOG2_UNIT_SIZE)       // unit size of CU partition
+#define TMVP_UNIT_MASK          0xF0                        // mask for mapping index to CompressMV field
+
+#define MAX_NUM_PARTITIONS      256
+
+#define MIN_PU_SIZE             4
+#define MIN_TU_SIZE             4
+#define MAX_NUM_SPU_W           (MAX_CU_SIZE / MIN_PU_SIZE) // maximum number of SPU in horizontal line
+#define ADI_BUF_STRIDE          (2 * MAX_CU_SIZE + 1 + 15)  // alignment to 16 bytes
+
+#define MAX_LOG2_TR_SIZE 5
+#define MAX_LOG2_TS_SIZE 2 // TODO: RExt
+#define MAX_TR_SIZE (1 << MAX_LOG2_TR_SIZE)
+#define MAX_TS_SIZE (1 << MAX_LOG2_TS_SIZE)
+
+#define MAX_NUM_TR_COEFFS MAX_TR_SIZE * MAX_TR_SIZE /* Maximum number of transform coefficients, for a 32x32 transform */
+#define MAX_NUM_TR_CATEGORIES    8   /* 32, 16, 8, 4 transform categories each for luma and chroma */
 
 // NOTE: MUST be alignment to 16 or 32 bytes for asm code
 struct NoiseReduction
 {
     /* 0 = luma 4x4, 1 = luma 8x8, 2 = luma 16x16, 3 = luma 32x32
      * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma 32x32 */
-    uint16_t offsetDenoise[8][1024];
-    uint32_t residualSum[8][1024];
-    uint32_t count[8];
+    uint16_t offsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
+    uint32_t residualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
+    uint32_t count[MAX_NUM_TR_CATEGORIES];
 };
 
 struct SaoCtuParam
diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp	Mon Oct 13 14:36:40 2014 +0530
+++ b/source/encoder/analysis.cpp	Mon Oct 13 16:11:55 2014 +0530
@@ -292,7 +292,8 @@
         if (!jobId || m_param->rdLevel > 4)
         {
             slave->m_quant.setQPforQuant(cu);
-            slave->m_quant.m_nr = m_quant.m_nr;
+            int frameEncoderID = m_tld[threadId].frameEncoderID;
+            slave->m_quant.m_nr = m_tld[threadId].m_nr[frameEncoderID];
             slave->m_rdContexts[depth].cur.load(m_rdContexts[depth].cur);
         }
     }
diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/analysis.h
--- a/source/encoder/analysis.h	Mon Oct 13 14:36:40 2014 +0530
+++ b/source/encoder/analysis.h	Mon Oct 13 16:11:55 2014 +0530
@@ -172,8 +172,18 @@
 struct ThreadLocalData
 {
     Analysis analysis;
+    NoiseReduction** m_nr;
+    int frameEncoderID;
 
-    ~ThreadLocalData() { analysis.destroy(); }
+    ThreadLocalData()
+    {
+        m_nr = NULL;
+    }
+
+    ~ThreadLocalData()
+    {
+        analysis.destroy();
+    }
 };
 
 }
diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp	Mon Oct 13 14:36:40 2014 +0530
+++ b/source/encoder/encoder.cpp	Mon Oct 13 16:11:55 2014 +0530
@@ -162,15 +162,21 @@
 
     /* Allocate thread local data, one for each thread pool worker and
      * if --no-wpp, one for each frame encoder */
-    int numLocalData = poolThreadCount;
+    m_numThreadLocalData = poolThreadCount;
     if (!m_param->bEnableWavefront)
-        numLocalData += m_param->frameNumThreads;
-    m_threadLocalData = new ThreadLocalData[numLocalData];
-    for (int i = 0; i < numLocalData; i++)
+        m_numThreadLocalData += m_param->frameNumThreads;
+    m_threadLocalData = new ThreadLocalData[m_numThreadLocalData];
+    for (int i = 0; i < m_numThreadLocalData; i++)
     {
         m_threadLocalData[i].analysis.setThreadPool(m_threadPool);
         m_threadLocalData[i].analysis.initSearch(m_param, m_scalingList);
         m_threadLocalData[i].analysis.create(g_maxCUDepth + 1, g_maxCUSize, m_threadLocalData);
+        m_threadLocalData[i].m_nr = X265_MALLOC(NoiseReduction*, m_param->frameNumThreads);
+        for (int j = 0; j < m_param->frameNumThreads; j++)
+        {
+            m_threadLocalData[i].m_nr[j] = X265_MALLOC(NoiseReduction, 1);
+            memset(m_threadLocalData[i].m_nr[j], 0, sizeof(NoiseReduction));
+        }
     }
 
     if (!m_param->bEnableWavefront)
@@ -240,6 +246,13 @@
         delete [] m_frameEncoder;
     }
 
+    for (int i = 0; i < m_numThreadLocalData; i++)
+    {
+        for (int j = 0; j < m_param->frameNumThreads; j++)
+           X265_FREE(m_threadLocalData[i].m_nr[j]);
+        X265_FREE(m_threadLocalData[i].m_nr);
+    }
+
     delete [] m_threadLocalData;
 
     if (m_lookahead)
@@ -400,6 +413,7 @@
         m_lookahead->flush();
 
     FrameEncoder *curEncoder = &m_frameEncoder[m_curEncoder];
+    curEncoder->m_frameEncoderID = m_curEncoder;
     m_curEncoder = (m_curEncoder + 1) % m_param->frameNumThreads;
     int ret = 0;
 
diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/encoder.h
--- a/source/encoder/encoder.h	Mon Oct 13 14:36:40 2014 +0530
+++ b/source/encoder/encoder.h	Mon Oct 13 16:11:55 2014 +0530
@@ -93,7 +93,6 @@
 
     int                m_curEncoder;
 
-
     /* Collect statistics globally */
     EncStats           m_analyzeAll;
     EncStats           m_analyzeI;
@@ -116,6 +115,7 @@
     PPS                m_pps;
     NALList            m_nalList;
     ScalingList        m_scalingList;      // quantization matrix information
+    int                m_numThreadLocalData;
 
     int                m_lastBPSEI;
     uint32_t           m_numDelayedPic;
diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp	Mon Oct 13 14:36:40 2014 +0530
+++ b/source/encoder/frameencoder.cpp	Mon Oct 13 16:11:55 2014 +0530
@@ -49,8 +49,9 @@
     m_vbvResetTriggerRow = -1;
     m_outStreams = NULL;
     m_substreamSizes = NULL;
-    m_nr = NULL;
+    m_frameNR = NULL;
     m_tld = NULL;
+    m_frameEncoderID = 0;
     memset(&m_frameStats, 0, sizeof(m_frameStats));
     memset(&m_rce, 0, sizeof(RateControlEntry));
 }
@@ -75,7 +76,7 @@
     X265_FREE(m_substreamSizes);
     m_frameFilter.destroy();
 
-    X265_FREE(m_nr);
+    X265_FREE(m_frameNR);
 
     // wait for worker thread to exit
     stop();
@@ -119,9 +120,9 @@
     }
 
     if (m_param->noiseReduction)
-        m_nr = X265_MALLOC(NoiseReduction, 1);
-    if (m_nr)
-        memset(m_nr, 0, sizeof(NoiseReduction));
+        m_frameNR = X265_MALLOC(NoiseReduction, 1);
+    if (m_frameNR)
+        memset(m_frameNR, 0, sizeof(NoiseReduction));
     else
         m_param->noiseReduction = 0;
 
@@ -393,8 +394,36 @@
     if (m_top->m_rateControl->rateControlEnd(m_frame, m_accessUnitBits, &m_rce, &m_frameStats) < 0)
         m_top->m_aborted = true;
 
+    /* Accumulate NR statistics from all worker threads */
+    if (m_frameNR)
+    {
+        for (int i = 0; i < m_top->m_numThreadLocalData; i++)
+        {
+            NoiseReduction* nr = m_top->m_threadLocalData[i].m_nr[m_frameEncoderID];
+            for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
+            {
+                for(int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++)
+                    m_frameNR->residualSum[cat][coeff] += nr->residualSum[cat][coeff];
+            
+                m_frameNR->count[cat] += nr->count[cat];
+            }
+        }
+    }
+
     noiseReductionUpdate();
 
+    /* Copy updated NR coefficients back to all worker threads */
+    if (m_frameNR)
+    {
+        for (int i = 0; i < m_top->m_numThreadLocalData; i++)
+        {
+            NoiseReduction* nr = m_top->m_threadLocalData[i].m_nr[m_frameEncoderID];
+            memcpy(nr->offsetDenoise, m_frameNR->offsetDenoise, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
+            memset(nr->count, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES);
+            memset(nr->residualSum, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
+        }
+    }
+
     // Decrement referenced frame reference counts, allow them to be recycled
     for (int l = 0; l < numPredDir; l++)
     {
@@ -616,7 +645,8 @@
     // setup thread-local data
     Slice *slice = m_frame->m_picSym->m_slice;
     TComPicYuv* fenc = m_frame->getPicYuvOrg();
-    tld.analysis.m_quant.m_nr = m_nr;
+    tld.analysis.m_quant.m_nr = tld.m_nr[m_frameEncoderID];
+    tld.frameEncoderID = m_frameEncoderID;
     tld.analysis.m_me.setSourcePlane(fenc->getLumaAddr(), fenc->getStride());
     tld.analysis.m_log = &tld.analysis.m_sliceTypeLog[m_frame->m_picSym->m_slice->m_sliceType];
     tld.analysis.setQP(slice, slice->m_sliceQp);
@@ -866,34 +896,34 @@
 /* DCT-domain noise reduction / adaptive deadzone from libavcodec */
 void FrameEncoder::noiseReductionUpdate()
 {
-    if (!m_nr)
+    if (!m_frameNR)
         return;
 
     static const uint32_t maxBlocksPerTrSize[4] = {1 << 18, 1 << 16, 1 << 14, 1 << 12};
 
-    for (int cat = 0; cat < 8; cat++)
+    for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
     {
         int trSize = cat & 3;
         int coefCount = 1 << ((trSize + 2) * 2);
 
-        if (m_nr->count[cat] > maxBlocksPerTrSize[trSize])
+        if (m_frameNR->count[cat] > maxBlocksPerTrSize[trSize])
         {
             for (int i = 0; i < coefCount; i++)
-                m_nr->residualSum[cat][i] >>= 1;
-            m_nr->count[cat] >>= 1;
+                m_frameNR->residualSum[cat][i] >>= 1;
+            m_frameNR->count[cat] >>= 1;
         }
 
-        uint64_t scaledCount = (uint64_t)m_param->noiseReduction * m_nr->count[cat];
+        uint64_t scaledCount = (uint64_t)m_param->noiseReduction * m_frameNR->count[cat];
 
         for (int i = 0; i < coefCount; i++)
         {
-            uint64_t value = scaledCount + m_nr->residualSum[cat][i] / 2;
-            uint64_t denom = m_nr->residualSum[cat][i] + 1;
-            m_nr->offsetDenoise[cat][i] = (uint16_t)(value / denom);
+            uint64_t value = scaledCount + m_frameNR->residualSum[cat][i] / 2;
+            uint64_t denom = m_frameNR->residualSum[cat][i] + 1;
+            m_frameNR->offsetDenoise[cat][i] = (uint16_t)(value / denom);
         }
 
         // Don't denoise DC coefficients
-        m_nr->offsetDenoise[cat][0] = 0;
+        m_frameNR->offsetDenoise[cat][0] = 0;
     }
 }
 
diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/frameencoder.h
--- a/source/encoder/frameencoder.h	Mon Oct 13 14:36:40 2014 +0530
+++ b/source/encoder/frameencoder.h	Mon Oct 13 16:11:55 2014 +0530
@@ -140,7 +140,7 @@
     Bitstream                m_bs;
     Bitstream*               m_outStreams;
     uint32_t*                m_substreamSizes;
-    NoiseReduction*          m_nr;
+    NoiseReduction*          m_frameNR;
     NALList                  m_nalList;
     ThreadLocalData*         m_tld; /* for --no-wpp */
 
@@ -148,6 +148,7 @@
     int                      m_filterRowDelayCus;
     Event                    m_completionEvent;
     int64_t                  m_totalTime;
+    int                      m_frameEncoderID;
 
 protected:
 


More information about the x265-devel mailing list