[x265] [PATCH] noiseReduction: make noiseReduction deterministic
Steve Borho
steve at borho.org
Tue Oct 14 08:15:10 CEST 2014
On 10/14, deepthi at multicorewareinc.com wrote:
> # HG changeset patch
> # User Deepthi Nandakumar <deepthi at multicorewareinc.com>
> # Date 1413196915 -19800
> # Mon Oct 13 16:11:55 2014 +0530
> # Node ID d12a9a16ed2b975c6b57ca67a9ea5af1a96692ae
> # Parent f26e81eb555aa586380b34314c302ea9b148f357
> noiseReduction: make noiseReduction deterministic
>
> diff -r f26e81eb555a -r d12a9a16ed2b source/Lib/TLibCommon/TComRom.h
> --- a/source/Lib/TLibCommon/TComRom.h Mon Oct 13 14:36:40 2014 +0530
> +++ b/source/Lib/TLibCommon/TComRom.h Mon Oct 13 16:11:55 2014 +0530
> @@ -43,29 +43,6 @@
> namespace x265 {
> // private namespace
most of this deserves to be in its own patch
> -#define NUM_CU_DEPTH 4 // maximun number of CU depths
> -#define NUM_FULL_DEPTH 5 // maximun number of full depths
> -#define MIN_LOG2_CU_SIZE 3 // log2(minCUSize)
> -#define MAX_LOG2_CU_SIZE 6 // log2(maxCUSize)
> -#define MIN_CU_SIZE (1 << MIN_LOG2_CU_SIZE) // minimum allowable size of CU
> -#define MAX_CU_SIZE (1 << MAX_LOG2_CU_SIZE) // maximum allowable size of CU
> -
> -#define LOG2_UNIT_SIZE 2 // log2(unitSize)
> -#define UNIT_SIZE (1 << LOG2_UNIT_SIZE) // unit size of CU partition
> -#define TMVP_UNIT_MASK 0xF0 // mask for mapping index to CompressMV field
> -
> -#define MAX_NUM_PARTITIONS 256
> -
> -#define MIN_PU_SIZE 4
> -#define MIN_TU_SIZE 4
> -#define MAX_NUM_SPU_W (MAX_CU_SIZE / MIN_PU_SIZE) // maximum number of SPU in horizontal line
> -#define ADI_BUF_STRIDE (2 * MAX_CU_SIZE + 1 + 15) // alignment to 16 bytes
> -
> -#define MAX_LOG2_TR_SIZE 5
> -#define MAX_LOG2_TS_SIZE 2 // TODO: RExt
> -#define MAX_TR_SIZE (1 << MAX_LOG2_TR_SIZE)
> -#define MAX_TS_SIZE (1 << MAX_LOG2_TS_SIZE)
> -
> #define SLFASE_CONSTANT 0x5f4e4a53
>
> void initROM();
> diff -r f26e81eb555a -r d12a9a16ed2b source/common/common.h
> --- a/source/common/common.h Mon Oct 13 14:36:40 2014 +0530
> +++ b/source/common/common.h Mon Oct 13 16:11:55 2014 +0530
> @@ -214,15 +214,40 @@
> namespace x265 {
>
> enum { SAO_NUM_OFFSET = 4 };
> +#define NUM_CU_DEPTH 4 // maximun number of CU depths
> +#define NUM_FULL_DEPTH 5 // maximun number of full depths
> +#define MIN_LOG2_CU_SIZE 3 // log2(minCUSize)
> +#define MAX_LOG2_CU_SIZE 6 // log2(maxCUSize)
> +#define MIN_CU_SIZE (1 << MIN_LOG2_CU_SIZE) // minimum allowable size of CU
> +#define MAX_CU_SIZE (1 << MAX_LOG2_CU_SIZE) // maximum allowable size of CU
> +
> +#define LOG2_UNIT_SIZE 2 // log2(unitSize)
> +#define UNIT_SIZE (1 << LOG2_UNIT_SIZE) // unit size of CU partition
> +#define TMVP_UNIT_MASK 0xF0 // mask for mapping index to CompressMV field
> +
> +#define MAX_NUM_PARTITIONS 256
> +
> +#define MIN_PU_SIZE 4
> +#define MIN_TU_SIZE 4
> +#define MAX_NUM_SPU_W (MAX_CU_SIZE / MIN_PU_SIZE) // maximum number of SPU in horizontal line
> +#define ADI_BUF_STRIDE (2 * MAX_CU_SIZE + 1 + 15) // alignment to 16 bytes
> +
> +#define MAX_LOG2_TR_SIZE 5
> +#define MAX_LOG2_TS_SIZE 2 // TODO: RExt
> +#define MAX_TR_SIZE (1 << MAX_LOG2_TR_SIZE)
> +#define MAX_TS_SIZE (1 << MAX_LOG2_TS_SIZE)
> +
> +#define MAX_NUM_TR_COEFFS MAX_TR_SIZE * MAX_TR_SIZE /* Maximum number of transform coefficients, for a 32x32 transform */
> +#define MAX_NUM_TR_CATEGORIES 8 /* 32, 16, 8, 4 transform categories each for luma and chroma */
>
> // NOTE: MUST be alignment to 16 or 32 bytes for asm code
> struct NoiseReduction
> {
> /* 0 = luma 4x4, 1 = luma 8x8, 2 = luma 16x16, 3 = luma 32x32
> * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma 32x32 */
> - uint16_t offsetDenoise[8][1024];
> - uint32_t residualSum[8][1024];
> - uint32_t count[8];
> + uint16_t offsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
> + uint32_t residualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
> + uint32_t count[MAX_NUM_TR_CATEGORIES];
> };
>
> struct SaoCtuParam
> diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/analysis.cpp
> --- a/source/encoder/analysis.cpp Mon Oct 13 14:36:40 2014 +0530
> +++ b/source/encoder/analysis.cpp Mon Oct 13 16:11:55 2014 +0530
> @@ -292,7 +292,8 @@
> if (!jobId || m_param->rdLevel > 4)
> {
> slave->m_quant.setQPforQuant(cu);
> - slave->m_quant.m_nr = m_quant.m_nr;
> + int frameEncoderID = m_tld[threadId].frameEncoderID;
> + slave->m_quant.m_nr = m_tld[threadId].m_nr[frameEncoderID];
almost. this is taking the frameEncoderID from the slave, which has not
beeen initialized
frameEncoderID needs to come from the master TLD, which you probably
can't easily access from here. So frameEncoderID should probably be a
member of Analysis or (perhaps even better) the TComPicSym.
> slave->m_rdContexts[depth].cur.load(m_rdContexts[depth].cur);
> }
> }
> diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/analysis.h
> --- a/source/encoder/analysis.h Mon Oct 13 14:36:40 2014 +0530
> +++ b/source/encoder/analysis.h Mon Oct 13 16:11:55 2014 +0530
> @@ -172,8 +172,18 @@
> struct ThreadLocalData
> {
> Analysis analysis;
> + NoiseReduction** m_nr;
why a double pointer? why not just allocate an array of them?
> + int frameEncoderID;
>
> - ~ThreadLocalData() { analysis.destroy(); }
> + ThreadLocalData()
> + {
> + m_nr = NULL;
> + }
> +
> + ~ThreadLocalData()
> + {
> + analysis.destroy();
> + }
> };
>
> }
> diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/encoder.cpp
> --- a/source/encoder/encoder.cpp Mon Oct 13 14:36:40 2014 +0530
> +++ b/source/encoder/encoder.cpp Mon Oct 13 16:11:55 2014 +0530
> @@ -162,15 +162,21 @@
>
> /* Allocate thread local data, one for each thread pool worker and
> * if --no-wpp, one for each frame encoder */
> - int numLocalData = poolThreadCount;
> + m_numThreadLocalData = poolThreadCount;
> if (!m_param->bEnableWavefront)
> - numLocalData += m_param->frameNumThreads;
> - m_threadLocalData = new ThreadLocalData[numLocalData];
> - for (int i = 0; i < numLocalData; i++)
> + m_numThreadLocalData += m_param->frameNumThreads;
> + m_threadLocalData = new ThreadLocalData[m_numThreadLocalData];
> + for (int i = 0; i < m_numThreadLocalData; i++)
> {
> m_threadLocalData[i].analysis.setThreadPool(m_threadPool);
> m_threadLocalData[i].analysis.initSearch(m_param, m_scalingList);
> m_threadLocalData[i].analysis.create(g_maxCUDepth + 1, g_maxCUSize, m_threadLocalData);
> + m_threadLocalData[i].m_nr = X265_MALLOC(NoiseReduction*, m_param->frameNumThreads);
> + for (int j = 0; j < m_param->frameNumThreads; j++)
> + {
> + m_threadLocalData[i].m_nr[j] = X265_MALLOC(NoiseReduction, 1);
> + memset(m_threadLocalData[i].m_nr[j], 0, sizeof(NoiseReduction));
> + }
> }
ok, except for the double allocations
> if (!m_param->bEnableWavefront)
> @@ -240,6 +246,13 @@
> delete [] m_frameEncoder;
> }
>
> + for (int i = 0; i < m_numThreadLocalData; i++)
> + {
> + for (int j = 0; j < m_param->frameNumThreads; j++)
> + X265_FREE(m_threadLocalData[i].m_nr[j]);
> + X265_FREE(m_threadLocalData[i].m_nr);
> + }
> +
> delete [] m_threadLocalData;
>
> if (m_lookahead)
> @@ -400,6 +413,7 @@
> m_lookahead->flush();
>
> FrameEncoder *curEncoder = &m_frameEncoder[m_curEncoder];
> + curEncoder->m_frameEncoderID = m_curEncoder;
> m_curEncoder = (m_curEncoder + 1) % m_param->frameNumThreads;
> int ret = 0;
>
> diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/encoder.h
> --- a/source/encoder/encoder.h Mon Oct 13 14:36:40 2014 +0530
> +++ b/source/encoder/encoder.h Mon Oct 13 16:11:55 2014 +0530
> @@ -93,7 +93,6 @@
>
> int m_curEncoder;
>
> -
> /* Collect statistics globally */
> EncStats m_analyzeAll;
> EncStats m_analyzeI;
> @@ -116,6 +115,7 @@
> PPS m_pps;
> NALList m_nalList;
> ScalingList m_scalingList; // quantization matrix information
> + int m_numThreadLocalData;
>
> int m_lastBPSEI;
> uint32_t m_numDelayedPic;
> diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/frameencoder.cpp
> --- a/source/encoder/frameencoder.cpp Mon Oct 13 14:36:40 2014 +0530
> +++ b/source/encoder/frameencoder.cpp Mon Oct 13 16:11:55 2014 +0530
> @@ -49,8 +49,9 @@
> m_vbvResetTriggerRow = -1;
> m_outStreams = NULL;
> m_substreamSizes = NULL;
> - m_nr = NULL;
> + m_frameNR = NULL;
> m_tld = NULL;
> + m_frameEncoderID = 0;
> memset(&m_frameStats, 0, sizeof(m_frameStats));
> memset(&m_rce, 0, sizeof(RateControlEntry));
> }
> @@ -75,7 +76,7 @@
> X265_FREE(m_substreamSizes);
> m_frameFilter.destroy();
>
> - X265_FREE(m_nr);
> + X265_FREE(m_frameNR);
>
> // wait for worker thread to exit
> stop();
> @@ -119,9 +120,9 @@
> }
>
> if (m_param->noiseReduction)
> - m_nr = X265_MALLOC(NoiseReduction, 1);
> - if (m_nr)
> - memset(m_nr, 0, sizeof(NoiseReduction));
> + m_frameNR = X265_MALLOC(NoiseReduction, 1);
> + if (m_frameNR)
> + memset(m_frameNR, 0, sizeof(NoiseReduction));
> else
> m_param->noiseReduction = 0;
>
> @@ -393,8 +394,36 @@
> if (m_top->m_rateControl->rateControlEnd(m_frame, m_accessUnitBits, &m_rce, &m_frameStats) < 0)
> m_top->m_aborted = true;
>
> + /* Accumulate NR statistics from all worker threads */
> + if (m_frameNR)
> + {
> + for (int i = 0; i < m_top->m_numThreadLocalData; i++)
> + {
> + NoiseReduction* nr = m_top->m_threadLocalData[i].m_nr[m_frameEncoderID];
> + for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
> + {
> + for(int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++)
> + m_frameNR->residualSum[cat][coeff] += nr->residualSum[cat][coeff];
> +
> + m_frameNR->count[cat] += nr->count[cat];
> + }
> + }
> + }
> +
> noiseReductionUpdate();
>
> + /* Copy updated NR coefficients back to all worker threads */
> + if (m_frameNR)
> + {
> + for (int i = 0; i < m_top->m_numThreadLocalData; i++)
> + {
> + NoiseReduction* nr = m_top->m_threadLocalData[i].m_nr[m_frameEncoderID];
> + memcpy(nr->offsetDenoise, m_frameNR->offsetDenoise, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
> + memset(nr->count, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES);
> + memset(nr->residualSum, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
> + }
> + }
\o/
> +
> // Decrement referenced frame reference counts, allow them to be recycled
> for (int l = 0; l < numPredDir; l++)
> {
> @@ -616,7 +645,8 @@
> // setup thread-local data
> Slice *slice = m_frame->m_picSym->m_slice;
> TComPicYuv* fenc = m_frame->getPicYuvOrg();
> - tld.analysis.m_quant.m_nr = m_nr;
> + tld.analysis.m_quant.m_nr = tld.m_nr[m_frameEncoderID];
> + tld.frameEncoderID = m_frameEncoderID;
> tld.analysis.m_me.setSourcePlane(fenc->getLumaAddr(), fenc->getStride());
> tld.analysis.m_log = &tld.analysis.m_sliceTypeLog[m_frame->m_picSym->m_slice->m_sliceType];
> tld.analysis.setQP(slice, slice->m_sliceQp);
> @@ -866,34 +896,34 @@
> /* DCT-domain noise reduction / adaptive deadzone from libavcodec */
> void FrameEncoder::noiseReductionUpdate()
> {
> - if (!m_nr)
> + if (!m_frameNR)
> return;
>
> static const uint32_t maxBlocksPerTrSize[4] = {1 << 18, 1 << 16, 1 << 14, 1 << 12};
>
> - for (int cat = 0; cat < 8; cat++)
> + for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
> {
> int trSize = cat & 3;
> int coefCount = 1 << ((trSize + 2) * 2);
>
> - if (m_nr->count[cat] > maxBlocksPerTrSize[trSize])
> + if (m_frameNR->count[cat] > maxBlocksPerTrSize[trSize])
> {
> for (int i = 0; i < coefCount; i++)
> - m_nr->residualSum[cat][i] >>= 1;
> - m_nr->count[cat] >>= 1;
> + m_frameNR->residualSum[cat][i] >>= 1;
> + m_frameNR->count[cat] >>= 1;
> }
>
> - uint64_t scaledCount = (uint64_t)m_param->noiseReduction * m_nr->count[cat];
> + uint64_t scaledCount = (uint64_t)m_param->noiseReduction * m_frameNR->count[cat];
>
> for (int i = 0; i < coefCount; i++)
> {
> - uint64_t value = scaledCount + m_nr->residualSum[cat][i] / 2;
> - uint64_t denom = m_nr->residualSum[cat][i] + 1;
> - m_nr->offsetDenoise[cat][i] = (uint16_t)(value / denom);
> + uint64_t value = scaledCount + m_frameNR->residualSum[cat][i] / 2;
> + uint64_t denom = m_frameNR->residualSum[cat][i] + 1;
> + m_frameNR->offsetDenoise[cat][i] = (uint16_t)(value / denom);
> }
>
> // Don't denoise DC coefficients
> - m_nr->offsetDenoise[cat][0] = 0;
> + m_frameNR->offsetDenoise[cat][0] = 0;
> }
> }
>
> diff -r f26e81eb555a -r d12a9a16ed2b source/encoder/frameencoder.h
> --- a/source/encoder/frameencoder.h Mon Oct 13 14:36:40 2014 +0530
> +++ b/source/encoder/frameencoder.h Mon Oct 13 16:11:55 2014 +0530
> @@ -140,7 +140,7 @@
> Bitstream m_bs;
> Bitstream* m_outStreams;
> uint32_t* m_substreamSizes;
> - NoiseReduction* m_nr;
> + NoiseReduction* m_frameNR;
> NALList m_nalList;
> ThreadLocalData* m_tld; /* for --no-wpp */
>
> @@ -148,6 +148,7 @@
> int m_filterRowDelayCus;
> Event m_completionEvent;
> int64_t m_totalTime;
> + int m_frameEncoderID;
>
> protected:
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list