[x265] [PATCH] Add emergency denoising when frame qp > QP_MAX_SPEC
Sagar Kotecha
sagar at multicorewareinc.com
Wed Aug 5 09:28:00 CEST 2015
On Tue, Aug 4, 2015 at 11:02 PM, Steve Borho <steve at borho.org> wrote:
> On 08/04, sagar at multicorewareinc.com wrote:
> > # HG changeset patch
> > # User Sagar Kotecha <sagar at multicorewareinc.com>
> > # Date 1438676290 -19800
> > # Tue Aug 04 13:48:10 2015 +0530
> > # Node ID bf5c5aca1a24eb4699d99a3ce4de386096219a5a
> > # Parent d5278c76d341b3bac405938dbfb64cb7e2d9bce5
> > Add emergency denoising when frame qp > QP_MAX_SPEC
> >
> > This feature is ported from x264, and is turned on for VBV encodes
>
> this looks a lot better than the previous patch
>
> > diff -r d5278c76d341 -r bf5c5aca1a24 source/common/common.h
> > --- a/source/common/common.h Mon Aug 03 10:18:46 2015 -0500
> > +++ b/source/common/common.h Tue Aug 04 13:48:10 2015 +0530
> > @@ -311,6 +311,9 @@
> > #define CHROMA_V_SHIFT(x) (x == X265_CSP_I420)
> > #define X265_MAX_PRED_MODE_PER_CTU 85 * 2 * 8
> >
> > +#define MAX_NUM_TR_COEFFS MAX_TR_SIZE * MAX_TR_SIZE //
> Maximum number of transform coefficients, for a 32x32 transform
> > +#define MAX_NUM_TR_CATEGORIES 16 // 32,
> 16, 8, 4 transform categories each for luma and chroma
> > +
> > namespace X265_NS {
> >
> > enum { SAO_NUM_OFFSET = 4 };
> > diff -r d5278c76d341 -r bf5c5aca1a24 source/common/quant.cpp
> > --- a/source/common/quant.cpp Mon Aug 03 10:18:46 2015 -0500
> > +++ b/source/common/quant.cpp Tue Aug 04 13:48:10 2015 +0530
> > @@ -447,12 +447,12 @@
> > primitives.cu[sizeIdx].dct(m_fencShortBuf, m_fencDctCoeff,
> trSize);
> > }
> >
> > - if (m_nr)
> > + if (m_nr && m_nr->offset)
> > {
> > /* denoise is not applied to intra residual, so DST can be
> ignored */
> > int cat = sizeIdx + 4 * !isLuma + 8 * !isIntra;
> > int numCoeff = 1 << (log2TrSize * 2);
> > - primitives.denoiseDct(m_resiDctCoeff,
> m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff);
> > + primitives.denoiseDct(m_resiDctCoeff,
> m_nr->residualSum[cat], m_nr->offset[cat], numCoeff);
> > m_nr->count[cat]++;
> > }
> > }
> > diff -r d5278c76d341 -r bf5c5aca1a24 source/common/quant.h
> > --- a/source/common/quant.h Mon Aug 03 10:18:46 2015 -0500
> > +++ b/source/common/quant.h Tue Aug 04 13:48:10 2015 +0530
> > @@ -59,18 +59,19 @@
> > }
> > };
> >
> > -#define MAX_NUM_TR_COEFFS MAX_TR_SIZE * MAX_TR_SIZE /* Maximum
> number of transform coefficients, for a 32x32 transform */
> > -#define MAX_NUM_TR_CATEGORIES 16 /* 32, 16,
> 8, 4 transform categories each for luma and chroma */
> > -
> > // NOTE: MUST be 16-byte aligned for asm code
> > struct NoiseReduction
> > {
> > /* 0 = luma 4x4, 1 = luma 8x8, 2 = luma 16x16, 3 = luma 32x32
> > * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma
> 32x32
> > * Intra 0..7 - Inter 8..15 */
> > - uint16_t offsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
> > - uint32_t residualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
> > - uint32_t count[MAX_NUM_TR_CATEGORIES];
> > + uint16_t nrOffsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
> > + uint32_t nrResidualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
> > + uint32_t nrCount[MAX_NUM_TR_CATEGORIES];
> > +
> > + ALIGN_VAR_16(uint16_t, (*offset)[MAX_NUM_TR_COEFFS]);
> > + uint32_t (*residualSum)[MAX_NUM_TR_COEFFS];
> > + uint32_t *count;
> > };
>
> I can see that the way we've split these structures to preserve
> determinism makes it rather difficult to swap out the user-specified
> denoise coeff with the emergency coeff.
>
> >
> > class Quant
> > diff -r d5278c76d341 -r bf5c5aca1a24 source/encoder/encoder.cpp
> > --- a/source/encoder/encoder.cpp Mon Aug 03 10:18:46 2015 -0500
> > +++ b/source/encoder/encoder.cpp Tue Aug 04 13:48:10 2015 +0530
> > @@ -72,6 +72,7 @@
> > m_buOffsetC = NULL;
> > m_threadPool = NULL;
> > m_analysisFile = NULL;
> > + m_offsetEmergency = NULL;
> > for (int i = 0; i < X265_MAX_FRAME_THREADS; i++)
> > m_frameEncoder[i] = NULL;
> >
> > @@ -191,6 +192,7 @@
> > {
> > x265_log(m_param, X265_LOG_ERROR, "Unable to allocate scaling
> list arrays\n");
> > m_aborted = true;
> > + return;
> > }
> > else if (!m_param->scalingLists || !strcmp(m_param->scalingLists,
> "off"))
> > m_scalingList.m_bEnabled = false;
> > @@ -198,7 +200,6 @@
> > m_scalingList.setDefaultScalingList();
> > else if (m_scalingList.parseScalingList(m_param->scalingLists))
> > m_aborted = true;
> > - m_scalingList.setupQuantMatrices();
> >
> > m_lookahead = new Lookahead(m_param, m_threadPool);
> > if (m_numPools)
> > @@ -213,6 +214,83 @@
> > initVPS(&m_vps);
> > initSPS(&m_sps);
> > initPPS(&m_pps);
> > +
> > + if (m_param->rc.vbvBufferSize)
> > + {
> > + m_offsetEmergency =
> (uint16_t(*)[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS])malloc(sizeof(*m_offsetEmergency)
> * (QP_MAX_MAX - QP_MAX_SPEC));
>
> did you check that sizeof(*m_offsetEmergency) is what you expect?
>
> why not use X265_ALLOC/X265_FREE? we generally need our mallocs aligned
>
OK
>
> > + if (!m_offsetEmergency)
> > + {
> > + x265_log(m_param, X265_LOG_ERROR, "Unable to allocate
> memory\n");
> > + m_aborted = true;
> > + return;
> > + }
> > +
> > + bool scalingEnabled = m_scalingList.m_bEnabled;
> > + if (!scalingEnabled)
> > + {
> > + m_scalingList.setDefaultScalingList();
> > + m_scalingList.setupQuantMatrices();
> > + }
> > + else
> > + m_scalingList.setupQuantMatrices();
> > +
> > + for (int q = 0; q < QP_MAX_MAX - QP_MAX_SPEC; q++)
> > + {
> > + for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
> > + {
> > + uint16_t *nrOffset = m_offsetEmergency[q][cat];
> > +
> > + int trSize = cat & 3;
> > +
> > + int coefCount = 1 << ((trSize + 2) * 2);
> > +
> > + /* Denoise chroma first then luma, then DC. */
> > + int dcThreshold = (QP_MAX_MAX - QP_MAX_SPEC) * 2 / 3;
> > + int lumaThreshold = (QP_MAX_MAX - QP_MAX_SPEC) * 2 / 3;
> > + int chromaThreshold = 0;
> > +
> > + int thresh = (cat < 4 || (cat >= 8 && cat < 12)) ?
> lumaThreshold : chromaThreshold;
> > +
> > + double quantF = (double)(1ULL << (q / 6 + 16 + 8));
> > +
> > + for (int i = 0; i < coefCount; i++)
> > + {
> > + uint16_t max = (1 << (7 + X265_DEPTH)) - 1;
> > + /* True "emergency mode": remove all DCT
> coefficients */
> > + if (q == QP_MAX_MAX - QP_MAX_SPEC - 1)
> > + {
> > + nrOffset[i] = max;
> > + continue;
> > + }
> > +
> > + int iThresh = i == 0 ? dcThreshold : thresh;
> > + if (q < iThresh)
> > + {
> > + nrOffset[i] = 0;
> > + continue;
> > + }
> > +
> > + int numList = (cat >= 8) * 3 + ((int)!iThresh);
> > +
> > + double pos = (double)(q - iThresh + 1) /
> (QP_MAX_MAX - QP_MAX_SPEC - iThresh);
> > + double start = quantF /
> (m_scalingList.m_quantCoef[trSize][numList][QP_MAX_SPEC % 6][i]);
> > +
> > + // Formula chosen as an exponential scale to
> vaguely mimic the effects of a higher quantizer.
> > + double bias = (pow(2, pos * (QP_MAX_MAX -
> QP_MAX_SPEC)) * 0.003 - 0.003) * start;
> > + nrOffset[i] = (uint16_t)X265_MIN(bias + 0.5, max);
> > + }
> > + }
> > + }
> > +
> > + if (!scalingEnabled)
> > + {
> > + m_scalingList.m_bEnabled = false;
> > + m_scalingList.m_bDataPresent = false;
> > + m_scalingList.setupQuantMatrices();
> > + }
> > + }
> > + else
> > + m_scalingList.setupQuantMatrices();
> >
> > int numRows = (m_param->sourceHeight + g_maxCUSize - 1) /
> g_maxCUSize;
> > int numCols = (m_param->sourceWidth + g_maxCUSize - 1) /
> g_maxCUSize;
> > @@ -323,6 +401,8 @@
> > X265_FREE(m_buOffsetY);
> > X265_FREE(m_buOffsetC);
> >
> > + free(m_offsetEmergency);
> > +
> > if (m_analysisFile)
> > fclose(m_analysisFile);
> >
> > diff -r d5278c76d341 -r bf5c5aca1a24 source/encoder/encoder.h
> > --- a/source/encoder/encoder.h Mon Aug 03 10:18:46 2015 -0500
> > +++ b/source/encoder/encoder.h Tue Aug 04 13:48:10 2015 +0530
> > @@ -133,6 +133,10 @@
> > bool m_aborted; // fatal error detected
> > bool m_reconfigured; // reconfigure of encoder
> detected
> >
> > + uint16_t
> (*m_offsetEmergency)[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
> > + uint32_t
> m_residualSumEmergency[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
> > + uint32_t m_countEmergency[MAX_NUM_TR_CATEGORIES];
> > +
> > Encoder();
> > ~Encoder() {}
> >
> > diff -r d5278c76d341 -r bf5c5aca1a24 source/encoder/frameencoder.cpp
> > --- a/source/encoder/frameencoder.cpp Mon Aug 03 10:18:46 2015 -0500
> > +++ b/source/encoder/frameencoder.cpp Tue Aug 04 13:48:10 2015 +0530
> > @@ -135,7 +135,7 @@
> > ok &= m_rce.picTimingSEI && m_rce.hrdTiming;
> > }
> >
> > - if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
> > + if (m_param->noiseReductionIntra || m_param->noiseReductionInter ||
> m_param->rc.vbvBufferSize)
> > m_nr = X265_MALLOC(NoiseReduction, 1);
> > if (m_nr)
> > memset(m_nr, 0, sizeof(NoiseReduction));
> > @@ -361,11 +361,45 @@
> > }
> > }
> >
> > + int numTLD;
> > + if (m_pool)
> > + numTLD = m_param->bEnableWavefront ? m_pool->m_numWorkers :
> m_pool->m_numWorkers + m_pool->m_numProviders;
> > + else
> > + numTLD = 1;
> > +
> > /* Get the QP for this frame from rate control. This call may block
> until
> > * frames ahead of it in encode order have called rateControlEnd()
> */
> > int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce,
> m_top);
> > m_rce.newQp = qp;
> >
> > + if (m_nr)
> > + {
> > +
>
> if (qp > QP_MAX_SPEC && m_frame->m_param->rc.vbvBufferSize)
> > + {
> > + for (int i = 0; i < numTLD; i++)
> > + {
> > + m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset =
> m_top->m_offsetEmergency[qp - QP_MAX_SPEC - 1];
> > + m_tld[i].analysis.m_quant.m_frameNr[m_jpId].residualSum
> = m_top->m_residualSumEmergency;
> > + m_tld[i].analysis.m_quant.m_frameNr[m_jpId].count =
> m_top->m_countEmergency;
> > + }
> > + }
> > + else
> > + {
> > + if (m_param->noiseReductionIntra ||
> m_param->noiseReductionInter)
> > + {
> > + for (int i = 0; i < numTLD; i++)
> > + {
> > + m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset
> = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrOffsetDenoise;
> > +
> m_tld[i].analysis.m_quant.m_frameNr[m_jpId].residualSum =
> m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrResidualSum;
> > + m_tld[i].analysis.m_quant.m_frameNr[m_jpId].count =
> m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrCount;
> > + }
> > + }
> > + else
> > + for (int i = 0; i < numTLD; i++)
> > + m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset =
> NULL;
>
> w/s
>
> > + }
> > + }
> > +
> > /* Clip slice QP to 0-51 spec range before encoding */
> > slice->m_sliceQp = x265_clip3(-QP_BD_OFFSET, QP_MAX_SPEC, qp);
> >
> > @@ -702,37 +736,36 @@
> > }
> > }
> >
> > - int numTLD;
> > - if (m_pool)
> > - numTLD = m_param->bEnableWavefront ? m_pool->m_numWorkers :
> m_pool->m_numWorkers + m_pool->m_numProviders;
> > - else
> > - numTLD = 1;
> > -
> > if (m_nr)
> > {
> > - /* Accumulate NR statistics from all worker threads */
> > - for (int i = 0; i < numTLD; i++)
> > - {
> > - NoiseReduction* nr =
> &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
> > - for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
> > + bool nrEnabled = (m_rce.newQp < QP_MAX_SPEC ||
> !m_param->rc.vbvBufferSize) && (m_param->noiseReductionIntra ||
> m_param->noiseReductionInter);
> > +
> > + if (nrEnabled)
> > + {
> > + /* Accumulate NR statistics from all worker threads */
> > + for (int i = 0; i < numTLD; i++)
> > + {
> > + NoiseReduction* nr =
> &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
> > + for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
> > + {
> > + for (int coeff = 0; coeff < MAX_NUM_TR_COEFFS;
> coeff++)
> > + m_nr->nrResidualSum[cat][coeff] +=
> nr->nrResidualSum[cat][coeff];
> > +
> > + m_nr->nrCount[cat] += nr->nrCount[cat];
> > + }
> > + }
> > +
> > + noiseReductionUpdate();
> > +
> > + /* Copy updated NR coefficients back to all worker threads
> */
> > + for (int i = 0; i < numTLD; i++)
> > {
> > - for (int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++)
> > - m_nr->residualSum[cat][coeff] +=
> nr->residualSum[cat][coeff];
> > -
> > - m_nr->count[cat] += nr->count[cat];
> > + NoiseReduction* nr =
> &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
> > + memcpy(nr->nrOffsetDenoise, m_nr->nrOffsetDenoise,
> sizeof(uint16_t)* MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
> > + memset(nr->nrCount, 0, sizeof(uint32_t)*
> MAX_NUM_TR_CATEGORIES);
> > + memset(nr->nrResidualSum, 0, sizeof(uint32_t)*
> MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
> > }
> > }
> > -
> > - noiseReductionUpdate();
> > -
> > - /* Copy updated NR coefficients back to all worker threads */
> > - for (int i = 0; i < numTLD; i++)
> > - {
> > - NoiseReduction* nr =
> &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
> > - memcpy(nr->offsetDenoise, m_nr->offsetDenoise,
> sizeof(uint16_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
> > - memset(nr->count, 0, sizeof(uint32_t) *
> MAX_NUM_TR_CATEGORIES);
> > - memset(nr->residualSum, 0, sizeof(uint32_t) *
> MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
> > - }
> > }
> >
> > #if DETAILED_CU_STATS
> > @@ -1265,25 +1298,25 @@
> > int trSize = cat & 3;
> > int coefCount = 1 << ((trSize + 2) * 2);
> >
> > - if (m_nr->count[cat] > maxBlocksPerTrSize[trSize])
> > + if (m_nr->nrCount[cat] > maxBlocksPerTrSize[trSize])
> > {
> > for (int i = 0; i < coefCount; i++)
> > - m_nr->residualSum[cat][i] >>= 1;
> > - m_nr->count[cat] >>= 1;
> > + m_nr->nrResidualSum[cat][i] >>= 1;
> > + m_nr->nrCount[cat] >>= 1;
> > }
> >
> > int nrStrength = cat < 8 ? m_param->noiseReductionIntra :
> m_param->noiseReductionInter;
> > - uint64_t scaledCount = (uint64_t)nrStrength * m_nr->count[cat];
> > + uint64_t scaledCount = (uint64_t)nrStrength *
> m_nr->nrCount[cat];
> >
> > for (int i = 0; i < coefCount; i++)
> > {
> > - uint64_t value = scaledCount + m_nr->residualSum[cat][i] /
> 2;
> > - uint64_t denom = m_nr->residualSum[cat][i] + 1;
> > - m_nr->offsetDenoise[cat][i] = (uint16_t)(value / denom);
> > + uint64_t value = scaledCount + m_nr->nrResidualSum[cat][i]
> / 2;
> > + uint64_t denom = m_nr->nrResidualSum[cat][i] + 1;
> > + m_nr->nrOffsetDenoise[cat][i] = (uint16_t)(value / denom);
> > }
> >
> > // Don't denoise DC coefficients
> > - m_nr->offsetDenoise[cat][0] = 0;
> > + m_nr->nrOffsetDenoise[cat][0] = 0;
> > }
> > }
>
> this seems to enable the emergency denoise only when the slice QP is
> over QP_MAX_SPEC, but I believe the main intent for the feature is to
> enable emergency denoise when the mid-frame VBV updates cause the QP to
> jump above QP_MAX_SPEC (and disable again if the QP drops below
> QP_MAX_SPEC), which leads to a design where the quant function itself
> must select between the FE's denoise (if denoise was user-configured) or
> emergency denoise if the QP is above spec and emergency denoise is
> enabled. Which I think in the end would be a cleaner design anyway.
>
> won't it better if we apply the denoise on the entire frame and avoid
strong denosing on the particular CU (qp > QP_MAX_SPEC),
this way we will save the bits from start.
For some of the frames, CU_QP > QP_MAX_SPEC but FRAME_QP < QP_MAX_SPEC in
such cases we can generalize by having
if frame_qp > (QP_MAX_SPEC - 1/2/3)
> diff -r d5278c76d341 -r bf5c5aca1a24 source/encoder/search.cpp
> > --- a/source/encoder/search.cpp Mon Aug 03 10:18:46 2015 -0500
> > +++ b/source/encoder/search.cpp Tue Aug 04 13:48:10 2015 +0530
> > @@ -80,7 +80,7 @@
> > m_me.init(param.searchMethod, param.subpelRefine,
> param.internalCsp);
> >
> > bool ok = m_quant.init(param.rdoqLevel, param.psyRdoq, scalingList,
> m_entropyCoder);
> > - if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
> > + if (m_param->noiseReductionIntra || m_param->noiseReductionInter ||
> m_param->rc.vbvBufferSize)
> > ok &= m_quant.allocNoiseReduction(param);
> >
> > ok &= Predict::allocBuffers(param.internalCsp); /* sets
> m_hChromaShift & m_vChromaShift */
> > _______________________________________________
> > x265-devel mailing list
> > x265-devel at videolan.org
> > https://mailman.videolan.org/listinfo/x265-devel
>
> --
> Steve Borho
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150805/ea97157b/attachment-0001.html>
More information about the x265-devel
mailing list