[x265] [PATCH] Add emergency denoising when frame qp > QP_MAX_SPEC

Tue Aug 4 19:32:06 CEST 2015

On 08/04, sagar at multicorewareinc.com wrote:
> # HG changeset patch
> # User Sagar Kotecha <sagar at multicorewareinc.com>
> # Date 1438676290 -19800
> #      Tue Aug 04 13:48:10 2015 +0530
> # Node ID bf5c5aca1a24eb4699d99a3ce4de386096219a5a
> # Parent  d5278c76d341b3bac405938dbfb64cb7e2d9bce5
> Add emergency denoising when frame qp > QP_MAX_SPEC
> 
> This feature is ported from x264, and is turned on for VBV encodes

this looks a lot better than the previous patch

> diff -r d5278c76d341 -r bf5c5aca1a24 source/common/common.h
> --- a/source/common/common.h	Mon Aug 03 10:18:46 2015 -0500
> +++ b/source/common/common.h	Tue Aug 04 13:48:10 2015 +0530
> @@ -311,6 +311,9 @@
>  #define CHROMA_V_SHIFT(x) (x == X265_CSP_I420)
>  #define X265_MAX_PRED_MODE_PER_CTU 85 * 2 * 8
>  
> +#define MAX_NUM_TR_COEFFS           MAX_TR_SIZE * MAX_TR_SIZE // Maximum number of transform coefficients, for a 32x32 transform
> +#define MAX_NUM_TR_CATEGORIES       16                        // 32, 16, 8, 4 transform categories each for luma and chroma
> +
>  namespace X265_NS {
>  
>  enum { SAO_NUM_OFFSET = 4 };
> diff -r d5278c76d341 -r bf5c5aca1a24 source/common/quant.cpp
> --- a/source/common/quant.cpp	Mon Aug 03 10:18:46 2015 -0500
> +++ b/source/common/quant.cpp	Tue Aug 04 13:48:10 2015 +0530
> @@ -447,12 +447,12 @@
>              primitives.cu[sizeIdx].dct(m_fencShortBuf, m_fencDctCoeff, trSize);
>          }
>  
> -        if (m_nr)
> +        if (m_nr && m_nr->offset)
>          {
>              /* denoise is not applied to intra residual, so DST can be ignored */
>              int cat = sizeIdx + 4 * !isLuma + 8 * !isIntra;
>              int numCoeff = 1 << (log2TrSize * 2);
> -            primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff);
> +            primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offset[cat], numCoeff);
>              m_nr->count[cat]++;
>          }
>      }
> diff -r d5278c76d341 -r bf5c5aca1a24 source/common/quant.h
> --- a/source/common/quant.h	Mon Aug 03 10:18:46 2015 -0500
> +++ b/source/common/quant.h	Tue Aug 04 13:48:10 2015 +0530
> @@ -59,18 +59,19 @@
>      }
>  };
>  
> -#define MAX_NUM_TR_COEFFS        MAX_TR_SIZE * MAX_TR_SIZE /* Maximum number of transform coefficients, for a 32x32 transform */
> -#define MAX_NUM_TR_CATEGORIES    16                        /* 32, 16, 8, 4 transform categories each for luma and chroma */
> -
>  // NOTE: MUST be 16-byte aligned for asm code
>  struct NoiseReduction
>  {
>      /* 0 = luma 4x4,   1 = luma 8x8,   2 = luma 16x16,   3 = luma 32x32
>       * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma 32x32
>       * Intra 0..7 - Inter 8..15 */
> -    uint16_t offsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
> -    uint32_t residualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
> -    uint32_t count[MAX_NUM_TR_CATEGORIES];
> +    uint16_t nrOffsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
> +    uint32_t nrResidualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
> +    uint32_t nrCount[MAX_NUM_TR_CATEGORIES];
> +
> +    ALIGN_VAR_16(uint16_t, (*offset)[MAX_NUM_TR_COEFFS]);
> +    uint32_t (*residualSum)[MAX_NUM_TR_COEFFS];
> +    uint32_t *count;
>  };

I can see that the way we've split these structures to preserve
determinism makes it rather difficult to swap out the user-specified
denoise coeff with the emergency coeff.

>  
>  class Quant
> diff -r d5278c76d341 -r bf5c5aca1a24 source/encoder/encoder.cpp
> --- a/source/encoder/encoder.cpp	Mon Aug 03 10:18:46 2015 -0500
> +++ b/source/encoder/encoder.cpp	Tue Aug 04 13:48:10 2015 +0530
> @@ -72,6 +72,7 @@
>      m_buOffsetC = NULL;
>      m_threadPool = NULL;
>      m_analysisFile = NULL;
> +    m_offsetEmergency = NULL;
>      for (int i = 0; i < X265_MAX_FRAME_THREADS; i++)
>          m_frameEncoder[i] = NULL;
>  
> @@ -191,6 +192,7 @@
>      {
>          x265_log(m_param, X265_LOG_ERROR, "Unable to allocate scaling list arrays\n");
>          m_aborted = true;
> +        return;
>      }
>      else if (!m_param->scalingLists || !strcmp(m_param->scalingLists, "off"))
>          m_scalingList.m_bEnabled = false;
> @@ -198,7 +200,6 @@
>          m_scalingList.setDefaultScalingList();
>      else if (m_scalingList.parseScalingList(m_param->scalingLists))
>          m_aborted = true;
> -    m_scalingList.setupQuantMatrices();
>  
>      m_lookahead = new Lookahead(m_param, m_threadPool);
>      if (m_numPools)
> @@ -213,6 +214,83 @@
>      initVPS(&m_vps);
>      initSPS(&m_sps);
>      initPPS(&m_pps);
> +   
> +    if (m_param->rc.vbvBufferSize)
> +    {
> +        m_offsetEmergency = (uint16_t(*)[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS])malloc(sizeof(*m_offsetEmergency) * (QP_MAX_MAX - QP_MAX_SPEC));

did you check that sizeof(*m_offsetEmergency) is what you expect?

why not use X265_ALLOC/X265_FREE? we generally need our mallocs aligned

> +        if (!m_offsetEmergency)
> +        {
> +            x265_log(m_param, X265_LOG_ERROR, "Unable to allocate memory\n");
> +            m_aborted = true;
> +            return;
> +        }
> +
> +        bool scalingEnabled = m_scalingList.m_bEnabled;
> +        if (!scalingEnabled)
> +        {
> +            m_scalingList.setDefaultScalingList();
> +            m_scalingList.setupQuantMatrices();
> +        }
> +        else
> +            m_scalingList.setupQuantMatrices();
> +
> +        for (int q = 0; q < QP_MAX_MAX - QP_MAX_SPEC; q++)
> +        {
> +            for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
> +            {
> +                uint16_t *nrOffset = m_offsetEmergency[q][cat];
> +
> +                int trSize = cat & 3;
> +
> +                int coefCount = 1 << ((trSize + 2) * 2);
> +
> +                /* Denoise chroma first then luma, then DC. */
> +                int dcThreshold = (QP_MAX_MAX - QP_MAX_SPEC) * 2 / 3;
> +                int lumaThreshold = (QP_MAX_MAX - QP_MAX_SPEC) * 2 / 3;
> +                int chromaThreshold = 0;
> +
> +                int thresh = (cat < 4 || (cat >= 8 && cat < 12)) ? lumaThreshold : chromaThreshold;
> +
> +                double quantF = (double)(1ULL << (q / 6 + 16 + 8));
> +
> +                for (int i = 0; i < coefCount; i++)
> +                {
> +                    uint16_t max = (1 << (7 + X265_DEPTH)) - 1;
> +                    /* True "emergency mode": remove all DCT coefficients */
> +                    if (q == QP_MAX_MAX - QP_MAX_SPEC - 1)
> +                    {
> +                        nrOffset[i] = max;
> +                        continue;
> +                    }
> +
> +                    int iThresh = i == 0 ? dcThreshold : thresh;
> +                    if (q < iThresh)
> +                    {
> +                        nrOffset[i] = 0;
> +                        continue;
> +                    }
> +
> +                    int numList = (cat >= 8) * 3 + ((int)!iThresh);
> +
> +                    double pos = (double)(q - iThresh + 1) / (QP_MAX_MAX - QP_MAX_SPEC - iThresh);
> +                    double start = quantF / (m_scalingList.m_quantCoef[trSize][numList][QP_MAX_SPEC % 6][i]);
> +
> +                    // Formula chosen as an exponential scale to vaguely mimic the effects of a higher quantizer.
> +                    double bias = (pow(2, pos * (QP_MAX_MAX - QP_MAX_SPEC)) * 0.003 - 0.003) * start;
> +                    nrOffset[i] = (uint16_t)X265_MIN(bias + 0.5, max);
> +                }
> +            }
> +        }
> +
> +        if (!scalingEnabled)
> +        {
> +            m_scalingList.m_bEnabled = false;
> +            m_scalingList.m_bDataPresent = false;
> +            m_scalingList.setupQuantMatrices();
> +        }
> +    }
> +    else
> +        m_scalingList.setupQuantMatrices();
>  
>      int numRows = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;
>      int numCols = (m_param->sourceWidth  + g_maxCUSize - 1) / g_maxCUSize;
> @@ -323,6 +401,8 @@
>      X265_FREE(m_buOffsetY);
>      X265_FREE(m_buOffsetC);
>  
> +    free(m_offsetEmergency);
> +
>      if (m_analysisFile)
>          fclose(m_analysisFile);
>  
> diff -r d5278c76d341 -r bf5c5aca1a24 source/encoder/encoder.h
> --- a/source/encoder/encoder.h	Mon Aug 03 10:18:46 2015 -0500
> +++ b/source/encoder/encoder.h	Tue Aug 04 13:48:10 2015 +0530
> @@ -133,6 +133,10 @@
>      bool               m_aborted;          // fatal error detected
>      bool               m_reconfigured;      // reconfigure of encoder detected
>  
> +    uint16_t           (*m_offsetEmergency)[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
> +    uint32_t           m_residualSumEmergency[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
> +    uint32_t           m_countEmergency[MAX_NUM_TR_CATEGORIES];
> +
>      Encoder();
>      ~Encoder() {}
>  
> diff -r d5278c76d341 -r bf5c5aca1a24 source/encoder/frameencoder.cpp
> --- a/source/encoder/frameencoder.cpp	Mon Aug 03 10:18:46 2015 -0500
> +++ b/source/encoder/frameencoder.cpp	Tue Aug 04 13:48:10 2015 +0530
> @@ -135,7 +135,7 @@
>          ok &= m_rce.picTimingSEI && m_rce.hrdTiming;
>      }
>  
> -    if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
> +    if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
>          m_nr = X265_MALLOC(NoiseReduction, 1);
>      if (m_nr)
>          memset(m_nr, 0, sizeof(NoiseReduction));
> @@ -361,11 +361,45 @@
>          }
>      }
>  
> +    int numTLD;
> +    if (m_pool)
> +        numTLD = m_param->bEnableWavefront ? m_pool->m_numWorkers : m_pool->m_numWorkers + m_pool->m_numProviders;
> +    else
> +        numTLD = 1;
> +
>      /* Get the QP for this frame from rate control. This call may block until
>       * frames ahead of it in encode order have called rateControlEnd() */
>      int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
>      m_rce.newQp = qp;
>  
> +    if (m_nr)
> +    {
> +        if (qp > QP_MAX_SPEC && m_frame->m_param->rc.vbvBufferSize)
> +        {
> +            for (int i = 0; i < numTLD; i++)
> +            {
> +                m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = m_top->m_offsetEmergency[qp - QP_MAX_SPEC - 1];
> +                m_tld[i].analysis.m_quant.m_frameNr[m_jpId].residualSum = m_top->m_residualSumEmergency;
> +                m_tld[i].analysis.m_quant.m_frameNr[m_jpId].count = m_top->m_countEmergency;
> +            }
> +        }
> +        else
> +        {
> +            if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
> +            {
> +                for (int i = 0; i < numTLD; i++)
> +                {
> +                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrOffsetDenoise;
> +                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].residualSum = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrResidualSum;
> +                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].count = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrCount;
> +                }
> +            }
> +            else
> +            for (int i = 0; i < numTLD; i++)
> +                m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = NULL;

w/s

> +        }
> +    }
> +
>      /* Clip slice QP to 0-51 spec range before encoding */
>      slice->m_sliceQp = x265_clip3(-QP_BD_OFFSET, QP_MAX_SPEC, qp);
>  
> @@ -702,37 +736,36 @@
>          }
>      }
>  
> -    int numTLD;
> -    if (m_pool)
> -        numTLD = m_param->bEnableWavefront ? m_pool->m_numWorkers : m_pool->m_numWorkers + m_pool->m_numProviders;
> -    else
> -        numTLD = 1;
> -
>      if (m_nr)
>      {
> -        /* Accumulate NR statistics from all worker threads */
> -        for (int i = 0; i < numTLD; i++)
> -        {
> -            NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
> -            for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
> +        bool nrEnabled = (m_rce.newQp < QP_MAX_SPEC || !m_param->rc.vbvBufferSize) && (m_param->noiseReductionIntra || m_param->noiseReductionInter);
> +
> +        if (nrEnabled)
> +        {
> +            /* Accumulate NR statistics from all worker threads */
> +            for (int i = 0; i < numTLD; i++)
> +            {
> +                NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
> +                for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
> +                {
> +                    for (int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++)
> +                        m_nr->nrResidualSum[cat][coeff] += nr->nrResidualSum[cat][coeff];
> +
> +                    m_nr->nrCount[cat] += nr->nrCount[cat];
> +                }
> +            }
> +
> +            noiseReductionUpdate();
> +
> +            /* Copy updated NR coefficients back to all worker threads */
> +            for (int i = 0; i < numTLD; i++)
>              {
> -                for (int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++)
> -                    m_nr->residualSum[cat][coeff] += nr->residualSum[cat][coeff];
> -            
> -                m_nr->count[cat] += nr->count[cat];
> +                NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
> +                memcpy(nr->nrOffsetDenoise, m_nr->nrOffsetDenoise, sizeof(uint16_t)* MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
> +                memset(nr->nrCount, 0, sizeof(uint32_t)* MAX_NUM_TR_CATEGORIES);
> +                memset(nr->nrResidualSum, 0, sizeof(uint32_t)* MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
>              }
>          }
> -
> -        noiseReductionUpdate();
> -
> -        /* Copy updated NR coefficients back to all worker threads */
> -        for (int i = 0; i < numTLD; i++)
> -        {
> -            NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
> -            memcpy(nr->offsetDenoise, m_nr->offsetDenoise, sizeof(uint16_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
> -            memset(nr->count, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES);
> -            memset(nr->residualSum, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
> -        }
>      }
>  
>  #if DETAILED_CU_STATS
> @@ -1265,25 +1298,25 @@
>          int trSize = cat & 3;
>          int coefCount = 1 << ((trSize + 2) * 2);
>  
> -        if (m_nr->count[cat] > maxBlocksPerTrSize[trSize])
> +        if (m_nr->nrCount[cat] > maxBlocksPerTrSize[trSize])
>          {
>              for (int i = 0; i < coefCount; i++)
> -                m_nr->residualSum[cat][i] >>= 1;
> -            m_nr->count[cat] >>= 1;
> +                m_nr->nrResidualSum[cat][i] >>= 1;
> +            m_nr->nrCount[cat] >>= 1;
>          }
>  
>          int nrStrength = cat < 8 ? m_param->noiseReductionIntra : m_param->noiseReductionInter;
> -        uint64_t scaledCount = (uint64_t)nrStrength * m_nr->count[cat];
> +        uint64_t scaledCount = (uint64_t)nrStrength * m_nr->nrCount[cat];
>  
>          for (int i = 0; i < coefCount; i++)
>          {
> -            uint64_t value = scaledCount + m_nr->residualSum[cat][i] / 2;
> -            uint64_t denom = m_nr->residualSum[cat][i] + 1;
> -            m_nr->offsetDenoise[cat][i] = (uint16_t)(value / denom);
> +            uint64_t value = scaledCount + m_nr->nrResidualSum[cat][i] / 2;
> +            uint64_t denom = m_nr->nrResidualSum[cat][i] + 1;
> +            m_nr->nrOffsetDenoise[cat][i] = (uint16_t)(value / denom);
>          }
>  
>          // Don't denoise DC coefficients
> -        m_nr->offsetDenoise[cat][0] = 0;
> +        m_nr->nrOffsetDenoise[cat][0] = 0;
>      }
>  }

this seems to enable the emergency denoise only when the slice QP is
over QP_MAX_SPEC, but I believe the main intent for the feature is to
enable emergency denoise when the mid-frame VBV updates cause the QP to
jump above QP_MAX_SPEC (and disable again if the QP drops below
QP_MAX_SPEC), which leads to a design where the quant function itself
must select between the FE's denoise (if denoise was user-configured) or
emergency denoise if the QP is above spec and emergency denoise is
enabled. Which I think in the end would be a cleaner design anyway.

> diff -r d5278c76d341 -r bf5c5aca1a24 source/encoder/search.cpp
> --- a/source/encoder/search.cpp	Mon Aug 03 10:18:46 2015 -0500
> +++ b/source/encoder/search.cpp	Tue Aug 04 13:48:10 2015 +0530
> @@ -80,7 +80,7 @@
>      m_me.init(param.searchMethod, param.subpelRefine, param.internalCsp);
>  
>      bool ok = m_quant.init(param.rdoqLevel, param.psyRdoq, scalingList, m_entropyCoder);
> -    if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
> +    if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
>          ok &= m_quant.allocNoiseReduction(param);
>  
>      ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho