[x265] [PATCH 6 of 6 REV2] analysis: always configure quant QP directly after setting RD lambda

Sun Apr 26 19:47:44 CEST 2015

On Sun, Apr 26, 2015 at 12:21 PM, Steve Borho <steve at borho.org> wrote:
> # HG changeset patch
> # User Steve Borho <steve at borho.org>
> # Date 1429943995 18000
> #      Sat Apr 25 01:39:55 2015 -0500
> # Node ID 68a13226d586b335c02cade9311e093f0149c42a
> # Parent  6a0a37c01cff03cadd44691a0fe447d17ec0b14f
> analysis: always configure quant QP directly after setting RD lambda
>
> Basically, everywhere we adjust or assign QP we set quant QP immediately. This
> removes a great many ad-hoc calls to setQPforQuant() and hopefully makes it
> impossible to miss quant being configured properly.
>
> This patch fixes a layering violation where the frame encoder was setting the
> RDO lambdas directly, but only when delta-QP was not enabled.
>
> diff -r 6a0a37c01cff -r 68a13226d586 source/common/quant.cpp
> --- a/source/common/quant.cpp   Sat Apr 25 00:39:48 2015 -0500
> +++ b/source/common/quant.cpp   Sat Apr 25 01:39:55 2015 -0500
> @@ -225,16 +225,15 @@
>      X265_FREE(m_fencShortBuf);
>  }
>
> -void Quant::setQPforQuant(const CUData& cu)
> +void Quant::setQPforQuant(const CUData& ctu, int qp)
>  {
> -    m_tqBypass = !!cu.m_tqBypass[0];
> +    m_tqBypass = !!ctu.m_tqBypass[0];
>      if (m_tqBypass)
>          return;
> -    m_nr = m_frameNr ? &m_frameNr[cu.m_encData->m_frameEncoderID] : NULL;
> -    int qpy = cu.m_qp[0];
> -    m_qpParam[TEXT_LUMA].setQpParam(qpy + QP_BD_OFFSET);
> -    setChromaQP(qpy + cu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, cu.m_chromaFormat);
> -    setChromaQP(qpy + cu.m_slice->m_pps->chromaQpOffset[1], TEXT_CHROMA_V, cu.m_chromaFormat);
> +    m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL;
> +    m_qpParam[TEXT_LUMA].setQpParam(qp + QP_BD_OFFSET);
> +    setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat);
> +    setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[1], TEXT_CHROMA_V, ctu.m_chromaFormat);
>  }
>
>  void Quant::setChromaQP(int qpin, TextType ttype, int chFmt)
> diff -r 6a0a37c01cff -r 68a13226d586 source/common/quant.h
> --- a/source/common/quant.h     Sat Apr 25 00:39:48 2015 -0500
> +++ b/source/common/quant.h     Sat Apr 25 01:39:55 2015 -0500
> @@ -103,7 +103,7 @@
>      bool allocNoiseReduction(const x265_param& param);
>
>      /* CU setup */
> -    void setQPforQuant(const CUData& cu);
> +    void setQPforQuant(const CUData& ctu, int qp);
>
>      uint32_t transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride, coeff_t* coeff,
>                            uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip);
> diff -r 6a0a37c01cff -r 68a13226d586 source/encoder/analysis.cpp
> --- a/source/encoder/analysis.cpp       Sat Apr 25 00:39:48 2015 -0500
> +++ b/source/encoder/analysis.cpp       Sat Apr 25 01:39:55 2015 -0500
> @@ -146,18 +146,16 @@
>
>      if (m_slice->m_pps->bUseDQP)
>      {
> -        m_aqQP[0] = calculateQpforCuSize(ctu, cuGeom);
> -        setLambdaFromQP(*m_slice, m_aqQP[0]);
> -        m_aqQP[0] = x265_clip3(QP_MIN, QP_MAX_SPEC, m_aqQP[0]);
> -        ctu.setQPSubParts((int8_t)m_aqQP[0], 0, 0);
> +        m_aqQP[0] = setLambdaFromQP(ctu, calculateQpforCuSize(ctu, cuGeom));
>
>          if (m_slice->m_pps->maxCuDQPDepth)
>              initAqQPs(1, ctu, &cuGeom + 1);
>      }
>      else
> -        m_aqQP[0] = m_slice->m_sliceQp;
> +        /* adaptive quant disabled, CTU QP is always slice QP, and within spec range */
> +        m_aqQP[0] = setLambdaFromQP(ctu, m_slice->m_sliceQp);
>
> -    m_quant.setQPforQuant(ctu);
> +    ctu.setQPSubParts((int8_t)m_aqQP[0], 0, 0);
>      m_rqt[0].cur.load(initialContext);
>      m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0);
>
> @@ -231,20 +229,24 @@
>          return;
>      else if (md.bestMode->cu.isIntra(0))
>      {
> +        m_quant.m_tqBypass = true;
>          md.pred[PRED_LOSSLESS].initCosts();
>          md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
>          PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
>          uint8_t* modes = md.pred[PRED_LOSSLESS].cu.m_lumaIntraDir;
>          checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size, modes, NULL);
>          checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
> +        m_quant.m_tqBypass = false;
>      }
>      else
>      {
> +        m_quant.m_tqBypass = true;
>          md.pred[PRED_LOSSLESS].initCosts();
>          md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
>          md.pred[PRED_LOSSLESS].predYuv.copyFromYuv(md.bestMode->predYuv);
>          encodeResAndCalcRdInterCU(md.pred[PRED_LOSSLESS], cuGeom);
>          checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
> +        m_quant.m_tqBypass = false;
>      }
>  }
>
> @@ -269,7 +271,6 @@
>              PartSize size = (PartSize)reusePartSizes[zOrder];
>              Mode& mode = size == SIZE_2Nx2N ? md.pred[PRED_INTRA] : md.pred[PRED_INTRA_NxN];
>              mode.cu.initSubCU(parentCTU, cuGeom, qp);
> -            m_quant.setQPforQuant(mode.cu);
>              checkIntra(mode, cuGeom, size, &reuseModes[zOrder], &reuseChromaModes[zOrder]);
>              checkBestMode(mode, depth);
>
> @@ -287,7 +288,6 @@
>      else if (mightNotSplit)
>      {
>          md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
> -        m_quant.setQPforQuant(md.pred[PRED_INTRA].cu);
>          checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL, NULL);
>          checkBestMode(md.pred[PRED_INTRA], depth);
>
> @@ -327,11 +327,7 @@
>                  m_rqt[nextDepth].cur.load(*nextContext);
>
>                  if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
> -                {
> -                    nextQP = m_aqQP[childGeom.index];
> -                    setLambdaFromQP(*m_slice, nextQP);
> -                    nextQP = x265_clip3(QP_MIN, QP_MAX_SPEC, nextQP);
> -                }
> +                    nextQP = setLambdaFromQP(parentCTU, m_aqQP[childGeom.index]);
>
>                  compressIntraCU(parentCTU, childGeom, zOrder, nextQP);
>
> @@ -401,14 +397,9 @@
>      {
>          slave.m_slice = m_slice;
>          slave.m_frame = m_frame;
> -        slave.setLambdaFromQP(*m_slice, m_rdCost.m_qp);
> +        slave.setLambdaFromQP(md.pred[PRED_2Nx2N].cu, m_rdCost.m_qp);
>          slave.invalidateContexts(0);
> -
> -        if (m_param->rdLevel >= 5)
> -        {
> -            slave.m_rqt[pmode.cuGeom.depth].cur.load(m_rqt[pmode.cuGeom.depth].cur);
> -            slave.m_quant.setQPforQuant(md.pred[PRED_2Nx2N].cu);
> -        }
> +        slave.m_rqt[pmode.cuGeom.depth].cur.load(m_rqt[pmode.cuGeom.depth].cur);
>      }
>
>      /* perform Mode task, repeat until no more work is available */
> @@ -419,11 +410,6 @@
>              switch (pmode.modes[task])
>              {
>              case PRED_INTRA:
> -                if (&slave != this)
> -                {
> -                    slave.m_rqt[pmode.cuGeom.depth].cur.load(m_rqt[pmode.cuGeom.depth].cur);
> -                    slave.m_quant.setQPforQuant(md.pred[PRED_INTRA].cu);
> -                }
>                  slave.checkIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom);
>                  if (m_param->rdLevel > 2)
>                      slave.encodeIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom);
> @@ -739,11 +725,7 @@
>                  m_rqt[nextDepth].cur.load(*nextContext);
>
>                  if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
> -                {
> -                    nextQP = m_aqQP[childGeom.index];
> -                    setLambdaFromQP(*m_slice, nextQP);
> -                    nextQP = x265_clip3(QP_MIN, QP_MAX_SPEC, nextQP);
> -                }
> +                    nextQP = setLambdaFromQP(parentCTU, m_aqQP[childGeom.index]);
>
>                  compressInterCU_dist(parentCTU, childGeom, nextQP);
>
> @@ -944,7 +926,6 @@
>                      {
>                          /* generate recon pixels with no rate distortion considerations */
>                          CUData& cu = md.bestMode->cu;
> -                        m_quant.setQPforQuant(cu);
>
>                          uint32_t tuDepthRange[2];
>                          cu.getInterTUQtDepthRange(tuDepthRange, 0);
> @@ -969,7 +950,6 @@
>                      {
>                          /* generate recon pixels with no rate distortion considerations */
>                          CUData& cu = md.bestMode->cu;
> -                        m_quant.setQPforQuant(cu);
>
>                          uint32_t tuDepthRange[2];
>                          cu.getIntraTUQtDepthRange(tuDepthRange, 0);
> @@ -1020,11 +1000,7 @@
>                  m_rqt[nextDepth].cur.load(*nextContext);
>
>                  if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
> -                {
> -                    nextQP = m_aqQP[childGeom.index];
> -                    setLambdaFromQP(*m_slice, nextQP);
> -                    nextQP = x265_clip3(QP_MIN, QP_MAX_SPEC, nextQP);
> -                }
> +                    nextQP = setLambdaFromQP(parentCTU, m_aqQP[childGeom.index]);
>
>                  compressInterCU_rd0_4(parentCTU, childGeom, nextQP);
>
> @@ -1228,11 +1204,7 @@
>                  m_rqt[nextDepth].cur.load(*nextContext);
>
>                  if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
> -                {
> -                    nextQP = m_aqQP[childGeom.index];
> -                    setLambdaFromQP(*m_slice, nextQP);
> -                    nextQP = x265_clip3(QP_MIN, QP_MAX_SPEC, nextQP);
> -                }
> +                    nextQP = setLambdaFromQP(parentCTU, m_aqQP[childGeom.index]);
>
>                  compressInterCU_rd5_6(parentCTU, childGeom, zOrder, nextQP);
>
> @@ -1758,7 +1730,6 @@
>      CUData& cu = bestMode->cu;
>
>      cu.copyFromPic(ctu, cuGeom);
> -    m_quant.setQPforQuant(cu);
>
>      Yuv& fencYuv = m_modeDepth[cuGeom.depth].fencYuv;
>      if (cuGeom.depth)
> diff -r 6a0a37c01cff -r 68a13226d586 source/encoder/frameencoder.cpp
> --- a/source/encoder/frameencoder.cpp   Sat Apr 25 00:39:48 2015 -0500
> +++ b/source/encoder/frameencoder.cpp   Sat Apr 25 01:39:55 2015 -0500
> @@ -851,12 +851,11 @@
>
>          if (m_param->rc.aqMode || bIsVbv)
>          {
> +            X265_CHECK(slice->m_pps->bUseDQP, "adaptive quant in use without DQP\n");
>              int qp = calcQpForCu(cuAddr, curEncData.m_cuStat[cuAddr].baseQp);
>              qp = x265_clip3(QP_MIN, QP_MAX_SPEC, qp);
>              curEncData.m_rowStat[row].sumQpAq += qp;
>          }
> -        else
> -            tld.analysis.setLambdaFromQP(*slice, slice->m_sliceQp);

Note that this change raises the question of whether calcQpForCu() is
necessary any more in the frame encoder. The returned QP is only used
to update the row sum QP. compressCTU() is free to use an entirely
different QP (or QPs per CU).

My guess is that rate control would work better if sumQpAq were
incremented *after* the CTU was compressed, based on the average of
the QPs actually used to code the block. And this would also be more
work efficient.

>          if (m_param->bEnableWavefront && !col && row)
>          {
> diff -r 6a0a37c01cff -r 68a13226d586 source/encoder/rdcost.h
> --- a/source/encoder/rdcost.h   Sat Apr 25 00:39:48 2015 -0500
> +++ b/source/encoder/rdcost.h   Sat Apr 25 01:39:55 2015 -0500
> @@ -40,12 +40,13 @@
>      uint32_t  m_chromaDistWeight[2];
>      uint32_t  m_psyRdBase;
>      uint32_t  m_psyRd;
> -    int       m_qp;
> +    int       m_qp; /* QP used to configure lambda, may be higher than QP_MAX_SPEC but <= QP_MAX_MAX */
>
>      void setPsyRdScale(double scale)                { m_psyRdBase = (uint32_t)floor(65536.0 * scale * 0.33); }
>
>      void setQP(const Slice& slice, int qp)
>      {
> +        x265_emms(); /* TODO: if the lambda tables were ints, this would not be necessary */
>          m_qp = qp;
>
>          /* Scale PSY RD factor by a slice type factor */
> diff -r 6a0a37c01cff -r 68a13226d586 source/encoder/search.cpp
> --- a/source/encoder/search.cpp Sat Apr 25 00:39:48 2015 -0500
> +++ b/source/encoder/search.cpp Sat Apr 25 01:39:55 2015 -0500
> @@ -163,11 +163,16 @@
>      X265_FREE(m_tsRecon);
>  }
>
> -void Search::setLambdaFromQP(const Slice& slice, int qp)
> +int Search::setLambdaFromQP(const CUData& ctu, int qp)
>  {
> -    x265_emms(); /* TODO: if the lambda tables were ints, this would not be necessary */
> +    X265_CHECK(qp >= QP_MIN && qp <= QP_MAX_MAX, "QP used for lambda is out of range\n");
> +
>      m_me.setQP(qp);
> -    m_rdCost.setQP(slice, qp);
> +    m_rdCost.setQP(*m_slice, qp);
> +
> +    int quantQP = x265_clip3(QP_MIN, QP_MAX_SPEC, qp);
> +    m_quant.setQPforQuant(ctu, quantQP);
> +    return quantQP;
>  }
>
>  #if CHECKED_BUILD || _DEBUG
> @@ -1364,8 +1369,6 @@
>      X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
>      X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
>
> -    m_quant.setQPforQuant(cu);
> -
>      uint32_t tuDepthRange[2];
>      cu.getIntraTUQtDepthRange(tuDepthRange, 0);
>
> @@ -1888,10 +1891,9 @@
>      /* Setup slave Search instance for ME for master's CU */
>      if (&slave != this)
>      {
> -        slave.setLambdaFromQP(*m_slice, m_rdCost.m_qp);
>          slave.m_slice = m_slice;
>          slave.m_frame = m_frame;
> -
> +        slave.setLambdaFromQP(pme.mode.cu, m_rdCost.m_qp);
>          slave.m_me.setSourcePU(*pme.mode.fencYuv, pme.pu.ctuAddr, pme.pu.cuAbsPartIdx, pme.pu.puAbsPartIdx, pme.pu.width, pme.pu.height);
>      }
>
> @@ -2523,9 +2525,6 @@
>      uint32_t log2CUSize = cuGeom.log2CUSize;
>      int sizeIdx = log2CUSize - 2;
>
> -    uint32_t tqBypass = cu.m_tqBypass[0];
> -    m_quant.setQPforQuant(interMode.cu);
> -
>      resiYuv->subtract(*fencYuv, *predYuv, log2CUSize);
>
>      uint32_t tuDepthRange[2];
> @@ -2536,6 +2535,7 @@
>      Cost costs;
>      estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
>
> +    uint32_t tqBypass = cu.m_tqBypass[0];
>      if (!tqBypass)
>      {
>          uint32_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
> diff -r 6a0a37c01cff -r 68a13226d586 source/encoder/search.h
> --- a/source/encoder/search.h   Sat Apr 25 00:39:48 2015 -0500
> +++ b/source/encoder/search.h   Sat Apr 25 01:39:55 2015 -0500
> @@ -287,7 +287,7 @@
>      ~Search();
>
>      bool     initSearch(const x265_param& param, ScalingList& scalingList);
> -    void     setLambdaFromQP(const Slice& slice, int qp);
> +    int      setLambdaFromQP(const CUData& ctu, int qp); /* returns real quant QP in valid spec range */
>
>      // mark temp RD entropy contexts as uninitialized; useful for finding loads without stores
>      void     invalidateContexts(int fromDepth);

-- 
Steve Borho