[x265] [PATCH] analysis: add an additional round of sub-pel refinement for inter 2Nx2N in rd 5 and 6

Wed May 20 11:05:51 CEST 2015

On Tue, May 19, 2015 at 9:35 PM, Steve Borho <steve at borho.org> wrote:

> On 05/19, santhoshini at multicorewareinc.com wrote:
> > # HG changeset patch
> > # User Santhoshini Sekar<santhoshini at multicorewareinc.com>
> > # Date 1432028003 -19800
> > #      Tue May 19 15:03:23 2015 +0530
> > # Node ID 904ac8808858baaeaaa333b5a105af50c1107db0
> > # Parent  d7b100e51e828833eee006f1da93e499ac161d28
> > analysis: add an additional round of sub-pel refinement for inter 2Nx2N
> in rd 5 and 6
> >
> > diff -r d7b100e51e82 -r 904ac8808858 source/common/cudata.cpp
> > --- a/source/common/cudata.cpp        Mon May 18 18:24:08 2015 -0500
> > +++ b/source/common/cudata.cpp        Tue May 19 15:03:23 2015 +0530
> > @@ -456,6 +456,41 @@
> >      memcpy(ctu.m_trCoeff[2] + tmpC2, m_trCoeff[2], sizeof(coeff_t) *
> tmpC);
> >  }
> >
> > +void CUData::copyToCU(CUData& ctu) const
> > +{
> > +    m_partCopy((uint8_t*)ctu.m_qp, (uint8_t*)m_qp);
> > +    m_partCopy(ctu.m_log2CUSize, m_log2CUSize);
> > +    m_partCopy(ctu.m_lumaIntraDir, m_lumaIntraDir);
> > +    m_partCopy(ctu.m_tqBypass, m_tqBypass);
> > +    m_partCopy((uint8_t*)ctu.m_refIdx[0], (uint8_t*)m_refIdx[0]);
> > +    m_partCopy((uint8_t*)ctu.m_refIdx[1], (uint8_t*)m_refIdx[1]);
> > +    m_partCopy(ctu.m_cuDepth, m_cuDepth);
> > +    m_partCopy(ctu.m_predMode, m_predMode);
> > +    m_partCopy(ctu.m_partSize, m_partSize);
> > +    m_partCopy(ctu.m_mergeFlag, m_mergeFlag);
> > +    m_partCopy(ctu.m_interDir, m_interDir);
> > +    m_partCopy(ctu.m_mvpIdx[0], m_mvpIdx[0]);
> > +    m_partCopy(ctu.m_mvpIdx[1], m_mvpIdx[1]);
> > +    m_partCopy(ctu.m_tuDepth, m_tuDepth);
> > +    m_partCopy(ctu.m_transformSkip[0], m_transformSkip[0]);
> > +    m_partCopy(ctu.m_transformSkip[1], m_transformSkip[1]);
> > +    m_partCopy(ctu.m_transformSkip[2], m_transformSkip[2]);
> > +    m_partCopy(ctu.m_cbf[0], m_cbf[0]);
> > +    m_partCopy(ctu.m_cbf[1], m_cbf[1]);
> > +    m_partCopy(ctu.m_cbf[2], m_cbf[2]);
> > +    m_partCopy(ctu.m_chromaIntraDir, m_chromaIntraDir);
> > +
> > +    memcpy(ctu.m_mv[0],  m_mv[0],  m_numPartitions * sizeof(MV));
> > +    memcpy(ctu.m_mv[1],  m_mv[1],  m_numPartitions * sizeof(MV));
> > +    memcpy(ctu.m_mvd[0], m_mvd[0], m_numPartitions * sizeof(MV));
> > +    memcpy(ctu.m_mvd[1], m_mvd[1], m_numPartitions * sizeof(MV));
> > +
> > +    memcpy(ctu.m_trCoeff[0], m_trCoeff[0], sizeof(coeff_t));
> > +
> > +    memcpy(ctu.m_trCoeff[1], m_trCoeff[1], sizeof(coeff_t));
> > +    memcpy(ctu.m_trCoeff[2], m_trCoeff[2], sizeof(coeff_t));
> w/s nit, remove the blank line above
> > +}
> > +
> >  /* The reverse of copyToPic, called only by encodeResidue */
> >  void CUData::copyFromPic(const CUData& ctu, const CUGeom& cuGeom)
> >  {
> > diff -r d7b100e51e82 -r 904ac8808858 source/common/cudata.h
> > --- a/source/common/cudata.h  Mon May 18 18:24:08 2015 -0500
> > +++ b/source/common/cudata.h  Tue May 19 15:03:23 2015 +0530
> > @@ -188,6 +188,7 @@
> >      void     copyPartFrom(const CUData& cu, const CUGeom& childGeom,
> uint32_t subPartIdx);
> >      void     setEmptyPart(const CUGeom& childGeom, uint32_t subPartIdx);
> >      void     copyToPic(uint32_t depth) const;
> > +    void     copyToCU(CUData& ctu) const;
> >
> >      /* RD-0 methods called only from encodeResidue */
> >      void     copyFromPic(const CUData& ctu, const CUGeom& cuGeom);
> > diff -r d7b100e51e82 -r 904ac8808858 source/encoder/analysis.cpp
> > --- a/source/encoder/analysis.cpp     Mon May 18 18:24:08 2015 -0500
> > +++ b/source/encoder/analysis.cpp     Tue May 19 15:03:23 2015 +0530
> > @@ -739,7 +739,31 @@
> >          cuStat.count[depth] += 1;
> >          cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) /
> cuStat.count[depth];
> >      }
> > +    /* If zero-residual, do not bother doing subpelRefine */
> > +    bool subpelRefine = !!(md.bestMode->cu.m_predMode[0] & MODE_INTER)
> && !(md.bestMode->cu.m_mergeFlag[0]) && (md.bestMode->cu.m_partSize[0] ==
> SIZE_2Nx2N) && (md.bestMode->cu.m_cuDepth[0] == depth);
> > +    if (subpelRefine && m_param->rdLevel > 4)
> > +    {
> > +        int hpelDirs =
> MotionEstimate::hpelDirCount(m_param->subpelRefine);
> >
> > +        Mode* rdRefine = &md.pred[PRED_RD_REFINE];
> > +        rdRefine->initCosts();
> > +        rdRefine->cu.initSubCU(parentCTU, cuGeom, qp);
> > +        memcpy(rdRefine->bestME[0], md.bestMode->bestME[0],
> sizeof(MotionData));
> > +        if (m_slice->m_sliceType == B_SLICE)
> > +            memcpy(&rdRefine->bestME[0][1], &md.bestMode->bestME[0][1],
> sizeof(MotionData));
>
> using copyToCU() here means cu.initSubCU() was probably a waste of time
>
> > +        md.bestMode->cu.copyToCU(rdRefine->cu);
> > +        rdRefine->reconYuv.copyFromYuv(md.bestMode->reconYuv);
> > +        rdRefine->predYuv.copyFromYuv(md.bestMode->predYuv);
> > +
> > +        for (int i = 1; i <= hpelDirs; i++)
> > +        {
> > +            qPelRefine(*rdRefine, cuGeom, true, i);
> > +            if (m_slice->m_pps->bUseDQP && depth <=
> m_slice->m_pps->maxCuDQPDepth)
> > +                setLambdaFromQP(parentCTU,
> calculateQpforCuSize(parentCTU, cuGeom));
>
> why is this necessary here? calculateQpforCuSize(parentCTU, cuGeom))
> better return the same 'qp' value in bestMode->cu else you are in a lot
> of trouble. Why do this inside the loop?
>

We need this step here because md.bestMode->cu.m_qp[0] will be changed if
there was no residual and it will be set with RefQP inside checkDQP().
But we want to pass the original QP. Yes this needn't be inside loop, can
be done just once.

> > +            encodeResAndCalcRdInterCU(*rdRefine, cuGeom);
> > +            checkBestMode(*rdRefine, depth);
> > +        }
>
> I fail to see how this works. If hpelDirs is 8, and at i=1 the RD cost
> is better than bestMode, then checkBestMode will point to rdRefine. So
> far so good.. but then at i=8 it could have worse cost and you're stuck
> because checkBestMode() is not going to go back to the original mode,
> there is only one bestMode pointer.
>
> Why do we want to go back to the original best mode? Can't we use the best
mode chosen after each subpel refinement?

> You can't avoid doing an extra encodeResAndCalcRdInterCU() at the end,
> so you might as well forget about the extra PRED_RD_REFINE.
>
By using PRED_RD_REFINE we'll have all temporary data in it and I need not
do the final extra encode as the pointer md.bestMode will always have the
best data.

>
>     int bcost = bestMode->rdCost;
>     itn bdir = 0;
>
>     for (int i = 1; i <= hpelDirs; i++)
>     {
>        qPelRefine(*bestMode, cuGeom, true, i);
>        encodeResAndCalcRdInterCU(*bestMode, cuGeom);
>        COPY2_IF_LT(bcost, bestMode->rdCost, bdir, i);
>     }
>
>     qPelRefine(*bestMode, cuGeom, true, bdir);
>     encodeResAndCalcRdInterCU(*bestMode, cuGeom);
>
> >      /* Copy best data to encData CTU and recon */
> >      md.bestMode->cu.copyToPic(depth);
> >      if (md.bestMode != &md.pred[PRED_SPLIT])
> > @@ -1207,7 +1231,31 @@
> >          checkDQPForSplitPred(*splitPred, cuGeom);
> >          checkBestMode(*splitPred, depth);
> >      }
> > +    /* If zero-residual, do not bother doing subpelRefine */
> > +    bool subpelRefine = !!(md.bestMode->cu.m_predMode[0] & MODE_INTER)
> && !(md.bestMode->cu.m_mergeFlag[0]) && (md.bestMode->cu.m_partSize[0] ==
> SIZE_2Nx2N) && (md.bestMode->cu.m_cuDepth[0] == depth);
> > +    if (subpelRefine)
> > +    {
> > +        int hpelDirs =
> MotionEstimate::hpelDirCount(m_param->subpelRefine);
> >
> > +        Mode* rdRefine = &md.pred[PRED_RD_REFINE];
> > +        rdRefine->initCosts();
> > +        rdRefine->cu.initSubCU(parentCTU, cuGeom, qp);
> > +        memcpy(rdRefine->bestME[0], md.bestMode->bestME[0],
> sizeof(MotionData));
> > +        if (m_slice->m_sliceType == B_SLICE)
> > +            memcpy(&rdRefine->bestME[0][1], &md.bestMode->bestME[0][1],
> sizeof(MotionData));
> > +        md.bestMode->cu.copyToCU(rdRefine->cu);
> > +        rdRefine->reconYuv.copyFromYuv(md.bestMode->reconYuv);
> > +        rdRefine->predYuv.copyFromYuv(md.bestMode->predYuv);
> > +
> > +        for (int i = 1; i <= hpelDirs; i++)
> > +        {
> > +            qPelRefine(*rdRefine, cuGeom, true, i);
> > +            if (m_slice->m_pps->bUseDQP && depth <=
> m_slice->m_pps->maxCuDQPDepth)
> > +                setLambdaFromQP(parentCTU,
> calculateQpforCuSize(parentCTU, cuGeom));
> > +            encodeResAndCalcRdInterCU(*rdRefine, cuGeom);
> > +            checkBestMode(*rdRefine, depth);
> > +        }
> > +    }
> >      /* Copy best data to encData CTU and recon */
> >      md.bestMode->cu.copyToPic(depth);
> >      if (md.bestMode != &md.pred[PRED_SPLIT])
> > diff -r d7b100e51e82 -r 904ac8808858 source/encoder/analysis.h
> > --- a/source/encoder/analysis.h       Mon May 18 18:24:08 2015 -0500
> > +++ b/source/encoder/analysis.h       Tue May 19 15:03:23 2015 +0530
> > @@ -59,6 +59,7 @@
> >          PRED_nRx2N,
> >          PRED_INTRA_NxN, /* 4x4 intra PU blocks for 8x8 CU */
> >          PRED_LOSSLESS,  /* lossless encode of best mode */
> > +        PRED_RD_REFINE,
> >          MAX_PRED_TYPES
> >      };
> >
> > diff -r d7b100e51e82 -r 904ac8808858 source/encoder/motion.cpp
> > --- a/source/encoder/motion.cpp       Mon May 18 18:24:08 2015 -0500
> > +++ b/source/encoder/motion.cpp       Tue May 19 15:03:23 2015 +0530
> > @@ -155,6 +155,11 @@
> >             workload[subme].qpel_iters / 2;
> >  }
> >
> > +int MotionEstimate::hpelDirCount(int subme)
> > +{
> > +    return workload[subme].hpel_dirs;
> > +}
> > +
> >  MotionEstimate::~MotionEstimate()
> >  {
> >      fencPUYuv.destroy();
> > @@ -1205,6 +1210,49 @@
> >      return bcost;
> >  }
> >
> > +int MotionEstimate::qPelCompare(ReferencePlanes *ref,
> > +                                   const MV &       mvmin,
> > +                                   const MV &       mvmax,
> > +                                   const MV&        mvp,
> > +                                   const MV&        mv,
> > +                                   MV &             outQMv,
> > +                                   int halfPelIdx)
> > +{
> > +    setMVP(mvp);
> > +
> > +    MV qmvmin = mvmin.toQPel();
> > +    MV qmvmax = mvmax.toQPel();
> > +
> > +    MV fmv = mv.roundToFPel();
> > +    fmv = fmv.clipped(qmvmin, qmvmax);
> > +    int bcost = INT_MAX;
> > +    const SubpelWorkload& wl = workload[this->subpelRefine];
> > +
> > +    MV hmv = fmv + square1[halfPelIdx] * 2;
> > +    bcost = subpelCompare(ref, hmv, satd) + mvcost(hmv);
> > +    MV bmv = hmv;
> > +
> > +    for (int iter = 0; iter < wl.qpel_iters; iter++)
> > +    {
> > +        int bdir = 0;
> > +        for (int i = 1; i <= wl.qpel_dirs; i++)
> > +        {
> > +            MV qmv = hmv + square1[i];
> > +            int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
> > +            COPY2_IF_LT(bcost, cost, bdir, i);
> > +        }
> > +
> > +        if (bdir)
> > +            bmv += square1[bdir];
> > +        else
> > +            break;
> > +    }
> > +
> > +    x265_emms();
> > +    outQMv = bmv;
> > +    return bcost;
> > +}
> > +
> >  int MotionEstimate::subpelCompare(ReferencePlanes *ref, const MV& qmv,
> pixelcmp_t cmp)
> >  {
> >      intptr_t refStride = ref->lumaStride;
> > diff -r d7b100e51e82 -r 904ac8808858 source/encoder/motion.h
> > --- a/source/encoder/motion.h Mon May 18 18:24:08 2015 -0500
> > +++ b/source/encoder/motion.h Tue May 19 15:03:23 2015 +0530
> > @@ -69,6 +69,7 @@
> >
> >      static void initScales();
> >      static int hpelIterationCount(int subme);
> > +    static int hpelDirCount(int subme);
> >      void init(int method, int refine, int csp);
> >
> >      /* Methods called at slice setup */
> > @@ -90,6 +91,7 @@
> >      }
> >
> >      int motionEstimate(ReferencePlanes* ref, const MV & mvmin, const MV
> & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange,
> MV & outQMv);
> > +    int qPelCompare(ReferencePlanes* ref, const MV & mvmin, const MV &
> mvmax, const MV & mvp, const MV & mv, MV & outQMv, int halfPelIdx);
> >
> >      int subpelCompare(ReferencePlanes* ref, const MV &qmv, pixelcmp_t);
> >
> > diff -r d7b100e51e82 -r 904ac8808858 source/encoder/search.cpp
> > --- a/source/encoder/search.cpp       Mon May 18 18:24:08 2015 -0500
> > +++ b/source/encoder/search.cpp       Tue May 19 15:03:23 2015 +0530
> > @@ -2299,6 +2299,54 @@
> >      interMode.sa8dBits += totalmebits;
> >  }
> >
> > +void Search::qPelRefine(Mode& interMode, const CUGeom& cuGeom, bool
> bChromaSA8D, int halfpelIdx)
> > +{
> > +    CUData& cu = interMode.cu;
> > +    Yuv* predYuv = &interMode.predYuv;
> > +
> > +    const Slice *slice = m_slice;
> > +    uint32_t interDir = cu.m_interDir[0];
> > +
> > +    const int* numRefIdx = slice->m_numRefIdx;
> > +
> > +    MotionData* bestME = interMode.bestME[0];
> > +    PredictionUnit pu(cu, cuGeom, 0);
> > +
> > +    for (uint32_t list = 0; list < 2; list++)
> > +    {
> > +        if (interDir & (1 << list))
> > +        {
> > +            int ref = bestME[list].ref;
> > +            uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
> > +            bits += getTUBits(ref, numRefIdx[list]);
> > +
> > +            int merange = m_param->searchRange;
> > +
> > +            MV mvmin, mvmax, outmv, mvp = interMode.bestME[0][0].mvp;
> > +            MV mv = interMode.bestME[0][0].mv;
> > +
> > +            int satdCost;
> > +            setSearchRange(cu, mv, merange, mvmin, mvmax);
> > +            satdCost = m_me.qPelCompare(&slice->m_mref[list][ref],
> mvmin, mvmax, mvp, mv, outmv, halfpelIdx);
> > +
> > +            /* Get total cost of partition, but only include MV bit
> cost once */
> > +            bits += m_me.bitcost(outmv);
> > +            uint32_t cost = (satdCost - m_me.mvcost(outmv)) +
> m_rdCost.getCost(bits);
> > +
> > +            if (cost < bestME[list].cost)
> > +            {
> > +                bestME[list].mv = outmv;
> > +                bestME[list].cost = cost;
> > +                bestME[list].bits = bits;
> > +            }
> > +        }
> > +    }
> > +
> > +    motionCompensation(cu, pu, *predYuv, true, bChromaSA8D);
> > +
> > +    X265_CHECK(interMode.ok(), "inter mode is not ok");
> > +}
> > +
> >  void Search::getBlkBits(PartSize cuMode, bool bPSlice, int partIdx,
> uint32_t lastMode, uint32_t blockBit[3])
> >  {
> >      if (cuMode == SIZE_2Nx2N)
> > diff -r d7b100e51e82 -r 904ac8808858 source/encoder/search.h
> > --- a/source/encoder/search.h Mon May 18 18:24:08 2015 -0500
> > +++ b/source/encoder/search.h Tue May 19 15:03:23 2015 +0530
> > @@ -302,6 +302,7 @@
> >
> >      // estimation inter prediction (non-skip)
> >      void     predInterSearch(Mode& interMode, const CUGeom& cuGeom,
> bool bChromaMC);
> > +    void     qPelRefine(Mode& interMode, const CUGeom& cuGeom, bool
> bChroma, int halfpelIdx);
> >
> >      // encode residual and compute rd-cost for inter mode
> >      void     encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom&
> cuGeom);
> > _______________________________________________
> > x265-devel mailing list
> > x265-devel at videolan.org
> > https://mailman.videolan.org/listinfo/x265-devel
>
> --
> Steve Borho
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150520/aaea6a2d/attachment-0001.html>