[x265] [PATCH] analysis: add an additional round of sub-pel refinement for inter 2Nx2N in rd 5 and 6

Thu May 21 17:25:32 CEST 2015

On 05/21, Steve Borho wrote:
> On 05/21, santhoshini at multicorewareinc.com wrote:
> > # HG changeset patch
> > # User Santhoshini Sekar<santhoshini at multicorewareinc.com>
> > # Date 1432182660 -19800
> > #      Thu May 21 10:01:00 2015 +0530
> > # Node ID 630b378b744f4bf442839680f5120d7d299d2acd
> > # Parent  dc4fcfc574ade14ecc841797ad08be9753fad58e
> > analysis: add an additional round of sub-pel refinement for inter 2Nx2N in rd 5 and 6
> > 
> > diff -r dc4fcfc574ad -r 630b378b744f source/encoder/analysis.cpp
> > --- a/source/encoder/analysis.cpp	Wed May 20 12:17:44 2015 -0500
> > +++ b/source/encoder/analysis.cpp	Thu May 21 10:01:00 2015 +0530
> > @@ -742,6 +742,24 @@
> >          cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
> >      }
> >  
> > +    /* If zero-residual, do not bother doing subpelRefine */
> > +    bool subpelRefine = !!(md.bestMode->cu.m_predMode[0] & MODE_INTER) && !(md.bestMode->cu.m_mergeFlag[0]) && (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N) && (md.bestMode->cu.m_cuDepth[0] == depth);
> 
> these long expressions should be broken into two lines
> 'subpelRefine' is not a very descriptive term, this is rd-refine since
> it is refining based on RD cost
> 
> > +    if (subpelRefine && m_param->rdLevel > 4)
> > +    {
> > +        int hpelDirs = MotionEstimate::hpelDirCount(m_param->subpelRefine);
> > +        if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth)
> > +            setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, cuGeom));
> > +        uint64_t bcost = md.bestMode->rdCost;
> > +        int bdir = 0;
> > +        for (int i = 1; i <= hpelDirs; i++)
> > +        {
> > +            qPelRefine(*md.bestMode, cuGeom, true, i);
> > +            encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
> > +            COPY2_IF_LT(bcost, md.bestMode->rdCost, bdir, i);
> > +        }
> > +        qPelRefine(*md.bestMode, cuGeom, true, bdir);
> > +        encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
> > +    }
> >      /* Copy best data to encData CTU and recon */
> >      md.bestMode->cu.copyToPic(depth);
> >      if (md.bestMode != &md.pred[PRED_SPLIT])
> > @@ -1312,6 +1330,24 @@
> >          checkBestMode(*splitPred, depth);
> >      }
> >  
> > +    /* If zero-residual, do not bother doing subpelRefine */
> > +    bool subpelRefine = !!(md.bestMode->cu.m_predMode[0] & MODE_INTER) && !(md.bestMode->cu.m_mergeFlag[0]) && (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N) && (md.bestMode->cu.m_cuDepth[0] == depth);
> > +    if (subpelRefine)
> > +    {
> > +        int hpelDirs = MotionEstimate::hpelDirCount(m_param->subpelRefine);
> > +        uint64_t bcost = md.bestMode->rdCost;
> > +        int bdir = 0;
> > +        if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth)
> > +            setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, cuGeom));
> > +        for (int i = 1; i <= hpelDirs; i++)
> > +        {
> > +            qPelRefine(*md.bestMode, cuGeom, true, i);
> > +            encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
> > +            COPY2_IF_LT(bcost, md.bestMode->rdCost, bdir, i);
> > +        }
> > +        qPelRefine(*md.bestMode, cuGeom, true, bdir);
> > +        encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
> > +    }
> >      /* Copy best data to encData CTU and recon */
> >      md.bestMode->cu.copyToPic(depth);
> >      if (md.bestMode != &md.pred[PRED_SPLIT])
> > diff -r dc4fcfc574ad -r 630b378b744f source/encoder/motion.cpp
> > --- a/source/encoder/motion.cpp	Wed May 20 12:17:44 2015 -0500
> > +++ b/source/encoder/motion.cpp	Thu May 21 10:01:00 2015 +0530
> > @@ -155,6 +155,11 @@
> >             workload[subme].qpel_iters / 2;
> >  }
> >  
> > +int MotionEstimate::hpelDirCount(int subme)
> > +{
> > +    return workload[subme].hpel_dirs;
> > +}
> > +
> >  MotionEstimate::~MotionEstimate()
> >  {
> >      fencPUYuv.destroy();
> > @@ -1205,6 +1210,49 @@
> >      return bcost;
> >  }
> >  
> > +int MotionEstimate::qPelCompare(ReferencePlanes *ref,
> > +                                   const MV &       mvmin,
> > +                                   const MV &       mvmax,
> > +                                   const MV&        mvp,
> > +                                   const MV&        mv,
> > +                                   MV &             outQMv,
> > +                                   int halfPelIdx)
> > +{
> > +    setMVP(mvp);
> 
> no idea why this function is necessary
> 
> > +    MV qmvmin = mvmin.toQPel();
> > +    MV qmvmax = mvmax.toQPel();
> > +
> > +    MV fmv = mv.roundToFPel();
> > +    fmv = fmv.clipped(qmvmin, qmvmax);
> > +    int bcost = INT_MAX;
> > +    const SubpelWorkload& wl = workload[this->subpelRefine];
> > +
> > +    MV hmv = fmv + square1[halfPelIdx] * 2;
> > +    bcost = subpelCompare(ref, hmv, satd) + mvcost(hmv);
> > +    MV bmv = hmv;
> > +
> > +    for (int iter = 0; iter < wl.qpel_iters; iter++)
> > +    {
> > +        int bdir = 0;
> > +        for (int i = 1; i <= wl.qpel_dirs; i++)
> > +        {
> > +            MV qmv = hmv + square1[i];
> > +            int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
> > +            COPY2_IF_LT(bcost, cost, bdir, i);
> > +        }
> > +
> > +        if (bdir)
> > +            bmv += square1[bdir];
> > +        else
> > +            break;
> > +    }
> > +
> > +    x265_emms();
> > +    outQMv = bmv;
> > +    return bcost;
> > +}
> > +
> >  int MotionEstimate::subpelCompare(ReferencePlanes *ref, const MV& qmv, pixelcmp_t cmp)
> >  {
> >      intptr_t refStride = ref->lumaStride;
> > diff -r dc4fcfc574ad -r 630b378b744f source/encoder/motion.h
> > --- a/source/encoder/motion.h	Wed May 20 12:17:44 2015 -0500
> > +++ b/source/encoder/motion.h	Thu May 21 10:01:00 2015 +0530
> > @@ -69,6 +69,7 @@
> >  
> >      static void initScales();
> >      static int hpelIterationCount(int subme);
> > +    static int hpelDirCount(int subme);
> >      void init(int method, int refine, int csp);
> >  
> >      /* Methods called at slice setup */
> > @@ -90,6 +91,7 @@
> >      }
> >  
> >      int motionEstimate(ReferencePlanes* ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv);
> > +    int qPelCompare(ReferencePlanes* ref, const MV & mvmin, const MV & mvmax, const MV & mvp, const MV & mv, MV & outQMv, int halfPelIdx);
> >  
> >      int subpelCompare(ReferencePlanes* ref, const MV &qmv, pixelcmp_t);
> >  
> > diff -r dc4fcfc574ad -r 630b378b744f source/encoder/search.cpp
> > --- a/source/encoder/search.cpp	Wed May 20 12:17:44 2015 -0500
> > +++ b/source/encoder/search.cpp	Thu May 21 10:01:00 2015 +0530
> > @@ -2348,6 +2348,54 @@
> >      interMode.sa8dBits += totalmebits;
> >  }
> >  
> > +void Search::qPelRefine(Mode& interMode, const CUGeom& cuGeom, bool bChromaSA8D, int halfpelIdx)
> > +{
> > +    CUData& cu = interMode.cu;
> > +    Yuv* predYuv = &interMode.predYuv;
> > +
> > +    const Slice *slice = m_slice;
> > +    uint32_t interDir = cu.m_interDir[0];
> > +
> > +    const int* numRefIdx = slice->m_numRefIdx;
> > +
> > +    MotionData* bestME = interMode.bestME[0];
> > +    PredictionUnit pu(cu, cuGeom, 0);
> > +
> > +    for (uint32_t list = 0; list < 2; list++)
> > +    {
> > +        if (interDir & (1 << list))
> > +        {
> > +            int ref = bestME[list].ref;
> > +            uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
> > +            bits += getTUBits(ref, numRefIdx[list]);
> > +
> > +            int merange = m_param->searchRange;
> > +
> > +            MV mvmin, mvmax, outmv, mvp = interMode.bestME[0][0].mvp;
> > +            MV mv = interMode.bestME[0][0].mv;
> > +
> > +            int satdCost;
> > +            setSearchRange(cu, mv, merange, mvmin, mvmax);
> > +            satdCost = m_me.qPelCompare(&slice->m_mref[list][ref], mvmin, mvmax, mvp, mv, outmv, halfpelIdx);
> > +
> > +            /* Get total cost of partition, but only include MV bit cost once */
> > +            bits += m_me.bitcost(outmv);
> > +            uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
> > +
> > +            if (cost < bestME[list].cost)
> > +            {
> > +                bestME[list].mv = outmv;
> > +                bestME[list].cost = cost;
> > +                bestME[list].bits = bits;
> > +            }
> > +        }
> > +    }
> > +    motionCompensation(cu, pu, *predYuv, true, bChromaSA8D);
> 
> 1. bidir refinement should a different feature, it needs to get built
>    into predInterSearch() itself. rd-refine should only be used to
>    refine a unidir prediction.
> 
> 2. Doing all this work to update bestME[] satd and bits costs is a waste
>    of time.
> 
> 3. you're not writing the new MV into the CU, so motionCompensation
>    below is just re-generating the original prediction for this CU,
>    ignoring bestME
> 
> 4. the whole point of rd-refine is to measure the full RD cost at each
>    offset. using SATD to pick a direction in qPelCompare is defeating
>    the point. All this function should be doing for hpel refine is:
> 
>    cu.m_mv[0][0] = interMode.bestME[0][0].mv + square1[halfPelIdx] * 2;
>    motionCompensation(cu, pu, *predYuv, true, true);
> 
>    and qpel refine:
> 
>    cu.m_mv[0][0] = interMode.bestME[0][0].mv + square1[halfPelIdx];
>    motionCompensation(cu, pu, *predYuv, true, true);
> 
>    it probably doesn't need to be a function at all
> 
> 
> At the end, the final refined MV must fully configured in the CU with:
> 
>    cu.setPUMv(0, cu.m_mv[0][0], 0, 0);
>    cu.m_mvd[0][0] = cu.m_mvd[0][0].mvp - cu.m_mv[0][0];
> 
> And, thinking out loud, m_mvd probably needs to be removed from CUData
> since it can always be calculated by subtracting the MV from its MVP.
> Cacheing it has no useful value, except to trip us up when we forget to
> update it (we are not even setting it on bidir CUs now)

ignore this bit about m_mvd, I forgot that we're storing mvpIdx but not
the mvp itself, so we have to keep mvd.

the rest still stands, except the example code needs to be:

   int list = cu.m_interDir[0];
   if (list == 3) /* do not refine */

   /* try new hpel offset */
   cu.m_mv[list][0] = interMode.bestME[list][0].mv + square1[halfPelIdx] * 2;
   motionCompensation(cu, pu, *predYuv, true, true);

   /* save final MV */
   cu.setPUMv(list, cu.m_mv[list][0], 0, 0);
   cu.m_mvd[list][0] = bestME[0][list].mvp - cu.m_mv[0][0];

-- 
Steve Borho