[x265] [PATCH] analysis: add an additional round of sub-pel refinement for inter 2Nx2N in rd 5 and 6

Steve Borho steve at borho.org
Tue May 19 18:05:55 CEST 2015


On 05/19, santhoshini at multicorewareinc.com wrote:
> # HG changeset patch
> # User Santhoshini Sekar<santhoshini at multicorewareinc.com>
> # Date 1432028003 -19800
> #      Tue May 19 15:03:23 2015 +0530
> # Node ID 904ac8808858baaeaaa333b5a105af50c1107db0
> # Parent  d7b100e51e828833eee006f1da93e499ac161d28
> analysis: add an additional round of sub-pel refinement for inter 2Nx2N in rd 5 and 6
> 
> diff -r d7b100e51e82 -r 904ac8808858 source/common/cudata.cpp
> --- a/source/common/cudata.cpp	Mon May 18 18:24:08 2015 -0500
> +++ b/source/common/cudata.cpp	Tue May 19 15:03:23 2015 +0530
> @@ -456,6 +456,41 @@
>      memcpy(ctu.m_trCoeff[2] + tmpC2, m_trCoeff[2], sizeof(coeff_t) * tmpC);
>  }
>  
> +void CUData::copyToCU(CUData& ctu) const
> +{
> +    m_partCopy((uint8_t*)ctu.m_qp, (uint8_t*)m_qp);
> +    m_partCopy(ctu.m_log2CUSize, m_log2CUSize);
> +    m_partCopy(ctu.m_lumaIntraDir, m_lumaIntraDir);
> +    m_partCopy(ctu.m_tqBypass, m_tqBypass);
> +    m_partCopy((uint8_t*)ctu.m_refIdx[0], (uint8_t*)m_refIdx[0]);
> +    m_partCopy((uint8_t*)ctu.m_refIdx[1], (uint8_t*)m_refIdx[1]);
> +    m_partCopy(ctu.m_cuDepth, m_cuDepth);
> +    m_partCopy(ctu.m_predMode, m_predMode);
> +    m_partCopy(ctu.m_partSize, m_partSize);
> +    m_partCopy(ctu.m_mergeFlag, m_mergeFlag);
> +    m_partCopy(ctu.m_interDir, m_interDir);
> +    m_partCopy(ctu.m_mvpIdx[0], m_mvpIdx[0]);
> +    m_partCopy(ctu.m_mvpIdx[1], m_mvpIdx[1]);
> +    m_partCopy(ctu.m_tuDepth, m_tuDepth);
> +    m_partCopy(ctu.m_transformSkip[0], m_transformSkip[0]);
> +    m_partCopy(ctu.m_transformSkip[1], m_transformSkip[1]);
> +    m_partCopy(ctu.m_transformSkip[2], m_transformSkip[2]);
> +    m_partCopy(ctu.m_cbf[0], m_cbf[0]);
> +    m_partCopy(ctu.m_cbf[1], m_cbf[1]);
> +    m_partCopy(ctu.m_cbf[2], m_cbf[2]);
> +    m_partCopy(ctu.m_chromaIntraDir, m_chromaIntraDir);
> +
> +    memcpy(ctu.m_mv[0],  m_mv[0],  m_numPartitions * sizeof(MV));
> +    memcpy(ctu.m_mv[1],  m_mv[1],  m_numPartitions * sizeof(MV));
> +    memcpy(ctu.m_mvd[0], m_mvd[0], m_numPartitions * sizeof(MV));
> +    memcpy(ctu.m_mvd[1], m_mvd[1], m_numPartitions * sizeof(MV));
> +
> +    memcpy(ctu.m_trCoeff[0], m_trCoeff[0], sizeof(coeff_t));
> +
> +    memcpy(ctu.m_trCoeff[1], m_trCoeff[1], sizeof(coeff_t));
> +    memcpy(ctu.m_trCoeff[2], m_trCoeff[2], sizeof(coeff_t));
w/s nit, remove the blank line above
> +}
> +
>  /* The reverse of copyToPic, called only by encodeResidue */
>  void CUData::copyFromPic(const CUData& ctu, const CUGeom& cuGeom)
>  {
> diff -r d7b100e51e82 -r 904ac8808858 source/common/cudata.h
> --- a/source/common/cudata.h	Mon May 18 18:24:08 2015 -0500
> +++ b/source/common/cudata.h	Tue May 19 15:03:23 2015 +0530
> @@ -188,6 +188,7 @@
>      void     copyPartFrom(const CUData& cu, const CUGeom& childGeom, uint32_t subPartIdx);
>      void     setEmptyPart(const CUGeom& childGeom, uint32_t subPartIdx);
>      void     copyToPic(uint32_t depth) const;
> +    void     copyToCU(CUData& ctu) const;
>  
>      /* RD-0 methods called only from encodeResidue */
>      void     copyFromPic(const CUData& ctu, const CUGeom& cuGeom);
> diff -r d7b100e51e82 -r 904ac8808858 source/encoder/analysis.cpp
> --- a/source/encoder/analysis.cpp	Mon May 18 18:24:08 2015 -0500
> +++ b/source/encoder/analysis.cpp	Tue May 19 15:03:23 2015 +0530
> @@ -739,7 +739,31 @@
>          cuStat.count[depth] += 1;
>          cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
>      }
> +    /* If zero-residual, do not bother doing subpelRefine */
> +    bool subpelRefine = !!(md.bestMode->cu.m_predMode[0] & MODE_INTER) && !(md.bestMode->cu.m_mergeFlag[0]) && (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N) && (md.bestMode->cu.m_cuDepth[0] == depth);
> +    if (subpelRefine && m_param->rdLevel > 4)
> +    {
> +        int hpelDirs = MotionEstimate::hpelDirCount(m_param->subpelRefine);
>  
> +        Mode* rdRefine = &md.pred[PRED_RD_REFINE];
> +        rdRefine->initCosts();
> +        rdRefine->cu.initSubCU(parentCTU, cuGeom, qp);
> +        memcpy(rdRefine->bestME[0], md.bestMode->bestME[0], sizeof(MotionData));
> +        if (m_slice->m_sliceType == B_SLICE)
> +            memcpy(&rdRefine->bestME[0][1], &md.bestMode->bestME[0][1], sizeof(MotionData));

using copyToCU() here means cu.initSubCU() was probably a waste of time

> +        md.bestMode->cu.copyToCU(rdRefine->cu);
> +        rdRefine->reconYuv.copyFromYuv(md.bestMode->reconYuv);
> +        rdRefine->predYuv.copyFromYuv(md.bestMode->predYuv);
> +
> +        for (int i = 1; i <= hpelDirs; i++)
> +        {
> +            qPelRefine(*rdRefine, cuGeom, true, i);
> +            if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth)
> +                setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, cuGeom));

why is this necessary here? calculateQpforCuSize(parentCTU, cuGeom))
better return the same 'qp' value in bestMode->cu else you are in a lot
of trouble. Why do this inside the loop?

> +            encodeResAndCalcRdInterCU(*rdRefine, cuGeom);
> +            checkBestMode(*rdRefine, depth);
> +        }

I fail to see how this works. If hpelDirs is 8, and at i=1 the RD cost
is better than bestMode, then checkBestMode will point to rdRefine. So
far so good.. but then at i=8 it could have worse cost and you're stuck
because checkBestMode() is not going to go back to the original mode,
there is only one bestMode pointer.

You can't avoid doing an extra encodeResAndCalcRdInterCU() at the end,
so you might as well forget about the extra PRED_RD_REFINE.

    int bcost = bestMode->rdCost;
    itn bdir = 0;

    for (int i = 1; i <= hpelDirs; i++)
    {
       qPelRefine(*bestMode, cuGeom, true, i);
       encodeResAndCalcRdInterCU(*bestMode, cuGeom);
       COPY2_IF_LT(bcost, bestMode->rdCost, bdir, i);
    }

    qPelRefine(*bestMode, cuGeom, true, bdir);
    encodeResAndCalcRdInterCU(*bestMode, cuGeom);

>      /* Copy best data to encData CTU and recon */
>      md.bestMode->cu.copyToPic(depth);
>      if (md.bestMode != &md.pred[PRED_SPLIT])
> @@ -1207,7 +1231,31 @@
>          checkDQPForSplitPred(*splitPred, cuGeom);
>          checkBestMode(*splitPred, depth);
>      }
> +    /* If zero-residual, do not bother doing subpelRefine */
> +    bool subpelRefine = !!(md.bestMode->cu.m_predMode[0] & MODE_INTER) && !(md.bestMode->cu.m_mergeFlag[0]) && (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N) && (md.bestMode->cu.m_cuDepth[0] == depth);
> +    if (subpelRefine)
> +    {
> +        int hpelDirs = MotionEstimate::hpelDirCount(m_param->subpelRefine);
>  
> +        Mode* rdRefine = &md.pred[PRED_RD_REFINE];
> +        rdRefine->initCosts();
> +        rdRefine->cu.initSubCU(parentCTU, cuGeom, qp);
> +        memcpy(rdRefine->bestME[0], md.bestMode->bestME[0], sizeof(MotionData));
> +        if (m_slice->m_sliceType == B_SLICE)
> +            memcpy(&rdRefine->bestME[0][1], &md.bestMode->bestME[0][1], sizeof(MotionData));
> +        md.bestMode->cu.copyToCU(rdRefine->cu);
> +        rdRefine->reconYuv.copyFromYuv(md.bestMode->reconYuv);
> +        rdRefine->predYuv.copyFromYuv(md.bestMode->predYuv);
> +
> +        for (int i = 1; i <= hpelDirs; i++)
> +        {
> +            qPelRefine(*rdRefine, cuGeom, true, i);
> +            if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth)
> +                setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, cuGeom));
> +            encodeResAndCalcRdInterCU(*rdRefine, cuGeom);
> +            checkBestMode(*rdRefine, depth);
> +        }
> +    }
>      /* Copy best data to encData CTU and recon */
>      md.bestMode->cu.copyToPic(depth);
>      if (md.bestMode != &md.pred[PRED_SPLIT])
> diff -r d7b100e51e82 -r 904ac8808858 source/encoder/analysis.h
> --- a/source/encoder/analysis.h	Mon May 18 18:24:08 2015 -0500
> +++ b/source/encoder/analysis.h	Tue May 19 15:03:23 2015 +0530
> @@ -59,6 +59,7 @@
>          PRED_nRx2N,
>          PRED_INTRA_NxN, /* 4x4 intra PU blocks for 8x8 CU */
>          PRED_LOSSLESS,  /* lossless encode of best mode */
> +        PRED_RD_REFINE,
>          MAX_PRED_TYPES
>      };
>  
> diff -r d7b100e51e82 -r 904ac8808858 source/encoder/motion.cpp
> --- a/source/encoder/motion.cpp	Mon May 18 18:24:08 2015 -0500
> +++ b/source/encoder/motion.cpp	Tue May 19 15:03:23 2015 +0530
> @@ -155,6 +155,11 @@
>             workload[subme].qpel_iters / 2;
>  }
>  
> +int MotionEstimate::hpelDirCount(int subme)
> +{
> +    return workload[subme].hpel_dirs;
> +}
> +
>  MotionEstimate::~MotionEstimate()
>  {
>      fencPUYuv.destroy();
> @@ -1205,6 +1210,49 @@
>      return bcost;
>  }
>  
> +int MotionEstimate::qPelCompare(ReferencePlanes *ref,
> +                                   const MV &       mvmin,
> +                                   const MV &       mvmax,
> +                                   const MV&        mvp,
> +                                   const MV&        mv,
> +                                   MV &             outQMv,
> +                                   int halfPelIdx)
> +{
> +    setMVP(mvp);
> +
> +    MV qmvmin = mvmin.toQPel();
> +    MV qmvmax = mvmax.toQPel();
> +
> +    MV fmv = mv.roundToFPel();
> +    fmv = fmv.clipped(qmvmin, qmvmax);
> +    int bcost = INT_MAX;
> +    const SubpelWorkload& wl = workload[this->subpelRefine];
> +
> +    MV hmv = fmv + square1[halfPelIdx] * 2;
> +    bcost = subpelCompare(ref, hmv, satd) + mvcost(hmv);
> +    MV bmv = hmv;
> +
> +    for (int iter = 0; iter < wl.qpel_iters; iter++)
> +    {
> +        int bdir = 0;
> +        for (int i = 1; i <= wl.qpel_dirs; i++)
> +        {
> +            MV qmv = hmv + square1[i];
> +            int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
> +            COPY2_IF_LT(bcost, cost, bdir, i);
> +        }
> +
> +        if (bdir)
> +            bmv += square1[bdir];
> +        else
> +            break;
> +    }
> +
> +    x265_emms();
> +    outQMv = bmv;
> +    return bcost;
> +}
> +
>  int MotionEstimate::subpelCompare(ReferencePlanes *ref, const MV& qmv, pixelcmp_t cmp)
>  {
>      intptr_t refStride = ref->lumaStride;
> diff -r d7b100e51e82 -r 904ac8808858 source/encoder/motion.h
> --- a/source/encoder/motion.h	Mon May 18 18:24:08 2015 -0500
> +++ b/source/encoder/motion.h	Tue May 19 15:03:23 2015 +0530
> @@ -69,6 +69,7 @@
>  
>      static void initScales();
>      static int hpelIterationCount(int subme);
> +    static int hpelDirCount(int subme);
>      void init(int method, int refine, int csp);
>  
>      /* Methods called at slice setup */
> @@ -90,6 +91,7 @@
>      }
>  
>      int motionEstimate(ReferencePlanes* ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv);
> +    int qPelCompare(ReferencePlanes* ref, const MV & mvmin, const MV & mvmax, const MV & mvp, const MV & mv, MV & outQMv, int halfPelIdx);
>  
>      int subpelCompare(ReferencePlanes* ref, const MV &qmv, pixelcmp_t);
>  
> diff -r d7b100e51e82 -r 904ac8808858 source/encoder/search.cpp
> --- a/source/encoder/search.cpp	Mon May 18 18:24:08 2015 -0500
> +++ b/source/encoder/search.cpp	Tue May 19 15:03:23 2015 +0530
> @@ -2299,6 +2299,54 @@
>      interMode.sa8dBits += totalmebits;
>  }
>  
> +void Search::qPelRefine(Mode& interMode, const CUGeom& cuGeom, bool bChromaSA8D, int halfpelIdx)
> +{
> +    CUData& cu = interMode.cu;
> +    Yuv* predYuv = &interMode.predYuv;
> +
> +    const Slice *slice = m_slice;
> +    uint32_t interDir = cu.m_interDir[0];
> +
> +    const int* numRefIdx = slice->m_numRefIdx;
> +
> +    MotionData* bestME = interMode.bestME[0];
> +    PredictionUnit pu(cu, cuGeom, 0);
> +
> +    for (uint32_t list = 0; list < 2; list++)
> +    {
> +        if (interDir & (1 << list))
> +        {
> +            int ref = bestME[list].ref;
> +            uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
> +            bits += getTUBits(ref, numRefIdx[list]);
> +
> +            int merange = m_param->searchRange;
> +
> +            MV mvmin, mvmax, outmv, mvp = interMode.bestME[0][0].mvp;
> +            MV mv = interMode.bestME[0][0].mv;
> +
> +            int satdCost;
> +            setSearchRange(cu, mv, merange, mvmin, mvmax);
> +            satdCost = m_me.qPelCompare(&slice->m_mref[list][ref], mvmin, mvmax, mvp, mv, outmv, halfpelIdx);
> +
> +            /* Get total cost of partition, but only include MV bit cost once */
> +            bits += m_me.bitcost(outmv);
> +            uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
> +
> +            if (cost < bestME[list].cost)
> +            {
> +                bestME[list].mv = outmv;
> +                bestME[list].cost = cost;
> +                bestME[list].bits = bits;
> +            }
> +        }
> +    }
> +
> +    motionCompensation(cu, pu, *predYuv, true, bChromaSA8D);
> +
> +    X265_CHECK(interMode.ok(), "inter mode is not ok");
> +}
> +
>  void Search::getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3])
>  {
>      if (cuMode == SIZE_2Nx2N)
> diff -r d7b100e51e82 -r 904ac8808858 source/encoder/search.h
> --- a/source/encoder/search.h	Mon May 18 18:24:08 2015 -0500
> +++ b/source/encoder/search.h	Tue May 19 15:03:23 2015 +0530
> @@ -302,6 +302,7 @@
>  
>      // estimation inter prediction (non-skip)
>      void     predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC);
> +    void     qPelRefine(Mode& interMode, const CUGeom& cuGeom, bool bChroma, int halfpelIdx);
>  
>      // encode residual and compute rd-cost for inter mode
>      void     encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom);
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho


More information about the x265-devel mailing list