[x265] [PATCH] cleanup: reduce data size and dependency on MotionEstimate

Tue Apr 15 20:26:50 CEST 2014

On Tue, Apr 15, 2014 at 6:30 AM, Min Chen <chenm003 at 163.com> wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1397561438 -28800
> # Node ID dd78d554f78dd785cb8b16a6606b5fe6b6e87e2a
> # Parent  1cf67a7b362d24d292d7cca574cbcfe88a8eb1cb
> cleanup: reduce data size and dependency on MotionEstimate
>
> diff -r 1cf67a7b362d -r dd78d554f78d source/Lib/TLibEncoder/TEncSearch.cpp
> --- a/source/Lib/TLibEncoder/TEncSearch.cpp     Mon Apr 14 21:26:37 2014 -0500
> +++ b/source/Lib/TLibEncoder/TEncSearch.cpp     Tue Apr 15 19:30:38 2014 +0800
> @@ -111,8 +111,6 @@
>      m_rdCost  = rdCost;
>
>      initTempBuff(cfg->param->internalCsp);
> -    m_me.setSearchMethod(cfg->param->searchMethod);
> -    m_me.setSubpelRefine(cfg->param->subpelRefine);
>
>      /* When frame parallelism is active, only 'refLagPixels' of reference frames will be guaranteed
>       * available for motion reference.  See refLagRows in FrameEncoder::compressCTURows() */
> @@ -2191,7 +2189,7 @@
>          cu->getCUMvField(REF_PIC_LIST_1)->m_refIdx[m.absPartIdx] = m.mvFieldNeighbours[1 + 2 * mergeCand].refIdx;
>
>          motionCompensation(cu, &m_predTempYuv, REF_PIC_LIST_X, puIdx, true, false);
> -        uint32_t costCand = m_me.bufSATD(m_predTempYuv.getLumaAddr(m.absPartIdx), m_predTempYuv.getStride());
> +        uint32_t costCand = m_me.satd(m_me.fenc, FENC_STRIDE, m_predTempYuv.getLumaAddr(m.absPartIdx), m_predTempYuv.getStride());
>          uint32_t bitsCand = mergeCand + 1;
>          if (mergeCand == m_cfg->param->maxNumMergeCand - 1)
>          {
> @@ -2314,7 +2312,7 @@
>                      cu->clipMv(mvCand);
>
>                      xPredInterLumaBlk(cu, cu->getSlice()->getRefPic(l, ref)->getPicYuvRec(), partAddr, &mvCand, roiWidth, roiHeight, &m_predTempYuv);
> -                    uint32_t cost = m_me.bufSAD(m_predTempYuv.getLumaAddr(partAddr), m_predTempYuv.getStride());
> +                    uint32_t cost = m_me.sad(m_me.fenc, FENC_STRIDE, m_predTempYuv.getLumaAddr(partAddr), m_predTempYuv.getStride());
>                      cost = m_rdCost->calcRdSADCost(cost, MVP_IDX_BITS);
>
>                      if (bestCost > cost)
> @@ -2328,11 +2326,11 @@
>
>                  int merange = m_cfg->param->searchRange;
>                  xSetSearchRange(cu, mvp, merange, mvmin, mvmax);
> -                int satdCost = m_me.motionEstimate(m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
> +                int satdCost = m_me.motionEstimate(m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv, m_cfg->param->searchMethod, m_cfg->param->subpelRefine);
>
>                  /* Get total cost of partition, but only include MV bit cost once */
> -                bits += m_me.bitcost(outmv);
> -                uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost->getCost(bits);
> +                bits += m_me.bitcost(outmv, mvp);
> +                uint32_t cost = (satdCost - m_me.mvcost(outmv, mvp)) + m_rdCost->getCost(bits);
>
>                  /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
>                  xCheckBestMVP(&amvpInfo[l][ref], outmv, mvp, mvpIdx, bits, cost);
> @@ -2368,7 +2366,7 @@
>
>              int partEnum = partitionFromSizes(roiWidth, roiHeight);
>              primitives.pixelavg_pp[partEnum](avg, roiWidth, pred0, m_predYuv[0].getStride(), pred1, m_predYuv[1].getStride(), 32);
> -            int satdCost = m_me.bufSATD(avg, roiWidth);
> +            int satdCost = m_me.satd(m_me.fenc, FENC_STRIDE, avg, roiWidth);
>
>              bidirBits = list[0].bits + list[1].bits + listSelBits[2] - (listSelBits[0] + listSelBits[1]);
>              bidirCost = satdCost + m_rdCost->getCost(bidirBits);
> @@ -2397,17 +2395,15 @@
>                  intptr_t refStride = m_mref[0][0]->lumaStride;
>
>                  primitives.pixelavg_pp[partEnum](avg, roiWidth, ref0, refStride, ref1, refStride, 32);
> -                satdCost = m_me.bufSATD(avg, roiWidth);
> +                satdCost = m_me.satd(m_me.fenc, FENC_STRIDE, avg, roiWidth);
>
>                  MV mvp0 = list[0].mvp;
>                  int mvpIdx0 = list[0].mvpIdx;
> -                m_me.setMVP(mvp0);
> -                uint32_t bits0 = list[0].bits - m_me.bitcost(list[0].mv) + m_me.bitcost(mvzero);
> +                uint32_t bits0 = list[0].bits - m_me.bitcost(list[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
>
>                  MV mvp1 = list[1].mvp;
>                  int mvpIdx1 = list[1].mvpIdx;
> -                m_me.setMVP(mvp1);
> -                uint32_t bits1 = list[1].bits - m_me.bitcost(list[1].mv) + m_me.bitcost(mvzero);
> +                uint32_t bits1 = list[1].bits - m_me.bitcost(list[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
>
>                  uint32_t cost = satdCost + m_rdCost->getCost(bits0) + m_rdCost->getCost(bits1);
>
> @@ -2556,9 +2552,8 @@
>  {
>      assert(amvpInfo->m_mvCand[outMvpIdx] == mvPred);
>
> -    m_me.setMVP(mvPred);
>      int bestMvpIdx = outMvpIdx;
> -    int mvBitsOrig = m_me.bitcost(mv) + MVP_IDX_BITS;
> +    int mvBitsOrig = m_me.bitcost(mv, mvPred) + MVP_IDX_BITS;
>      int bestMvBits = mvBitsOrig;
>
>      for (int mvpIdx = 0; mvpIdx < AMVP_MAX_NUM_CANDS; mvpIdx++)
> @@ -2566,8 +2561,7 @@
>          if (mvpIdx == outMvpIdx)
>              continue;
>
> -        m_me.setMVP(amvpInfo->m_mvCand[mvpIdx]);
> -        int mvbits = m_me.bitcost(mv) + MVP_IDX_BITS;
> +        int mvbits = m_me.bitcost(mv, amvpInfo->m_mvCand[mvpIdx]) + MVP_IDX_BITS;
>
>          if (mvbits < bestMvBits)
>          {
> diff -r 1cf67a7b362d -r dd78d554f78d source/encoder/bitcost.h
> --- a/source/encoder/bitcost.h  Mon Apr 14 21:26:37 2014 -0500
> +++ b/source/encoder/bitcost.h  Tue Apr 15 19:30:38 2014 +0800
> @@ -35,36 +35,26 @@
>  {
>  public:
>
> -    BitCost() : m_cost_mvx(0), m_cost_mvy(0), m_cost(0) {}
> +    BitCost() : m_cost(NULL) {}
>
>      void setQP(unsigned int qp);
>
> -    void setMVP(const MV& mvp)                      { m_mvp = mvp; m_cost_mvx = m_cost - mvp.x; m_cost_mvy = m_cost - mvp.y; }
> -
>      // return bit cost of motion vector difference, multiplied by lambda
> -    inline uint16_t mvcost(const MV& mv) const      { return m_cost_mvx[mv.x] + m_cost_mvy[mv.y]; }
> +    inline uint16_t mvcost(const MV mv, const MV mvp) const      { return m_cost[mv.x - mvp.x] + m_cost[mv.y - mvp.y]; }
>
>      // return bit cost of motion vector difference, without lambda
> -    inline uint16_t bitcost(const MV& mv) const
> +    inline uint16_t bitcost(const MV mv, const MV mvp) const
>      {
> -        return (uint16_t)(s_bitsizes[(abs(mv.x - m_mvp.x) << 1) + !!(mv.x < m_mvp.x)] +
> -                          s_bitsizes[(abs(mv.y - m_mvp.y) << 1) + !!(mv.y < m_mvp.y)] + 0.5f);
> +        return (uint16_t)(s_bitsizes[(abs(mv.x - mvp.x) << 1) + !!(mv.x < mvp.x)] +
> +                          s_bitsizes[(abs(mv.y - mvp.y) << 1) + !!(mv.y < mvp.y)] + 0.5f);
>      }
>
>      static void destroy();
>
>  protected:
>
> -    uint16_t *m_cost_mvx;
> -
> -    uint16_t *m_cost_mvy;
> -

This seems an odd trade-off to me, saving 16 bytes to add an
additional two subtract operations to every MV cost evaluation.

Does this change measure well as a performance improvement?

-- 
Steve Borho