[x265] [PATCH] weightp: pass struct to tryCommonDenom() to cache motion compensated reference planes for reuse

Steve Borho steve at borho.org
Fri Feb 21 03:29:20 CET 2014


On Thu, Feb 20, 2014 at 6:27 AM, <kavitha at multicorewareinc.com> wrote:
>
> # HG changeset patch
> # User Kavitha Sampath <kavitha at multicorewareinc.com>
> # Date 1392899156 -19800
> #      Thu Feb 20 17:55:56 2014 +0530
> # Node ID a359217a5a5dbd2c8183c41dbf3f7193a09e0269
> # Parent  3389061b75a486e004409ab628c46fed39d03b72
> weightp: pass struct to tryCommonDenom() to cache motion compensated reference planes for reuse
>
> diff -r 3389061b75a4 -r a359217a5a5d source/encoder/weightPrediction.cpp
> --- a/source/encoder/weightPrediction.cpp       Wed Feb 19 17:03:21 2014 -0600
> +++ b/source/encoder/weightPrediction.cpp       Thu Feb 20 17:55:56 2014 +0530
> @@ -32,6 +32,13 @@
>  using namespace x265;
>
>  namespace weightp {
> +
> +struct cache
> +{
> +    pixel    *mcRef[2][MAX_NUM_REF][3];
> +    uint32_t weightcost[2][MAX_NUM_REF][3];


the weight costs are presumably changing each time the function is
called with different denoms, I don't think they can be cached.

>
> +};
> +
>  /* make a motion compensated copy of lowres ref into mcout with the same stride.
>   * The borders of mcout are not extended */
>  void mcLuma(pixel *    mcout,
> @@ -213,7 +220,7 @@
>  bool tryCommonDenom(TComSlice&     slice,
>                      x265_param&    param,
>                      wpScalingParam wp[2][MAX_NUM_REF][3],
> -                    pixel *        temp,
> +                    cache&         cacheData,
>                      int            indenom)
>  {
>      TComPic *pic = slice.getPic();
> @@ -221,12 +228,6 @@
>      Lowres& fenc = pic->m_lowres;
>      int curPoc = slice.getPOC();
>
> -    /* caller provides temp space for two full-pel planes. Split it
> -     * in half for motion compensation of the reference and then the
> -     * weighting */
> -    pixel *mcTemp = temp;
> -    pixel *weightTemp = temp + picorig->getStride() * picorig->getHeight();
> -
>      int log2denom[3] = { indenom };
>      int csp = picorig->m_picCsp;
>      int hshift = CHROMA_H_SHIFT(csp);
> @@ -327,6 +328,8 @@
>                  pixel *fref;
>                  int    origstride, frefstride;
>                  int    width, height;
> +                pixel* &buf = cacheData.mcRef[list][ref][yuv];
> +                uint32_t& origscore = cacheData.weightcost[list][ref][yuv];
>                  switch (yuv)
>                  {
>                  case 0:
> @@ -338,8 +341,12 @@
>
>                      if (bMotionCompensate)
>                      {
> -                        mcLuma(mcTemp, refLowres, mvCosts, fenc.intraCost, mvs);
> -                        fref = mcTemp;
> +                        if (!buf)
> +                        {
> +                            buf = X265_MALLOC(pixel, picorig->getStride() * picorig->getHeight());
> +                            mcLuma(buf, refLowres, mvCosts, fenc.intraCost, mvs);
> +                        }
> +                        fref = buf;
>                      }
>                      break;
>
> @@ -359,8 +366,12 @@
>
>                      if (bMotionCompensate)
>                      {
> -                        mcChroma(mcTemp, fref, fenc, frefstride, mvCosts, fenc.intraCost, mvs, height, width, csp);
> -                        fref = mcTemp;
> +                        if (!buf)
> +                        {
> +                            buf = X265_MALLOC(pixel, picorig->getStride() * picorig->getHeight());
> +                            mcChroma(buf, fref, fenc, frefstride, mvCosts, fenc.intraCost, mvs, height, width, csp);
> +                        }
> +                        fref = buf;
>                      }
>                      break;
>
> @@ -373,8 +384,12 @@
>
>                      if (bMotionCompensate)
>                      {
> -                        mcChroma(mcTemp, fref, fenc, frefstride, mvCosts, fenc.intraCost, mvs, height, width, csp);
> -                        fref = mcTemp;
> +                        if (!buf)
> +                        {
> +                            buf = X265_MALLOC(pixel, picorig->getStride() * picorig->getHeight());
> +                            mcChroma(buf, fref, fenc, frefstride, mvCosts, fenc.intraCost, mvs, height, width, csp);
> +                        }
> +                        fref = buf;


looks good up to here

>
>                      }
>                      break;
>
> @@ -388,10 +403,14 @@
>                  int mindenom = w.log2WeightDenom;
>                  int minscale = w.inputWeight;
>                  int minoff = 0;
> +                pixel *weightTemp = X265_MALLOC(pixel, picorig->getStride() * picorig->getHeight());


this buffer should be in the cache, you only need one buffer;
allocated to the max of the chroma and luma plane sizes

>
>
> -                uint32_t origscore = weightCost(orig, origstride, fref, frefstride, weightTemp, width, height, NULL);
> -                if (!origscore)
> -                    continue;
> +                if (origscore == 0)
> +                {
> +                    origscore = weightCost(orig, origstride, fref, frefstride, weightTemp, width, height, NULL);
> +                    if (!origscore)
> +                        continue;
> +                }
>
>                  uint32_t minscore = origscore;
>                  bool bFound = false;
> @@ -437,6 +456,7 @@
>                              break;
>                      }
>                  }
> +                X265_FREE(weightTemp);
>
>                  // if chroma denoms diverged, we must start over
>                  if (mindenom < log2denom[yuv])
> @@ -481,48 +501,38 @@
>      wpScalingParam wp[2][MAX_NUM_REF][3];
>      int numPredDir = slice.isInterP() ? 1 : 2;
>
> -    /* TODO: perf - collect some of this data into a struct which is passed to
> -     * tryCommonDenom() to avoid recalculating some data.  Motion compensated
> -     * reference planes can be cached this way */
> -
> -    TComPicYuv *orig = slice.getPic()->getPicYuvOrg();
> -    pixel *temp = X265_MALLOC(pixel, 2 * orig->getStride() * orig->getHeight());
> -    if (temp)
> +    weightp::cache cacheData;
> +    memset(&cacheData, 0, sizeof(cacheData));
> +    int denom = slice.getNumRefIdx(REF_PIC_LIST_0) > 3 ? 7 : 6;
> +    do
>      {
> -        int denom = slice.getNumRefIdx(REF_PIC_LIST_0) > 3 ? 7 : 6;
> -        do
> -        {
> -            /* reset weight states */
> -            for (int list = 0; list < numPredDir; list++)
> -            {
> -                for (int ref = 0; ref < slice.getNumRefIdx(list); ref++)
> -                {
> -                    SET_WEIGHT(wp[list][ref][0], false, 1 << denom, denom, 0);
> -                    SET_WEIGHT(wp[list][ref][1], false, 1 << denom, denom, 0);
> -                    SET_WEIGHT(wp[list][ref][2], false, 1 << denom, denom, 0);
> -                }
> -            }
> -
> -            if (weightp::tryCommonDenom(slice, param, wp, temp, denom))
> -                break;
> -            denom--; // decrement to satisfy the range limitation
> -        }
> -        while (denom > 0);
> -
> -        X265_FREE(temp);
> -    }
> -
> -    if (param.logLevel >= X265_LOG_DEBUG)
> -    {
> -        char buf[1024];
> -        int p = 0;
> -        bool bWeighted = false;
> -
> -        p = sprintf(buf, "poc: %d weights:", slice.getPOC());
> +        /* reset weight states */
>          for (int list = 0; list < numPredDir; list++)
>          {
>              for (int ref = 0; ref < slice.getNumRefIdx(list); ref++)
>              {
> +                SET_WEIGHT(wp[list][ref][0], false, 1 << denom, denom, 0);
> +                SET_WEIGHT(wp[list][ref][1], false, 1 << denom, denom, 0);
> +                SET_WEIGHT(wp[list][ref][2], false, 1 << denom, denom, 0);
> +            }
> +        }
> +
> +        if (weightp::tryCommonDenom(slice, param, wp, cacheData, denom))
> +            break;
> +        denom--; // decrement to satisfy the range limitation
> +    }
> +    while (denom > 0);
> +
> +    char buf[1024];
> +    int  p = 0;
> +    bool bWeighted = false;
> +    p = sprintf(buf, "poc: %d weights:", slice.getPOC());
> +    for (int list = 0; list < numPredDir; list++)
> +    {
> +        for (int ref = 0; ref < slice.getNumRefIdx(list); ref++)
> +        {
> +            if (param.logLevel >= X265_LOG_DEBUG)
> +            {
>                  wpScalingParam* w = &wp[list][ref][0];
>                  if (w[0].bPresentFlag || w[1].bPresentFlag || w[2].bPresentFlag)
>                  {
> @@ -537,13 +547,18 @@
>                      p += sprintf(buf + p, "]");
>                  }
>              }
> +            for (int yuv = 0; yuv < 3; yuv++)
> +            {
> +                if(cacheData.mcRef[list][ref][yuv])
> +                    X265_FREE(cacheData.mcRef[list][ref][yuv]);


X265_FREE() handles NULL pointers just fine, there's no reason to
check the pointer first.

white-space.

>
> +            }
>          }
> -        if (bWeighted)
> -        {
> -            if (p < 80) // pad with spaces to ensure progress line overwritten
> -                sprintf(buf + p, "%*s", 80-p, " ");
> -            x265_log(&param, X265_LOG_DEBUG, "%s\n", buf);
> -        }
> +    }


this is doing a lot of string operations even when logging is
disabled, just to avoid a second series of for() loops around the
indexes, which I think is a poor trade-off.

>
> +    if (param.logLevel >= X265_LOG_DEBUG && bWeighted)
> +    {
> +        if (p < 80) // pad with spaces to ensure progress line overwritten
> +            sprintf(buf + p, "%*s", 80-p, " ");
> +        x265_log(&param, X265_LOG_DEBUG, "%s\n", buf);
>      }
>      slice.setWpScaling(wp);
>  }
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel




-- 
Steve Borho


More information about the x265-devel mailing list