[x265] [PATCH] weightp: pass struct to tryCommonDenom() to cache motion compensated reference planes for reuse
Steve Borho
steve at borho.org
Fri Feb 21 03:29:20 CET 2014
On Thu, Feb 20, 2014 at 6:27 AM, <kavitha at multicorewareinc.com> wrote:
>
> # HG changeset patch
> # User Kavitha Sampath <kavitha at multicorewareinc.com>
> # Date 1392899156 -19800
> # Thu Feb 20 17:55:56 2014 +0530
> # Node ID a359217a5a5dbd2c8183c41dbf3f7193a09e0269
> # Parent 3389061b75a486e004409ab628c46fed39d03b72
> weightp: pass struct to tryCommonDenom() to cache motion compensated reference planes for reuse
>
> diff -r 3389061b75a4 -r a359217a5a5d source/encoder/weightPrediction.cpp
> --- a/source/encoder/weightPrediction.cpp Wed Feb 19 17:03:21 2014 -0600
> +++ b/source/encoder/weightPrediction.cpp Thu Feb 20 17:55:56 2014 +0530
> @@ -32,6 +32,13 @@
> using namespace x265;
>
> namespace weightp {
> +
> +struct cache
> +{
> + pixel *mcRef[2][MAX_NUM_REF][3];
> + uint32_t weightcost[2][MAX_NUM_REF][3];
the weight costs are presumably changing each time the function is
called with different denoms, I don't think they can be cached.
>
> +};
> +
> /* make a motion compensated copy of lowres ref into mcout with the same stride.
> * The borders of mcout are not extended */
> void mcLuma(pixel * mcout,
> @@ -213,7 +220,7 @@
> bool tryCommonDenom(TComSlice& slice,
> x265_param& param,
> wpScalingParam wp[2][MAX_NUM_REF][3],
> - pixel * temp,
> + cache& cacheData,
> int indenom)
> {
> TComPic *pic = slice.getPic();
> @@ -221,12 +228,6 @@
> Lowres& fenc = pic->m_lowres;
> int curPoc = slice.getPOC();
>
> - /* caller provides temp space for two full-pel planes. Split it
> - * in half for motion compensation of the reference and then the
> - * weighting */
> - pixel *mcTemp = temp;
> - pixel *weightTemp = temp + picorig->getStride() * picorig->getHeight();
> -
> int log2denom[3] = { indenom };
> int csp = picorig->m_picCsp;
> int hshift = CHROMA_H_SHIFT(csp);
> @@ -327,6 +328,8 @@
> pixel *fref;
> int origstride, frefstride;
> int width, height;
> + pixel* &buf = cacheData.mcRef[list][ref][yuv];
> + uint32_t& origscore = cacheData.weightcost[list][ref][yuv];
> switch (yuv)
> {
> case 0:
> @@ -338,8 +341,12 @@
>
> if (bMotionCompensate)
> {
> - mcLuma(mcTemp, refLowres, mvCosts, fenc.intraCost, mvs);
> - fref = mcTemp;
> + if (!buf)
> + {
> + buf = X265_MALLOC(pixel, picorig->getStride() * picorig->getHeight());
> + mcLuma(buf, refLowres, mvCosts, fenc.intraCost, mvs);
> + }
> + fref = buf;
> }
> break;
>
> @@ -359,8 +366,12 @@
>
> if (bMotionCompensate)
> {
> - mcChroma(mcTemp, fref, fenc, frefstride, mvCosts, fenc.intraCost, mvs, height, width, csp);
> - fref = mcTemp;
> + if (!buf)
> + {
> + buf = X265_MALLOC(pixel, picorig->getStride() * picorig->getHeight());
> + mcChroma(buf, fref, fenc, frefstride, mvCosts, fenc.intraCost, mvs, height, width, csp);
> + }
> + fref = buf;
> }
> break;
>
> @@ -373,8 +384,12 @@
>
> if (bMotionCompensate)
> {
> - mcChroma(mcTemp, fref, fenc, frefstride, mvCosts, fenc.intraCost, mvs, height, width, csp);
> - fref = mcTemp;
> + if (!buf)
> + {
> + buf = X265_MALLOC(pixel, picorig->getStride() * picorig->getHeight());
> + mcChroma(buf, fref, fenc, frefstride, mvCosts, fenc.intraCost, mvs, height, width, csp);
> + }
> + fref = buf;
looks good up to here
>
> }
> break;
>
> @@ -388,10 +403,14 @@
> int mindenom = w.log2WeightDenom;
> int minscale = w.inputWeight;
> int minoff = 0;
> + pixel *weightTemp = X265_MALLOC(pixel, picorig->getStride() * picorig->getHeight());
this buffer should be in the cache, you only need one buffer;
allocated to the max of the chroma and luma plane sizes
>
>
> - uint32_t origscore = weightCost(orig, origstride, fref, frefstride, weightTemp, width, height, NULL);
> - if (!origscore)
> - continue;
> + if (origscore == 0)
> + {
> + origscore = weightCost(orig, origstride, fref, frefstride, weightTemp, width, height, NULL);
> + if (!origscore)
> + continue;
> + }
>
> uint32_t minscore = origscore;
> bool bFound = false;
> @@ -437,6 +456,7 @@
> break;
> }
> }
> + X265_FREE(weightTemp);
>
> // if chroma denoms diverged, we must start over
> if (mindenom < log2denom[yuv])
> @@ -481,48 +501,38 @@
> wpScalingParam wp[2][MAX_NUM_REF][3];
> int numPredDir = slice.isInterP() ? 1 : 2;
>
> - /* TODO: perf - collect some of this data into a struct which is passed to
> - * tryCommonDenom() to avoid recalculating some data. Motion compensated
> - * reference planes can be cached this way */
> -
> - TComPicYuv *orig = slice.getPic()->getPicYuvOrg();
> - pixel *temp = X265_MALLOC(pixel, 2 * orig->getStride() * orig->getHeight());
> - if (temp)
> + weightp::cache cacheData;
> + memset(&cacheData, 0, sizeof(cacheData));
> + int denom = slice.getNumRefIdx(REF_PIC_LIST_0) > 3 ? 7 : 6;
> + do
> {
> - int denom = slice.getNumRefIdx(REF_PIC_LIST_0) > 3 ? 7 : 6;
> - do
> - {
> - /* reset weight states */
> - for (int list = 0; list < numPredDir; list++)
> - {
> - for (int ref = 0; ref < slice.getNumRefIdx(list); ref++)
> - {
> - SET_WEIGHT(wp[list][ref][0], false, 1 << denom, denom, 0);
> - SET_WEIGHT(wp[list][ref][1], false, 1 << denom, denom, 0);
> - SET_WEIGHT(wp[list][ref][2], false, 1 << denom, denom, 0);
> - }
> - }
> -
> - if (weightp::tryCommonDenom(slice, param, wp, temp, denom))
> - break;
> - denom--; // decrement to satisfy the range limitation
> - }
> - while (denom > 0);
> -
> - X265_FREE(temp);
> - }
> -
> - if (param.logLevel >= X265_LOG_DEBUG)
> - {
> - char buf[1024];
> - int p = 0;
> - bool bWeighted = false;
> -
> - p = sprintf(buf, "poc: %d weights:", slice.getPOC());
> + /* reset weight states */
> for (int list = 0; list < numPredDir; list++)
> {
> for (int ref = 0; ref < slice.getNumRefIdx(list); ref++)
> {
> + SET_WEIGHT(wp[list][ref][0], false, 1 << denom, denom, 0);
> + SET_WEIGHT(wp[list][ref][1], false, 1 << denom, denom, 0);
> + SET_WEIGHT(wp[list][ref][2], false, 1 << denom, denom, 0);
> + }
> + }
> +
> + if (weightp::tryCommonDenom(slice, param, wp, cacheData, denom))
> + break;
> + denom--; // decrement to satisfy the range limitation
> + }
> + while (denom > 0);
> +
> + char buf[1024];
> + int p = 0;
> + bool bWeighted = false;
> + p = sprintf(buf, "poc: %d weights:", slice.getPOC());
> + for (int list = 0; list < numPredDir; list++)
> + {
> + for (int ref = 0; ref < slice.getNumRefIdx(list); ref++)
> + {
> + if (param.logLevel >= X265_LOG_DEBUG)
> + {
> wpScalingParam* w = &wp[list][ref][0];
> if (w[0].bPresentFlag || w[1].bPresentFlag || w[2].bPresentFlag)
> {
> @@ -537,13 +547,18 @@
> p += sprintf(buf + p, "]");
> }
> }
> + for (int yuv = 0; yuv < 3; yuv++)
> + {
> + if(cacheData.mcRef[list][ref][yuv])
> + X265_FREE(cacheData.mcRef[list][ref][yuv]);
X265_FREE() handles NULL pointers just fine, there's no reason to
check the pointer first.
white-space.
>
> + }
> }
> - if (bWeighted)
> - {
> - if (p < 80) // pad with spaces to ensure progress line overwritten
> - sprintf(buf + p, "%*s", 80-p, " ");
> - x265_log(¶m, X265_LOG_DEBUG, "%s\n", buf);
> - }
> + }
this is doing a lot of string operations even when logging is
disabled, just to avoid a second series of for() loops around the
indexes, which I think is a poor trade-off.
>
> + if (param.logLevel >= X265_LOG_DEBUG && bWeighted)
> + {
> + if (p < 80) // pad with spaces to ensure progress line overwritten
> + sprintf(buf + p, "%*s", 80-p, " ");
> + x265_log(¶m, X265_LOG_DEBUG, "%s\n", buf);
> }
> slice.setWpScaling(wp);
> }
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list