[x265] [PATCH] weightp: weight only one reference; donot cache mc refs anymore

kavitha at multicorewareinc.com kavitha at multicorewareinc.com
Sun Mar 9 13:02:34 CET 2014


# HG changeset patch
# User Kavitha Sampath <kavitha at multicorewareinc.com>
# Date 1394365741 -19800
#      Sun Mar 09 17:19:01 2014 +0530
# Node ID efb0ff5f607b70cb6c728bec3f61709b87626606
# Parent  93861c42b879798134bb200ff633f6492a7ff376
weightp: weight only one reference; donot cache mc refs anymore

diff -r 93861c42b879 -r efb0ff5f607b source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp	Fri Mar 07 22:54:00 2014 -0600
+++ b/source/encoder/frameencoder.cpp	Sun Mar 09 17:19:01 2014 +0530
@@ -453,7 +453,9 @@
     //------------------------------------------------------------------------------
     //  Weighted Prediction parameters estimation.
     //------------------------------------------------------------------------------
-    if ((slice->getSliceType() == P_SLICE && slice->getPPS()->getUseWP()) || (slice->getSliceType() == B_SLICE && slice->getPPS()->getWPBiPred()))
+    bool weightpSet = slice->getSliceType() == P_SLICE && slice->getPPS()->getUseWP();
+    bool weightbSet = slice->getSliceType() == B_SLICE && slice->getPPS()->getWPBiPred();
+    if (weightpSet || weightbSet)
     {
         assert(slice->getPPS()->getUseWP());
         weightAnalyse(*slice, *m_cfg->param);
@@ -466,7 +468,7 @@
         for (int ref = 0; ref < slice->getNumRefIdx(l); ref++)
         {
             wpScalingParam *w = NULL;
-            if ((slice->isInterP() && slice->getPPS()->getUseWP() && slice->m_weightPredTable[l][ref][0].bPresentFlag))
+            if (weightpSet && !ref && slice->m_weightPredTable[l][ref][0].bPresentFlag)
             {
                 w = slice->m_weightPredTable[l][ref];
                 slice->m_numWPRefs++;
diff -r 93861c42b879 -r efb0ff5f607b source/encoder/weightPrediction.cpp
--- a/source/encoder/weightPrediction.cpp	Fri Mar 07 22:54:00 2014 -0600
+++ b/source/encoder/weightPrediction.cpp	Sun Mar 09 17:19:01 2014 +0530
@@ -33,34 +33,10 @@
 using namespace x265;
 namespace weightp {
 
-struct RefData
-{
-    pixel *  mcbuf;
-    pixel *  fref;
-    float    guessScale;
-    float    fencMean;
-    float    refMean;
-    uint32_t unweightedCost;
-};
-
-struct ChannelData
-{
-    pixel* orig;
-    int    stride;
-    int    width;
-    int    height;
-};
-
 struct Cache
 {
-    wpScalingParam wp[2][MAX_NUM_REF][3];
-    RefData        ref[2][MAX_NUM_REF][3];
-    ChannelData    paramset[3];
-
     const int *    intraCost;
-    pixel*         weightTemp;
     int            numPredDir;
-    int            lambda;
     int            csp;
     int            hshift;
     int            vshift;
@@ -191,6 +167,7 @@
  * pixels have unreliable availability */
 uint32_t weightCost(pixel *         fenc,
                     pixel *         ref,
+                    pixel *         weightTemp,
                     int             stride,
                     const Cache &   cache,
                     int             width,
@@ -208,9 +185,9 @@
         int correction = IF_INTERNAL_PREC - X265_DEPTH; /* intermediate interpolation depth */
         int pwidth = ((width + 15) >> 4) << 4;
 
-        primitives.weight_pp(ref, cache.weightTemp, stride, stride, pwidth, height,
+        primitives.weight_pp(ref, weightTemp, stride, stride, pwidth, height,
                              weight, round << correction, denom + correction, offset);
-        ref = cache.weightTemp;
+        ref = weightTemp;
     }
 
     uint32_t cost = 0;
@@ -241,154 +218,24 @@
     return cost;
 }
 
-bool tryCommonDenom(TComSlice& slice, Cache& cache, int indenom)
-{
-    int log2denom[3] = { indenom };
-    const float epsilon = 1.f / 128.f;
-
-    /* reset weight states */
-    for (int list = 0; list < cache.numPredDir; list++)
-    {
-        for (int ref = 0; ref < slice.getNumRefIdx(list); ref++)
-        {
-            SET_WEIGHT(cache.wp[list][ref][0], false, 1 << indenom, indenom, 0);
-            SET_WEIGHT(cache.wp[list][ref][1], false, 1 << indenom, indenom, 0);
-            SET_WEIGHT(cache.wp[list][ref][2], false, 1 << indenom, indenom, 0);
-        }
-    }
-
-    int numWeighted = 0;
-    for (int list = 0; list < cache.numPredDir; list++)
-    {
-        for (int ref = 0; ref < slice.getNumRefIdx(list); ref++)
-        {
-            wpScalingParam *fw = cache.wp[list][ref];
-
-            for (int yuv = 1; yuv < 3; yuv++)
-            {
-                /* Ensure that the denominators of cb and cr are same */
-                RefData *rd = &cache.ref[list][ref][yuv];
-                fw[yuv].setFromWeightAndOffset((int)(rd->guessScale * (1 << log2denom[1]) + 0.5), 0, log2denom[1]);
-                log2denom[1] = X265_MIN(log2denom[1], (int)fw[yuv].log2WeightDenom);
-            }
-            log2denom[2] = log2denom[1];
-
-            bool bWeightRef = false;
-            for (int yuv = 0; yuv < 3; yuv++)
-            {
-                RefData *rd = &cache.ref[list][ref][yuv];
-                ChannelData *p = &cache.paramset[yuv];
-                if (yuv && !fw[0].bPresentFlag)
-                {
-                    fw[1].inputWeight = 1 << fw[1].log2WeightDenom;
-                    fw[2].inputWeight = 1 << fw[2].log2WeightDenom;
-                    break;
-                }
-
-                x265_emms();
-                /* Early termination */
-                float meanDiff = rd->refMean < rd->fencMean ? rd->fencMean - rd->refMean : rd->refMean - rd->fencMean;
-                float guessVal = rd->guessScale > 1.f ? rd->guessScale - 1.f : 1.f - rd->guessScale;
-                if ((meanDiff < 0.5f && guessVal < epsilon) || !rd->unweightedCost)
-                    continue;
-
-                wpScalingParam w;
-                w.setFromWeightAndOffset((int)(rd->guessScale * (1 << log2denom[yuv]) + 0.5), 0, log2denom[yuv]);
-                int mindenom = w.log2WeightDenom;
-                int minscale = w.inputWeight;
-                int minoff = 0;
-
-                uint32_t origscore = rd->unweightedCost;
-                uint32_t minscore = origscore;
-                bool bFound = false;
-                static const int sD = 4; // scale distance
-                static const int oD = 2; // offset distance
-                for (int is = minscale - sD; is <= minscale + sD; is++)
-                {
-                    int deltaWeight = is - (1 << mindenom);
-                    if (deltaWeight > 127 || deltaWeight <= -128)
-                        continue;
-
-                    int curScale = is;
-                    int curOffset = (int)(rd->fencMean - rd->refMean * curScale / (1 << mindenom) + 0.5f);
-                    if (curOffset < -128 || curOffset > 127)
-                    {
-                        /* Rescale considering the constraints on curOffset. We do it in this order
-                         * because scale has a much wider range than offset (because of denom), so
-                         * it should almost never need to be clamped. */
-                        curOffset = Clip3(-128, 127, curOffset);
-                        curScale = (int)((1 << mindenom) * (rd->fencMean - curOffset) / rd->refMean + 0.5f);
-                        curScale = Clip3(0, 127, curScale);
-                    }
-
-                    for (int ioff = curOffset - oD; (ioff <= (curOffset + oD)) && (ioff < 127); ioff++)
-                    {
-                        if (yuv)
-                        {
-                            int pred = (128 - ((128 * curScale) >> (mindenom)));
-                            int deltaOffset = ioff - pred; // signed 10bit
-                            if (deltaOffset < -512 || deltaOffset > 511)
-                                continue;
-                            ioff = Clip3(-128, 127, (deltaOffset + pred)); // signed 8bit
-                        }
-                        else
-                        {
-                            ioff = Clip3(-128, 127, ioff);
-                        }
-
-                        SET_WEIGHT(w, true, curScale, mindenom, ioff);
-                        uint32_t s = weightCost(p->orig, rd->fref, p->stride, cache, p->width, p->height, &w, !yuv) +
-                                     sliceHeaderCost(&w, cache.lambda, !!yuv);
-                        COPY4_IF_LT(minscore, s, minscale, curScale, minoff, ioff, bFound, true);
-                        if (minoff == curOffset - oD && ioff != curOffset - oD)
-                            break;
-                    }
-                }
-
-                // if chroma denoms diverged, we must start over
-                if (mindenom < log2denom[yuv])
-                    return false;
-
-                if (!bFound || (minscale == (1 << mindenom) && minoff == 0) || (float)minscore / origscore > 0.998f)
-                {
-                    fw[yuv].bPresentFlag = false;
-                    fw[yuv].inputWeight = 1 << fw[yuv].log2WeightDenom;
-                }
-                else
-                {
-                    SET_WEIGHT(fw[yuv], true, minscale, mindenom, minoff);
-                    bWeightRef = true;
-                }
-            }
-
-            if (bWeightRef)
-            {
-                // Make sure both chroma channels match
-                if (fw[1].bPresentFlag != fw[2].bPresentFlag)
-                {
-                    if (fw[1].bPresentFlag)
-                        fw[2] = fw[1];
-                    else
-                        fw[1] = fw[2];
-                }
-
-                if (++numWeighted >= 8)
-                    return true;
-            }
-        }
-    }
-
-    return true;
-}
-
-void prepareRef(Cache& cache, TComSlice& slice, x265_param& param)
+void tryCommonDenom(TComSlice&     slice,
+                    x265_param&    param,
+                    wpScalingParam wp[2][MAX_NUM_REF][3],
+                    pixel *        temp,
+                    int            indenom)
 {
     TComPic *pic = slice.getPic();
     TComPicYuv *picorig = pic->getPicYuvOrg();
     Lowres& fenc = pic->m_lowres;
 
-    cache.weightTemp = X265_MALLOC(pixel, picorig->getStride() * picorig->getHeight());
-    cache.lambda = (int) x265_lambda2_non_I[slice.getSliceQp()];
+    /* caller provides temp space for two full-pel planes. Split it
+     * in half for motion compensation of the reference and then the
+     * weighting */
+    pixel *mcbuf = temp;
+    pixel *weightTemp = temp + picorig->getStride() * picorig->getHeight();
+
+    weightp::Cache cache;
+    memset(&cache, 0, sizeof(cache));
     cache.intraCost = fenc.intraCost;
     cache.lowresWidthInCU = fenc.width >> 3;
     cache.lowresHeightInCU = fenc.lines >> 3;
@@ -396,6 +243,7 @@
     cache.hshift = CHROMA_H_SHIFT(cache.csp);
     cache.vshift = CHROMA_V_SHIFT(cache.csp);
 
+    int lambda = (int) x265_lambda2_non_I[slice.getSliceQp()];
     int curPoc = slice.getPOC();
     int numpixels[3];
     int w = ((picorig->getWidth()  + 15) >> 4) << 4;
@@ -407,137 +255,228 @@
     numpixels[1] = numpixels[2] = w * h;
 
     cache.numPredDir = slice.isInterP() ? 1 : 2;
+    int chromadenom = indenom;
+    const float epsilon = 1.f / 128.f;
+
+    /* reset weight states */
     for (int list = 0; list < cache.numPredDir; list++)
     {
         for (int ref = 0; ref < slice.getNumRefIdx(list); ref++)
         {
-            TComPic *refPic = slice.getRefPic(list, ref);
-            Lowres& refLowres = refPic->m_lowres;
+            SET_WEIGHT(wp[list][ref][0], false, 1 << indenom, indenom, 0);
+            SET_WEIGHT(wp[list][ref][1], false, 1 << indenom, indenom, 0);
+            SET_WEIGHT(wp[list][ref][2], false, 1 << indenom, indenom, 0);
+        }
+    }
 
-            MV *mvs = NULL;
-            bool bMotionCompensate = false;
+    for (int list = 0; list < cache.numPredDir; list++)
+    {
+        wpScalingParam *fw = wp[list][0];
+        TComPic *refPic = slice.getRefPic(list, 0);
+        Lowres& refLowres = refPic->m_lowres;
 
-            /* test whether POC distance is within range for lookahead structures */
-            int diffPoc = abs(curPoc - refPic->getPOC());
-            if (diffPoc <= param.bframes + 1)
+        MV *mvs = NULL;
+        bool bMotionCompensate = false;
+
+        /* test whether POC distance is within range for lookahead structures */
+        int diffPoc = abs(curPoc - refPic->getPOC());
+        if (diffPoc <= param.bframes + 1)
+        {
+            mvs = fenc.lowresMvs[list][diffPoc - 1];
+            /* test whether this motion search was performed by lookahead */
+            if (mvs[0].x != 0x7FFF)
             {
-                mvs = fenc.lowresMvs[list][diffPoc - 1];
-                /* test whether this motion search was performed by lookahead */
-                if (mvs[0].x != 0x7FFF)
+                bMotionCompensate = true;
+
+                /* reference chroma planes must be extended prior to being
+                    * used as motion compensation sources */
+                if (!refPic->m_bChromaPlanesExtended)
                 {
-                    bMotionCompensate = true;
-
-                    /* reference chroma planes must be extended prior to being
-                     * used as motion compensation sources */
-                    if (!refPic->m_bChromaPlanesExtended)
-                    {
-                        refPic->m_bChromaPlanesExtended = true;
-                        TComPicYuv *refyuv = refPic->getPicYuvOrg();
-                        int stride = refyuv->getCStride();
-                        int width = refyuv->getWidth() >> cache.hshift;
-                        int height = refyuv->getHeight() >> cache.vshift;
-                        int marginX = refyuv->getChromaMarginX();
-                        int marginY = refyuv->getChromaMarginY();
-                        extendPicBorder(refyuv->getCbAddr(), stride, width, height, marginX, marginY);
-                        extendPicBorder(refyuv->getCrAddr(), stride, width, height, marginX, marginY);
-                    }
+                    refPic->m_bChromaPlanesExtended = true;
+                    TComPicYuv *refyuv = refPic->getPicYuvOrg();
+                    int stride = refyuv->getCStride();
+                    int width = refyuv->getWidth() >> cache.hshift;
+                    int height = refyuv->getHeight() >> cache.vshift;
+                    int marginX = refyuv->getChromaMarginX();
+                    int marginY = refyuv->getChromaMarginY();
+                    extendPicBorder(refyuv->getCbAddr(), stride, width, height, marginX, marginY);
+                    extendPicBorder(refyuv->getCrAddr(), stride, width, height, marginX, marginY);
                 }
             }
-            for (int yuv = 0; yuv < 3; yuv++)
+        }
+
+        /* prepare estimates */
+        float guessScale[3], fencMean[3], refMean[3];
+        for (int yuv = 0; yuv < 3; yuv++)
+        {
+            uint64_t fencVar = fenc.wp_ssd[yuv] + !refLowres.wp_ssd[yuv];
+            uint64_t refVar  = refLowres.wp_ssd[yuv] + !refLowres.wp_ssd[yuv];
+            if (fencVar && refVar)
+                guessScale[yuv] = Clip3(-2.f, 1.8f, std::sqrt((float)fencVar / refVar));
+            else
+                guessScale[yuv] = 1.8f;
+            fencMean[yuv] = (float)fenc.wp_sum[yuv] / (numpixels[yuv]) / (1 << (X265_DEPTH - 8));
+            refMean[yuv]  = (float)refLowres.wp_sum[yuv] / (numpixels[yuv]) / (1 << (X265_DEPTH - 8));
+
+            /* Ensure that the denominators of cb and cr are same */
+            if (yuv)
             {
-                /* prepare inputs to weight analysis */
-                RefData *rd = &cache.ref[list][ref][yuv];
-                ChannelData *p = &cache.paramset[yuv];
-
-                x265_emms();
-                uint64_t fencVar = fenc.wp_ssd[yuv] + !refLowres.wp_ssd[yuv];
-                uint64_t refVar  = refLowres.wp_ssd[yuv] + !refLowres.wp_ssd[yuv];
-                if (fencVar && refVar)
-                    rd->guessScale = Clip3(-2.f, 1.8f, std::sqrt((float)fencVar / refVar));
-                else
-                    rd->guessScale = 1.8f;
-                rd->fencMean = (float)fenc.wp_sum[yuv] / (numpixels[yuv]) / (1 << (X265_DEPTH - 8));
-                rd->refMean  = (float)refLowres.wp_sum[yuv] / (numpixels[yuv]) / (1 << (X265_DEPTH - 8));
-
-                switch (yuv)
-                {
-                case 0:
-                    p->orig = fenc.lowresPlane[0];
-                    p->stride = fenc.lumaStride;
-                    p->width = fenc.width;
-                    p->height = fenc.lines;
-                    rd->fref = refLowres.lowresPlane[0];
-                    if (bMotionCompensate)
-                    {
-                        rd->mcbuf = X265_MALLOC(pixel, p->stride * p->height);
-                        if (rd->mcbuf)
-                        {
-                            mcLuma(rd->mcbuf, refLowres, mvs);
-                            rd->fref = rd->mcbuf;
-                        }
-                    }
-                    break;
-
-                case 1:
-                    p->orig = picorig->getCbAddr();
-                    p->stride = picorig->getCStride();
-                    rd->fref = refPic->getPicYuvOrg()->getCbAddr();
-
-                    /* Clamp the chroma dimensions to the nearest multiple of
-                     * 8x8 blocks (or 16x16 for 4:4:4) since mcChroma uses lowres
-                     * blocks and weightCost measures 8x8 blocks. This
-                     * potentially ignores some edge pixels, but simplifies the
-                     * logic and prevents reading uninitialized pixels. Lowres
-                     * planes are border extended and require no clamping. */
-                    p->width =  ((picorig->getWidth()  >> 4) << 4) >> cache.hshift;
-                    p->height = ((picorig->getHeight() >> 4) << 4) >> cache.vshift;
-                    if (bMotionCompensate)
-                    {
-                        rd->mcbuf = X265_MALLOC(pixel, p->stride * p->height);
-                        if (rd->mcbuf)
-                        {
-                            mcChroma(rd->mcbuf, rd->fref, p->stride, mvs, cache, p->height, p->width);
-                            rd->fref = rd->mcbuf;
-                        }
-                    }
-                    break;
-
-                case 2:
-                    rd->fref = refPic->getPicYuvOrg()->getCrAddr();
-                    p->orig = picorig->getCrAddr();
-                    p->stride = picorig->getCStride();
-                    p->width =  ((picorig->getWidth()  >> 4) << 4) >> cache.hshift;
-                    p->height = ((picorig->getHeight() >> 4) << 4) >> cache.vshift;
-                    if (bMotionCompensate)
-                    {
-                        rd->mcbuf = X265_MALLOC(pixel, p->stride * p->height);
-                        if (rd->mcbuf)
-                        {
-                            mcChroma(rd->mcbuf, rd->fref, p->stride, mvs, cache, p->height, p->width);
-                            rd->fref = rd->mcbuf;
-                        }
-                    }
-                    break;
-
-                default:
-                    return;
-                }
-                rd->unweightedCost = weightCost(p->orig, rd->fref, p->stride, cache, p->width, p->height, NULL, !yuv);
+                fw[yuv].setFromWeightAndOffset((int)(guessScale[yuv] * (1 << chromadenom) + 0.5), 0, chromadenom);
+                chromadenom = X265_MIN(chromadenom, (int)fw[yuv].log2WeightDenom);
             }
         }
-    }
-}
 
-void tearDown(Cache& cache, TComSlice& slice)
-{
-    X265_FREE(cache.weightTemp);
-    for (int list = 0; list < cache.numPredDir; list++)
-    {
-        for (int ref = 0; ref < slice.getNumRefIdx(list); ref++)
+        bool bWeightRef = false;
+        for (int yuv = 0; yuv < 3; yuv++)
         {
-            for (int yuv = 0; yuv < 3; yuv++)
+            if (yuv && !fw[0].bPresentFlag)
             {
-                X265_FREE(cache.ref[list][ref][yuv].mcbuf);
+                fw[1].inputWeight = 1 << fw[1].log2WeightDenom;
+                fw[2].inputWeight = 1 << fw[2].log2WeightDenom;
+                break;
+            }
+
+            x265_emms();
+            /* Early termination */
+            float meanDiff = refMean[yuv] < fencMean[yuv] ? fencMean[yuv] - refMean[yuv] : refMean[yuv] - fencMean[yuv];
+            float guessVal = guessScale[yuv] > 1.f ? guessScale[yuv] - 1.f : 1.f - guessScale[yuv];
+            if (meanDiff < 0.5f && guessVal < epsilon)
+                continue;
+
+            /* prepare inputs to weight analysis */
+            pixel *orig;
+            pixel *fref;
+            int    stride;
+            int    width, height;
+            switch (yuv)
+            {
+            case 0:
+                orig = fenc.lowresPlane[0];
+                stride = fenc.lumaStride;
+                width = fenc.width;
+                height = fenc.lines;
+                fref = refLowres.lowresPlane[0];
+                if (bMotionCompensate)
+                {
+                    mcLuma(mcbuf, refLowres, mvs);
+                    fref = mcbuf;
+                }
+                break;
+
+            case 1:
+                orig = picorig->getCbAddr();
+                stride = picorig->getCStride();
+                fref = refPic->getPicYuvOrg()->getCbAddr();
+
+                /* Clamp the chroma dimensions to the nearest multiple of
+                    * 8x8 blocks (or 16x16 for 4:4:4) since mcChroma uses lowres
+                    * blocks and weightCost measures 8x8 blocks. This
+                    * potentially ignores some edge pixels, but simplifies the
+                    * logic and prevents reading uninitialized pixels. Lowres
+                    * planes are border extended and require no clamping. */
+                width =  ((picorig->getWidth()  >> 4) << 4) >> cache.hshift;
+                height = ((picorig->getHeight() >> 4) << 4) >> cache.vshift;
+                if (bMotionCompensate)
+                {
+                    mcChroma(mcbuf, fref, stride, mvs, cache, height, width);
+                    fref = mcbuf;
+                }
+                break;
+
+            case 2:
+                fref = refPic->getPicYuvOrg()->getCrAddr();
+                orig = picorig->getCrAddr();
+                stride = picorig->getCStride();
+                width =  ((picorig->getWidth()  >> 4) << 4) >> cache.hshift;
+                height = ((picorig->getHeight() >> 4) << 4) >> cache.vshift;
+                if (bMotionCompensate)
+                {
+                    mcChroma(mcbuf, fref, stride, mvs, cache, height, width);
+                    fref = mcbuf;
+                }
+                break;
+
+            default:
+                return;
+            }
+
+            wpScalingParam w;
+            w.setFromWeightAndOffset((int)(guessScale[yuv] * (1 << fw[yuv].log2WeightDenom) + 0.5), 0, fw[yuv].log2WeightDenom);
+            int mindenom = w.log2WeightDenom;
+            int minscale = w.inputWeight;
+            int minoff = 0;
+
+            uint32_t origscore = weightCost(orig, fref, weightTemp, stride, cache, width, height, NULL, !yuv);
+            if (!origscore)
+                continue;
+
+            uint32_t minscore = origscore;
+            bool bFound = false;
+            static const int sD = 4; // scale distance
+            static const int oD = 2; // offset distance
+            for (int is = minscale - sD; is <= minscale + sD; is++)
+            {
+                int deltaWeight = is - (1 << mindenom);
+                if (deltaWeight > 127 || deltaWeight <= -128)
+                    continue;
+
+                int curScale = is;
+                int curOffset = (int)(fencMean[yuv] - refMean[yuv] * curScale / (1 << mindenom) + 0.5f);
+                if (curOffset < -128 || curOffset > 127)
+                {
+                    /* Rescale considering the constraints on curOffset. We do it in this order
+                        * because scale has a much wider range than offset (because of denom), so
+                        * it should almost never need to be clamped. */
+                    curOffset = Clip3(-128, 127, curOffset);
+                    curScale = (int)((1 << mindenom) * (fencMean[yuv] - curOffset) / refMean[yuv] + 0.5f);
+                    curScale = Clip3(0, 127, curScale);
+                }
+
+                for (int ioff = curOffset - oD; (ioff <= (curOffset + oD)) && (ioff < 127); ioff++)
+                {
+                    if (yuv)
+                    {
+                        int pred = (128 - ((128 * curScale) >> (mindenom)));
+                        int deltaOffset = ioff - pred; // signed 10bit
+                        if (deltaOffset < -512 || deltaOffset > 511)
+                            continue;
+                        ioff = Clip3(-128, 127, (deltaOffset + pred)); // signed 8bit
+                    }
+                    else
+                    {
+                        ioff = Clip3(-128, 127, ioff);
+                    }
+
+                    SET_WEIGHT(w, true, curScale, mindenom, ioff);
+                    uint32_t s = weightCost(orig, fref, weightTemp, stride, cache, width, height, &w, !yuv) +
+                                    sliceHeaderCost(&w, lambda, !!yuv);
+                    COPY4_IF_LT(minscore, s, minscale, curScale, minoff, ioff, bFound, true);
+                    if (minoff == curOffset - oD && ioff != curOffset - oD)
+                        break;
+                }
+            }
+
+            if (!bFound || (minscale == (1 << mindenom) && minoff == 0) || (float)minscore / origscore > 0.998f)
+            {
+                fw[yuv].bPresentFlag = false;
+                fw[yuv].inputWeight = 1 << fw[yuv].log2WeightDenom;
+            }
+            else
+            {
+                SET_WEIGHT(fw[yuv], true, minscale, mindenom, minoff);
+                bWeightRef = true;
+            }
+        }
+
+        if (bWeightRef)
+        {
+            // Make sure both chroma channels match
+            if (fw[1].bPresentFlag != fw[2].bPresentFlag)
+            {
+                if (fw[1].bPresentFlag)
+                    fw[2] = fw[1];
+                else
+                    fw[1] = fw[2];
             }
         }
     }
@@ -547,35 +486,30 @@
 namespace x265 {
 void weightAnalyse(TComSlice& slice, x265_param& param)
 {
-    weightp::Cache cache;
-    memset(&cache, 0, sizeof(cache));
+    wpScalingParam wp[2][MAX_NUM_REF][3];
+    int numPredDir = slice.isInterP() ? 1 : 2;
+    TComPicYuv *orig = slice.getPic()->getPicYuvOrg();
+    pixel *temp = X265_MALLOC(pixel, 2 * orig->getStride() * orig->getHeight());
 
-    prepareRef(cache, slice, param);
-    if (cache.weightTemp)
+    if (temp)
     {
         int denom = slice.getNumRefIdx(REF_PIC_LIST_0) > 3 ? 7 : 6;
-        do
-        {
-            if (weightp::tryCommonDenom(slice, cache, denom))
-                break;
-            denom--; // decrement to satisfy the range limitation 
-        }
-        while (denom > 0);
+        weightp::tryCommonDenom(slice, param, wp, temp, denom);
+        X265_FREE(temp);
     }
     else
     {
-        for (int list = 0; list < cache.numPredDir; list++)
+        for (int list = 0; list < numPredDir; list++)
         {
             for (int ref = 0; ref < slice.getNumRefIdx(list); ref++)
             {
-                SET_WEIGHT(cache.wp[list][ref][0], false, 1, 0, 0);
-                SET_WEIGHT(cache.wp[list][ref][1], false, 1, 0, 0);
-                SET_WEIGHT(cache.wp[list][ref][2], false, 1, 0, 0);
+                SET_WEIGHT(wp[list][ref][0], false, 1, 0, 0);
+                SET_WEIGHT(wp[list][ref][1], false, 1, 0, 0);
+                SET_WEIGHT(wp[list][ref][2], false, 1, 0, 0);
             }
         }
     }
-    tearDown(cache, slice);
-    slice.setWpScaling(cache.wp);
+    slice.setWpScaling(wp);
 
     if (param.logLevel >= X265_LOG_FULL)
     {
@@ -584,23 +518,20 @@
         bool bWeighted = false;
 
         p = sprintf(buf, "poc: %d weights:", slice.getPOC());
-        for (int list = 0; list < cache.numPredDir; list++)
+        for (int list = 0; list < numPredDir; list++)
         {
-            for (int ref = 0; ref < slice.getNumRefIdx(list); ref++)
+            wpScalingParam* w = &wp[list][0][0];
+            if (w[0].bPresentFlag || w[1].bPresentFlag || w[2].bPresentFlag)
             {
-                wpScalingParam* w = &cache.wp[list][ref][0];
-                if (w[0].bPresentFlag || w[1].bPresentFlag || w[2].bPresentFlag)
-                {
-                    bWeighted = true;
-                    p += sprintf(buf + p, " [L%d:R%d ", list, ref);
-                    if (w[0].bPresentFlag)
-                        p += sprintf(buf + p, "Y{%d/%d%+d}", w[0].inputWeight, 1 << w[0].log2WeightDenom, w[0].inputOffset);
-                    if (w[1].bPresentFlag)
-                        p += sprintf(buf + p, "U{%d/%d%+d}", w[1].inputWeight, 1 << w[1].log2WeightDenom, w[1].inputOffset);
-                    if (w[2].bPresentFlag)
-                        p += sprintf(buf + p, "V{%d/%d%+d}", w[2].inputWeight, 1 << w[2].log2WeightDenom, w[2].inputOffset);
-                    p += sprintf(buf + p, "]");
-                }
+                bWeighted = true;
+                p += sprintf(buf + p, " [L%d:R0 ", list);
+                if (w[0].bPresentFlag)
+                    p += sprintf(buf + p, "Y{%d/%d%+d}", w[0].inputWeight, 1 << w[0].log2WeightDenom, w[0].inputOffset);
+                if (w[1].bPresentFlag)
+                    p += sprintf(buf + p, "U{%d/%d%+d}", w[1].inputWeight, 1 << w[1].log2WeightDenom, w[1].inputOffset);
+                if (w[2].bPresentFlag)
+                    p += sprintf(buf + p, "V{%d/%d%+d}", w[2].inputWeight, 1 << w[2].log2WeightDenom, w[2].inputOffset);
+                p += sprintf(buf + p, "]");
             }
         }
 


More information about the x265-devel mailing list