[x265] [PATCH] slicetype: CuTree Implementation for AQ RateControl

Gopu Govindaswamy gopu at multicorewareinc.com
Wed Nov 27 12:50:57 CET 2013


# HG changeset patch
# User Gopu Govindaswamy <gopu at multicorewareinc.com>
# Date 1385553019 -19800
# Node ID 24b3e80a86f7511e7a17cb2cf57fb6bcad993603
# Parent  ece323e1b6035c125b8d1e892e02cca84917990b
slicetype: CuTree Implementation for AQ RateControl

Added Following methods into slicetype for CuTree Implementation
1.cuTree - Entry Point for CuTree
2.estimateCUPropagate and estimateCUPropagateCost - Calculate the CU Propagate cost for CU's
3.cuTreeFinish - set the qpOffset using Precomputed PropagateCost, weightedCostDelta and lookahead costs

Added cuTree option into param->rc and make it as a Disable, still the cuTree is an Under Construction

diff -r ece323e1b603 -r 24b3e80a86f7 source/common/common.cpp
--- a/source/common/common.cpp	Wed Nov 27 00:29:23 2013 -0600
+++ b/source/common/common.cpp	Wed Nov 27 17:20:19 2013 +0530
@@ -216,6 +216,7 @@
     param->rc.qp = 32;
     param->rc.aqMode = X265_AQ_NONE;
     param->rc.aqStrength = 1.0;
+    param->rc.cuTree = 0;
 
     /* Quality Measurement Metrics */
     param->bEnablePsnr = 1;
diff -r ece323e1b603 -r 24b3e80a86f7 source/common/lowres.cpp
--- a/source/common/lowres.cpp	Wed Nov 27 00:29:23 2013 -0600
+++ b/source/common/lowres.cpp	Wed Nov 27 17:20:19 2013 +0530
@@ -47,10 +47,13 @@
     {
         qpAqOffset = (double*)x265_malloc(sizeof(double) * cuCount);
         invQscaleFactor = (int*)x265_malloc(sizeof(int) * cuCount);
-        if (!qpAqOffset || !invQscaleFactor)
+        qpOffset = (double*)x265_malloc(sizeof(double) * cuCount);
+        if (!qpAqOffset || !invQscaleFactor || !qpOffset)
             *aqMode = 0;
     }
 
+    propagateCost = (uint16_t*)x265_malloc(sizeof(uint16_t) * cuCount);
+
     /* allocate lowres buffers */
     for (int i = 0; i < 4; i++)
     {
@@ -111,6 +114,8 @@
 
     X265_FREE(qpAqOffset);
     X265_FREE(invQscaleFactor);
+    X265_FREE(qpOffset);
+    X265_FREE(propagateCost);
 }
 
 // (re) initialize lowres state
diff -r ece323e1b603 -r 24b3e80a86f7 source/common/lowres.h
--- a/source/common/lowres.h	Wed Nov 27 00:29:23 2013 -0600
+++ b/source/common/lowres.h	Wed Nov 27 17:20:19 2013 +0530
@@ -128,6 +128,10 @@
     int*      invQscaleFactor; // qScale values for qp Aq Offsets
     uint64_t  wp_ssd[3];       // This is different than SSDY, this is sum(pixel^2) - sum(pixel)^2 for entire frame
     uint64_t  wp_sum[3];
+    double*   qpOffset;
+
+    uint16_t* propagateCost;
+    double    weightedCostDelta[X265_BFRAME_MAX+2];
 
     void create(TComPicYuv *orig, int bframes, int32_t *aqMode);
     void destroy(int bframes);
diff -r ece323e1b603 -r 24b3e80a86f7 source/encoder/ratecontrol.h
--- a/source/encoder/ratecontrol.h	Wed Nov 27 00:29:23 2013 -0600
+++ b/source/encoder/ratecontrol.h	Wed Nov 27 17:20:19 2013 +0530
@@ -44,6 +44,14 @@
     double qRceq;
 };
 
+#define BASE_FRAME_DURATION 0.04
+
+/* Arbitrary limitations as a sanity check. */
+#define MAX_FRAME_DURATION 1.00
+#define MIN_FRAME_DURATION 0.01
+
+#define CLIP_DURATION(f) Clip3(MIN_FRAME_DURATION, MAX_FRAME_DURATION, f)
+
 struct RateControl
 {
     TComSlice *curSlice;      /* all info about the current frame */
diff -r ece323e1b603 -r 24b3e80a86f7 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Wed Nov 27 00:29:23 2013 -0600
+++ b/source/encoder/slicetype.cpp	Wed Nov 27 17:20:19 2013 +0530
@@ -31,6 +31,7 @@
 #include "slicetype.h"
 #include "motion.h"
 #include "mv.h"
+#include "ratecontrol.h"
 
 #define LOWRES_COST_MASK  ((1 << 14) - 1)
 #define LOWRES_COST_SHIFT 14
@@ -86,6 +87,8 @@
         lhrows[i].widthInCU = widthInCU;
         lhrows[i].heightInCU = heightInCU;
     }
+
+    scratch = (int*)x265_malloc(widthInCU * sizeof(int));
 }
 
 Lookahead::~Lookahead()
@@ -127,6 +130,8 @@
     {
         x265_free(wbuffer[i]);
     }
+    X265_FREE(scratch);
+
 }
 
 void Lookahead::addPicture(TComPic *pic, int sliceType)
@@ -262,6 +267,7 @@
     Lowres *fenc, *ref;
     fenc = frames[b];
     ref  = frames[p0];
+    int deltaIndex = fenc->frameNum - ref->frameNum;
 
     /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
     const float epsilon = 1.f / 128.f;
@@ -318,6 +324,9 @@
     {
         SET_WEIGHT(w, 1, minscale, mindenom, minoff);
 
+        // set weighted delta cost
+        fenc->weightedCostDelta[deltaIndex] = minscore / origscore;
+
         int offset = w.inputOffset << (X265_DEPTH - 8);
         int scale = w.inputWeight;
         int denom = w.log2WeightDenom;
@@ -906,7 +915,8 @@
 
     if (!framecnt)
     {
-        // TODO: mb-tree
+        if (cfg->param.rc.cuTree && cfg->param.rc.aqMode)
+            cuTree(frames, 0, bKeyframe);
         return;
     }
 
@@ -1039,8 +1049,9 @@
         num_bframes = 0;
     }
 
-    // TODO if rc.b_mb_tree Enabled the need to call  x264_macroblock_tree currently Ignored the call
-    // if (!cfg->param.bIntraRefresh)
+    if (cfg->param.rc.cuTree && cfg->param.rc.aqMode)
+        cuTree(frames, X265_MIN(num_frames, cfg->param.keyframeMax), bKeyframe);
+
     for (int j = keyint_limit + 1; j <= num_frames; j += cfg->param.keyframeMax)
     {
         frames[j]->sliceType = X265_TYPE_I;
@@ -1256,3 +1267,239 @@
         rowsCompleted = true;
     }
 }
+
+void Lookahead::cuTree(Lowres **frames, int numframes, bool bintra)
+{
+    int idx = !bintra;
+    int lastnonb, curnonb = 1;
+    int bframes = 0;
+
+    x265_emms();
+    double totalDuration = 0.0;
+    for (int j = 0; j <= numframes; j++)
+        totalDuration += 1.0 / cfg->param.frameRate;
+    double averageDuration = totalDuration / (numframes + 1);
+
+    int i = numframes;
+    int cuCount = widthInCU * heightInCU;
+
+    if (bintra)
+        estimateFrameCost(0, 0, 0, 0);
+
+    while (i > 0 && frames[i]->sliceType == X265_TYPE_B)
+        i--;
+    lastnonb = i;
+
+    /* Lookaheadless MB-tree is not a theoretically distinct case; the same extrapolation could
+     * be applied to the end of a lookahead buffer of any size.  However, it's most needed when
+     * lookahead=0, so that's what's currently implemented. */
+    if (!cfg->param.lookaheadDepth)
+    {
+        if (bintra)
+        {
+            memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t));
+            memcpy(frames[0]->qpOffset, frames[0]->qpAqOffset, cuCount * sizeof(double));
+            return;
+        }
+        std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
+        memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t));
+    }
+    else
+    {
+        if (lastnonb < idx)
+            return;
+        memset(frames[lastnonb]->propagateCost, 0, cuCount * sizeof(uint16_t));
+    }
+
+    while (i-- > idx)
+    {
+        curnonb = i;
+        while (frames[curnonb]->sliceType == X265_TYPE_B && curnonb > 0)
+            curnonb--;
+        if (curnonb < idx)
+            break;
+
+        estimateFrameCost(curnonb, lastnonb, lastnonb, 0);
+        memset(frames[curnonb]->propagateCost, 0, cuCount * sizeof(uint16_t));
+        bframes = lastnonb - curnonb - 1;
+        if (cfg->param.bpyramid && bframes > 1)
+        {
+            int middle = (bframes + 1) / 2 + curnonb;
+            estimateFrameCost(curnonb, lastnonb, middle, 0);
+            memset(frames[middle]->propagateCost, 0, cuCount * sizeof(uint16_t));
+            while (i > curnonb)
+            {
+                int p0 = i > middle ? middle : curnonb;
+                int p1 = i < middle ? middle : lastnonb;
+                if (i != middle)
+                {
+                    estimateFrameCost(p0, p1, i, 0);
+                    estimateCUPropagate(frames, averageDuration, p0, p1, i, 0);
+                }
+                i--;
+            }
+            estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, middle, 1);
+        }
+        else
+        {
+            while (i > curnonb)
+            {
+                estimateFrameCost(curnonb, lastnonb, i, 0);
+                estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, i, 0);
+                i--;
+            }
+        }
+        estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, lastnonb, 1);
+        lastnonb = curnonb;
+    }
+
+    if (!cfg->param.lookaheadDepth)
+    {
+        estimateFrameCost(0, lastnonb, lastnonb, 0);
+        estimateCUPropagate(frames, averageDuration, 0, lastnonb, lastnonb, 1);
+        std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
+    }
+
+    cuTreeFinish(frames[lastnonb], averageDuration, lastnonb);
+    if (cfg->param.bpyramid && bframes > 1 /* && !h->param.rc.i_vbv_buffer_size */)
+        cuTreeFinish(frames[lastnonb + (bframes + 1) / 2], averageDuration, 0);
+}
+
+void Lookahead::estimateCUPropagate(Lowres **frames, double averageDuration, int p0, int p1, int b, int referenced)
+{
+    uint16_t *refCosts[2] = {frames[p0]->propagateCost, frames[p1]->propagateCost};
+    int distScaleFactor = (((b - p0) << 8) + ((p1 - p0) >> 1)) / (p1 - p0);
+    int bipredWeight = cfg->param.bEnableWeightedBiPred ? 64 - (distScaleFactor >> 2) : 32;
+    MV *mvs[2] = {frames[b]->lowresMvs[0][b - p0 -1], frames[b]->lowresMvs[1][p1 - b - 1]};
+    int bipredWeights[2] = {bipredWeight, 64 - bipredWeight};
+    memset(scratch, 0, widthInCU * sizeof(int));
+
+    uint16_t *propagate_cost = frames[b]->propagateCost;
+
+    x265_emms();
+    double fpsFactor = CLIP_DURATION(1.0 / cfg->param.frameRate) / CLIP_DURATION(averageDuration);
+
+    /* For non-refferd frames the source costs are always zero, so just memset one row and re-use it. */
+    if (!referenced)
+        memset(frames[b]->propagateCost, 0, widthInCU * sizeof(uint16_t));
+
+    uint16_t StrideInCU = (uint16_t)frames[b]->lumaStride;
+    for (uint16_t block_y = 0; block_y < heightInCU; block_y++)
+    {
+        int cuIndex = block_y * StrideInCU;
+        /* TODO This function go into ASM */
+        estimateCUPropagateCost(scratch, propagate_cost,
+            frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
+            frames[b]->invQscaleFactor + cuIndex, &fpsFactor, widthInCU);
+
+        if (referenced)
+            propagate_cost += widthInCU;
+        for (uint16_t block_x = 0; block_x < widthInCU; block_x++, cuIndex++)
+        {
+            int propagate_amount = scratch[block_x];
+            /* Don't propagate for an intra block. */
+            if (propagate_amount > 0)
+            {
+                /* Access width-2 bitfield. */
+                int lists_used = frames[b]->lowresCosts[b - p0][p1 - b][cuIndex] >> LOWRES_COST_SHIFT;
+                /* Follow the MVs to the previous frame(s). */
+                for (uint16_t list = 0; list < 2; list++)
+                    if ((lists_used >> list) & 1)
+                    {
+#define CLIP_ADD(s, x) (s) = X265_MIN((s) + (x),(1 << 16) - 1)
+                        uint16_t listamount = (uint16_t)propagate_amount;
+                        /* Apply bipred weighting. */
+                        if (lists_used == 3)
+                            listamount = (uint16_t)(listamount * bipredWeights[list] + 32) >> 6;
+
+                        /* Early termination for simple case of mv0. */
+                        if (mvs[list]->notZero())
+                        {
+                            CLIP_ADD(refCosts[list][cuIndex], listamount);
+                            continue;
+                        }
+
+                        uint16_t x = mvs[list]->x;
+                        uint16_t y = mvs[list]->y;
+                        uint16_t cux = (x >> 5) + block_x;
+                        uint16_t cuy = (y >> 5) + block_y;
+                        uint16_t idx0 = cux + cuy * StrideInCU;
+                        uint16_t idx1 = idx0 + 1;
+                        uint16_t idx2 = idx0 + StrideInCU;
+                        uint16_t idx3 = idx0 + StrideInCU + 1;
+                        x &= 31;
+                        y &= 31;
+                        uint16_t idx0weight = (uint16_t) (32 - y) * (32 - x);
+                        uint16_t idx1weight = (uint16_t) (32 - y) * x;
+                        uint16_t idx2weight = (uint16_t) y * (32 - x);
+                        uint16_t idx3weight = (uint16_t) y * x;
+
+                        /* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't
+                         * be counted. */
+                        if (cux < widthInCU - 1 && cuy < heightInCU - 1 && cux >= 0 && cuy >= 0)
+                        {
+                            CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10);
+                            CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10);
+                            CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10);
+                            CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10);
+                        }
+                        else /* Check offsets individually */
+                        {
+                            if (cux < widthInCU && cuy < heightInCU && cux >= 0 && cuy >= 0)
+                                CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10);
+                            if (cux + 1 < widthInCU && cuy < heightInCU && cux + 1 >= 0 && cuy >= 0)
+                                CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10);
+                            if (cux < widthInCU && cuy + 1 < heightInCU && cux >= 0 && cuy + 1 >= 0)
+                                CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10);
+                            if (cux + 1 < widthInCU && cuy + 1 < heightInCU && cux + 1 >= 0 && cuy + 1 >= 0)
+                                CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10);
+                        }
+                    }
+            }
+        }
+    }
+
+    if(/*h->param.rc.i_vbv_buffer_size &&*/ cfg->param.logLevel && referenced)
+        cuTreeFinish(frames[b], averageDuration, b == p1 ? b - p0 : 0);
+
+}
+
+void Lookahead::cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance)
+{
+    int fpsFactor = (int)(CLIP_DURATION(averageDuration) / CLIP_DURATION(1.0 / cfg->param.frameRate) * 256);
+    double weightdelta = 0.0;
+
+    if (ref0Distance && frame->weightedCostDelta[ref0Distance - 1] > 0)
+        weightdelta = (1.0 - frame->weightedCostDelta[ref0Distance - 1]);
+
+    /* Allow the strength to be adjusted via qcompress, since the two
+     * concepts are very similar. */
+
+    int cuCount = widthInCU * heightInCU;
+    double strength = 5.0f * (1.0f - cfg->param.rc.qCompress);
+    for (int cuIndex = 0; cuIndex < cuCount; cuIndex++)
+    {
+        int intracost = (frame->intraCost[cuIndex] * frame->invQscaleFactor[cuIndex] + 128) >> 8;
+        if (intracost)
+        {
+            int propagate_cost = (frame->propagateCost[cuIndex] * fpsFactor + 128) >> 8;
+            double log2_ratio = X265_LOG2(intracost + propagate_cost) - X265_LOG2(intracost) + weightdelta;
+            frame->qpOffset[cuIndex] = frame->qpAqOffset[cuIndex] - strength * log2_ratio;
+        }
+    }
+}
+
+/* Estimate the total amount of influence on future quality that could be had if we
+ * were to improve the reference samples used to inter predict any given macroblock. */
+void Lookahead::estimateCUPropagateCost(int *dst, uint16_t *propagateIn, int32_t *intraCosts, uint16_t *interCosts, int32_t *invQscales, double *fpsFactor, int len)
+{
+    double fps = *fpsFactor / 256;
+    for(int i = 0; i < len; i++)
+    {
+        double intraCost       = intraCosts[i] * invQscales[i];
+        double propagateAmount = (double)propagateIn[i] + intraCost * fps;
+        double propagateNum    = (double)intraCosts[i] - (interCosts[i] & LOWRES_COST_MASK);
+        double propagateDenom  = (double)intraCosts[i];
+        dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5);
+    }
+}
diff -r ece323e1b603 -r 24b3e80a86f7 source/encoder/slicetype.h
--- a/source/encoder/slicetype.h	Wed Nov 27 00:29:23 2013 -0600
+++ b/source/encoder/slicetype.h	Wed Nov 27 17:20:19 2013 +0530
@@ -85,6 +85,8 @@
     pixel           *wbuffer[4];
     int              paddedLines;
 
+    int             *scratch; // temp buffer
+
     PicList inputQueue;  // input pictures in order received
     PicList outputQueue; // pictures to be encoded, in encode order
 
@@ -116,6 +118,12 @@
 
     void weightsAnalyse(int b, int p0);
     uint32_t weightCostLuma(int b, pixel *src, wpScalingParam *w);
+
+    void cuTree(Lowres **frames, int numframes, bool bintra);
+    void estimateCUPropagate(Lowres **frames, double average_duration, int p0, int p1, int b, int referenced);
+    void estimateCUPropagateCost(int *dst, uint16_t *propagateIn, int32_t *intraCosts, uint16_t *interCosts, int32_t *invQscales, double *fpsFactor, int len);
+    void cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance);
+
 };
 }
 
diff -r ece323e1b603 -r 24b3e80a86f7 source/x265.h
--- a/source/x265.h	Wed Nov 27 00:29:23 2013 -0600
+++ b/source/x265.h	Wed Nov 27 17:20:19 2013 +0530
@@ -597,6 +597,7 @@
 
         int       aqMode;                      ///< Adaptive QP (AQ)
         double    aqStrength;
+        int       cuTree;
     } rc;
 } x265_param;
 


More information about the x265-devel mailing list