[x265-commits] [x265] slicetype: CuTree Implementation for AQ RateControl

Gopu Govindaswamy gopu at multicorewareinc.com
Mon Dec 2 07:22:32 CET 2013


details:   http://hg.videolan.org/x265/rev/c75c3431b108
branches:  
changeset: 5414:c75c3431b108
user:      Gopu Govindaswamy <gopu at multicorewareinc.com>
date:      Mon Dec 02 11:48:10 2013 +0530
description:
slicetype: CuTree Implementation for AQ RateControl

Added Following methods into slicetype for CuTree Implementation
1.cuTree - Entry Point for CuTree
2.estimateCUPropagate and estimateCUPropagateCost - Calculate the CU Propagate cost for CU's
3.cuTreeFinish - update the qpOffset using Precomputed PropagateCost, weightedCostDelta and lookahead costs

Added cuTree option into param->rc and make it as a Disable, still the cuTree is an Under Construction

diffstat:

 source/common/common.cpp        |    1 +
 source/common/lowres.cpp        |    6 +-
 source/common/lowres.h          |    4 +
 source/encoder/frameencoder.cpp |    4 +
 source/encoder/ratecontrol.h    |    8 +
 source/encoder/slicetype.cpp    |  250 +++++++++++++++++++++++++++++++++++++++-
 source/encoder/slicetype.h      |    7 +
 source/x265.h                   |    1 +
 8 files changed, 278 insertions(+), 3 deletions(-)

diffs (truncated from 426 to 300 lines):

diff -r 189ac76266a9 -r c75c3431b108 source/common/common.cpp
--- a/source/common/common.cpp	Mon Dec 02 00:12:58 2013 -0600
+++ b/source/common/common.cpp	Mon Dec 02 11:48:10 2013 +0530
@@ -219,6 +219,7 @@ void x265_param_default(x265_param *para
     param->rc.qp = 32;
     param->rc.aqMode = X265_AQ_NONE;
     param->rc.aqStrength = 1.0;
+    param->rc.cuTree = 0;
 
     /* Quality Measurement Metrics */
     param->bEnablePsnr = 1;
diff -r 189ac76266a9 -r c75c3431b108 source/common/lowres.cpp
--- a/source/common/lowres.cpp	Mon Dec 02 00:12:58 2013 -0600
+++ b/source/common/lowres.cpp	Mon Dec 02 11:48:10 2013 +0530
@@ -47,9 +47,11 @@ void Lowres::create(TComPicYuv *orig, in
     {
         qpAqOffset = (double*)x265_malloc(sizeof(double) * cuCount);
         invQscaleFactor = (int*)x265_malloc(sizeof(int) * cuCount);
-        if (!qpAqOffset || !invQscaleFactor)
+        qpOffset = (double*)x265_malloc(sizeof(double) * cuCount);
+        if (!qpAqOffset || !invQscaleFactor || !qpOffset)
             *aqMode = 0;
     }
+    propagateCost = (uint16_t*)x265_malloc(sizeof(uint16_t) * cuCount);
 
     /* allocate lowres buffers */
     for (int i = 0; i < 4; i++)
@@ -111,6 +113,8 @@ void Lowres::destroy(int bframes)
 
     X265_FREE(qpAqOffset);
     X265_FREE(invQscaleFactor);
+    X265_FREE(qpOffset);
+    X265_FREE(propagateCost);
 }
 
 // (re) initialize lowres state
diff -r 189ac76266a9 -r c75c3431b108 source/common/lowres.h
--- a/source/common/lowres.h	Mon Dec 02 00:12:58 2013 -0600
+++ b/source/common/lowres.h	Mon Dec 02 11:48:10 2013 +0530
@@ -126,9 +126,13 @@ struct Lowres : public ReferencePlanes
     /* rate control / adaptive quant data */
     double*   qpAqOffset;      // qp Aq offset values for each Cu
     int*      invQscaleFactor; // qScale values for qp Aq Offsets
+    double*   qpOffset;
     uint64_t  wp_ssd[3];       // This is different than SSDY, this is sum(pixel^2) - sum(pixel)^2 for entire frame
     uint64_t  wp_sum[3];
 
+    uint16_t* propagateCost;
+    double    weightedCostDelta[X265_BFRAME_MAX+2];
+
     void create(TComPicYuv *orig, int bframes, int32_t *aqMode);
     void destroy(int bframes);
     void init(TComPicYuv *orig, int poc, int sliceType, int bframes);
diff -r 189ac76266a9 -r c75c3431b108 source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp	Mon Dec 02 00:12:58 2013 -0600
+++ b/source/encoder/frameencoder.cpp	Mon Dec 02 11:48:10 2013 +0530
@@ -1118,6 +1118,10 @@ int FrameEncoder::calcQpForCu(TComPic *p
             for (int w = 0; w < noOfBlocks && (block_x + w) < maxBlockCols; w++)
             {
                 qp_offset += pic->m_lowres.qpAqOffset[block_x + w + (block_y * maxBlockCols)];
+                if (pic->getSlice()->isReferenced() && m_cfg->param.rc.cuTree && m_cfg->param.rc.aqMode)
+                    qp_offset += pic->m_lowres.qpOffset[block_x + w + (block_y * maxBlockCols)];
+                else
+                    qp_offset += pic->m_lowres.qpAqOffset[block_x + w + (block_y * maxBlockCols)];
                 cnt++;
             }
         }
diff -r 189ac76266a9 -r c75c3431b108 source/encoder/ratecontrol.h
--- a/source/encoder/ratecontrol.h	Mon Dec 02 00:12:58 2013 -0600
+++ b/source/encoder/ratecontrol.h	Mon Dec 02 11:48:10 2013 +0530
@@ -36,6 +36,14 @@ class Encoder;
 class TComPic;
 class TEncCfg;
 
+#define BASE_FRAME_DURATION 0.04
+
+/* Arbitrary limitations as a sanity check. */
+#define MAX_FRAME_DURATION 1.00
+#define MIN_FRAME_DURATION 0.01
+
+#define CLIP_DURATION(f) Clip3(MIN_FRAME_DURATION, MAX_FRAME_DURATION, f)
+
 struct RateControlEntry
 {
     int sliceType;
diff -r 189ac76266a9 -r c75c3431b108 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Mon Dec 02 00:12:58 2013 -0600
+++ b/source/encoder/slicetype.cpp	Mon Dec 02 11:48:10 2013 +0530
@@ -31,6 +31,7 @@
 #include "slicetype.h"
 #include "motion.h"
 #include "mv.h"
+#include "ratecontrol.h"
 
 #define LOWRES_COST_MASK  ((1 << 14) - 1)
 #define LOWRES_COST_SHIFT 14
@@ -86,6 +87,8 @@ Lookahead::Lookahead(TEncCfg *_cfg, Thre
         lhrows[i].widthInCU = widthInCU;
         lhrows[i].heightInCU = heightInCU;
     }
+
+    scratch = (int*)x265_malloc(widthInCU * sizeof(int));
 }
 
 Lookahead::~Lookahead()
@@ -127,6 +130,7 @@ void Lookahead::destroy()
     {
         x265_free(wbuffer[i]);
     }
+    X265_FREE(scratch);
 }
 
 void Lookahead::addPicture(TComPic *pic, int sliceType)
@@ -265,6 +269,7 @@ void Lookahead::weightsAnalyse(int b, in
 
     fenc = frames[b];
     ref  = frames[p0];
+    int deltaIndex = fenc->frameNum - ref->frameNum;
 
     /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
     const float epsilon = 1.f / 128.f;
@@ -320,6 +325,8 @@ void Lookahead::weightsAnalyse(int b, in
     else
     {
         SET_WEIGHT(w, 1, minscale, mindenom, minoff);
+        // set weighted delta cost
+        fenc->weightedCostDelta[deltaIndex] = minscore / origscore;
 
         int offset = w.inputOffset << (X265_DEPTH - 8);
         int scale = w.inputWeight;
@@ -912,7 +919,8 @@ void Lookahead::slicetypeAnalyse(bool bK
 
     if (!framecnt)
     {
-        // TODO: mb-tree
+        if (cfg->param.rc.cuTree && cfg->param.rc.aqMode)
+            cuTree(frames, 0, bKeyframe);
         return;
     }
 
@@ -1045,7 +1053,9 @@ void Lookahead::slicetypeAnalyse(bool bK
         num_bframes = 0;
     }
 
-    // TODO if rc.b_mb_tree Enabled the need to call  x264_macroblock_tree currently Ignored the call
+    if (cfg->param.rc.cuTree && cfg->param.rc.aqMode)
+        cuTree(frames, X265_MIN(num_frames, cfg->param.keyframeMax), bKeyframe);
+
     // if (!cfg->param.bIntraRefresh)
     for (int j = keyint_limit + 1; j <= num_frames; j += cfg->param.keyframeMax)
     {
@@ -1262,3 +1272,239 @@ void Lookahead::processRow(int row)
         rowsCompleted = true;
     }
 }
+
+void Lookahead::cuTree(Lowres **frames, int numframes, bool bintra)
+{
+    int idx = !bintra;
+    int lastnonb, curnonb = 1;
+    int bframes = 0;
+
+    x265_emms();
+    double totalDuration = 0.0;
+    for (int j = 0; j <= numframes; j++)
+        totalDuration += 1.0 / cfg->param.frameRate;
+    double averageDuration = totalDuration / (numframes + 1);
+
+    int i = numframes;
+    int cuCount = widthInCU * heightInCU;
+
+    if (bintra)
+        estimateFrameCost(0, 0, 0, 0);
+
+    while (i > 0 && frames[i]->sliceType == X265_TYPE_B)
+        i--;
+    lastnonb = i;
+
+    /* Lookaheadless MB-tree is not a theoretically distinct case; the same extrapolation could
+     * be applied to the end of a lookahead buffer of any size.  However, it's most needed when
+     * lookahead=0, so that's what's currently implemented. */
+    if (!cfg->param.lookaheadDepth)
+    {
+        if (bintra)
+        {
+            memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t));
+            memcpy(frames[0]->qpOffset, frames[0]->qpAqOffset, cuCount * sizeof(double));
+            return;
+        }
+        std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
+        memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t));
+    }
+    else
+    {
+        if (lastnonb < idx)
+            return;
+        memset(frames[lastnonb]->propagateCost, 0, cuCount * sizeof(uint16_t));
+    }
+
+    while (i-- > idx)
+    {
+        curnonb = i;
+        while (frames[curnonb]->sliceType == X265_TYPE_B && curnonb > 0)
+            curnonb--;
+        if (curnonb < idx)
+            break;
+
+        estimateFrameCost(curnonb, lastnonb, lastnonb, 0);
+        memset(frames[curnonb]->propagateCost, 0, cuCount * sizeof(uint16_t));
+        bframes = lastnonb - curnonb - 1;
+        if (cfg->param.bBPyramid && bframes > 1)
+        {
+            int middle = (bframes + 1) / 2 + curnonb;
+            estimateFrameCost(curnonb, lastnonb, middle, 0);
+            memset(frames[middle]->propagateCost, 0, cuCount * sizeof(uint16_t));
+            while (i > curnonb)
+            {
+                int p0 = i > middle ? middle : curnonb;
+                int p1 = i < middle ? middle : lastnonb;
+                if (i != middle)
+                {
+                    estimateFrameCost(p0, p1, i, 0);
+                    estimateCUPropagate(frames, averageDuration, p0, p1, i, 0);
+                }
+                i--;
+            }
+            estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, middle, 1);
+        }
+        else
+        {
+            while (i > curnonb)
+            {
+                estimateFrameCost(curnonb, lastnonb, i, 0);
+                estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, i, 0);
+                i--;
+            }
+        }
+        estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, lastnonb, 1);
+        lastnonb = curnonb;
+    }
+
+    if (!cfg->param.lookaheadDepth)
+    {
+        estimateFrameCost(0, lastnonb, lastnonb, 0);
+        estimateCUPropagate(frames, averageDuration, 0, lastnonb, lastnonb, 1);
+        std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
+    }
+
+    cuTreeFinish(frames[lastnonb], averageDuration, lastnonb);
+    if (cfg->param.bBPyramid && bframes > 1 /* && !h->param.rc.i_vbv_buffer_size */)
+        cuTreeFinish(frames[lastnonb + (bframes + 1) / 2], averageDuration, 0);
+}
+
+void Lookahead::estimateCUPropagate(Lowres **frames, double averageDuration, int p0, int p1, int b, int referenced)
+{
+    uint16_t *refCosts[2] = {frames[p0]->propagateCost, frames[p1]->propagateCost};
+    int distScaleFactor = (((b - p0) << 8) + ((p1 - p0) >> 1)) / (p1 - p0);
+    int bipredWeight = cfg->param.bEnableWeightedBiPred ? 64 - (distScaleFactor >> 2) : 32;
+    MV *mvs[2] = {frames[b]->lowresMvs[0][b - p0 -1], frames[b]->lowresMvs[1][p1 - b - 1]};
+    int bipredWeights[2] = {bipredWeight, 64 - bipredWeight};
+    memset(scratch, 0, widthInCU * sizeof(int));
+
+    uint16_t *propagate_cost = frames[b]->propagateCost;
+
+    x265_emms();
+    double fpsFactor = CLIP_DURATION(1.0 / cfg->param.frameRate) / CLIP_DURATION(averageDuration);
+
+    /* For non-refferd frames the source costs are always zero, so just memset one row and re-use it. */
+    if (!referenced)
+        memset(frames[b]->propagateCost, 0, widthInCU * sizeof(uint16_t));
+
+    uint16_t StrideInCU = (uint16_t)widthInCU;
+    for (uint16_t block_y = 0; block_y < heightInCU; block_y += 16)
+    {
+        int cuIndex = block_y * StrideInCU;
+        /* TODO This function go into ASM */
+        estimateCUPropagateCost(scratch, propagate_cost,
+            frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
+            frames[b]->invQscaleFactor + cuIndex, &fpsFactor, widthInCU);
+
+        if (referenced)
+            propagate_cost += widthInCU;
+        for (uint16_t block_x = 0; block_x < widthInCU; block_x += 16, cuIndex++)
+        {
+            int propagate_amount = scratch[block_x];
+            /* Don't propagate for an intra block. */
+            if (propagate_amount > 0)
+            {
+                /* Access width-2 bitfield. */
+                int lists_used = frames[b]->lowresCosts[b - p0][p1 - b][cuIndex] >> LOWRES_COST_SHIFT;
+                /* Follow the MVs to the previous frame(s). */
+                for (uint16_t list = 0; list < 2; list++)
+                    if ((lists_used >> list) & 1)
+                    {
+#define CLIP_ADD(s, x) (s) = X265_MIN((s) + (x),(1 << 16) - 1)
+                        uint16_t listamount = (uint16_t)propagate_amount;
+                        /* Apply bipred weighting. */
+                        if (lists_used == 3)
+                            listamount = (uint16_t)(listamount * bipredWeights[list] + 32) >> 6;
+


More information about the x265-commits mailing list