[x265-commits] [x265] slicetype: CuTree Implementation for AQ RateControl
Gopu Govindaswamy
gopu at multicorewareinc.com
Mon Dec 2 07:22:32 CET 2013
details: http://hg.videolan.org/x265/rev/c75c3431b108
branches:
changeset: 5414:c75c3431b108
user: Gopu Govindaswamy <gopu at multicorewareinc.com>
date: Mon Dec 02 11:48:10 2013 +0530
description:
slicetype: CuTree Implementation for AQ RateControl
Added Following methods into slicetype for CuTree Implementation
1.cuTree - Entry Point for CuTree
2.estimateCUPropagate and estimateCUPropagateCost - Calculate the CU Propagate cost for CU's
3.cuTreeFinish - update the qpOffset using Precomputed PropagateCost, weightedCostDelta and lookahead costs
Added cuTree option into param->rc and make it as a Disable, still the cuTree is an Under Construction
diffstat:
source/common/common.cpp | 1 +
source/common/lowres.cpp | 6 +-
source/common/lowres.h | 4 +
source/encoder/frameencoder.cpp | 4 +
source/encoder/ratecontrol.h | 8 +
source/encoder/slicetype.cpp | 250 +++++++++++++++++++++++++++++++++++++++-
source/encoder/slicetype.h | 7 +
source/x265.h | 1 +
8 files changed, 278 insertions(+), 3 deletions(-)
diffs (truncated from 426 to 300 lines):
diff -r 189ac76266a9 -r c75c3431b108 source/common/common.cpp
--- a/source/common/common.cpp Mon Dec 02 00:12:58 2013 -0600
+++ b/source/common/common.cpp Mon Dec 02 11:48:10 2013 +0530
@@ -219,6 +219,7 @@ void x265_param_default(x265_param *para
param->rc.qp = 32;
param->rc.aqMode = X265_AQ_NONE;
param->rc.aqStrength = 1.0;
+ param->rc.cuTree = 0;
/* Quality Measurement Metrics */
param->bEnablePsnr = 1;
diff -r 189ac76266a9 -r c75c3431b108 source/common/lowres.cpp
--- a/source/common/lowres.cpp Mon Dec 02 00:12:58 2013 -0600
+++ b/source/common/lowres.cpp Mon Dec 02 11:48:10 2013 +0530
@@ -47,9 +47,11 @@ void Lowres::create(TComPicYuv *orig, in
{
qpAqOffset = (double*)x265_malloc(sizeof(double) * cuCount);
invQscaleFactor = (int*)x265_malloc(sizeof(int) * cuCount);
- if (!qpAqOffset || !invQscaleFactor)
+ qpOffset = (double*)x265_malloc(sizeof(double) * cuCount);
+ if (!qpAqOffset || !invQscaleFactor || !qpOffset)
*aqMode = 0;
}
+ propagateCost = (uint16_t*)x265_malloc(sizeof(uint16_t) * cuCount);
/* allocate lowres buffers */
for (int i = 0; i < 4; i++)
@@ -111,6 +113,8 @@ void Lowres::destroy(int bframes)
X265_FREE(qpAqOffset);
X265_FREE(invQscaleFactor);
+ X265_FREE(qpOffset);
+ X265_FREE(propagateCost);
}
// (re) initialize lowres state
diff -r 189ac76266a9 -r c75c3431b108 source/common/lowres.h
--- a/source/common/lowres.h Mon Dec 02 00:12:58 2013 -0600
+++ b/source/common/lowres.h Mon Dec 02 11:48:10 2013 +0530
@@ -126,9 +126,13 @@ struct Lowres : public ReferencePlanes
/* rate control / adaptive quant data */
double* qpAqOffset; // qp Aq offset values for each Cu
int* invQscaleFactor; // qScale values for qp Aq Offsets
+ double* qpOffset;
uint64_t wp_ssd[3]; // This is different than SSDY, this is sum(pixel^2) - sum(pixel)^2 for entire frame
uint64_t wp_sum[3];
+ uint16_t* propagateCost;
+ double weightedCostDelta[X265_BFRAME_MAX+2];
+
void create(TComPicYuv *orig, int bframes, int32_t *aqMode);
void destroy(int bframes);
void init(TComPicYuv *orig, int poc, int sliceType, int bframes);
diff -r 189ac76266a9 -r c75c3431b108 source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp Mon Dec 02 00:12:58 2013 -0600
+++ b/source/encoder/frameencoder.cpp Mon Dec 02 11:48:10 2013 +0530
@@ -1118,6 +1118,10 @@ int FrameEncoder::calcQpForCu(TComPic *p
for (int w = 0; w < noOfBlocks && (block_x + w) < maxBlockCols; w++)
{
qp_offset += pic->m_lowres.qpAqOffset[block_x + w + (block_y * maxBlockCols)];
+ if (pic->getSlice()->isReferenced() && m_cfg->param.rc.cuTree && m_cfg->param.rc.aqMode)
+ qp_offset += pic->m_lowres.qpOffset[block_x + w + (block_y * maxBlockCols)];
+ else
+ qp_offset += pic->m_lowres.qpAqOffset[block_x + w + (block_y * maxBlockCols)];
cnt++;
}
}
diff -r 189ac76266a9 -r c75c3431b108 source/encoder/ratecontrol.h
--- a/source/encoder/ratecontrol.h Mon Dec 02 00:12:58 2013 -0600
+++ b/source/encoder/ratecontrol.h Mon Dec 02 11:48:10 2013 +0530
@@ -36,6 +36,14 @@ class Encoder;
class TComPic;
class TEncCfg;
+#define BASE_FRAME_DURATION 0.04
+
+/* Arbitrary limitations as a sanity check. */
+#define MAX_FRAME_DURATION 1.00
+#define MIN_FRAME_DURATION 0.01
+
+#define CLIP_DURATION(f) Clip3(MIN_FRAME_DURATION, MAX_FRAME_DURATION, f)
+
struct RateControlEntry
{
int sliceType;
diff -r 189ac76266a9 -r c75c3431b108 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp Mon Dec 02 00:12:58 2013 -0600
+++ b/source/encoder/slicetype.cpp Mon Dec 02 11:48:10 2013 +0530
@@ -31,6 +31,7 @@
#include "slicetype.h"
#include "motion.h"
#include "mv.h"
+#include "ratecontrol.h"
#define LOWRES_COST_MASK ((1 << 14) - 1)
#define LOWRES_COST_SHIFT 14
@@ -86,6 +87,8 @@ Lookahead::Lookahead(TEncCfg *_cfg, Thre
lhrows[i].widthInCU = widthInCU;
lhrows[i].heightInCU = heightInCU;
}
+
+ scratch = (int*)x265_malloc(widthInCU * sizeof(int));
}
Lookahead::~Lookahead()
@@ -127,6 +130,7 @@ void Lookahead::destroy()
{
x265_free(wbuffer[i]);
}
+ X265_FREE(scratch);
}
void Lookahead::addPicture(TComPic *pic, int sliceType)
@@ -265,6 +269,7 @@ void Lookahead::weightsAnalyse(int b, in
fenc = frames[b];
ref = frames[p0];
+ int deltaIndex = fenc->frameNum - ref->frameNum;
/* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
const float epsilon = 1.f / 128.f;
@@ -320,6 +325,8 @@ void Lookahead::weightsAnalyse(int b, in
else
{
SET_WEIGHT(w, 1, minscale, mindenom, minoff);
+ // set weighted delta cost
+ fenc->weightedCostDelta[deltaIndex] = minscore / origscore;
int offset = w.inputOffset << (X265_DEPTH - 8);
int scale = w.inputWeight;
@@ -912,7 +919,8 @@ void Lookahead::slicetypeAnalyse(bool bK
if (!framecnt)
{
- // TODO: mb-tree
+ if (cfg->param.rc.cuTree && cfg->param.rc.aqMode)
+ cuTree(frames, 0, bKeyframe);
return;
}
@@ -1045,7 +1053,9 @@ void Lookahead::slicetypeAnalyse(bool bK
num_bframes = 0;
}
- // TODO if rc.b_mb_tree Enabled the need to call x264_macroblock_tree currently Ignored the call
+ if (cfg->param.rc.cuTree && cfg->param.rc.aqMode)
+ cuTree(frames, X265_MIN(num_frames, cfg->param.keyframeMax), bKeyframe);
+
// if (!cfg->param.bIntraRefresh)
for (int j = keyint_limit + 1; j <= num_frames; j += cfg->param.keyframeMax)
{
@@ -1262,3 +1272,239 @@ void Lookahead::processRow(int row)
rowsCompleted = true;
}
}
+
+void Lookahead::cuTree(Lowres **frames, int numframes, bool bintra)
+{
+ int idx = !bintra;
+ int lastnonb, curnonb = 1;
+ int bframes = 0;
+
+ x265_emms();
+ double totalDuration = 0.0;
+ for (int j = 0; j <= numframes; j++)
+ totalDuration += 1.0 / cfg->param.frameRate;
+ double averageDuration = totalDuration / (numframes + 1);
+
+ int i = numframes;
+ int cuCount = widthInCU * heightInCU;
+
+ if (bintra)
+ estimateFrameCost(0, 0, 0, 0);
+
+ while (i > 0 && frames[i]->sliceType == X265_TYPE_B)
+ i--;
+ lastnonb = i;
+
+ /* Lookaheadless MB-tree is not a theoretically distinct case; the same extrapolation could
+ * be applied to the end of a lookahead buffer of any size. However, it's most needed when
+ * lookahead=0, so that's what's currently implemented. */
+ if (!cfg->param.lookaheadDepth)
+ {
+ if (bintra)
+ {
+ memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t));
+ memcpy(frames[0]->qpOffset, frames[0]->qpAqOffset, cuCount * sizeof(double));
+ return;
+ }
+ std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
+ memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t));
+ }
+ else
+ {
+ if (lastnonb < idx)
+ return;
+ memset(frames[lastnonb]->propagateCost, 0, cuCount * sizeof(uint16_t));
+ }
+
+ while (i-- > idx)
+ {
+ curnonb = i;
+ while (frames[curnonb]->sliceType == X265_TYPE_B && curnonb > 0)
+ curnonb--;
+ if (curnonb < idx)
+ break;
+
+ estimateFrameCost(curnonb, lastnonb, lastnonb, 0);
+ memset(frames[curnonb]->propagateCost, 0, cuCount * sizeof(uint16_t));
+ bframes = lastnonb - curnonb - 1;
+ if (cfg->param.bBPyramid && bframes > 1)
+ {
+ int middle = (bframes + 1) / 2 + curnonb;
+ estimateFrameCost(curnonb, lastnonb, middle, 0);
+ memset(frames[middle]->propagateCost, 0, cuCount * sizeof(uint16_t));
+ while (i > curnonb)
+ {
+ int p0 = i > middle ? middle : curnonb;
+ int p1 = i < middle ? middle : lastnonb;
+ if (i != middle)
+ {
+ estimateFrameCost(p0, p1, i, 0);
+ estimateCUPropagate(frames, averageDuration, p0, p1, i, 0);
+ }
+ i--;
+ }
+ estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, middle, 1);
+ }
+ else
+ {
+ while (i > curnonb)
+ {
+ estimateFrameCost(curnonb, lastnonb, i, 0);
+ estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, i, 0);
+ i--;
+ }
+ }
+ estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, lastnonb, 1);
+ lastnonb = curnonb;
+ }
+
+ if (!cfg->param.lookaheadDepth)
+ {
+ estimateFrameCost(0, lastnonb, lastnonb, 0);
+ estimateCUPropagate(frames, averageDuration, 0, lastnonb, lastnonb, 1);
+ std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
+ }
+
+ cuTreeFinish(frames[lastnonb], averageDuration, lastnonb);
+ if (cfg->param.bBPyramid && bframes > 1 /* && !h->param.rc.i_vbv_buffer_size */)
+ cuTreeFinish(frames[lastnonb + (bframes + 1) / 2], averageDuration, 0);
+}
+
+void Lookahead::estimateCUPropagate(Lowres **frames, double averageDuration, int p0, int p1, int b, int referenced)
+{
+ uint16_t *refCosts[2] = {frames[p0]->propagateCost, frames[p1]->propagateCost};
+ int distScaleFactor = (((b - p0) << 8) + ((p1 - p0) >> 1)) / (p1 - p0);
+ int bipredWeight = cfg->param.bEnableWeightedBiPred ? 64 - (distScaleFactor >> 2) : 32;
+ MV *mvs[2] = {frames[b]->lowresMvs[0][b - p0 -1], frames[b]->lowresMvs[1][p1 - b - 1]};
+ int bipredWeights[2] = {bipredWeight, 64 - bipredWeight};
+ memset(scratch, 0, widthInCU * sizeof(int));
+
+ uint16_t *propagate_cost = frames[b]->propagateCost;
+
+ x265_emms();
+ double fpsFactor = CLIP_DURATION(1.0 / cfg->param.frameRate) / CLIP_DURATION(averageDuration);
+
+ /* For non-refferd frames the source costs are always zero, so just memset one row and re-use it. */
+ if (!referenced)
+ memset(frames[b]->propagateCost, 0, widthInCU * sizeof(uint16_t));
+
+ uint16_t StrideInCU = (uint16_t)widthInCU;
+ for (uint16_t block_y = 0; block_y < heightInCU; block_y += 16)
+ {
+ int cuIndex = block_y * StrideInCU;
+ /* TODO This function go into ASM */
+ estimateCUPropagateCost(scratch, propagate_cost,
+ frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
+ frames[b]->invQscaleFactor + cuIndex, &fpsFactor, widthInCU);
+
+ if (referenced)
+ propagate_cost += widthInCU;
+ for (uint16_t block_x = 0; block_x < widthInCU; block_x += 16, cuIndex++)
+ {
+ int propagate_amount = scratch[block_x];
+ /* Don't propagate for an intra block. */
+ if (propagate_amount > 0)
+ {
+ /* Access width-2 bitfield. */
+ int lists_used = frames[b]->lowresCosts[b - p0][p1 - b][cuIndex] >> LOWRES_COST_SHIFT;
+ /* Follow the MVs to the previous frame(s). */
+ for (uint16_t list = 0; list < 2; list++)
+ if ((lists_used >> list) & 1)
+ {
+#define CLIP_ADD(s, x) (s) = X265_MIN((s) + (x),(1 << 16) - 1)
+ uint16_t listamount = (uint16_t)propagate_amount;
+ /* Apply bipred weighting. */
+ if (lists_used == 3)
+ listamount = (uint16_t)(listamount * bipredWeights[list] + 32) >> 6;
+
More information about the x265-commits
mailing list