[x265] [PATCH] slicetype: CuTree Implementation for AQ RateControl
Gopu Govindaswamy
gopu at multicorewareinc.com
Mon Dec 2 07:18:19 CET 2013
# HG changeset patch
# User Gopu Govindaswamy <gopu at multicorewareinc.com>
# Date 1385965090 -19800
# Node ID fbfc3fa2141d942c3a7f04429f3124e16712658f
# Parent e83550d5f10d7cb950db34cbb96f476ce8b0ab2f
slicetype: CuTree Implementation for AQ RateControl
Added Following methods into slicetype for CuTree Implementation
1.cuTree - Entry Point for CuTree
2.estimateCUPropagate and estimateCUPropagateCost - Calculate the CU Propagate cost for CU's
3.cuTreeFinish - update the qpOffset using Precomputed PropagateCost, weightedCostDelta and lookahead costs
Added cuTree option into param->rc and make it as a Disable, still the cuTree is an Under Construction
diff -r e83550d5f10d -r fbfc3fa2141d source/common/common.cpp
--- a/source/common/common.cpp Sun Dec 01 19:44:27 2013 -0600
+++ b/source/common/common.cpp Mon Dec 02 11:48:10 2013 +0530
@@ -219,6 +219,7 @@
param->rc.qp = 32;
param->rc.aqMode = X265_AQ_NONE;
param->rc.aqStrength = 1.0;
+ param->rc.cuTree = 0;
/* Quality Measurement Metrics */
param->bEnablePsnr = 1;
diff -r e83550d5f10d -r fbfc3fa2141d source/common/lowres.cpp
--- a/source/common/lowres.cpp Sun Dec 01 19:44:27 2013 -0600
+++ b/source/common/lowres.cpp Mon Dec 02 11:48:10 2013 +0530
@@ -47,9 +47,11 @@
{
qpAqOffset = (double*)x265_malloc(sizeof(double) * cuCount);
invQscaleFactor = (int*)x265_malloc(sizeof(int) * cuCount);
- if (!qpAqOffset || !invQscaleFactor)
+ qpOffset = (double*)x265_malloc(sizeof(double) * cuCount);
+ if (!qpAqOffset || !invQscaleFactor || !qpOffset)
*aqMode = 0;
}
+ propagateCost = (uint16_t*)x265_malloc(sizeof(uint16_t) * cuCount);
/* allocate lowres buffers */
for (int i = 0; i < 4; i++)
@@ -111,6 +113,8 @@
X265_FREE(qpAqOffset);
X265_FREE(invQscaleFactor);
+ X265_FREE(qpOffset);
+ X265_FREE(propagateCost);
}
// (re) initialize lowres state
diff -r e83550d5f10d -r fbfc3fa2141d source/common/lowres.h
--- a/source/common/lowres.h Sun Dec 01 19:44:27 2013 -0600
+++ b/source/common/lowres.h Mon Dec 02 11:48:10 2013 +0530
@@ -126,9 +126,13 @@
/* rate control / adaptive quant data */
double* qpAqOffset; // qp Aq offset values for each Cu
int* invQscaleFactor; // qScale values for qp Aq Offsets
+ double* qpOffset;
uint64_t wp_ssd[3]; // This is different than SSDY, this is sum(pixel^2) - sum(pixel)^2 for entire frame
uint64_t wp_sum[3];
+ uint16_t* propagateCost;
+ double weightedCostDelta[X265_BFRAME_MAX+2];
+
void create(TComPicYuv *orig, int bframes, int32_t *aqMode);
void destroy(int bframes);
void init(TComPicYuv *orig, int poc, int sliceType, int bframes);
diff -r e83550d5f10d -r fbfc3fa2141d source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp Sun Dec 01 19:44:27 2013 -0600
+++ b/source/encoder/frameencoder.cpp Mon Dec 02 11:48:10 2013 +0530
@@ -1117,7 +1117,11 @@
{
for (int w = 0; w < noOfBlocks && (block_x + w) < maxBlockCols; w++)
{
- qp_offset += pic->m_lowres.qpAqOffset[block_x + w + (block_y * maxBlockCols)];
+ qp_offset += pic->m_lowres.qpAqOffset[block_x + w + (block_y * maxBlockCols)];
+ if (pic->getSlice()->isReferenced() && m_cfg->param.rc.cuTree && m_cfg->param.rc.aqMode)
+ qp_offset += pic->m_lowres.qpOffset[block_x + w + (block_y * maxBlockCols)];
+ else
+ qp_offset += pic->m_lowres.qpAqOffset[block_x + w + (block_y * maxBlockCols)];
cnt++;
}
}
diff -r e83550d5f10d -r fbfc3fa2141d source/encoder/ratecontrol.h
--- a/source/encoder/ratecontrol.h Sun Dec 01 19:44:27 2013 -0600
+++ b/source/encoder/ratecontrol.h Mon Dec 02 11:48:10 2013 +0530
@@ -36,6 +36,14 @@
class TComPic;
class TEncCfg;
+#define BASE_FRAME_DURATION 0.04
+
+/* Arbitrary limitations as a sanity check. */
+#define MAX_FRAME_DURATION 1.00
+#define MIN_FRAME_DURATION 0.01
+
+#define CLIP_DURATION(f) Clip3(MIN_FRAME_DURATION, MAX_FRAME_DURATION, f)
+
struct RateControlEntry
{
int sliceType;
diff -r e83550d5f10d -r fbfc3fa2141d source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp Sun Dec 01 19:44:27 2013 -0600
+++ b/source/encoder/slicetype.cpp Mon Dec 02 11:48:10 2013 +0530
@@ -31,6 +31,7 @@
#include "slicetype.h"
#include "motion.h"
#include "mv.h"
+#include "ratecontrol.h"
#define LOWRES_COST_MASK ((1 << 14) - 1)
#define LOWRES_COST_SHIFT 14
@@ -86,6 +87,8 @@
lhrows[i].widthInCU = widthInCU;
lhrows[i].heightInCU = heightInCU;
}
+
+ scratch = (int*)x265_malloc(widthInCU * sizeof(int));
}
Lookahead::~Lookahead()
@@ -127,6 +130,7 @@
{
x265_free(wbuffer[i]);
}
+ X265_FREE(scratch);
}
void Lookahead::addPicture(TComPic *pic, int sliceType)
@@ -265,6 +269,7 @@
fenc = frames[b];
ref = frames[p0];
+ int deltaIndex = fenc->frameNum - ref->frameNum;
/* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
const float epsilon = 1.f / 128.f;
@@ -320,6 +325,8 @@
else
{
SET_WEIGHT(w, 1, minscale, mindenom, minoff);
+ // set weighted delta cost
+ fenc->weightedCostDelta[deltaIndex] = minscore / origscore;
int offset = w.inputOffset << (X265_DEPTH - 8);
int scale = w.inputWeight;
@@ -912,7 +919,8 @@
if (!framecnt)
{
- // TODO: mb-tree
+ if (cfg->param.rc.cuTree && cfg->param.rc.aqMode)
+ cuTree(frames, 0, bKeyframe);
return;
}
@@ -1045,7 +1053,9 @@
num_bframes = 0;
}
- // TODO if rc.b_mb_tree Enabled the need to call x264_macroblock_tree currently Ignored the call
+ if (cfg->param.rc.cuTree && cfg->param.rc.aqMode)
+ cuTree(frames, X265_MIN(num_frames, cfg->param.keyframeMax), bKeyframe);
+
// if (!cfg->param.bIntraRefresh)
for (int j = keyint_limit + 1; j <= num_frames; j += cfg->param.keyframeMax)
{
@@ -1262,3 +1272,239 @@
rowsCompleted = true;
}
}
+
+void Lookahead::cuTree(Lowres **frames, int numframes, bool bintra)
+{
+ int idx = !bintra;
+ int lastnonb, curnonb = 1;
+ int bframes = 0;
+
+ x265_emms();
+ double totalDuration = 0.0;
+ for (int j = 0; j <= numframes; j++)
+ totalDuration += 1.0 / cfg->param.frameRate;
+ double averageDuration = totalDuration / (numframes + 1);
+
+ int i = numframes;
+ int cuCount = widthInCU * heightInCU;
+
+ if (bintra)
+ estimateFrameCost(0, 0, 0, 0);
+
+ while (i > 0 && frames[i]->sliceType == X265_TYPE_B)
+ i--;
+ lastnonb = i;
+
+ /* Lookaheadless MB-tree is not a theoretically distinct case; the same extrapolation could
+ * be applied to the end of a lookahead buffer of any size. However, it's most needed when
+ * lookahead=0, so that's what's currently implemented. */
+ if (!cfg->param.lookaheadDepth)
+ {
+ if (bintra)
+ {
+ memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t));
+ memcpy(frames[0]->qpOffset, frames[0]->qpAqOffset, cuCount * sizeof(double));
+ return;
+ }
+ std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
+ memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t));
+ }
+ else
+ {
+ if (lastnonb < idx)
+ return;
+ memset(frames[lastnonb]->propagateCost, 0, cuCount * sizeof(uint16_t));
+ }
+
+ while (i-- > idx)
+ {
+ curnonb = i;
+ while (frames[curnonb]->sliceType == X265_TYPE_B && curnonb > 0)
+ curnonb--;
+ if (curnonb < idx)
+ break;
+
+ estimateFrameCost(curnonb, lastnonb, lastnonb, 0);
+ memset(frames[curnonb]->propagateCost, 0, cuCount * sizeof(uint16_t));
+ bframes = lastnonb - curnonb - 1;
+ if (cfg->param.bBPyramid && bframes > 1)
+ {
+ int middle = (bframes + 1) / 2 + curnonb;
+ estimateFrameCost(curnonb, lastnonb, middle, 0);
+ memset(frames[middle]->propagateCost, 0, cuCount * sizeof(uint16_t));
+ while (i > curnonb)
+ {
+ int p0 = i > middle ? middle : curnonb;
+ int p1 = i < middle ? middle : lastnonb;
+ if (i != middle)
+ {
+ estimateFrameCost(p0, p1, i, 0);
+ estimateCUPropagate(frames, averageDuration, p0, p1, i, 0);
+ }
+ i--;
+ }
+ estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, middle, 1);
+ }
+ else
+ {
+ while (i > curnonb)
+ {
+ estimateFrameCost(curnonb, lastnonb, i, 0);
+ estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, i, 0);
+ i--;
+ }
+ }
+ estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, lastnonb, 1);
+ lastnonb = curnonb;
+ }
+
+ if (!cfg->param.lookaheadDepth)
+ {
+ estimateFrameCost(0, lastnonb, lastnonb, 0);
+ estimateCUPropagate(frames, averageDuration, 0, lastnonb, lastnonb, 1);
+ std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
+ }
+
+ cuTreeFinish(frames[lastnonb], averageDuration, lastnonb);
+ if (cfg->param.bBPyramid && bframes > 1 /* && !h->param.rc.i_vbv_buffer_size */)
+ cuTreeFinish(frames[lastnonb + (bframes + 1) / 2], averageDuration, 0);
+}
+
+void Lookahead::estimateCUPropagate(Lowres **frames, double averageDuration, int p0, int p1, int b, int referenced)
+{
+ uint16_t *refCosts[2] = {frames[p0]->propagateCost, frames[p1]->propagateCost};
+ int distScaleFactor = (((b - p0) << 8) + ((p1 - p0) >> 1)) / (p1 - p0);
+ int bipredWeight = cfg->param.bEnableWeightedBiPred ? 64 - (distScaleFactor >> 2) : 32;
+ MV *mvs[2] = {frames[b]->lowresMvs[0][b - p0 -1], frames[b]->lowresMvs[1][p1 - b - 1]};
+ int bipredWeights[2] = {bipredWeight, 64 - bipredWeight};
+ memset(scratch, 0, widthInCU * sizeof(int));
+
+ uint16_t *propagate_cost = frames[b]->propagateCost;
+
+ x265_emms();
+ double fpsFactor = CLIP_DURATION(1.0 / cfg->param.frameRate) / CLIP_DURATION(averageDuration);
+
+ /* For non-refferd frames the source costs are always zero, so just memset one row and re-use it. */
+ if (!referenced)
+ memset(frames[b]->propagateCost, 0, widthInCU * sizeof(uint16_t));
+
+ uint16_t StrideInCU = (uint16_t)widthInCU;
+ for (uint16_t block_y = 0; block_y < heightInCU; block_y += 16)
+ {
+ int cuIndex = block_y * StrideInCU;
+ /* TODO This function go into ASM */
+ estimateCUPropagateCost(scratch, propagate_cost,
+ frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
+ frames[b]->invQscaleFactor + cuIndex, &fpsFactor, widthInCU);
+
+ if (referenced)
+ propagate_cost += widthInCU;
+ for (uint16_t block_x = 0; block_x < widthInCU; block_x += 16, cuIndex++)
+ {
+ int propagate_amount = scratch[block_x];
+ /* Don't propagate for an intra block. */
+ if (propagate_amount > 0)
+ {
+ /* Access width-2 bitfield. */
+ int lists_used = frames[b]->lowresCosts[b - p0][p1 - b][cuIndex] >> LOWRES_COST_SHIFT;
+ /* Follow the MVs to the previous frame(s). */
+ for (uint16_t list = 0; list < 2; list++)
+ if ((lists_used >> list) & 1)
+ {
+#define CLIP_ADD(s, x) (s) = X265_MIN((s) + (x),(1 << 16) - 1)
+ uint16_t listamount = (uint16_t)propagate_amount;
+ /* Apply bipred weighting. */
+ if (lists_used == 3)
+ listamount = (uint16_t)(listamount * bipredWeights[list] + 32) >> 6;
+
+ /* Early termination for simple case of mv0. */
+ if (mvs[list]->notZero())
+ {
+ CLIP_ADD(refCosts[list][cuIndex], listamount);
+ continue;
+ }
+
+ uint16_t x = mvs[list]->x;
+ uint16_t y = mvs[list]->y;
+ uint16_t cux = (x >> 5) + block_x;
+ uint16_t cuy = (y >> 5) + block_y;
+ uint16_t idx0 = cux + cuy * StrideInCU;
+ uint16_t idx1 = idx0 + 1;
+ uint16_t idx2 = idx0 + StrideInCU;
+ uint16_t idx3 = idx0 + StrideInCU + 1;
+ x &= 31;
+ y &= 31;
+ uint16_t idx0weight = (uint16_t) (32 - y) * (32 - x);
+ uint16_t idx1weight = (uint16_t) (32 - y) * x;
+ uint16_t idx2weight = (uint16_t) y * (32 - x);
+ uint16_t idx3weight = (uint16_t) y * x;
+
+ /* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't
+ * be counted. */
+ if (cux < widthInCU - 1 && cuy < heightInCU - 1 && cux >= 0 && cuy >= 0)
+ {
+ CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10);
+ CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10);
+ CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10);
+ CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10);
+ }
+ else /* Check offsets individually */
+ {
+ if (cux < widthInCU && cuy < heightInCU && cux >= 0 && cuy >= 0)
+ CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10);
+ if (cux + 1 < widthInCU && cuy < heightInCU && cux + 1 >= 0 && cuy >= 0)
+ CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10);
+ if (cux < widthInCU && cuy + 1 < heightInCU && cux >= 0 && cuy + 1 >= 0)
+ CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10);
+ if (cux + 1 < widthInCU && cuy + 1 < heightInCU && cux + 1 >= 0 && cuy + 1 >= 0)
+ CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10);
+ }
+ }
+ }
+ }
+ }
+
+ if(/*h->param.rc.i_vbv_buffer_size &&*/ cfg->param.logLevel && referenced)
+ cuTreeFinish(frames[b], averageDuration, b == p1 ? b - p0 : 0);
+
+}
+
+void Lookahead::cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance)
+{
+ int fpsFactor = (int)(CLIP_DURATION(averageDuration) / CLIP_DURATION(1.0 / cfg->param.frameRate) * 256);
+ double weightdelta = 0.0;
+
+ if (ref0Distance && frame->weightedCostDelta[ref0Distance - 1] > 0)
+ weightdelta = (1.0 - frame->weightedCostDelta[ref0Distance - 1]);
+
+ /* Allow the strength to be adjusted via qcompress, since the two
+ * concepts are very similar. */
+
+ int cuCount = widthInCU * heightInCU;
+ double strength = 5.0f * (1.0f - cfg->param.rc.qCompress);
+ for (int cuIndex = 0; cuIndex < cuCount; cuIndex++)
+ {
+ int intracost = (frame->intraCost[cuIndex] * frame->invQscaleFactor[cuIndex] + 128) >> 8;
+ if (intracost)
+ {
+ int propagate_cost = (frame->propagateCost[cuIndex] * fpsFactor + 128) >> 8;
+ double log2_ratio = X265_LOG2(intracost + propagate_cost) - X265_LOG2(intracost) + weightdelta;
+ frame->qpOffset[cuIndex] = frame->qpAqOffset[cuIndex] - strength * log2_ratio;
+ }
+ }
+}
+
+/* Estimate the total amount of influence on future quality that could be had if we
+ * were to improve the reference samples used to inter predict any given macroblock. */
+void Lookahead::estimateCUPropagateCost(int *dst, uint16_t *propagateIn, int32_t *intraCosts, uint16_t *interCosts, int32_t *invQscales, double *fpsFactor, int len)
+{
+ double fps = *fpsFactor / 256;
+ for(int i = 0; i < len; i++)
+ {
+ double intraCost = intraCosts[i] * invQscales[i];
+ double propagateAmount = (double)propagateIn[i] + intraCost * fps;
+ double propagateNum = (double)intraCosts[i] - (interCosts[i] & LOWRES_COST_MASK);
+ double propagateDenom = (double)intraCosts[i];
+ dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5);
+ }
+}
diff -r e83550d5f10d -r fbfc3fa2141d source/encoder/slicetype.h
--- a/source/encoder/slicetype.h Sun Dec 01 19:44:27 2013 -0600
+++ b/source/encoder/slicetype.h Mon Dec 02 11:48:10 2013 +0530
@@ -92,6 +92,8 @@
int curb, curp0, curp1;
bool rowsCompleted;
+ int *scratch; // temp buffer
+
LookaheadRow* lhrows;
Lookahead(TEncCfg *, ThreadPool *);
@@ -116,6 +118,11 @@
void weightsAnalyse(int b, int p0);
uint32_t weightCostLuma(int b, pixel *src, wpScalingParam *w);
+
+ void cuTree(Lowres **frames, int numframes, bool bintra);
+ void estimateCUPropagate(Lowres **frames, double average_duration, int p0, int p1, int b, int referenced);
+ void estimateCUPropagateCost(int *dst, uint16_t *propagateIn, int32_t *intraCosts, uint16_t *interCosts, int32_t *invQscales, double *fpsFactor, int len);
+ void cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance);
};
}
diff -r e83550d5f10d -r fbfc3fa2141d source/x265.h
--- a/source/x265.h Sun Dec 01 19:44:27 2013 -0600
+++ b/source/x265.h Mon Dec 02 11:48:10 2013 +0530
@@ -597,6 +597,7 @@
int vbvMaxBitrate;
int vbvBufferSize;
double vbvBufferInit;
+ int cuTree;
} rc;
} x265_param;
More information about the x265-devel
mailing list