[x265] [PATCH][MASTER] Support for Decoding-Energy-Rate-Distortion Optimization (DERDO)
Christian Herglotz
christian.herglotz at fau.de
Thu Oct 22 15:07:55 CEST 2020
--
Dr.-Ing. Christian Herglotz
Chair of Multimedia Communications and Signal Processing
Friedrich-Alexander University Erlangen-Nürnberg
Cauerstr. 7, D-91058 Erlangen, Germany
Tel. +49 9131 85-27117
-------------- next part --------------
From 969268805cbd2571856a0da6ee286b623143b4da Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:21:09 +0200
Subject: [PATCH 04/14] DERDO: Add new variable for decoding energy to cudata
and define function to return the motion vector.
---
source/common/cudata.cpp | 2 ++
source/common/cudata.h | 3 +++
2 files changed, 5 insertions(+)
diff --git a/source/common/cudata.cpp b/source/common/cudata.cpp
index 19281dee2..29d961f01 100644
--- a/source/common/cudata.cpp
+++ b/source/common/cudata.cpp
@@ -273,6 +273,8 @@ void CUData::initialize(const CUDataMemPool& dataPool, uint32_t depth, const x26
m_trCoeff[0] = dataPool.trCoeffMemBlock + instance * (sizeL + sizeC * 2);
m_trCoeff[1] = m_trCoeff[0] + sizeL;
m_trCoeff[2] = m_trCoeff[0] + sizeL + sizeC;
+ m_dpartDecEnergy = 0;
+
for (int i = 0; i < 3; i++)
m_fAc_den[i] = m_fDc_den[i] = 0;
}
diff --git a/source/common/cudata.h b/source/common/cudata.h
index 8397f0568..5f2a2c5a6 100644
--- a/source/common/cudata.h
+++ b/source/common/cudata.h
@@ -96,6 +96,7 @@ struct MVField
{
MV mv;
int refIdx;
+ MV getMv () const {return mv;}
};
// Structure that keeps the neighbour's MV information.
@@ -229,6 +230,8 @@ public:
uint64_t* m_collectCURd;
uint32_t* m_collectCUVariance;
uint32_t* m_collectCUCount;
+ uint32_t m_dpartDecEnergy;
+
CUData();
--
2.20.1.windows.1
-------------- next part --------------
From b51e801b5479cf329f20a98fc963f74db2e7a473 Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:24:04 +0200
Subject: [PATCH 05/14] DERDO: Counter for coefficient features in quant_c
function
---
source/common/dct.cpp | 17 ++++++++++++++---
1 file changed, 14 insertions(+), 3 deletions(-)
diff --git a/source/common/dct.cpp b/source/common/dct.cpp
index b102b6e31..2777c5057 100644
--- a/source/common/dct.cpp
+++ b/source/common/dct.cpp
@@ -661,7 +661,7 @@ static void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCo
}
}
-static uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
+static uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff, bool countVal, uint32_t& ldVal)
{
X265_CHECK(qBits >= 8, "qBits less than 8\n");
X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n");
@@ -676,8 +676,19 @@ static uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t*
int tmplevel = abs(level) * quantCoeff[blockpos];
level = ((tmplevel + add) >> qBits);
deltaU[blockpos] = ((tmplevel - (level << qBits)) >> qBits8);
- if (level)
- ++numSig;
+ if (level)
+ {
+ if (countVal)
+ {
+ int absCoeff = level;
+ for (; absCoeff > 1; absCoeff = absCoeff >> 1) {
+ if (absCoeff < 4 && (absCoeff & 1)) //consider second last bit
+ ldVal = ldVal + 1; //middle between two logs for more accuracy, using ~ double the value and divide by two later
+ ldVal = ldVal + 2;
+ }
+ }
+ ++numSig;
+ }
level *= sign;
qCoef[blockpos] = (int16_t)x265_clip3(-32768, 32767, level);
}
--
2.20.1.windows.1
-------------- next part --------------
From f38783ee6805b182e5fcdf58e136f453b9508b29 Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:25:58 +0200
Subject: [PATCH 06/14] DERDO: Add functions to return absolute values of
motion vectors.
---
source/common/mv.h | 2 ++
1 file changed, 2 insertions(+)
diff --git a/source/common/mv.h b/source/common/mv.h
index 191090cb9..6bcf940cb 100644
--- a/source/common/mv.h
+++ b/source/common/mv.h
@@ -47,6 +47,8 @@ public:
MV() {}
MV(int64_t w) : word(w) {}
MV(int32_t _x, int32_t _y) : x(_x), y(_y) {}
+ int getAbsHor () const { return abs( x ); }
+ int getAbsVer () const { return abs( y ); }
MV& operator =(uint64_t w) { word = w; return *this; }
--
2.20.1.windows.1
-------------- next part --------------
From 251e1cd38f87773d8012923b2447b3a69f21afe3 Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:30:20 +0200
Subject: [PATCH 07/14] DERDO: Adding flags and tuning definitions in param.cpp
---
source/common/param.cpp | 51 ++++++++++++++++++++++++++++++++++++++++-
1 file changed, 50 insertions(+), 1 deletion(-)
diff --git a/source/common/param.cpp b/source/common/param.cpp
index 47a7a7c47..c653a5706 100755
--- a/source/common/param.cpp
+++ b/source/common/param.cpp
@@ -258,6 +258,9 @@ void x265_param_default(x265_param* param)
param->minVbvFullness = 50;
param->maxVbvFullness = 80;
param->rc.rfConstant = 28;
+
+ param->derdo = false;
+
param->rc.bitrate = 0;
param->rc.qCompress = 0.6;
param->rc.ipFactor = 1.4f;
@@ -323,6 +326,38 @@ void x265_param_default(x265_param* param)
param->confWinRightOffset = 0;
param->confWinBottomOffset = 0;
+ //Intra Prediction decoding energies
+ param->sSpecificDecEnergies.e_intra32x32 = 416>>3;
+ param->sSpecificDecEnergies.e_intra16x16 = 134 >> 3;
+ param->sSpecificDecEnergies.e_intra8x8 = 57 >> 3;
+ param->sSpecificDecEnergies.e_intra4x4 = 20 >> 3;
+ // Transform coding decoding energies
+ param->sSpecificDecEnergies.e_trans32x32 = 1049 >> 3;
+ param->sSpecificDecEnergies.e_trans16x16 = 140 >> 3;
+ param->sSpecificDecEnergies.e_trans8x8 = 27 >> 3;
+ param->sSpecificDecEnergies.e_trans4x4 = 16 >> 3;
+ param->sSpecificDecEnergies.e_IntraCUs = 25 >> 3;
+ // Coefficient coding decoding energies
+ param->sSpecificDecEnergies.e_coeff = 1;
+ param->sSpecificDecEnergies.e_val = 1;
+ // Skipped CU decoding energies
+ param->sSpecificDecEnergies.e_PBslice = 187 >> 3;
+ param->sSpecificDecEnergies.e_skip64x64 = 1650 >> 3;
+ param->sSpecificDecEnergies.e_skip8x82x32 = 427 >> 3;
+ param->sSpecificDecEnergies.e_skip16x16 = 127 >> 3;
+ param->sSpecificDecEnergies.e_skip8x8 = 40 >> 3;
+ // Inter coded CU decoding energies
+ param->sSpecificDecEnergies.e_inter64x64 = 1880 >> 3;
+ param->sSpecificDecEnergies.e_inter32x32 = 471 >> 3;
+ param->sSpecificDecEnergies.e_inter32x326x16 = 136 >> 3;
+ param->sSpecificDecEnergies.e_inter8x8 = 49 >> 3;
+ param->sSpecificDecEnergies.e_fracpel = 1;
+ param->sSpecificDecEnergies.e_bi = 1; // number of bipredicted SCUs
+ // In-loop filter decoding energies
+ param->sSpecificDecEnergies.e_Bs = 1;
+ param->sSpecificDecEnergies.e_SAOY = 1175 >> 3;
+ param->sSpecificDecEnergies.e_SAOC = 369 >> 3;
+
param->bEmitVUITimingInfo = 1;
param->bEmitVUIHRDInfo = 1;
param->bOptQpPPS = 0;
@@ -621,6 +656,17 @@ int x265_param_default_preset(x265_param* param, const char* preset, const char*
else if (!strcmp(tune, "vmaf")) /*Adding vmaf for x265 + SVT-HEVC integration support*/
{
/*vmaf is under development, currently x265 won't support vmaf*/
+ }
+ else if (!strcmp(tune, "eedecode")) /*Adding energy efficient decoding tuning by LMS, Erlangen*/
+ {
+ param->bEnableLoopFilter = 0;
+ param->bEnableWeightedPred = 0;
+ param->bEnableWeightedBiPred = 0;
+ param->bIntraInBFrames = 0;
+ param->derdo = true;
+ param->rc.aqStrength = 0.0;
+ param->psyRd = 0.0;
+ param->psyRdoq = 0.0;
}
else
return -1;
@@ -1014,6 +1060,7 @@ int x265_param_parse(x265_param* p, const char* name, const char* value)
else
p->psyRdoq = 0.0;
}
+ OPT("derdo") p->derdo = atobool(value);
OPT("rd-refine") p->bEnableRdRefine = atobool(value);
OPT("signhide") p->bEnableSignHiding = atobool(value);
OPT("b-intra") p->bIntraInBFrames = atobool(value);
@@ -1192,7 +1239,7 @@ int x265_param_parse(x265_param* p, const char* name, const char* value)
OPT("max-cll") bError |= sscanf(value, "%hu,%hu", &p->maxCLL, &p->maxFALL) != 2;
OPT("min-luma") p->minLuma = (uint16_t)atoi(value);
OPT("max-luma") p->maxLuma = (uint16_t)atoi(value);
- OPT("uhd-bd") p->uhdBluray = atobool(value);
+ //OPT("uhd-bd") p->uhdBluray = atobool(value);
else
bExtraParams = true;
@@ -2127,6 +2174,7 @@ char *x265_param2string(x265_param* p, int padx, int pady)
s += sprintf(s, " rdpenalty=%d", p->rdPenalty);
s += sprintf(s, " psy-rd=%.2f", p->psyRd);
s += sprintf(s, " psy-rdoq=%.2f", p->psyRdoq);
+ BOOL(p->derdo, "derdo");
BOOL(p->bEnableRdRefine, "rd-refine");
BOOL(p->bLossless, "lossless");
s += sprintf(s, " cbqpoffs=%d", p->cbQpOffset);
@@ -2510,6 +2558,7 @@ void x265_copy_params(x265_param* dst, x265_param* src)
dst->rc.bEnableConstVbv = src->rc.bEnableConstVbv;
dst->rc.hevcAq = src->rc.hevcAq;
dst->rc.qpAdaptationRange = src->rc.qpAdaptationRange;
+ dst->derdo = src->derdo;
dst->vui.aspectRatioIdc = src->vui.aspectRatioIdc;
dst->vui.sarWidth = src->vui.sarWidth;
--
2.20.1.windows.1
-------------- next part --------------
From 7f8620298ed650eddc3044fdd332f82eba00183c Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:31:37 +0200
Subject: [PATCH 08/14] DERDO: Primitive declaration for coefficient counting
in primitives.h
---
source/common/primitives.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/source/common/primitives.h b/source/common/primitives.h
index 0b52f84de..34a2774cb 100644
--- a/source/common/primitives.h
+++ b/source/common/primitives.h
@@ -156,7 +156,7 @@ typedef void (*denoiseDct_t)(int16_t* dctCoef, uint32_t* resSum, const uint16_t*
typedef void (*calcresidual_t)(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
typedef void (*transpose_t)(pixel* dst, const pixel* src, intptr_t stride);
-typedef uint32_t (*quant_t)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
+typedef uint32_t (*quant_t)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff, bool countLdVal, uint32_t& ldVal);
typedef uint32_t (*nquant_t)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift);
typedef void (*dequant_normal_t)(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
--
2.20.1.windows.1
-------------- next part --------------
From f242757353b05456a4d96e578f2b5def4f5e1b7a Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:33:35 +0200
Subject: [PATCH 09/14] DERDO: Definitions and corrected declarations for
coefficient counting in quant.h and quant.cpp
---
source/common/quant.cpp | 21 +++++++++++++++------
source/common/quant.h | 8 ++++----
2 files changed, 19 insertions(+), 10 deletions(-)
diff --git a/source/common/quant.cpp b/source/common/quant.cpp
index 93462f51a..3197df414 100644
--- a/source/common/quant.cpp
+++ b/source/common/quant.cpp
@@ -395,7 +395,7 @@ uint32_t Quant::signBitHidingHDQ(int16_t* coeff, int32_t* deltaU, uint32_t numSi
}
uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride,
- coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip)
+ coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip, bool countLdVal, uint32_t& ldVal)
{
const uint32_t sizeIdx = log2TrSize - 2;
@@ -452,7 +452,8 @@ uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencS
}
if (m_rdoqLevel)
- return (this->*rdoQuant_func[log2TrSize - 2])(cu, coeff, ttype, absPartIdx, usePsy);
+ return (this->*rdoQuant_func[log2TrSize - 2])(cu, coeff, ttype, absPartIdx, usePsy, countLdVal, ldVal);
+
else
{
int deltaU[32 * 32];
@@ -466,7 +467,7 @@ uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencS
int add = (cu.m_slice->m_sliceType == I_SLICE ? 171 : 85) << (qbits - 9);
int numCoeff = 1 << (log2TrSize * 2);
- uint32_t numSig = primitives.quant(m_resiDctCoeff, quantCoeff, deltaU, coeff, qbits, add, numCoeff);
+ uint32_t numSig = primitives.quant(m_resiDctCoeff, quantCoeff, deltaU, coeff, qbits, add, numCoeff, countLdVal, ldVal);
if (numSig >= 2 && cu.m_slice->m_pps->bSignHideEnabled)
{
@@ -607,7 +608,7 @@ void Quant::invtransformNxN(const CUData& cu, int16_t* residual, uint32_t resiSt
/* Rate distortion optimized quantization for entropy coding engines using
* probability models like CABAC */
template<uint32_t log2TrSize>
-uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy)
+uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy, bool countLdVal, uint32_t& ldVal)
{
const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype;
@@ -1261,7 +1262,15 @@ uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, ui
int blkPos = codeParams.scan[pos];
int level = dstCoeff[blkPos];
numSig += (level != 0);
-
+ if (level && countLdVal)
+ {
+ int absCoeff = abs(level);
+ for (; absCoeff > 1; absCoeff = absCoeff >> 1) {
+ if (absCoeff<4 && (absCoeff & 1)) //consider second last bit
+ ldVal = ldVal + 1; //middle between two logs for more accuracy, using ~ double the value and divide by two later
+ ldVal = ldVal + 2;
+ }
+ }
uint32_t mask = (int32_t)m_resiDctCoeff[blkPos] >> 31;
dstCoeff[blkPos] = (int16_t)((level ^ mask) - mask);
}
@@ -1418,7 +1427,7 @@ uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, ui
lastCG = 0;
}
}
-
+ ldVal >>= 1;
return numSig;
}
diff --git a/source/common/quant.h b/source/common/quant.h
index 21ec217db..4bcfd2b1c 100644
--- a/source/common/quant.h
+++ b/source/common/quant.h
@@ -106,8 +106,8 @@ public:
/* CU setup */
void setQPforQuant(const CUData& ctu, int qp);
- uint32_t transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride, coeff_t* coeff,
- uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip);
+ uint32_t transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride, coeff_t* coeff,
+ uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip, bool countLdVal, uint32_t& ldVal);
void invtransformNxN(const CUData& cu, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig);
@@ -155,10 +155,10 @@ protected:
uint32_t signBitHidingHDQ(int16_t* qcoeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codingParameters, uint32_t log2TrSize);
template<uint32_t log2TrSize>
- uint32_t rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy);
+ uint32_t rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy, bool countLdVal, uint32_t& ldVal);
public:
- typedef uint32_t (Quant::*rdoQuant_t)(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy);
+ typedef uint32_t(Quant::*rdoQuant_t)(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy, bool countLdVal, uint32_t& ldVal);
private:
static rdoQuant_t rdoQuant_func[NUM_CU_DEPTH];
--
2.20.1.windows.1
-------------- next part --------------
From 6ece16d0c5e0ec519ef6218035071b28e8a3e874 Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:35:48 +0200
Subject: [PATCH 10/14] DERDO: Energy estimation and cost calculation functions
for analisys.cpp
---
source/encoder/analysis.cpp | 111 ++++++++++++++++++++++++++++++++++--
1 file changed, 106 insertions(+), 5 deletions(-)
diff --git a/source/encoder/analysis.cpp b/source/encoder/analysis.cpp
index aabf386ca..0f49ad29d 100644
--- a/source/encoder/analysis.cpp
+++ b/source/encoder/analysis.cpp
@@ -1382,12 +1382,18 @@ SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom&
}
nextContext->store(splitPred->contexts);
+ if(m_rdCost.m_decEnergyRD)
+ splitPred->sa8dBits += splitPred->decEnergy;
+
if (mightNotSplit)
addSplitFlagCost(*splitPred, cuGeom.depth);
else if (m_param->rdLevel > 1)
updateModeCost(*splitPred);
else
splitPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)splitPred->distortion, splitPred->sa8dBits);
+
+ if(m_rdCost.m_decEnergyRD)
+ splitPred->sa8dBits -= splitPred->decEnergy;
}
/* If analysis mode is simple do not Evaluate other modes */
if (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7)
@@ -2828,7 +2834,32 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe
tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize);
tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize);
}
- tempPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)tempPred->distortion, tempPred->sa8dBits);
+
+ if(m_rdCost.m_decEnergyRD)
+ {
+ bool biPred = false;
+ if (candDir[i] == 3) // consider biprediction
+ {
+ tempPred->decEnergy += this->m_param->sSpecificDecEnergies.e_bi * ((pu.width*pu.height)>>4);
+ biPred = true;
+ }
+
+ // Fractional pel calculation
+ int decEnergyL0 = this->calcFracpelDecodingEnergySearch(candMvField[i][0].mv, pu.width, pu.height);
+ int decEnergyL1 = this->calcFracpelDecodingEnergySearch(candMvField[i][1].mv, pu.width, pu.height);
+
+ if (biPred)
+ tempPred->decEnergy += decEnergyL0 + decEnergyL1;
+ else if (candDir[i] & 1)
+ tempPred->decEnergy += decEnergyL0;
+ else
+ tempPred->decEnergy += decEnergyL1;
+
+ tempPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)tempPred->distortion, tempPred->sa8dBits+tempPred->decEnergy);
+ }
+ else
+ tempPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)tempPred->distortion, tempPred->sa8dBits);
+
if (tempPred->sa8dCost < bestPred->sa8dCost)
{
@@ -2972,6 +3003,25 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGe
{
/* if the best prediction has CBF (not a skip) then try merge with residual */
+ /* Code for DERDO*/
+ bool biPred = false;
+ if (candDir[i] == 3) // consider biprediction
+ {
+ tempPred->decEnergy += this->m_param->sSpecificDecEnergies.e_bi * ((pu.width*pu.height)>>4);
+ biPred = true;
+ }
+
+ // Fractional pel calculation
+ int decEnergyL0 = this->calcFracpelDecodingEnergySearch(candMvField[i][0].mv, pu.width, pu.height);
+ int decEnergyL1 = this->calcFracpelDecodingEnergySearch(candMvField[i][1].mv, pu.width, pu.height);
+
+ if (biPred)
+ tempPred->decEnergy += decEnergyL0 + decEnergyL1;
+ else if (candDir[i] & 1)
+ tempPred->decEnergy += decEnergyL0;
+ else
+ tempPred->decEnergy += decEnergyL1;
+ /* END code for DERDO*/
encodeResAndCalcRdInterCU(*tempPred, cuGeom);
hasCbf = tempPred->cu.getQtRootCbf(0);
foundCbf0Merge = !hasCbf;
@@ -2998,6 +3048,33 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGe
tempPred->predYuv.copyFromYuv(bestPred->predYuv);
}
+ /* Code DERDO*/
+ bool biPred = false;
+ if (candDir[i] == 3) // consider biprediction
+ {
+ tempPred->decEnergy += this->m_param->sSpecificDecEnergies.e_bi * ((pu.width*pu.height)>>4);
+ biPred = true;
+ }
+ // Fractional pel calculation
+ int decEnergyL0 = this->calcFracpelDecodingEnergySearch(candMvField[i][0].mv, pu.width, pu.height);
+ int decEnergyL1 = this->calcFracpelDecodingEnergySearch(candMvField[i][0].mv, pu.width, pu.height);
+ if (biPred)
+ tempPred->decEnergy += decEnergyL0 + decEnergyL1;
+ else if (candDir[i] & 1)
+ tempPred->decEnergy += decEnergyL0;
+ else
+ tempPred->decEnergy += decEnergyL1;
+ if (m_param->bEnableLoopFilter)
+ {
+ int nBorders = 0;
+ if (tempPred->cu.m_cuPelX != 0 && tempPred->cu.m_cuPelX % 8 == 0)
+ nBorders = 1;
+ if (tempPred->cu.m_cuPelY != 0 && tempPred->cu.m_cuPelY % 8 == 0)
+ nBorders = nBorders+1;
+ tempPred->decEnergy += nBorders ? ((64 >> cuGeom.depth) >> (3-nBorders))*m_param->sSpecificDecEnergies.e_Bs : 0; // Considering DBF boundaries top and left (each boundary has 4 pixels, two boundaries because of left and top)
+ }
+ /* Code DERDO*/
+
encodeResAndCalcRdSkipCU(*tempPred);
if (tempPred->rdCost < bestPred->rdCost)
@@ -3058,6 +3135,8 @@ void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize
}
predInterSearch(interMode, cuGeom, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400), refMask);
+ interMode.predDecEnergy = interMode.decEnergy;
+
/* predInterSearch sets interMode.sa8dBits */
const Yuv& fencYuv = *interMode.fencYuv;
Yuv& predYuv = interMode.predYuv;
@@ -3068,7 +3147,11 @@ void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize
interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);
interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);
}
- interMode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)interMode.distortion, interMode.sa8dBits);
+
+ if(m_rdCost.m_decEnergyRD)
+ interMode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)interMode.distortion, interMode.sa8dBits+interMode.decEnergy);
+ else
+ interMode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)interMode.distortion, interMode.sa8dBits);
if (m_param->analysisSaveReuseLevel > 1 && m_reuseInterDataCTU)
{
@@ -3123,6 +3206,7 @@ void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize
}
predInterSearch(interMode, cuGeom, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400, refMask);
+ interMode.predDecEnergy = interMode.decEnergy;
/* predInterSearch sets interMode.sa8dBits, but this is ignored */
encodeResAndCalcRdInterCU(interMode, cuGeom);
@@ -3177,6 +3261,8 @@ void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom&
cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
cu.m_mergeFlag[0] = 0;
+ bidir2Nx2N.decEnergy = inter2Nx2N.predDecEnergy;
+
/* Estimate cost of BIDIR using best 2Nx2N L0 and L1 motion vectors */
cu.setPUMv(0, bestME[0].mv, 0, 0);
cu.m_mvd[0][0] = bestME[0].mv - mvp0;
@@ -3186,6 +3272,9 @@ void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom&
PredictionUnit pu(cu, cuGeom, 0);
motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
+ bidir2Nx2N.decEnergy += ((pu.width*pu.height)>>4)*m_param->sSpecificDecEnergies.e_bi;
+
+
int sa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size);
if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
@@ -3241,13 +3330,20 @@ void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom&
uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
uint32_t zcost = zsa8d + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
-
+ int zDecEnergy = ((pu.width*pu.height)>>4)*m_param->sSpecificDecEnergies.e_bi;
+ zDecEnergy += this->calcFracpelDecodingEnergySearch(bestME[0].mv, pu.width, pu.height );
+ zDecEnergy += this->calcFracpelDecodingEnergySearch(bestME[1].mv, pu.width, pu.height );
/* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
mvp0 = checkBestMVP(inter2Nx2N.amvpCand[0][ref0], mvzero, mvpIdx0, bits0, zcost);
mvp1 = checkBestMVP(inter2Nx2N.amvpCand[1][ref1], mvzero, mvpIdx1, bits1, zcost);
uint32_t zbits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
- zcost = zsa8d + m_rdCost.getCost(zbits);
+
+ if(m_rdCost.m_decEnergyRD)
+ zcost = zsa8d + m_rdCost.getCost(zbits+ zDecEnergy);
+ else
+ zcost = zsa8d + m_rdCost.getCost(zbits);
+
if (zcost < bidir2Nx2N.sa8dCost)
{
@@ -3416,7 +3512,12 @@ void Analysis::addSplitFlagCost(Mode& mode, uint32_t depth)
else if (m_param->rdLevel <= 1)
{
mode.sa8dBits++;
- mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
+
+ if(m_rdCost.m_decEnergyRD)
+ mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits+mode.decEnergy);
+ else
+ mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
+
}
else
{
--
2.20.1.windows.1
-------------- next part --------------
From 9e29018311f85f9825ed788fcea327f19e810b80 Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:36:48 +0200
Subject: [PATCH 11/14] DERDO: Introducing DERDO-cost calculation flag and
modified lambda calculation.
---
source/encoder/rdcost.h | 19 +++++++++++++++++--
1 file changed, 17 insertions(+), 2 deletions(-)
diff --git a/source/encoder/rdcost.h b/source/encoder/rdcost.h
index 1bd4dd696..9d1aaffee 100644
--- a/source/encoder/rdcost.h
+++ b/source/encoder/rdcost.h
@@ -44,6 +44,9 @@ public:
uint32_t m_ssimRd;
int m_qp; /* QP used to configure lambda, may be higher than QP_MAX_SPEC but <= QP_MAX_MAX */
+ uint32_t m_decEnergyRD;
+
+
void setPsyRdScale(double scale) { m_psyRdBase = (uint32_t)floor(65536.0 * scale * 0.33); }
void setSsimRd(int ssimRd) { m_ssimRd = ssimRd; };
@@ -51,6 +54,13 @@ public:
{
x265_emms(); /* TODO: if the lambda tables were ints, this would not be necessary */
m_qp = qp;
+
+ if(slice.m_param->derdo)
+ m_decEnergyRD = true;
+ else
+ m_decEnergyRD = false;
+
+
setLambda(x265_lambda2_tab[qp], x265_lambda_tab[qp]);
/* Scale PSY RD factor by a slice type factor */
@@ -92,8 +102,13 @@ public:
void setLambda(double lambda2, double lambda)
{
- m_lambda2 = (uint64_t)floor(256.0 * lambda2);
- m_lambda = (uint64_t)floor(256.0 * lambda);
+ if(m_decEnergyRD){
+ m_lambda2 = (uint64_t)floor(224.0 * lambda2);
+ m_lambda = (uint64_t)floor(224.0 * lambda);
+ }else{
+ m_lambda2 = (uint64_t)floor(256.0 * lambda2);
+ m_lambda = (uint64_t)floor(256.0 * lambda);
+ }
}
inline uint64_t calcRdCost(sse_t distortion, uint32_t bits) const
--
2.20.1.windows.1
-------------- next part --------------
From 5f9cb95413deb6adbd95f121bdb4989f86b829d3 Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:44:21 +0200
Subject: [PATCH 12/14] DERDO: Cost calculations and function definitions for
counting coefficients + fractional pels in search.cpp and search.h.
---
source/encoder/search.cpp | 809 ++++++++++++++++++++++++++++++++++----
source/encoder/search.h | 30 +-
2 files changed, 746 insertions(+), 93 deletions(-)
diff --git a/source/encoder/search.cpp b/source/encoder/search.cpp
index dab11fc79..ebc48d792 100644
--- a/source/encoder/search.cpp
+++ b/source/encoder/search.cpp
@@ -313,6 +313,9 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth,
bool mightSplit = (log2TrSize > depthRange[0]) && (bAllowSplit || !mightNotSplit);
bool bEnableRDOQ = !!m_param->rdoqLevel;
+ int dSingleDecEnergyY = 0;
+ uint32_t TrSize = 64 >> fullDepth;
+
/* If maximum RD penalty, force spits at TU size 32x32 if SPS allows TUs of 16x16 */
if (m_param->rdPenalty == 2 && m_slice->m_sliceType != I_SLICE && log2TrSize == 5 && depthRange[0] <= 4)
{
@@ -356,9 +359,36 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth,
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
- uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
+ uint32_t ldVal = 0;
+ uint32_t numSig;
+ if(m_rdCost.m_decEnergyRD){
+ if (m_param->bEnableLoopFilter)
+ {
+ int nBorders = 0;
+ if (cu.m_cuPelX != 0 && cu.m_cuPelX % 8 == 0)
+ nBorders = 1;
+ if (cu.m_cuPelY != 0 && cu.m_cuPelY % 8 == 0)
+ nBorders = nBorders + 1;
+ fullCost.decEnergy += nBorders ? ((64 >> cuGeom.depth) >> (3 - nBorders))*m_param->sSpecificDecEnergies.e_Bs : 0; // Considering DBF boundaries top and left (each boundary has 4 pixels, two boundaries because of left and top)
+ }
+ switch (fullDepth) {
+ case 1: fullCost.decEnergy += m_param->sSpecificDecEnergies.e_intra32x32;
+ break;
+ case 2: fullCost.decEnergy += m_param->sSpecificDecEnergies.e_intra16x16;
+ break;
+ case 3: fullCost.decEnergy += m_param->sSpecificDecEnergies.e_intra8x8;
+ break;
+ case 4: fullCost.decEnergy += m_param->sSpecificDecEnergies.e_intra4x4;
+ break;
+ }
+ }
+ numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false, m_rdCost.m_decEnergyRD, ldVal);
+
if (numSig)
{
+ dSingleDecEnergyY = calcCoeffDecodingEnergy(numSig, ldVal, TrSize);
+ fullCost.decEnergy += dSingleDecEnergyY;
+
m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
bool reconQtYuvAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
@@ -415,7 +445,9 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth,
if (m_param->rdPenalty && log2TrSize == 5 && m_slice->m_sliceType != I_SLICE)
fullCost.bits *= 4;
-
+ if(m_rdCost.m_decEnergyRD){
+ fullCost.bits += fullCost.decEnergy;
+ }
if (m_rdCost.m_psyRd)
{
fullCost.energy = m_rdCost.psyCost(sizeIdx, fenc, mode.fencYuv->m_size, reconQt, reconQtStride);
@@ -431,6 +463,9 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth,
}
else
fullCost.rdcost = MAX_INT64;
+ if(m_rdCost.m_decEnergyRD){
+ fullCost.bits -= fullCost.decEnergy;
+ }
if (mightSplit)
{
@@ -466,13 +501,18 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth,
m_entropyCoder.resetBits();
m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
splitCost.bits += m_entropyCoder.getNumberOfWrittenBits();
-
+ if(m_rdCost.m_decEnergyRD){
+ splitCost.bits += splitCost.decEnergy;
+ }
if (m_rdCost.m_psyRd)
splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
else if(m_rdCost.m_ssimRd)
splitCost.rdcost = m_rdCost.calcSsimRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
else
splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
+ if(m_rdCost.m_decEnergyRD){
+ splitCost.bits -= splitCost.decEnergy;
+ }
}
if (splitCost.rdcost < fullCost.rdcost)
@@ -481,7 +521,8 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth,
outCost.distortion += splitCost.distortion;
outCost.bits += splitCost.bits;
outCost.energy += splitCost.energy;
- return;
+ outCost.decEnergy += splitCost.decEnergy;
+ return;
}
else
{
@@ -500,7 +541,7 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth,
pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
intptr_t picStride = reconPic->m_stride;
primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
-
+ outCost.decEnergy += fullCost.decEnergy;
outCost.rdcost += fullCost.rdcost;
outCost.distortion += fullCost.distortion;
outCost.bits += fullCost.bits;
@@ -560,6 +601,8 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDep
uint64_t tmpCost;
uint32_t tmpEnergy = 0;
+ double tmpDecEnergy = 0;
+
coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffY);
pixel* tmpRecon = (useTSkip ? m_tsRecon : reconQt);
bool tmpReconAlign = (useTSkip ? 1 : (m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0));
@@ -567,7 +610,31 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDep
primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
- uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip);
+ uint32_t ldVal = 0;
+ uint32_t numSig;
+ if(m_rdCost.m_decEnergyRD){
+ if (m_param->bEnableLoopFilter)
+ {
+ int nBorders = 0;
+ if (cu.m_cuPelX != 0 && cu.m_cuPelX % 8 == 0)
+ nBorders = 1;
+ if (cu.m_cuPelY != 0 && cu.m_cuPelY % 8 == 0)
+ nBorders = nBorders + 1;
+ fullCost.decEnergy += nBorders ? ((64 >> cuGeom.depth) >> (3 - nBorders))*m_param->sSpecificDecEnergies.e_Bs : 0; // Considering DBF boundaries top and left (each boundary has 4 pixels, two boundaries because of left and top)
+ }
+ switch (fullDepth) {
+ case 1: fullCost.decEnergy += m_param->sSpecificDecEnergies.e_intra32x32;
+ break;
+ case 2: fullCost.decEnergy += m_param->sSpecificDecEnergies.e_intra16x16;
+ break;
+ case 3: fullCost.decEnergy += m_param->sSpecificDecEnergies.e_intra8x8;
+ break;
+ case 4: fullCost.decEnergy += m_param->sSpecificDecEnergies.e_intra4x4;
+ break;
+ }
+ }
+ numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip, m_rdCost.m_decEnergyRD, ldVal);
+
if (numSig)
{
m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
@@ -635,6 +702,10 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDep
if (!useTSkip)
m_entropyCoder.store(m_rqt[fullDepth].rqtTemp);
+ if(m_rdCost.m_decEnergyRD){
+ tmpBits += tmpDecEnergy;
+ }
+
if (m_rdCost.m_psyRd)
{
tmpEnergy = m_rdCost.psyCost(sizeIdx, fenc, fencYuv->m_size, tmpRecon, tmpReconStride);
@@ -648,6 +719,10 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDep
else
tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
+ if(m_rdCost.m_decEnergyRD){
+ tmpBits -= tmpDecEnergy;
+ }
+
if (tmpCost < fullCost.rdcost)
{
bTSkip = useTSkip;
@@ -656,7 +731,8 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDep
fullCost.distortion = tmpDist;
fullCost.bits = tmpBits;
fullCost.energy = tmpEnergy;
- }
+ fullCost.decEnergy = tmpDecEnergy;
+ }
}
if (bTSkip)
@@ -681,6 +757,7 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDep
outCost.distortion += fullCost.distortion;
outCost.bits += fullCost.bits;
outCost.energy += fullCost.energy;
+ outCost.decEnergy += fullCost.decEnergy;
}
/* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */
@@ -727,7 +804,9 @@ void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint3
pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
intptr_t picStride = reconPic->m_stride;
- uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
+ uint32_t ldVal = 0;
+ uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false, m_rdCost.m_decEnergyRD, ldVal);
+
if (numSig)
{
m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
@@ -821,6 +900,9 @@ void Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDept
CUData& cu = mode.cu;
uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
bool bEnableRDOQ = !!m_param->rdoqLevel;
+ int dSingleDecEnergyU = 0;
+ int dSingleDecEnergyV = 0;
+ uint32_t ldVal = 0;
if (tuDepth < cu.m_tuDepth[absPartIdx])
{
@@ -840,6 +922,8 @@ void Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDept
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
uint32_t tuDepthC = tuDepth;
+ uint32_t TrSizeC = log2TrSizeC < 2 ? 4 : 1 << log2TrSizeC;
+
if (log2TrSizeC < 2)
{
X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
@@ -906,9 +990,21 @@ void Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDept
primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
- uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
+ ldVal = 0;
+ uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false, m_rdCost.m_decEnergyRD, ldVal);
+
if (numSig)
{
+ if (chromaId ==1)
+ {
+ dSingleDecEnergyU = calcCoeffDecodingEnergy(numSig, ldVal, TrSizeC);
+ outCost.decEnergy += dSingleDecEnergyU;
+ }
+ else if(chromaId == 2)
+ {
+ dSingleDecEnergyV = calcCoeffDecodingEnergy(numSig, ldVal, TrSizeC);
+ outCost.decEnergy += dSingleDecEnergyV;
+ }
m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
bool reconQtAlign = m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
@@ -952,6 +1048,11 @@ void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuD
const uint32_t log2TrSizeC = 2;
uint32_t qtLayer = log2TrSize - 2;
+ int dSingleDecEnergyU = 0;
+ int dSingleDecEnergyV = 0;
+ uint32_t TrSizeC = 1 << log2TrSize;
+ uint32_t ldVal = 0;
+
/* At the TU layers above this one, no RDO is performed, only distortion is being measured,
* so the entropy coder is not very accurate. The best we can do is return it in the same
* condition as it arrived, and to do all bit estimates from the same state. */
@@ -999,6 +1100,7 @@ void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuD
sse_t bDist = 0;
uint32_t bCbf = 0;
uint32_t bEnergy = 0;
+ double bDecEnergy = 0;
int bTSkip = 0;
int checkTransformSkip = 1;
@@ -1010,9 +1112,19 @@ void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuD
primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
- uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip);
+ ldVal = 0;
+ uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip, m_rdCost.m_decEnergyRD, ldVal);
+
if (numSig)
{
+ if (chromaId ==1)
+ {
+ dSingleDecEnergyU = calcCoeffDecodingEnergy(numSig, ldVal, TrSizeC);
+ }
+ else if(chromaId == 2)
+ {
+ dSingleDecEnergyV = calcCoeffDecodingEnergy(numSig, ldVal, TrSizeC);
+ }
m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
bool reconAlign = (useTSkip ? 1 : m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC)) % 64 == 0;
bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
@@ -1037,8 +1149,18 @@ void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuD
cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
uint32_t tmpBits = 0, tmpEnergy = 0;
+ double tmpDecEnergy = 0;
+
if (numSig)
{
+ if (chromaId ==1)
+ {
+ tmpDecEnergy += dSingleDecEnergyU;
+ }
+ else if(chromaId == 2)
+ {
+ tmpDecEnergy += dSingleDecEnergyV;
+ }
m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
m_entropyCoder.resetBits();
m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
@@ -1046,6 +1168,9 @@ void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuD
}
uint64_t tmpCost;
+ if(m_rdCost.m_decEnergyRD){
+ tmpBits += tmpDecEnergy;
+ }
if (m_rdCost.m_psyRd)
{
tmpEnergy = m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
@@ -1059,6 +1184,10 @@ void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuD
else
tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
+ if(m_rdCost.m_decEnergyRD){
+ tmpBits -= tmpDecEnergy;
+ }
+
if (tmpCost < bCost)
{
bCost = tmpCost;
@@ -1066,7 +1195,8 @@ void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuD
bTSkip = useTSkip;
bCbf = !!numSig;
bEnergy = tmpEnergy;
- }
+ bDecEnergy = tmpDecEnergy;
+ }
}
if (bTSkip)
@@ -1085,6 +1215,7 @@ void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuD
outCost.distortion += bDist;
outCost.energy += bEnergy;
+ outCost.decEnergy += bDecEnergy;
}
}
while (tuIterator.isNextSection());
@@ -1134,6 +1265,8 @@ void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t ab
CUData& cu = mode.cu;
uint32_t log2TrSize = cu.m_log2CUSize[absPartIdx] - tuDepth;
+ uint32_t ldVal=0;
+
if (tuDepth < cu.m_tuDepth[absPartIdx])
{
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
@@ -1205,7 +1338,9 @@ void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t ab
primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
- uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
+ ldVal = 0;
+ uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false, m_rdCost.m_decEnergyRD, ldVal);
+
if (numSig)
{
m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
@@ -1244,6 +1379,8 @@ void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize
cu.getIntraTUQtDepthRange(tuDepthRange, 0);
intraMode.initCosts();
+ intraMode.decEnergy += m_param->sSpecificDecEnergies.e_IntraCUs;
+
intraMode.lumaDistortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange);
if (m_csp != X265_CSP_I400)
{
@@ -1298,6 +1435,10 @@ void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
cu.setPartSizeSubParts(SIZE_2Nx2N);
cu.setPredModeSubParts(MODE_INTRA);
+
+ double intraDecEnergy = 0;
+ double bintraDecEnergy = 0;
+
const uint32_t initTuDepth = 0;
uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth;
uint32_t tuSize = 1 << log2TrSize;
@@ -1359,7 +1500,23 @@ void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
bsad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift;
bmode = mode = DC_IDX;
bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
- bcost = m_rdCost.calcRdSADCost(bsad, bbits);
+
+ if(m_rdCost.m_decEnergyRD){
+ switch(sizeIdx){
+ case 0: bintraDecEnergy = m_param->sSpecificDecEnergies.e_intra32x32;
+ break;
+ case 1: bintraDecEnergy = m_param->sSpecificDecEnergies.e_intra16x16;
+ break;
+ case 2: bintraDecEnergy = m_param->sSpecificDecEnergies.e_intra8x8;
+ break;
+ case 3: bintraDecEnergy = m_param->sSpecificDecEnergies.e_intra4x4;
+ break;
+ }
+ bcost = m_rdCost.calcRdSADCost(bsad, bbits+bintraDecEnergy);
+ }
+ else
+ bcost = m_rdCost.calcRdSADCost(bsad, bbits);
+
// PLANAR
pixel* planar = intraNeighbourBuf[0];
@@ -1370,8 +1527,25 @@ void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift;
mode = PLANAR_IDX;
bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
- cost = m_rdCost.calcRdSADCost(sad, bits);
- COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
+
+ if(m_rdCost.m_decEnergyRD){
+ switch(sizeIdx){
+ case 0: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra32x32;
+ break;
+ case 1: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra16x16;
+ break;
+ case 2: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra8x8;
+ break;
+ case 3: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra4x4;
+ break;
+ }
+ cost = m_rdCost.calcRdSADCost(sad, bits+intraDecEnergy);
+ COPY5_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits, bintraDecEnergy, intraDecEnergy);
+ }else{
+ cost = m_rdCost.calcRdSADCost(sad, bits);
+ COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
+ }
+
bool allangs = true;
if (primitives.cu[sizeIdx].intra_pred_allangs)
@@ -1383,32 +1557,60 @@ void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
allangs = false;
#define TRY_ANGLE(angle) \
- if (allangs) { \
- if (angle < 18) \
+ if (allangs) { \
+ if (angle < 18) \
sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \
- else \
+ else \
sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \
- bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
- cost = m_rdCost.calcRdSADCost(sad, bits); \
- } else { \
- int filter = !!(g_intraFilterFlags[angle] & scaleTuSize); \
- primitives.cu[sizeIdx].intra_pred[angle](m_intraPredAngs, scaleTuSize, intraNeighbourBuf[filter], angle, scaleTuSize <= 16); \
- sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleTuSize) << costShift; \
- bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
- cost = m_rdCost.calcRdSADCost(sad, bits); \
- }
-
+ bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
+ if(m_rdCost.m_decEnergyRD){ \
+ switch(sizeIdx){ \
+ case 0: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra32x32; \
+ break; \
+ case 1: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra16x16; \
+ break; \
+ case 2: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra8x8; \
+ break; \
+ case 3: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra4x4; \
+ break; } \
+ cost = m_rdCost.calcRdSADCost(sad, bits+ intraDecEnergy); \
+ }\
+ else \
+ cost = m_rdCost.calcRdSADCost(sad, bits); \
+ } else { \
+ int filter = !!(g_intraFilterFlags[angle] & scaleTuSize); \
+ primitives.cu[sizeIdx].intra_pred[angle](m_intraPredAngs, scaleTuSize, intraNeighbourBuf[filter], angle, scaleTuSize <= 16); \
+ sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleTuSize) << costShift; \
+ bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
+ if(m_rdCost.m_decEnergyRD){ \
+ switch(sizeIdx){ \
+ case 0: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra32x32; \
+ break; \
+ case 1: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra16x16; \
+ break; \
+ case 2: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra8x8; \
+ break; \
+ case 3: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra4x4; \
+ break;} \
+ cost = m_rdCost.calcRdSADCost(sad, bits+ intraDecEnergy); \
+ } \
+ else \
+ cost = m_rdCost.calcRdSADCost(sad, bits); \
+ }
if (m_param->bEnableFastIntra)
{
int asad = 0;
uint32_t lowmode, highmode, amode = 5, abits = 0;
+
+ double aintraDecEnergy = 0;
uint64_t acost = MAX_INT64;
/* pick the best angle, sampling at distance of 5 */
for (mode = 5; mode < 35; mode += 5)
{
TRY_ANGLE(mode);
- COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
+ COPY5_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits, aintraDecEnergy, intraDecEnergy);
+
}
/* refine best angle at distance 2, then distance 1 */
@@ -1419,27 +1621,32 @@ void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n");
TRY_ANGLE(lowmode);
- COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits);
+ COPY5_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits, aintraDecEnergy, intraDecEnergy);
+
X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n");
TRY_ANGLE(highmode);
- COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits);
+ COPY5_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits, aintraDecEnergy, intraDecEnergy);
+
}
if (amode == 33)
{
TRY_ANGLE(34);
- COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits);
+ COPY5_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits, aintraDecEnergy, intraDecEnergy);
+
}
+ COPY5_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits, bintraDecEnergy, aintraDecEnergy);
+
- COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
}
else // calculate and search all intra prediction angles for lowest cost
{
for (mode = 2; mode < 35; mode++)
{
TRY_ANGLE(mode);
- COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
+ COPY5_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits, bintraDecEnergy, intraDecEnergy);
+
}
}
@@ -1449,6 +1656,8 @@ void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
intraMode.distortion = bsad;
intraMode.sa8dCost = bcost;
intraMode.sa8dBits = bbits;
+
+ intraMode.decEnergy = bintraDecEnergy;
}
void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
@@ -1469,7 +1678,7 @@ void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
Cost icosts;
codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
extractIntraResultQT(cu, *reconYuv, 0, 0);
-
+ intraMode.decEnergy += icosts.decEnergy;
intraMode.lumaDistortion = icosts.distortion;
if (m_csp != X265_CSP_I400)
{
@@ -1530,6 +1739,9 @@ sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32
{
uint32_t bmode = 0;
+ double intraDecEnergy = 0;
+ double bintraDecEnergy = 0;
+
if (intraMode.cu.m_lumaIntraDir[puIdx] != (uint8_t)ALL_IDX)
bmode = intraMode.cu.m_lumaIntraDir[puIdx];
else
@@ -1572,7 +1784,24 @@ sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32
primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPred, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, DC_IDX) : rbits;
uint32_t sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
- modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
+
+
+ if(m_rdCost.m_decEnergyRD){
+ switch(tuSize){
+ case 32: bintraDecEnergy = m_param->sSpecificDecEnergies.e_intra32x32;
+ break;
+ case 16: bintraDecEnergy = m_param->sSpecificDecEnergies.e_intra16x16;
+ break;
+ case 8: bintraDecEnergy = m_param->sSpecificDecEnergies.e_intra8x8;
+ break;
+ case 4: bintraDecEnergy = m_param->sSpecificDecEnergies.e_intra4x4;
+ break;
+ }
+ modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits+bintraDecEnergy);
+ }
+ else
+ modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
+
// PLANAR
pixel* planar = intraNeighbourBuf[0];
@@ -1582,8 +1811,24 @@ sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32
primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPred, scaleStride, planar, 0, 0);
bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, PLANAR_IDX) : rbits;
sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
- modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
- COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
+
+ if(m_rdCost.m_decEnergyRD){
+ switch(tuSize){
+ case 32: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra32x32;
+ break;
+ case 16: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra16x16;
+ break;
+ case 8: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra8x8;
+ break;
+ case 4: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra4x4;
+ break;
+ }
+ modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits+ intraDecEnergy);
+ COPY2_IF_LT(bcost, modeCosts[PLANAR_IDX], bintraDecEnergy, intraDecEnergy);
+ }else{
+ modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
+ COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
+ }
// angular predictions
if (primitives.cu[sizeIdx].intra_pred_allangs)
@@ -1597,8 +1842,25 @@ sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32
sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
else
sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
- modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
- COPY1_IF_LT(bcost, modeCosts[mode]);
+
+ if(m_rdCost.m_decEnergyRD) {
+ switch(tuSize){
+ case 32: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra32x32;
+ break;
+ case 16: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra16x16;
+ break;
+ case 8: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra8x8;
+ break;
+ case 4: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra4x4;
+ break;
+ }
+ modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits + intraDecEnergy);
+ COPY2_IF_LT(bcost, modeCosts[mode], bintraDecEnergy, intraDecEnergy);
+ }else{
+ modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
+ COPY1_IF_LT(bcost, modeCosts[mode]);
+ }
+
}
}
else
@@ -1609,8 +1871,25 @@ sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32
int filter = !!(g_intraFilterFlags[mode] & scaleTuSize);
primitives.cu[sizeIdx].intra_pred[mode](m_intraPred, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16);
sad = sa8d(fenc, scaleStride, m_intraPred, scaleTuSize) << costShift;
- modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
- COPY1_IF_LT(bcost, modeCosts[mode]);
+
+ if(m_rdCost.m_decEnergyRD) {
+ switch(tuSize){
+ case 32: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra32x32;
+ break;
+ case 16: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra16x16;
+ break;
+ case 8: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra8x8;
+ break;
+ case 4: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra4x4;
+ break;
+ }
+ modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits + intraDecEnergy );
+ COPY2_IF_LT(bcost, modeCosts[mode], bintraDecEnergy, intraDecEnergy);
+ }else{
+ modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
+ COPY1_IF_LT(bcost, modeCosts[mode]);
+ }
+
}
}
@@ -1661,6 +1940,7 @@ sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32
else
codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange);
totalDistortion += icosts.distortion;
+ intraMode.decEnergy += icosts.decEnergy;
extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx);
@@ -1825,8 +2105,17 @@ sse_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U);
codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V);
uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
+
+
+ if(m_rdCost.m_decEnergyRD){
+ bits += outCost.decEnergy;
+ }
uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(outCost.distortion, bits, outCost.energy) : m_rdCost.m_ssimRd ? m_rdCost.calcSsimRdCost(outCost.distortion, bits, outCost.energy)
- : m_rdCost.calcRdCost(outCost.distortion, bits);
+ : m_rdCost.calcRdCost(outCost.distortion, bits);
+ if(m_rdCost.m_decEnergyRD){
+ bits -= outCost.decEnergy;
+ }
+
if (cost < bestCost)
{
@@ -1888,7 +2177,7 @@ sse_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
}
/* estimation of best merge coding of an inter PU (2Nx2N merge PUs are evaluated as their own mode) */
-uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m)
+uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m, int& decEnergy)
{
X265_CHECK(cu.m_partSize[0] != SIZE_2Nx2N, "mergeEstimation() called for 2Nx2N\n");
@@ -1943,12 +2232,22 @@ uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, const Predict
motionCompensation(cu, pu, tempYuv, true, m_me.bChromaSATD);
+ if (m_rdCost.m_decEnergyRD) {
+ if (cu.m_interDir[0] == 1 || cu.m_interDir[0] == 3)
+ decEnergy += calcFracpelDecodingEnergySearch(cu.m_mv[0][pu.puAbsPartIdx], pu.width, pu.height);
+ if (cu.m_interDir[0] == 2 || cu.m_interDir[0] == 3)
+ decEnergy += calcFracpelDecodingEnergySearch(cu.m_mv[1][pu.puAbsPartIdx], pu.width, pu.height);
+ if (cu.m_interDir[0] == 3)
+ decEnergy += ((pu.width * pu.height) >> 4 ) * m_param->sSpecificDecEnergies.e_bi;
+ }
uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(pu.puAbsPartIdx), tempYuv.m_size);
if (m_me.bChromaSATD)
costCand += m_me.bufChromaSATD(tempYuv, pu.puAbsPartIdx);
uint32_t bitsCand = getTUBits(mergeCand, numMergeCand);
- costCand = costCand + m_rdCost.getCost(bitsCand);
+
+ costCand = costCand + m_rdCost.getCost(bitsCand + decEnergy);
+
if (costCand < outCost)
{
outCost = costCand;
@@ -2199,8 +2498,17 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
MergeData merge;
memset(&merge, 0, sizeof(merge));
bool useAsMVP = false;
+
+ int dCuDecEnergy = 0;
+
for (int puIdx = 0; puIdx < numPart; puIdx++)
{
+ int decEnergy[2]={0, 0};
+ int decEnergyTempL0[MAX_NUM_REF];
+ int decEnergyBi = 0;
+
+ int dinterPUenergy = 0;
+
MotionData* bestME = interMode.bestME[puIdx];
PredictionUnit pu(cu, cuGeom, puIdx);
m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
@@ -2218,7 +2526,10 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
useAsMVP = true;
}
/* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
- uint32_t mrgCost = numPart == 1 ? MAX_UINT : mergeEstimation(cu, cuGeom, pu, puIdx, merge);
+ int dMergeDecEnergy = 0;
+ uint32_t mrgCost = numPart == 1 ? MAX_UINT : mergeEstimation(cu, cuGeom, pu, puIdx, merge, dMergeDecEnergy);
+
+
bestME[0].cost = MAX_UINT;
bestME[1].cost = MAX_UINT;
@@ -2232,6 +2543,8 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
{
for (int list = 0; list < numPredDir; list++)
{
+ int partDecEnergy = 0;
+
int ref = -1;
if (useAsMVP)
@@ -2268,6 +2581,8 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
int satdCost;
if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && mvpIdx == bestME[list].mvpIdx)
mvpIn = bestME[list].mv;
+ if(list == 1)
+ partDecEnergy=decEnergyTempL0[ref];
if (useAsMVP && m_param->mvRefine > 1)
{
MV bestmv, mvpSel[3];
@@ -2309,7 +2624,16 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
/* Get total cost of partition, but only include MV bit cost once */
bits += m_me.bitcost(outmv);
uint32_t mvCost = m_me.mvcost(outmv);
- uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
+
+ uint32_t cost;
+ if(m_rdCost.m_decEnergyRD){
+ partDecEnergy += calcFracpelDecodingEnergySearch( outmv, pu.width, pu.height);
+ cost = (satdCost - mvCost) + m_rdCost.getCost(bits+partDecEnergy);
+ }
+ else
+ cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
+
+
/* Refine MVP selection, updates: mvpIdx, bits, cost */
if (!(m_param->analysisMultiPassRefine || useAsMVP))
mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
@@ -2323,11 +2647,19 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
mvpIdx = !mvpIdx;
uint32_t origOutBits = bits;
bits = origOutBits + diffBits;
- cost = (cost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(bits);
+ if(m_rdCost.m_decEnergyRD)
+ cost = (cost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(bits + partDecEnergy);
+ else
+ cost = (cost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(bits);
+
}
mvp = amvp[mvpIdx];
}
+ if(list == 0)
+ decEnergyTempL0[ref] = partDecEnergy;
+
+
if (cost < bestME[list].cost)
{
bestME[list].mv = outmv;
@@ -2393,7 +2725,7 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
for (int ref = 0; ref < numRefIdx[list]; ref++)
{
ProfileCounter(interMode.cu, totalMotionReferences[cuGeom.depth]);
-
+ int partDecEnergy= 0;
if (!(refMask & (1 << ref)))
{
ProfileCounter(interMode.cu, skippedMotionReferences[cuGeom.depth]);
@@ -2446,7 +2778,13 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
/* Get total cost of partition, but only include MV bit cost once */
bits += m_me.bitcost(outmv);
uint32_t mvCost = m_me.mvcost(outmv);
- uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
+ uint32_t cost;
+ if(m_rdCost.m_decEnergyRD){
+ partDecEnergy += calcFracpelDecodingEnergySearch( outmv, pu.width, pu.height);
+ cost = (satdCost - mvCost) + m_rdCost.getCost(bits+ partDecEnergy);
+ }
+ else
+ cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
/* Update LowresMVP to best AMVP cand*/
if (bLowresMVP)
updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);
@@ -2454,6 +2792,9 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
/* Refine MVP selection, updates: mvpIdx, bits, cost */
mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
+ if(list == 0)
+ decEnergyTempL0[ref] = partDecEnergy;
+
if (cost < bestME[list].cost)
{
bestME[list].mv = outmv;
@@ -2463,6 +2804,8 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
bestME[list].cost = cost;
bestME[list].bits = bits;
bestME[list].mvCost = mvCost;
+ decEnergy[list] = partDecEnergy;
+
}
}
/* the second list ref bits start at bit 16 */
@@ -2479,6 +2822,12 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N && /* 2Nx2N biprediction is handled elsewhere */
bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT)
{
+
+ int partDecEnergy = 0;
+ int dBiEnergyTemp = ((pu.width*pu.height)>>4)*m_param->sSpecificDecEnergies.e_bi;
+
+
+
bidir[0] = bestME[0];
bidir[1] = bestME[1];
@@ -2512,6 +2861,18 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
bidirCost = satdCost + m_rdCost.getCost(bidirBits);
+ if(m_rdCost.m_decEnergyRD){
+ int RefPOCa = cu.m_slice->m_refPOCList[0][(int8_t)bidir[0].ref]; //getRefPic(RefPicList(iRefList), iRefIdxTemp)->getPOC();//Taken from xCheckIdenticalMotion
+ int RefPOCb = cu.m_slice->m_refPOCList[1][(int8_t)bidir[1].ref]; //pcCU->getSlice()->getRefPic(RefPicList(1-iRefList), iRefIdxBi[1-iRefList])->getPOC();
+ if (!(RefPOCa == RefPOCb)) // If motion vectors are NOT equal
+ {
+ partDecEnergy += dBiEnergyTemp;
+ partDecEnergy += calcFracpelDecodingEnergySearch(bidir[0].mv, pu.width, pu.height);
+ partDecEnergy += calcFracpelDecodingEnergySearch(bidir[1].mv, pu.width, pu.height);
+ }
+ bidirCost += m_rdCost.getCost(partDecEnergy);
+ }
+
bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
if (bTryZero)
{
@@ -2563,6 +2924,20 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
mvp0 = checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvpIdx0, bits0, cost);
mvp1 = checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvpIdx1, bits1, cost);
+
+ if(m_rdCost.m_decEnergyRD){
+ int RefPOCa = cu.m_slice->m_refPOCList[0][(int8_t)bidir[0].ref]; //getRefPic(RefPicList(iRefList), iRefIdxTemp)->getPOC();//Taken from xCheckIdenticalMotion
+ int RefPOCb = cu.m_slice->m_refPOCList[1][(int8_t)bidir[1].ref]; //pcCU->getSlice()->getRefPic(RefPicList(1-iRefList), iRefIdxBi[1-iRefList])->getPOC();
+ if (!(RefPOCa == RefPOCb && mvp0 == mvp1)) // If motion vectors are NOT equal
+ {
+ partDecEnergy += dBiEnergyTemp;
+ partDecEnergy += calcFracpelDecodingEnergySearch(bidir[0].ref, pu.width, pu.height);
+ partDecEnergy += calcFracpelDecodingEnergySearch(bidir[1].ref, pu.width, pu.height);
+ }
+ cost += m_rdCost.getCost(partDecEnergy);
+ }
+
+
if (cost < bidirCost)
{
bidir[0].mv = mvzero;
@@ -2573,6 +2948,7 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
bidir[1].mvpIdx = mvpIdx1;
bidirCost = cost;
bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
+ decEnergyBi = partDecEnergy;
}
}
}
@@ -2589,6 +2965,7 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
cu.setPURefIdx(1, merge.mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
totalmebits += merge.bits;
+ dCuDecEnergy += dMergeDecEnergy;
}
else if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost)
{
@@ -2607,6 +2984,11 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
cu.m_mvpIdx[1][pu.puAbsPartIdx] = bidir[1].mvpIdx;
totalmebits += bidirBits;
+
+ dCuDecEnergy += decEnergyBi+ dinterPUenergy;
+
+
+
}
else if (bestME[0].cost <= bestME[1].cost)
{
@@ -2623,6 +3005,8 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
cu.setPUMv(1, mvzero, pu.puAbsPartIdx, puIdx);
totalmebits += bestME[0].bits;
+ dCuDecEnergy += decEnergy[0]+dinterPUenergy;
+
}
else
{
@@ -2639,10 +3023,13 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
cu.setPUMv(0, mvzero, pu.puAbsPartIdx, puIdx);
totalmebits += bestME[1].bits;
+ dCuDecEnergy += decEnergy[1]+ dinterPUenergy;
+
}
motionCompensation(cu, pu, *predYuv, true, bChromaMC);
}
+ interMode.decEnergy += dCuDecEnergy;
interMode.sa8dBits += totalmebits;
}
@@ -2807,7 +3194,13 @@ void Search::encodeResAndCalcRdSkipCU(Mode& interMode)
interMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
interMode.coeffBits = 0;
interMode.totalBits = interMode.mvBits + skipFlagBits;
- if (m_rdCost.m_psyRd)
+ switch (depth) {
+ case 0: interMode.decEnergy += this->m_param->sSpecificDecEnergies.e_skip64x64; break;
+ case 1: interMode.decEnergy += this->m_param->sSpecificDecEnergies.e_skip8x82x32; break;
+ case 2: interMode.decEnergy += this->m_param->sSpecificDecEnergies.e_skip16x16; break;
+ case 3: interMode.decEnergy += this->m_param->sSpecificDecEnergies.e_skip8x8; break;
+ }
+ if (m_rdCost.m_psyRd)
interMode.psyEnergy = m_rdCost.psyCost(part, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
else if(m_rdCost.m_ssimRd)
interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0);
@@ -2848,6 +3241,33 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
memset(&m_cacheTU, 0, sizeof(TUInfoCache));
Cost costs;
+ int32_t inter0Energy = 0;
+ int32_t skipEnergy = 0;
+ if (m_rdCost.m_decEnergyRD) {
+ switch (depth) {
+ case 0: inter0Energy = this->m_param->sSpecificDecEnergies.e_inter64x64;
+ skipEnergy = this->m_param->sSpecificDecEnergies.e_skip64x64;break;
+ case 1: inter0Energy = this->m_param->sSpecificDecEnergies.e_inter32x32;
+ skipEnergy = this->m_param->sSpecificDecEnergies.e_skip8x82x32;break;
+ case 2: inter0Energy = this->m_param->sSpecificDecEnergies.e_inter32x326x16;
+ skipEnergy = this->m_param->sSpecificDecEnergies.e_skip16x16;break;
+ case 3: inter0Energy = this->m_param->sSpecificDecEnergies.e_inter8x8;
+ skipEnergy = this->m_param->sSpecificDecEnergies.e_skip8x8;break;
+ }
+
+ costs.decEnergy += inter0Energy;
+ if (m_param->bEnableLoopFilter)
+ {
+ int nBorders = 0;
+ if (cu.m_cuPelX != 0 && cu.m_cuPelX % 8 == 0)
+ nBorders = 1;
+ if (cu.m_cuPelY != 0 && cu.m_cuPelY % 8 == 0)
+ nBorders = nBorders + 1;
+ int addecEnergy = nBorders ? ((64 >> cuGeom.depth) >> (3 - nBorders))*m_param->sSpecificDecEnergies.e_Bs : 0; // Considering DBF boundaries top and left (each boundary has 4 pixels, two boundaries because of left and top)
+ skipEnergy += addecEnergy;
+ inter0Energy += addecEnergy;
+ }
+ }
if (m_limitTU & X265_TU_LIMIT_NEIGH)
{
/* Save and reload maxTUDepth to avoid changing of maxTUDepth between modes */
@@ -2881,7 +3301,7 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
m_entropyCoder.resetBits();
m_entropyCoder.codeQtRootCbfZero();
uint32_t cbf0Bits = m_entropyCoder.getNumberOfWrittenBits();
-
+ cbf0Bits += (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N) ? skipEnergy : inter0Energy;
uint32_t cbf0Energy; uint64_t cbf0Cost;
if (m_rdCost.m_psyRd)
{
@@ -2900,6 +3320,7 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
{
cu.clearCbf();
cu.setTUDepthSubParts(0, 0, depth);
+ costs.decEnergy = (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N) ? skipEnergy : inter0Energy;
}
}
@@ -2969,6 +3390,10 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
interMode.lumaDistortion = bestLumaDist;
interMode.coeffBits = coeffBits;
interMode.mvBits = mvBits;
+ interMode.decEnergy += costs.decEnergy;
+
+
+
cu.m_distortion[0] = interMode.distortion;
updateModeCost(interMode);
checkDQP(interMode, cuGeom);
@@ -3017,7 +3442,9 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
uint32_t strideResiY = resiYuv.m_size;
const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
- uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
+ uint32_t ldVal=0;
+ uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false, m_rdCost.m_decEnergyRD, ldVal);
+
if (numSigY)
{
@@ -3051,7 +3478,9 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC);
const pixel* fencCb = fencYuv->getCbAddr(absPartIdxC);
- uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false);
+ ldVal = 0;
+ uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false, m_rdCost.m_decEnergyRD, ldVal);
+
if (numSigU)
{
m_quant.invtransformNxN(cu, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU);
@@ -3065,7 +3494,9 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC);
const pixel* fencCr = fencYuv->getCrAddr(absPartIdxC);
- uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false);
+ ldVal=0;
+ uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false, m_rdCost.m_decEnergyRD, ldVal);
+
if (numSigV)
{
m_quant.invtransformNxN(cu, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV);
@@ -3110,11 +3541,12 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
}
}
}
-
-uint64_t Search::estimateNullCbfCost(sse_t dist, uint32_t energy, uint32_t tuDepth, TextType compId)
+uint64_t Search::estimateNullCbfCost(sse_t dist, uint32_t energy, uint32_t tuDepth, TextType compId, double decEnergy)
{
uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth);
-
+ if(m_rdCost.m_decEnergyRD){
+ nullBits += decEnergy;
+ }
if (m_rdCost.m_psyRd)
return m_rdCost.calcPsyRdCost(dist, nullBits, energy);
else if(m_rdCost.m_ssimRd)
@@ -3165,6 +3597,10 @@ bool Search::splitTU(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint
uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
splitCost.bits += splitCbfBits;
+ if(m_rdCost.m_decEnergyRD){
+ splitCost.bits += splitCost.decEnergy;
+ }
+
if (m_rdCost.m_psyRd)
splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
else if(m_rdCost.m_ssimRd)
@@ -3172,6 +3608,9 @@ bool Search::splitTU(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint
else
splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
+ if(m_rdCost.m_decEnergyRD)
+ splitCost.bits -= splitCost.decEnergy;
+
return ycbf || ucbf || vcbf;
}
@@ -3244,7 +3683,13 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
uint32_t singleEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} };
-
+ double dSingleDecEnergyY = 0;
+ double dSingleDecEnergyU = 0;
+ double dSingleDecEnergyV = 0;
+ double dSingleDecEnergy = 0;
+ uint32_t TrSize = 1 << (log2TrSize);
+ uint32_t TrSizeC = 1 << log2TrSizeC;
+ uint32_t ldVal = 0;
m_entropyCoder.store(m_rqt[depth].rqtRoot);
uint32_t trSize = 1 << log2TrSize;
@@ -3274,9 +3719,15 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
int16_t* resi = resiYuv.getLumaAddr(absPartIdx);
- numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
+ ldVal=0;
+ numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false, m_rdCost.m_decEnergyRD, ldVal);
cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0];
+ if (numSig[TEXT_LUMA][0] && m_rdCost.m_decEnergyRD)
+ {
+ dSingleDecEnergyY = calcCoeffDecodingEnergy(numSig[TEXT_LUMA][0], ldVal, TrSize);
+ }
+
m_entropyCoder.resetBits();
if (bSplitPresentFlag && log2TrSize > depthRange[0])
@@ -3316,6 +3767,11 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
const sse_t nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, curReconY, strideReconY);
uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
uint32_t nonZeroEnergyY = 0; uint64_t singleCostY = 0;
+
+ if(m_rdCost.m_decEnergyRD){
+ nzCbfBitsY += dSingleDecEnergyY;
+ }
+
if (m_rdCost.m_psyRd)
{
nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, curReconY, strideReconY);
@@ -3329,6 +3785,10 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
else
singleCostY = m_rdCost.calcRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0]);
+ if(m_rdCost.m_decEnergyRD){
+ nzCbfBitsY -= dSingleDecEnergyY;
+ }
+
if (cu.m_tqBypass[0])
{
singleDist[TEXT_LUMA][0] = nonZeroDistY;
@@ -3354,6 +3814,7 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
minCost[TEXT_LUMA][0] = nullCostY;
singleDist[TEXT_LUMA][0] = zeroDistY;
singleEnergy[TEXT_LUMA][0] = zeroEnergyY;
+ dSingleDecEnergyY = 0;
}
else
{
@@ -3366,8 +3827,13 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
}
else
{
- if (checkTransformSkipY)
- minCost[TEXT_LUMA][0] = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA);
+ if (checkTransformSkipY){
+ if(m_rdCost.m_decEnergyRD)
+ minCost[TEXT_LUMA][0] = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA, dSingleDecEnergyY);
+ else
+ minCost[TEXT_LUMA][0] = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA);
+ }
+
primitives.cu[partSize].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0);
singleDist[TEXT_LUMA][0] = zeroDistY;
singleBits[TEXT_LUMA][0] = 0;
@@ -3399,9 +3865,22 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
- numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
+ ldVal = 0;
+ numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false, m_rdCost.m_decEnergyRD, ldVal);
cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
+ if (m_rdCost.m_decEnergyRD && numSig[chromaId][tuIterator.section] > 0)
+ {
+ if (chromaId ==1)
+ {
+ dSingleDecEnergyU = calcCoeffDecodingEnergy(numSig[chromaId][tuIterator.section], ldVal, TrSizeC);
+ }
+ else if(chromaId == 2)
+ {
+ dSingleDecEnergyV = calcCoeffDecodingEnergy(numSig[chromaId][tuIterator.section], ldVal, TrSizeC);
+ }
+ }
+
uint32_t latestBitCount = m_entropyCoder.getNumberOfWrittenBits();
if (cbfFlag[chromaId][tuIterator.section])
m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
@@ -3445,7 +3924,26 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
singleCostC = m_rdCost.calcSsimRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
}
else
- singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]);
+
+ if (chromaId ==1)
+ {
+ nzCbfBitsC +=dSingleDecEnergyU;
+ }
+ else if(chromaId == 2)
+ {
+ nzCbfBitsC +=dSingleDecEnergyV;
+ }
+
+ singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]);
+ if (chromaId ==1)
+ {
+ nzCbfBitsC -=dSingleDecEnergyU;
+ }
+ else if(chromaId == 2)
+ {
+ nzCbfBitsC -=dSingleDecEnergyV;
+ }
+
if (cu.m_tqBypass[0])
{
@@ -3477,13 +3975,25 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
minCost[chromaId][tuIterator.section] = singleCostC;
singleDist[chromaId][tuIterator.section] = nonZeroDistC;
singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC;
+ if (chromaId ==1)
+ dSingleDecEnergy += dSingleDecEnergyU;
+ if (chromaId ==2)
+ dSingleDecEnergy += dSingleDecEnergyV;
}
}
}
else
{
- if (checkTransformSkipC)
- minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepthC, (TextType)chromaId);
+ if (checkTransformSkipC){
+ if(m_rdCost.m_decEnergyRD){
+ if(chromaId ==1)
+ minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepthC, (TextType)chromaId, dSingleDecEnergyU);
+ else
+ minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepthC, (TextType)chromaId, dSingleDecEnergyV);
+ }else{
+ minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepthC, (TextType)chromaId);
+ }
+ }
primitives.cu[partSizeC].blockfill_s[strideResiC % 64 == 0](curResiC, strideResiC, 0);
singleBits[chromaId][tuIterator.section] = 0;
singleDist[chromaId][tuIterator.section] = zeroDistC;
@@ -3524,10 +4034,14 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
fenc = fencYuv->getLumaAddr(absPartIdx);
resi = resiYuv.getLumaAddr(absPartIdx);
- uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, m_tsCoeff, log2TrSize, TEXT_LUMA, absPartIdx, true);
+ ldVal = 0;
+ uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, m_tsCoeff, log2TrSize, TEXT_LUMA, absPartIdx, true, m_rdCost.m_decEnergyRD, ldVal);
if (numSigTSkipY)
{
+
+ dSingleDecEnergyY = calcCoeffDecodingEnergy(numSigTSkipY, ldVal, TrSize);//( cuGeom.depth , TrSize, TrSize, TEXT_LUMA, m_tsCoeff, MODE_INTER );
+ dSingleDecEnergy += dSingleDecEnergyY;
m_entropyCoder.resetBits();
m_entropyCoder.codeQtCbfLuma(!!numSigTSkipY, tuDepth);
m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdx, log2TrSize, TEXT_LUMA);
@@ -3540,18 +4054,33 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
primitives.cu[partSize].add_ps[bufferAlignCheck](m_tsRecon, trSize, mode.predYuv.getLumaAddr(absPartIdx), m_tsResidual, mode.predYuv.m_size, trSize);
nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, m_tsRecon, trSize);
- if (m_rdCost.m_psyRd)
- {
- nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, m_tsRecon, trSize);
- singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY);
- }
- else if(m_rdCost.m_ssimRd)
- {
- nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, m_tsRecon, trSize, log2TrSize, TEXT_LUMA, absPartIdx);
- singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY);
+ if(m_rdCost.m_decEnergyRD){
+ if (m_rdCost.m_psyRd)
+ {
+ nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, m_tsRecon, trSize);
+ singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY+dSingleDecEnergyY, nonZeroEnergyY);
+ }
+ else if(m_rdCost.m_ssimRd)
+ {
+ nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, m_tsRecon, trSize, log2TrSize, TEXT_LUMA, absPartIdx);
+ singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, skipSingleBitsY+dSingleDecEnergyY, nonZeroEnergyY);
+ }
+ else
+ singleCostY = m_rdCost.calcRdCost(nonZeroDistY, skipSingleBitsY+dSingleDecEnergyY);
+ }else{
+ if (m_rdCost.m_psyRd)
+ {
+ nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, m_tsRecon, trSize);
+ singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY);
+ }
+ else if(m_rdCost.m_ssimRd)
+ {
+ nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, m_tsRecon, trSize, log2TrSize, TEXT_LUMA, absPartIdx);
+ singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY);
+ }
+ else
+ singleCostY = m_rdCost.calcRdCost(nonZeroDistY, skipSingleBitsY);
}
- else
- singleCostY = m_rdCost.calcRdCost(nonZeroDistY, skipSingleBitsY);
}
if (!numSigTSkipY || minCost[TEXT_LUMA][0] < singleCostY)
@@ -3601,13 +4130,24 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
- uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, m_tsCoeff, log2TrSizeC, (TextType)chromaId, absPartIdxC, true);
+ ldVal = 0;
+ uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, m_tsCoeff, log2TrSizeC, (TextType)chromaId, absPartIdxC, true, m_rdCost.m_decEnergyRD, ldVal);
m_entropyCoder.resetBits();
singleBits[chromaId][tuIterator.section] = 0;
if (numSigTSkipC)
{
+ if (m_rdCost.m_decEnergyRD && chromaId ==1)
+ {
+ dSingleDecEnergyU = calcCoeffDecodingEnergy(numSigTSkipC, ldVal, TrSizeC);
+ dSingleDecEnergy += dSingleDecEnergyU;
+ }
+ else if(m_rdCost.m_decEnergyRD && chromaId == 2)
+ {
+ dSingleDecEnergyV = calcCoeffDecodingEnergy(numSigTSkipC, ldVal, TrSizeC);
+ dSingleDecEnergy += dSingleDecEnergyV;
+ }
m_entropyCoder.codeQtCbfChroma(!!numSigTSkipC, tuDepth);
m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
@@ -3618,6 +4158,18 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
bool bufferAlignCheck = predYuvAlign && (trSizeC % 64 == 0) && (mode.predYuv.m_csize % 64 == 0) && (trSizeC % 64 == 0);
primitives.cu[partSizeC].add_ps[bufferAlignCheck](m_tsRecon, trSizeC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), m_tsResidual, mode.predYuv.m_csize, trSizeC);
nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, m_tsRecon, trSizeC));
+
+ if(m_rdCost.m_decEnergyRD){
+ if (chromaId ==1)
+ {
+ singleBits[chromaId][tuIterator.section]+= dSingleDecEnergyU;
+ }
+ else if(chromaId == 2)
+ {
+ singleBits[chromaId][tuIterator.section]+= dSingleDecEnergyV;
+ }
+ }
+
if (m_rdCost.m_psyRd)
{
nonZeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, m_tsRecon, trSizeC);
@@ -3630,6 +4182,17 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
}
else
singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section]);
+
+ if(m_rdCost.m_decEnergyRD){
+ if (chromaId ==1)
+ {
+ singleBits[chromaId][tuIterator.section]-= dSingleDecEnergyU;
+ }
+ else if(chromaId == 2)
+ {
+ singleBits[chromaId][tuIterator.section]-= dSingleDecEnergyV;
+ }
+ }
}
if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC)
@@ -3695,6 +4258,21 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
// For that reason, I am collecting individual coefficient bits only.
fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits;
+ if(m_rdCost.m_decEnergyRD){
+ fullCost.decEnergy+=dSingleDecEnergy;
+ if (m_param->bEnableLoopFilter)
+ {
+ int nBorders = 0;
+ if (cu.m_cuPelX != 0 && cu.m_cuPelX % 8 == 0)
+ nBorders = 1;
+ if (cu.m_cuPelY != 0 && cu.m_cuPelY % 8 == 0)
+ nBorders = nBorders + 1;
+ fullCost.decEnergy += nBorders ? ((64 >> cuGeom.depth) >> (3 - nBorders))*m_param->sSpecificDecEnergies.e_Bs : 0; // Considering DBF boundaries top and left (each boundary has 4 pixels, two boundaries because of left and top)
+ }
+
+ fullCost.bits+= fullCost.decEnergy+ outCosts.decEnergy;
+ }
+
fullCost.distortion += singleDist[TEXT_LUMA][0];
fullCost.energy += singleEnergy[TEXT_LUMA][0];// need to check we need to add chroma also
for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
@@ -3710,6 +4288,10 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
else
fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
+ if(m_rdCost.m_decEnergyRD){
+ fullCost.bits-= fullCost.decEnergy + outCosts.decEnergy;
+ }
+
if (m_param->limitTU && bCheckSplit)
{
// Stop recursion if the TU's energy level is minimal
@@ -3773,6 +4355,7 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
}
+ splitCost.decEnergy = outCosts.decEnergy;
bool yCbCrCbf = splitTU(mode, cuGeom, absPartIdx, tuDepth, resiYuv, splitCost, depthRange, 0);
if (yCbCrCbf || !bCheckFull)
{
@@ -3785,6 +4368,7 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
if (nextSplit)
{
m_entropyCoder.load(m_rqt[depth].rqtRoot);
+ splitCost.decEnergy = outCosts.decEnergy;
splitCost.bits = splitCost.distortion = splitCost.rdcost = splitCost.energy = 0;
if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
{
@@ -3797,9 +4381,10 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
}
}
outCosts.distortion += splitCost.distortion;
- outCosts.rdcost += splitCost.rdcost;
- outCosts.bits += splitCost.bits;
- outCosts.energy += splitCost.energy;
+ outCosts.rdcost += splitCost.rdcost;
+ outCosts.bits += splitCost.bits;
+ outCosts.energy += splitCost.energy;
+ outCosts.decEnergy = splitCost.decEnergy;
return;
}
else
@@ -3851,9 +4436,10 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
}
outCosts.distortion += fullCost.distortion;
- outCosts.rdcost += fullCost.rdcost;
- outCosts.bits += fullCost.bits;
- outCosts.energy += fullCost.energy;
+ outCosts.rdcost += fullCost.rdcost;
+ outCosts.bits += fullCost.bits;
+ outCosts.energy += fullCost.energy;
+ outCosts.decEnergy += fullCost.decEnergy;
}
void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2])
@@ -3989,7 +4575,11 @@ void Search::checkDQP(Mode& mode, const CUGeom& cuGeom)
else if (m_param->rdLevel <= 1)
{
mode.sa8dBits++;
- mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
+ if(m_rdCost.m_decEnergyRD)
+ mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits + mode.decEnergy );
+ else
+ mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
+
}
else
{
@@ -4032,7 +4622,11 @@ void Search::checkDQPForSplitPred(Mode& mode, const CUGeom& cuGeom)
else if (m_param->rdLevel <= 1)
{
mode.sa8dBits++;
- mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
+ if(m_rdCost.m_decEnergyRD)
+ mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits + mode.decEnergy);
+ else
+ mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
+
}
else
{
@@ -4048,3 +4642,44 @@ void Search::checkDQPForSplitPred(Mode& mode, const CUGeom& cuGeom)
cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
}
}
+
+int Search::calcFracpelDecodingEnergySearch( MV cMv, uint32_t width, uint32_t height)
+{
+ int decEnergy = 0;
+ bool bHor = false;
+ int partSize = (width*height) >> 4;
+ //printf("%d\n", partSize);
+ if (cMv.getAbsHor()%4 != 0) // Horizontal filterings subpel
+ {
+ decEnergy += partSize * this->m_param->sSpecificDecEnergies.e_fracpel;
+ bHor = true;
+ }
+ if (cMv.getAbsVer()%4 != 0) // Vertical filterings
+ {
+ decEnergy += partSize * this->m_param->sSpecificDecEnergies.e_fracpel;
+ if (bHor)// Additional horizontal filterings if both are applied
+ {
+ decEnergy += ((6*height)>>4) * this->m_param->sSpecificDecEnergies.e_fracpel;
+ }
+ }
+ return decEnergy;
+}
+
+int Search::calcCoeffDecodingEnergy(uint32_t numSig, uint32_t ldVal, int trWidth)
+{
+ // Block energies
+ int dCoeffEnergy = 0;
+
+ dCoeffEnergy += m_param->sSpecificDecEnergies.e_coeff * numSig;
+ dCoeffEnergy += m_param->sSpecificDecEnergies.e_val * ldVal /16;
+
+ switch(trWidth){
+ case 32: dCoeffEnergy += m_param->sSpecificDecEnergies.e_trans32x32;break;
+ case 16: dCoeffEnergy += m_param->sSpecificDecEnergies.e_trans16x16;break;
+ case 8: dCoeffEnergy += m_param->sSpecificDecEnergies.e_trans8x8;break;
+ case 4: dCoeffEnergy += m_param->sSpecificDecEnergies.e_trans4x4;break;
+ }
+
+ return dCoeffEnergy;
+}
+
diff --git a/source/encoder/search.h b/source/encoder/search.h
index 02bd6e647..1cda9aa0f 100644
--- a/source/encoder/search.h
+++ b/source/encoder/search.h
@@ -127,6 +127,9 @@ struct Mode
uint32_t mvBits; // Mv bits + Ref + block type (or intra mode)
uint32_t coeffBits; // Texture bits (DCT Coeffs)
+ uint32_t decEnergy; //DERDO
+ uint32_t predDecEnergy;
+
void initCosts()
{
rdCost = 0;
@@ -141,6 +144,9 @@ struct Mode
totalBits = 0;
mvBits = 0;
coeffBits = 0;
+
+ decEnergy = 0; //DERDO
+ predDecEnergy = 0;
}
void addSubCosts(const Mode& subMode)
@@ -157,6 +163,8 @@ struct Mode
totalBits += subMode.totalBits;
mvBits += subMode.mvBits;
coeffBits += subMode.coeffBits;
+
+ decEnergy += subMode.decEnergy;
}
};
@@ -382,7 +390,8 @@ protected:
uint32_t bits;
sse_t distortion;
uint32_t energy;
- Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
+ uint32_t decEnergy;
+ Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; decEnergy = 0;}
};
struct TUInfoCache
@@ -393,7 +402,7 @@ protected:
Entropy rqtStore[NUM_SUBPART];
} m_cacheTU;
- uint64_t estimateNullCbfCost(sse_t dist, uint32_t energy, uint32_t tuDepth, TextType compId);
+ uint64_t estimateNullCbfCost(sse_t dist, uint32_t energy, uint32_t tuDepth, TextType compId, double decEnergy=0);
bool splitTU(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& splitCost, const uint32_t depthRange[2], int32_t splitMore);
void estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2], int32_t splitMore = -1);
@@ -423,10 +432,13 @@ protected:
int selectMVP(const CUData& cu, const PredictionUnit& pu, const MV amvp[AMVP_NUM_CANDS], int list, int ref);
const MV& checkBestMVP(const MV amvpCand[2], const MV& mv, int& mvpIdx, uint32_t& outBits, uint32_t& outCost) const;
void setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const;
- uint32_t mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m);
+ uint32_t mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m, int& decEnergy);
static void getBlkBits(PartSize cuMode, bool bPSlice, int puIdx, uint32_t lastMode, uint32_t blockBit[3]);
void updateMVP(const MV amvp, const MV& mv, uint32_t& outBits, uint32_t& outCost, const MV& alterMVP);
+ int calcFracpelDecodingEnergySearch(MV cMv, uint32_t width, uint32_t height);
+ int calcCoeffDecodingEnergy(uint32_t numSig, uint32_t ldVal, int trWidth);
+
/* intra helper functions */
enum { MAX_RD_INTRA_MODES = 16 };
static void updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList);
@@ -434,9 +446,15 @@ protected:
// get most probable luma modes for CU part, and bit cost of all non mpm modes
uint32_t getIntraRemModeBits(CUData & cu, uint32_t absPartIdx, uint32_t mpmModes[3], uint64_t& mpms) const;
- void updateModeCost(Mode& m) const { m.rdCost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(m.distortion, m.totalBits, m.psyEnergy)
- : (m_rdCost.m_ssimRd ? m_rdCost.calcSsimRdCost(m.distortion, m.totalBits, m.ssimEnergy)
- : m_rdCost.calcRdCost(m.distortion, m.totalBits)); }
+ void updateModeCost(Mode& m) const { m.rdCost = m_rdCost.m_decEnergyRD ?
+ (m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(m.distortion, m.totalBits+m.decEnergy, m.psyEnergy)
+ : (m_rdCost.m_ssimRd ? m_rdCost.calcSsimRdCost(m.distortion, m.totalBits+m.decEnergy, m.ssimEnergy)
+ : m_rdCost.calcRdCost(m.distortion, m.totalBits+m.decEnergy))):
+ (m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(m.distortion, m.totalBits, m.psyEnergy)
+ : (m_rdCost.m_ssimRd ? m_rdCost.calcSsimRdCost(m.distortion, m.totalBits, m.ssimEnergy)
+ : m_rdCost.calcRdCost(m.distortion, m.totalBits))); }
+
+
};
}
--
2.20.1.windows.1
-------------- next part --------------
From 1b6be642cb62b06505361ab8e2b2ff6dd70fc536 Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:46:29 +0200
Subject: [PATCH 13/14] DERDO: Cost calculations and function modifications for
considering SAO energy.
---
source/encoder/sao.cpp | 62 +++++++++++++++++++++++++++---------------
source/encoder/sao.h | 10 ++++---
2 files changed, 46 insertions(+), 26 deletions(-)
diff --git a/source/encoder/sao.cpp b/source/encoder/sao.cpp
index 0c46ece53..9498ea6fb 100644
--- a/source/encoder/sao.cpp
+++ b/source/encoder/sao.cpp
@@ -1272,6 +1272,7 @@ void SAO::rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr)
memset(m_offset, 0, sizeof(m_offset));
int64_t bestCost = 0;
int64_t rateDist = 0;
+ int64_t decEnergy = 0;
bool bAboveLeftAvail = true;
for (int mergeIdx = 0; mergeIdx < 2; ++mergeIdx)
@@ -1292,7 +1293,7 @@ void SAO::rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr)
{
calcSaoStatsCTU(addr, 0);
saoStatsInitialOffset(addr, 0);
- saoLumaComponentParamDist(saoParam, addr, rateDist, lambda, bestCost);
+ saoLumaComponentParamDist(saoParam, addr, rateDist, lambda, bestCost, decEnergy);
}
}
@@ -1304,7 +1305,7 @@ void SAO::rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr)
calcSaoStatsCTU(addr, 1);
calcSaoStatsCTU(addr, 2);
saoStatsInitialOffset(addr, 1);
- saoChromaComponentParamDist(saoParam, addr, rateDist, lambda, bestCost);
+ saoChromaComponentParamDist(saoParam, addr, rateDist, lambda, bestCost, decEnergy);
}
}
if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
@@ -1314,7 +1315,7 @@ void SAO::rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr)
{
if (!allowMerge[mergeIdx])
continue;
-
+ int64_t mergeDecEnergy = 0;
int64_t mergeDist = 0;
for (int plane = 0; plane < planes; plane++)
{
@@ -1329,7 +1330,11 @@ void SAO::rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr)
int mergeOffset = mergeSrcParam->offset[classIdx];
estDist += estSaoDist(m_count[plane][typeIdx][classIdx + bandPos], mergeOffset, m_offsetOrg[plane][typeIdx][classIdx + bandPos]);
}
- }
+ if(plane == 0)
+ mergeDecEnergy += (int64_t)m_param->sSpecificDecEnergies.e_SAOY;
+ else
+ mergeDecEnergy += (int64_t)m_param->sSpecificDecEnergies.e_SAOC;
+ }
mergeDist += (estDist << 8) / lambda[!!plane];
}
@@ -1342,6 +1347,8 @@ void SAO::rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr)
uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
int64_t mergeCost = mergeDist + estRate;
+
+ mergeCost += mergeDecEnergy;
if (mergeCost < bestCost)
{
SaoMergeMode mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT;
@@ -1432,7 +1439,7 @@ void SAO::saoStatsInitialOffset(int addr, int planes)
}
}
-inline int64_t SAO::calcSaoRdoCost(int64_t distortion, uint32_t bits, int64_t lambda)
+inline int64_t SAO::calcSaoRdoCost(int64_t distortion, uint32_t bits, int64_t lambda, int64_t decEnergy)
{
#if X265_DEPTH < 10
X265_CHECK(bits <= (INT64_MAX - 128) / lambda,
@@ -1443,17 +1450,18 @@ inline int64_t SAO::calcSaoRdoCost(int64_t distortion, uint32_t bits, int64_t la
"calcRdCost wrap detected dist: " X265_LL ", bits %u, lambda: " X265_LL "\n",
distortion, bits, lambda);
#endif
- return distortion + ((bits * lambda + 128) >> 8);
+ return distortion + ((bits * lambda + 128) >> 8) + ((decEnergy * lambda + 128) >> 8);
+
}
-void SAO::estIterOffset(int typeIdx, int64_t lambda, int32_t count, int32_t offsetOrg, int32_t& offset, int32_t& distClasses, int64_t& costClasses)
+void SAO::estIterOffset(int typeIdx, int64_t lambda, int32_t count, int32_t offsetOrg, int32_t& offset, int32_t& distClasses, int64_t& costClasses, int64_t decEnergy)
{
int bestOffset = 0;
distClasses = 0;
// Assuming sending quantized value 0 results in zero offset and sending the value zero needs 1 bit.
// entropy coder can be used to measure the exact rate here.
- int64_t bestCost = calcSaoRdoCost(0, 1, lambda);
+ int64_t bestCost = calcSaoRdoCost(0, 1, lambda, 0);
while (offset != 0)
{
// Calculate the bits required for signalling the offset
@@ -1463,7 +1471,7 @@ void SAO::estIterOffset(int typeIdx, int64_t lambda, int32_t count, int32_t offs
// Do the dequntization before distorion calculation
int64_t dist = estSaoDist(count, offset << SAO_BIT_INC, offsetOrg);
- int64_t cost = calcSaoRdoCost(dist, rate, lambda);
+ int64_t cost = calcSaoRdoCost(dist, rate, lambda, decEnergy);
if (cost < bestCost)
{
bestCost = cost;
@@ -1476,7 +1484,7 @@ void SAO::estIterOffset(int typeIdx, int64_t lambda, int32_t count, int32_t offs
costClasses = bestCost;
offset = bestOffset;
}
-void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& rateDist, int64_t* lambda, int64_t &bestCost)
+void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& rateDist, int64_t* lambda, int64_t &bestCost, int64_t &decEnergy)
{
Slice* slice = m_frame->m_encData->m_slice;
const CUData* cu = m_frame->m_encData->getPicCTU(addr);
@@ -1491,7 +1499,7 @@ void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& r
m_entropyCoder.load(m_rdContexts.temp);
m_entropyCoder.resetBits();
m_entropyCoder.codeSaoType(0);
- int64_t costPartBest = calcSaoRdoCost(0, m_entropyCoder.getNumberOfWrittenBits(), lambda[0]);
+ int64_t costPartBest = calcSaoRdoCost(0, m_entropyCoder.getNumberOfWrittenBits(), lambda[0], 0);
int maxSaoType;
if (m_param->bLimitSAO && ((slice->m_sliceType == P_SLICE && cu->isSkipped(0)) ||
(slice->m_sliceType == B_SLICE)))
@@ -1512,7 +1520,7 @@ void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& r
int32_t& count = m_count[0][typeIdx][classIdx];
int32_t& offsetOrg = m_offsetOrg[0][typeIdx][classIdx];
int32_t& offsetOut = m_offset[0][typeIdx][classIdx];
- estIterOffset(typeIdx, lambda[0], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
+ estIterOffset(typeIdx, lambda[0], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx], (int64_t)m_param->sSpecificDecEnergies.e_SAOY);
//Calculate distortion
estDist += distClasses[classIdx];
@@ -1522,13 +1530,14 @@ void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& r
m_entropyCoder.resetBits();
m_entropyCoder.codeSaoOffsetEO(m_offset[0][typeIdx] + 1, typeIdx, 0);
- int64_t cost = calcSaoRdoCost(estDist, m_entropyCoder.getNumberOfWrittenBits(), lambda[0]);
+ int64_t cost = calcSaoRdoCost(estDist, m_entropyCoder.getNumberOfWrittenBits(), lambda[0], (int64_t) m_param->sSpecificDecEnergies.e_SAOY);
if (cost < costPartBest)
{
costPartBest = cost;
bestDist = estDist;
bestTypeIdx = typeIdx;
+ decEnergy = (int64_t)m_param->sSpecificDecEnergies.e_SAOY;
}
}
@@ -1549,7 +1558,7 @@ void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& r
int32_t& offsetOrg = m_offsetOrg[0][SAO_BO][classIdx];
int32_t& offsetOut = m_offset[0][SAO_BO][classIdx];
- estIterOffset(SAO_BO, lambda[0], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
+ estIterOffset(SAO_BO, lambda[0], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx], (int64_t)m_param->sSpecificDecEnergies.e_SAOY);
}
// Estimate Best Position
@@ -1580,7 +1589,7 @@ void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& r
m_entropyCoder.resetBits();
m_entropyCoder.codeSaoOffsetBO(m_offset[0][SAO_BO] + bestClassBO, bestClassBO, 0);
- int64_t cost = calcSaoRdoCost(estDist, m_entropyCoder.getNumberOfWrittenBits(), lambda[0]);
+ int64_t cost = calcSaoRdoCost(estDist, m_entropyCoder.getNumberOfWrittenBits(), lambda[0], (int64_t)m_param->sSpecificDecEnergies.e_SAOY);
if (cost < costPartBest)
{
@@ -1592,6 +1601,8 @@ void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& r
lclCtuParam->bandPos = bestClassBO;
for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
lclCtuParam->offset[classIdx] = m_offset[0][SAO_BO][classIdx + bestClassBO];
+
+ decEnergy = (int64_t)m_param->sSpecificDecEnergies.e_SAOY;
}
rateDist = (bestDist << 8) / lambda[0];
@@ -1604,7 +1615,7 @@ void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& r
bestCost = rateDist + m_entropyCoder.getNumberOfWrittenBits();
}
}
-void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& rateDist, int64_t* lambda, int64_t &bestCost)
+void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& rateDist, int64_t* lambda, int64_t &bestCost, int64_t &decEnergy)
{
Slice* slice = m_frame->m_encData->m_slice;
const CUData* cu = m_frame->m_encData->getPicCTU(addr);
@@ -1621,7 +1632,9 @@ void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t&
m_entropyCoder.codeSaoType(0);
uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
- int64_t costPartBest = calcSaoRdoCost(0, bits, lambda[1]);
+ uint32_t decEnergyC = 0;
+ int64_t costPartBest = calcSaoRdoCost(0, bits, lambda[1], 0);
+
int maxSaoType;
if (m_param->bLimitSAO && ((slice->m_sliceType == P_SLICE && cu->isSkipped(0)) ||
(slice->m_sliceType == B_SLICE)))
@@ -1645,7 +1658,7 @@ void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t&
int32_t& offsetOrg = m_offsetOrg[compIdx][typeIdx][classIdx];
int32_t& offsetOut = m_offset[compIdx][typeIdx][classIdx];
- estIterOffset(typeIdx, lambda[1], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
+ estIterOffset(typeIdx, lambda[1], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx], (int64_t)m_param->sSpecificDecEnergies.e_SAOC);
estDist[compIdx - 1] += distClasses[classIdx];
}
@@ -1658,13 +1671,14 @@ void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t&
m_entropyCoder.codeSaoOffsetEO(m_offset[compIdx + 1][typeIdx] + 1, typeIdx, compIdx + 1);
uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
- int64_t cost = calcSaoRdoCost((estDist[0] + estDist[1]), estRate, lambda[1]);
+ int64_t cost = calcSaoRdoCost((estDist[0] + estDist[1]), estRate, lambda[1], (int64_t)m_param->sSpecificDecEnergies.e_SAOC*2);
if (cost < costPartBest)
{
costPartBest = cost;
bestDist = (estDist[0] + estDist[1]);
bestTypeIdx = typeIdx;
+ decEnergyC = (int64_t)m_param->sSpecificDecEnergies.e_SAOC*2;
}
}
@@ -1694,7 +1708,7 @@ void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t&
int32_t& offsetOrg = m_offsetOrg[compIdx][SAO_BO][classIdx];
int32_t& offsetOut = m_offset[compIdx][SAO_BO][classIdx];
- estIterOffset(SAO_BO, lambda[1], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
+ estIterOffset(SAO_BO, lambda[1], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx], (int64_t)m_param->sSpecificDecEnergies.e_SAOC);
}
for (int i = 0; i < MAX_NUM_SAO_CLASS - SAO_NUM_OFFSET + 1; i++)
@@ -1722,7 +1736,8 @@ void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t&
m_entropyCoder.codeSaoOffsetBO(m_offset[compIdx + 1][SAO_BO] + bestClassBO[compIdx], bestClassBO[compIdx], compIdx + 1);
uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
- int64_t cost = calcSaoRdoCost((estDist[0] + estDist[1]), estRate, lambda[1]);
+ int64_t cost = calcSaoRdoCost((estDist[0] + estDist[1]), estRate, lambda[1], (int64_t)m_param->sSpecificDecEnergies.e_SAOC * 2);
+
if (cost < costPartBest)
{
@@ -1737,6 +1752,7 @@ void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t&
for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
lclCtuParam[compIdx]->offset[classIdx] = m_offset[compIdx + 1][SAO_BO][classIdx + bestClassBO[compIdx]];
}
+ decEnergyC = (int64_t)m_param->sSpecificDecEnergies.e_SAOC * 2;
}
rateDist += (bestDist << 8) / lambda[1];
@@ -1749,7 +1765,9 @@ void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t&
m_entropyCoder.store(m_rdContexts.temp);
uint32_t rate = m_entropyCoder.getNumberOfWrittenBits();
- bestCost = rateDist + rate;
+ decEnergy = decEnergy + decEnergyC;
+ bestCost = rateDist + rate + decEnergyC;
+
}
else
{
diff --git a/source/encoder/sao.h b/source/encoder/sao.h
index c797ca7cc..2ef60e93b 100644
--- a/source/encoder/sao.h
+++ b/source/encoder/sao.h
@@ -123,13 +123,15 @@ public:
void calcSaoStatsCTU(int addr, int plane);
void calcSaoStatsCu_BeforeDblk(Frame* pic, int idxX, int idxY);
- void saoLumaComponentParamDist(SAOParam* saoParam, int addr, int64_t& rateDist, int64_t* lambda, int64_t& bestCost);
- void saoChromaComponentParamDist(SAOParam* saoParam, int addr, int64_t& rateDist, int64_t* lambda, int64_t& bestCost);
+ void saoLumaComponentParamDist(SAOParam* saoParam, int addr, int64_t& rateDist, int64_t* lambda, int64_t& bestCost, int64_t &decEnergy);
+ void saoChromaComponentParamDist(SAOParam* saoParam, int addr, int64_t& rateDist, int64_t* lambda, int64_t& bestCost, int64_t &decEnergy);
- void estIterOffset(int typeIdx, int64_t lambda, int32_t count, int32_t offsetOrg, int32_t& offset, int32_t& distClasses, int64_t& costClasses);
+
+ void estIterOffset(int typeIdx, int64_t lambda, int32_t count, int32_t offsetOrg, int32_t& offset, int32_t& distClasses, int64_t& costClasses, int64_t decEnergy);
void rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus);
void rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr);
- int64_t calcSaoRdoCost(int64_t distortion, uint32_t bits, int64_t lambda);
+ int64_t calcSaoRdoCost(int64_t distortion, uint32_t bits, int64_t lambda, int64_t energy=0);
+
void saoStatsInitialOffset(int addr, int planes);
friend class FrameFilter;
};
--
2.20.1.windows.1
-------------- next part --------------
From 3aa9debdd05184a12d816f84cd5b8a338d0042c7 Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 14:44:18 +0200
Subject: [PATCH 14/14] DERDO: Adding explanations on eedecode tuning and
derdo-flag to cli.rst and presets.rst.
---
doc/reST/cli.rst | 14 +++++++++++++-
doc/reST/presets.rst | 21 ++++++++++++++++++++-
2 files changed, 33 insertions(+), 2 deletions(-)
diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst
index 1a1de9f50..c3d39ffc0 100755
--- a/doc/reST/cli.rst
+++ b/doc/reST/cli.rst
@@ -392,7 +392,7 @@ Performance Options
be applied after :option:`--preset` but before all other parameters. Default none.
See :ref:`tunings <tunings>` for more detail.
- **Values:** psnr, ssim, grain, zero-latency, fast-decode, animation.
+ **Values:** psnr, ssim, grain, zero-latency, fast-decode, animation, eedecode.
.. option:: --slices <integer>
@@ -1197,6 +1197,18 @@ as the residual quad-tree (RQT).
gain in terms of objective quality metrics SSIM and PSNR. It only has effect
on presets which use RDO-based mode decisions (:option:`--rd` 3 and above).
+.. option:: --derdo
+
+ Enable decoding-energy-rate-distortion optimization (DERDO). In RD-
+ calculations, the energy costs for decoding the bit stream are
+ additionally taken into account. To estimate the decoding energy costs,
+ specific energy coefficients are multiplied with the occurence of certain
+ coding modes. In order to reduce the decoding energy, inter and intra
+ prediction, transforms, coefficient coding, and in-loop filters are taken
+ into account.
+
+ Default: disabled
+
Temporal / motion search options
================================
diff --git a/doc/reST/presets.rst b/doc/reST/presets.rst
index 7cabe8af2..0cd1c872e 100644
--- a/doc/reST/presets.rst
+++ b/doc/reST/presets.rst
@@ -130,6 +130,8 @@ after the preset.
+--------------+-----------------------------------------------------+
| animation | improves encode quality for animated content |
+--------------+-----------------------------------------------------+
+| eedecode | reduces the energy demand for a software decoder |
++--------------+-----------------------------------------------------+
@@ -215,4 +217,21 @@ quality for animation content without impacting the encode speed. This is done b
* :option:`--psy-rd` 0.4
* :option:`--aq-strength` 0.4
* :option:`--deblock` 1:1
- * :option:`--bframes` Increase by 2
\ No newline at end of file
+ * :option:`--bframes` Increase by 2
+
+
+Energy-Efficient Decoding
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:option:`--tune` *eedecode* adjusts encoder settings and performs decoding-energy-rate-distortion
+optimization (derdo) with the goal of reducing the software decoding energy at a constant quality. The
+decoding energy is reduced by:
+
+ * :option:`--no-deblock`
+ * :option:`--no-weightp`
+ * :option:`--no-weightb`
+ * :option:`--no-b-intra`
+ * :option:`--derdo`
+ * :option:`--aq-strength` 0.0
+ * :option:`--psy-rd` 0.0
+ * :option:`--psy-rdoq` 0.0
--
2.20.1.windows.1
-------------- next part --------------
From 70e0485d974bbf8fddcf4d603a316846cb4c68b6 Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:16:29 +0200
Subject: [PATCH 01/14] DERDO: Introduce derdo-flag to x265.h
---
source/x265.h | 45 ++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 42 insertions(+), 3 deletions(-)
diff --git a/source/x265.h b/source/x265.h
index f44040ba7..cfba3dd51 100644
--- a/source/x265.h
+++ b/source/x265.h
@@ -1308,7 +1308,12 @@ typedef struct x265_param
* value must be between 0 and 50, 1.0 is typical. Default 0 */
double psyRdoq;
- /* Perform quantisation parameter based RD refinement. RD cost is calculated
+ /* Enables decoding energy-rate-distortion optimization. Tuning option for
+ * energy-saving software decoding. See C. Herglotz, M. Bader, A. Kaup,
+ * Decoding Energy Optimal Video Encoding for x265", MMSP 2020. Default: false*/
+ bool derdo;
+
+ /* Perform quantisation parameter based RD refinement. RD cost is calculated
* on the best CU partitions, chosen after the CU analysis, for a range of QPs
* to find the optimal rounding effect. Only effective at rd-levels 5 and 6.
* Default disabled */
@@ -1380,7 +1385,7 @@ typedef struct x265_param
/* Ratefactor constant: targets a certain constant "quality".
* Acceptable values between 0 and 51. Default value: 28 */
double rfConstant;
-
+
/* Max QP difference between frames. Default: 4 */
int qpStep;
@@ -1792,6 +1797,40 @@ typedef struct x265_param
/* File containing base64 encoded SEI messages in POC order */
const char* naluFile;
+ /* Structure of specific decoding energy coefficients for DERDO (extension of RDO to consider the software decoding energy)*/
+ struct{
+ // Intra-Prediction modes
+ int e_intra32x32;
+ int e_intra16x16;
+ int e_intra8x8;
+ int e_intra4x4;
+ int e_trans32x32;
+ int e_trans16x16;
+ int e_trans8x8;
+ int e_trans4x4;
+ int e_IntraCUs;
+ // Coefficient coding
+ int e_coeff;
+ int e_val;
+ int e_PBslice;
+ int e_skip64x64; //Number of skipped CUs per depth
+ int e_skip8x82x32;
+ int e_skip16x16;
+ int e_skip8x8;
+ // number of inter coded CUs per depth
+ int e_inter64x64;
+ int e_inter32x32;
+ int e_inter32x326x16;
+ int e_inter8x8;
+ // Number of fracpel-predicted pixels [hor/ver][depth]
+ int e_fracpel;
+ int e_bi; // number of bipredicted 4x4-blocks
+ // SAO
+ int e_SAOY;
+ int e_SAOC;
+ int e_Bs;
+} sSpecificDecEnergies;
+
/* Generate bitstreams confirming to the specified dolby vision profile,
* note that 0x7C01 makes RPU appear to be an unspecified NAL type in
* HEVC stream. if BL is backward compatible, Dolby Vision single
@@ -2012,7 +2051,7 @@ static const char * const x265_preset_names[] = { "ultrafast", "superfast", "ver
* 100 times faster than placebo!
*
* Currently available tunings are: */
-static const char * const x265_tune_names[] = { "psnr", "ssim", "grain", "zerolatency", "fastdecode", "animation", 0 };
+static const char * const x265_tune_names[] = { "psnr", "ssim", "grain", "zerolatency", "fastdecode", "animation", "eedecode", 0 };
/* returns 0 on success, negative on failure (e.g. invalid preset/tune name). */
int x265_param_default_preset(x265_param *, const char *preset, const char *tune);
--
2.20.1.windows.1
-------------- next part --------------
From 3a9e987e2314e2f7faa35db9bf404526bd00277e Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:18:13 +0200
Subject: [PATCH 02/14] DERDO: Add flags and descriptions to cli-files.
---
source/x265cli.cpp | 5 +++--
source/x265cli.h | 3 ++-
2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/source/x265cli.cpp b/source/x265cli.cpp
index c28dd7f8c..9a807e42c 100755
--- a/source/x265cli.cpp
+++ b/source/x265cli.cpp
@@ -110,7 +110,7 @@ namespace X265_NS {
H0("-p/--preset <string> Trade off performance for compression efficiency. Default medium\n");
H0(" ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow, or placebo\n");
H0("-t/--tune <string> Tune the settings for a particular type of source or situation:\n");
- H0(" psnr, ssim, grain, zerolatency, fastdecode\n");
+ H0(" psnr, ssim, grain, zerolatency, fastdecode, eedecode\n");
H0("\nQuad-Tree size and depth:\n");
H0("-s/--ctu <64|32|16> Maximum CU size (WxH). Default %d\n", param->maxCUSize);
H0(" --min-cu-size <64|32|16|8> Minimum CU size (WxH). Default %d\n", param->minCUSize);
@@ -129,7 +129,8 @@ namespace X265_NS {
H0(" --[no-]early-skip Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
H0(" --rskip <mode> Set mode for early exit from recursion. Mode 1: exit using rdcost & CU homogenity. Mode 2: exit using CU edge density.\n"
" Mode 0: disabled. Default %d\n", param->recursionSkipMode);
- H1(" --rskip-edge-threshold Threshold in terms of percentage (integer of range [0,100]) for minimum edge density in CUs used to prun the recursion depth. Applicable only for rskip mode 2. Value is preset dependent. Default: %.f\n", param->edgeVarThreshold*100.0f);
+ H0(" --derdo Enable decoding-energy-rate-distortion optimization (DERDO). Default %s\n", OPT(param->derdo));
+ H1(" --rskip-edge-threshold Threshold in terms of percentage (integer of range [0,100]) for minimum edge density in CUs used to prun the recursion depth. Applicable only for rskip mode 2. Value is preset dependent. Default: %.f\n", param->edgeVarThreshold*100.0f);
H1(" --[no-]tskip-fast Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast));
H1(" --[no-]splitrd-skip Enable skipping split RD analysis when sum of split CU rdCost larger than one split CU rdCost for Intra CU. Default %s\n", OPT(param->bEnableSplitRdSkip));
H1(" --nr-intra <integer> An integer value in range of 0 to 2000, which denotes strength of noise reduction in intra CUs. Default 0\n");
diff --git a/source/x265cli.h b/source/x265cli.h
index a24d25435..bba7710ec 100644
--- a/source/x265cli.h
+++ b/source/x265cli.h
@@ -212,7 +212,8 @@ static const struct option long_options[] =
{ "no-psy-rdoq", no_argument, NULL, 0 },
{ "rd-refine", no_argument, NULL, 0 },
{ "no-rd-refine", no_argument, NULL, 0 },
- { "scaling-list", required_argument, NULL, 0 },
+ { "derdo", no_argument, NULL, 0 },
+ { "scaling-list", required_argument, NULL, 0 },
{ "lossless", no_argument, NULL, 0 },
{ "no-lossless", no_argument, NULL, 0 },
{ "no-signhide", no_argument, NULL, 0 },
--
2.20.1.windows.1
-------------- next part --------------
From 4d5b12de0e67884975edbdf5a2d62e92778cca11 Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:19:38 +0200
Subject: [PATCH 03/14] DERDO: Add new copy-if-lower-than-macro for five
parameters (COPY5_IF_LT)
---
source/common/common.h | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/source/common/common.h b/source/common/common.h
index 8c06cd79e..945e34213 100644
--- a/source/common/common.h
+++ b/source/common/common.h
@@ -202,6 +202,16 @@ typedef int16_t coeff_t; // transform coefficient
(c) = (d); \
(e) = (f); \
}
+#define COPY5_IF_LT(x, y, a, b, c, d, e, f, g, h) \
+ if ((y) < (x)) \
+ { \
+ (x) = (y); \
+ (a) = (b); \
+ (c) = (d); \
+ (e) = (f); \
+ (g) = (h); \
+ }
+
#define X265_MIN3(a, b, c) X265_MIN((a), X265_MIN((b), (c)))
#define X265_MAX3(a, b, c) X265_MAX((a), X265_MAX((b), (c)))
#define X265_MIN4(a, b, c, d) X265_MIN((a), X265_MIN3((b), (c), (d)))
--
2.20.1.windows.1
More information about the x265-devel
mailing list