[x265] [PATCH][MASTER] Support for Decoding-Energy-Rate-Distortion Optimization (DERDO)

Thu Oct 22 15:07:55 CEST 2020

-- 
Dr.-Ing. Christian Herglotz

Chair of Multimedia Communications and Signal Processing
Friedrich-Alexander University Erlangen-Nürnberg
Cauerstr. 7, D-91058 Erlangen, Germany
Tel. +49 9131 85-27117

-------------- next part --------------
From 969268805cbd2571856a0da6ee286b623143b4da Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:21:09 +0200
Subject: [PATCH 04/14] DERDO: Add  new variable for decoding energy to cudata
 and define function to return the motion vector.

---
 source/common/cudata.cpp | 2 ++
 source/common/cudata.h   | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/source/common/cudata.cpp b/source/common/cudata.cpp
index 19281dee2..29d961f01 100644
--- a/source/common/cudata.cpp
+++ b/source/common/cudata.cpp
@@ -273,6 +273,8 @@ void CUData::initialize(const CUDataMemPool& dataPool, uint32_t depth, const x26
         m_trCoeff[0] = dataPool.trCoeffMemBlock + instance * (sizeL + sizeC * 2);
         m_trCoeff[1] = m_trCoeff[0] + sizeL;
         m_trCoeff[2] = m_trCoeff[0] + sizeL + sizeC;
+        m_dpartDecEnergy = 0;
+
         for (int i = 0; i < 3; i++)
             m_fAc_den[i] = m_fDc_den[i] = 0;
     }
diff --git a/source/common/cudata.h b/source/common/cudata.h
index 8397f0568..5f2a2c5a6 100644
--- a/source/common/cudata.h
+++ b/source/common/cudata.h
@@ -96,6 +96,7 @@ struct MVField
 {
     MV  mv;
     int refIdx;
+    MV  getMv () const 	{return mv;}
 };
 
 // Structure that keeps the neighbour's MV information.
@@ -229,6 +230,8 @@ public:
     uint64_t*       m_collectCURd;
     uint32_t*       m_collectCUVariance;
     uint32_t*       m_collectCUCount;
+    uint32_t		m_dpartDecEnergy;
+
 
     CUData();
 
-- 
2.20.1.windows.1

-------------- next part --------------
From b51e801b5479cf329f20a98fc963f74db2e7a473 Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:24:04 +0200
Subject: [PATCH 05/14] DERDO: Counter for coefficient features in quant_c
 function

---
 source/common/dct.cpp | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/source/common/dct.cpp b/source/common/dct.cpp
index b102b6e31..2777c5057 100644
--- a/source/common/dct.cpp
+++ b/source/common/dct.cpp
@@ -661,7 +661,7 @@ static void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCo
     }
 }
 
-static uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
+static uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff, bool countVal, uint32_t& ldVal)
 {
     X265_CHECK(qBits >= 8, "qBits less than 8\n");
     X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n");
@@ -676,8 +676,19 @@ static uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t*
         int tmplevel = abs(level) * quantCoeff[blockpos];
         level = ((tmplevel + add) >> qBits);
         deltaU[blockpos] = ((tmplevel - (level << qBits)) >> qBits8);
-        if (level)
-            ++numSig;
+		if (level)
+        {
+            if (countVal)
+    		{
+       			int absCoeff = level;
+				for (; absCoeff > 1; absCoeff = absCoeff >> 1) {
+					if (absCoeff < 4 && (absCoeff & 1)) //consider second last bit
+						ldVal = ldVal + 1; //middle between two logs for more accuracy, using ~ double the value and divide by two later
+					ldVal = ldVal + 2;
+				}
+   			}
+   			++numSig;
+		}
         level *= sign;
         qCoef[blockpos] = (int16_t)x265_clip3(-32768, 32767, level);
     }
-- 
2.20.1.windows.1

-------------- next part --------------
From f38783ee6805b182e5fcdf58e136f453b9508b29 Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:25:58 +0200
Subject: [PATCH 06/14] DERDO: Add functions to return absolute values of
 motion vectors.

---
 source/common/mv.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/source/common/mv.h b/source/common/mv.h
index 191090cb9..6bcf940cb 100644
--- a/source/common/mv.h
+++ b/source/common/mv.h
@@ -47,6 +47,8 @@ public:
     MV()                                       {}
     MV(int64_t w) : word(w)                    {}
     MV(int32_t _x, int32_t _y) : x(_x), y(_y)  {}
+    int   getAbsHor () const { return abs( x );   }
+    int   getAbsVer () const { return abs( y );   }
 
     MV& operator =(uint64_t w)                 { word = w; return *this; }
 
-- 
2.20.1.windows.1

-------------- next part --------------
From 251e1cd38f87773d8012923b2447b3a69f21afe3 Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:30:20 +0200
Subject: [PATCH 07/14] DERDO: Adding flags and tuning definitions in param.cpp

---
 source/common/param.cpp | 51 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)

diff --git a/source/common/param.cpp b/source/common/param.cpp
index 47a7a7c47..c653a5706 100755
--- a/source/common/param.cpp
+++ b/source/common/param.cpp
@@ -258,6 +258,9 @@ void x265_param_default(x265_param* param)
     param->minVbvFullness = 50;
     param->maxVbvFullness = 80;
     param->rc.rfConstant = 28;
+
+    param->derdo = false;
+
     param->rc.bitrate = 0;
     param->rc.qCompress = 0.6;
     param->rc.ipFactor = 1.4f;
@@ -323,6 +326,38 @@ void x265_param_default(x265_param* param)
     param->confWinRightOffset = 0;
     param->confWinBottomOffset = 0;
 
+	//Intra Prediction decoding energies
+	param->sSpecificDecEnergies.e_intra32x32 = 416>>3;
+	param->sSpecificDecEnergies.e_intra16x16 = 134 >> 3;
+	param->sSpecificDecEnergies.e_intra8x8 = 57 >> 3;
+	param->sSpecificDecEnergies.e_intra4x4 = 20 >> 3;
+	// Transform coding decoding energies
+	param->sSpecificDecEnergies.e_trans32x32 = 1049 >> 3;
+	param->sSpecificDecEnergies.e_trans16x16 = 140 >> 3;
+	param->sSpecificDecEnergies.e_trans8x8 = 27 >> 3;
+	param->sSpecificDecEnergies.e_trans4x4 = 16 >> 3;
+	param->sSpecificDecEnergies.e_IntraCUs = 25 >> 3; 
+	// Coefficient coding decoding energies
+	param->sSpecificDecEnergies.e_coeff = 1;
+	param->sSpecificDecEnergies.e_val = 1;
+	// Skipped CU decoding energies
+	param->sSpecificDecEnergies.e_PBslice = 187 >> 3;
+	param->sSpecificDecEnergies.e_skip64x64 = 1650 >> 3;	
+	param->sSpecificDecEnergies.e_skip8x82x32 = 427 >> 3;
+	param->sSpecificDecEnergies.e_skip16x16 = 127 >> 3;
+	param->sSpecificDecEnergies.e_skip8x8 = 40 >> 3;
+	// Inter coded CU  decoding energies
+	param->sSpecificDecEnergies.e_inter64x64 = 1880 >> 3;
+	param->sSpecificDecEnergies.e_inter32x32 = 471 >> 3;
+	param->sSpecificDecEnergies.e_inter32x326x16 = 136 >> 3;
+	param->sSpecificDecEnergies.e_inter8x8 = 49 >> 3;
+	param->sSpecificDecEnergies.e_fracpel = 1;
+	param->sSpecificDecEnergies.e_bi = 1;		// number of bipredicted SCUs
+	// In-loop filter decoding energies
+	param->sSpecificDecEnergies.e_Bs = 1;
+	param->sSpecificDecEnergies.e_SAOY = 1175 >> 3;
+	param->sSpecificDecEnergies.e_SAOC = 369 >> 3;
+
     param->bEmitVUITimingInfo   = 1;
     param->bEmitVUIHRDInfo      = 1;
     param->bOptQpPPS            = 0;
@@ -621,6 +656,17 @@ int x265_param_default_preset(x265_param* param, const char* preset, const char*
         else if (!strcmp(tune, "vmaf"))  /*Adding vmaf for x265 + SVT-HEVC integration support*/
         {
             /*vmaf is under development, currently x265 won't support vmaf*/
+        }
+		else if (!strcmp(tune, "eedecode"))  /*Adding energy efficient decoding tuning by LMS, Erlangen*/
+		{
+			param->bEnableLoopFilter = 0;
+			param->bEnableWeightedPred = 0;
+			param->bEnableWeightedBiPred = 0;
+			param->bIntraInBFrames = 0;
+			param->derdo = true;
+			param->rc.aqStrength = 0.0;
+			param->psyRd = 0.0;
+			param->psyRdoq = 0.0;
         }
         else
             return -1;
@@ -1014,6 +1060,7 @@ int x265_param_parse(x265_param* p, const char* name, const char* value)
         else
             p->psyRdoq = 0.0;
     }
+	OPT("derdo") p->derdo = atobool(value);
     OPT("rd-refine") p->bEnableRdRefine = atobool(value);
     OPT("signhide") p->bEnableSignHiding = atobool(value);
     OPT("b-intra") p->bIntraInBFrames = atobool(value);
@@ -1192,7 +1239,7 @@ int x265_param_parse(x265_param* p, const char* name, const char* value)
     OPT("max-cll") bError |= sscanf(value, "%hu,%hu", &p->maxCLL, &p->maxFALL) != 2;
     OPT("min-luma") p->minLuma = (uint16_t)atoi(value);
     OPT("max-luma") p->maxLuma = (uint16_t)atoi(value);
-    OPT("uhd-bd") p->uhdBluray = atobool(value);
+    //OPT("uhd-bd") p->uhdBluray = atobool(value);
     else
         bExtraParams = true;
 
@@ -2127,6 +2174,7 @@ char *x265_param2string(x265_param* p, int padx, int pady)
     s += sprintf(s, " rdpenalty=%d", p->rdPenalty);
     s += sprintf(s, " psy-rd=%.2f", p->psyRd);
     s += sprintf(s, " psy-rdoq=%.2f", p->psyRdoq);
+	BOOL(p->derdo, "derdo");
     BOOL(p->bEnableRdRefine, "rd-refine");
     BOOL(p->bLossless, "lossless");
     s += sprintf(s, " cbqpoffs=%d", p->cbQpOffset);
@@ -2510,6 +2558,7 @@ void x265_copy_params(x265_param* dst, x265_param* src)
     dst->rc.bEnableConstVbv = src->rc.bEnableConstVbv;
     dst->rc.hevcAq = src->rc.hevcAq;
     dst->rc.qpAdaptationRange = src->rc.qpAdaptationRange;
+	dst->derdo = src->derdo;
 
     dst->vui.aspectRatioIdc = src->vui.aspectRatioIdc;
     dst->vui.sarWidth = src->vui.sarWidth;
-- 
2.20.1.windows.1

-------------- next part --------------
From 7f8620298ed650eddc3044fdd332f82eba00183c Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:31:37 +0200
Subject: [PATCH 08/14] DERDO: Primitive declaration for coefficient counting
 in primitives.h

---
 source/common/primitives.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/common/primitives.h b/source/common/primitives.h
index 0b52f84de..34a2774cb 100644
--- a/source/common/primitives.h
+++ b/source/common/primitives.h
@@ -156,7 +156,7 @@ typedef void (*denoiseDct_t)(int16_t* dctCoef, uint32_t* resSum, const uint16_t*
 
 typedef void (*calcresidual_t)(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
 typedef void (*transpose_t)(pixel* dst, const pixel* src, intptr_t stride);
-typedef uint32_t (*quant_t)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
+typedef uint32_t (*quant_t)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff, bool countLdVal, uint32_t& ldVal);
 typedef uint32_t (*nquant_t)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
 typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift);
 typedef void (*dequant_normal_t)(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
-- 
2.20.1.windows.1

-------------- next part --------------
From f242757353b05456a4d96e578f2b5def4f5e1b7a Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:33:35 +0200
Subject: [PATCH 09/14] DERDO: Definitions and corrected declarations for
 coefficient counting in quant.h and quant.cpp

---
 source/common/quant.cpp | 21 +++++++++++++++------
 source/common/quant.h   |  8 ++++----
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/source/common/quant.cpp b/source/common/quant.cpp
index 93462f51a..3197df414 100644
--- a/source/common/quant.cpp
+++ b/source/common/quant.cpp
@@ -395,7 +395,7 @@ uint32_t Quant::signBitHidingHDQ(int16_t* coeff, int32_t* deltaU, uint32_t numSi
 }
 
 uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride,
-                             coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip)
+                             coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip, bool countLdVal, uint32_t& ldVal)
 {
     const uint32_t sizeIdx = log2TrSize - 2;
 
@@ -452,7 +452,8 @@ uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencS
     }
 
     if (m_rdoqLevel)
-        return (this->*rdoQuant_func[log2TrSize - 2])(cu, coeff, ttype, absPartIdx, usePsy);
+		return (this->*rdoQuant_func[log2TrSize - 2])(cu, coeff, ttype, absPartIdx, usePsy, countLdVal, ldVal);
+
     else
     {
         int deltaU[32 * 32];
@@ -466,7 +467,7 @@ uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencS
         int add = (cu.m_slice->m_sliceType == I_SLICE ? 171 : 85) << (qbits - 9);
         int numCoeff = 1 << (log2TrSize * 2);
 
-        uint32_t numSig = primitives.quant(m_resiDctCoeff, quantCoeff, deltaU, coeff, qbits, add, numCoeff);
+        uint32_t numSig = primitives.quant(m_resiDctCoeff, quantCoeff, deltaU, coeff, qbits, add, numCoeff, countLdVal, ldVal);
 
         if (numSig >= 2 && cu.m_slice->m_pps->bSignHideEnabled)
         {
@@ -607,7 +608,7 @@ void Quant::invtransformNxN(const CUData& cu, int16_t* residual, uint32_t resiSt
 /* Rate distortion optimized quantization for entropy coding engines using
  * probability models like CABAC */
 template<uint32_t log2TrSize>
-uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy)
+uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy, bool countLdVal, uint32_t& ldVal)
 {
     const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
     int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype;
@@ -1261,7 +1262,15 @@ uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, ui
         int blkPos = codeParams.scan[pos];
         int level  = dstCoeff[blkPos];
         numSig += (level != 0);
-
+		if (level && countLdVal)
+		{
+			int absCoeff = abs(level);
+			for (; absCoeff > 1; absCoeff = absCoeff >> 1) {
+				if (absCoeff<4 && (absCoeff & 1)) //consider second last bit
+					ldVal = ldVal + 1; //middle between two logs for more accuracy, using ~ double the value and divide by two later
+				ldVal = ldVal + 2;
+			}
+		}
         uint32_t mask = (int32_t)m_resiDctCoeff[blkPos] >> 31;
         dstCoeff[blkPos] = (int16_t)((level ^ mask) - mask);
     }
@@ -1418,7 +1427,7 @@ uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, ui
             lastCG = 0;
         }
     }
-
+	ldVal >>= 1;
     return numSig;
 }
 
diff --git a/source/common/quant.h b/source/common/quant.h
index 21ec217db..4bcfd2b1c 100644
--- a/source/common/quant.h
+++ b/source/common/quant.h
@@ -106,8 +106,8 @@ public:
     /* CU setup */
     void setQPforQuant(const CUData& ctu, int qp);
 
-    uint32_t transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride, coeff_t* coeff,
-                          uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip);
+	uint32_t transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride, coeff_t* coeff,
+		uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip, bool countLdVal, uint32_t& ldVal);
 
     void invtransformNxN(const CUData& cu, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
                          uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig);
@@ -155,10 +155,10 @@ protected:
     uint32_t signBitHidingHDQ(int16_t* qcoeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codingParameters, uint32_t log2TrSize);
 
     template<uint32_t log2TrSize>
-    uint32_t rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy);
+	uint32_t rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy, bool countLdVal, uint32_t& ldVal);
 
 public:
-    typedef uint32_t (Quant::*rdoQuant_t)(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy);
+	typedef uint32_t(Quant::*rdoQuant_t)(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy, bool countLdVal, uint32_t& ldVal);
 
 private:
     static rdoQuant_t rdoQuant_func[NUM_CU_DEPTH];
-- 
2.20.1.windows.1

-------------- next part --------------
From 6ece16d0c5e0ec519ef6218035071b28e8a3e874 Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:35:48 +0200
Subject: [PATCH 10/14] DERDO: Energy estimation and cost calculation functions
 for analisys.cpp

---
 source/encoder/analysis.cpp | 111 ++++++++++++++++++++++++++++++++++--
 1 file changed, 106 insertions(+), 5 deletions(-)

diff --git a/source/encoder/analysis.cpp b/source/encoder/analysis.cpp
index aabf386ca..0f49ad29d 100644
--- a/source/encoder/analysis.cpp
+++ b/source/encoder/analysis.cpp
@@ -1382,12 +1382,18 @@ SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom&
             }
             nextContext->store(splitPred->contexts);
 
+            if(m_rdCost.m_decEnergyRD)
+                splitPred->sa8dBits += splitPred->decEnergy;
+            
             if (mightNotSplit)
                 addSplitFlagCost(*splitPred, cuGeom.depth);
             else if (m_param->rdLevel > 1)
                 updateModeCost(*splitPred);
             else
                 splitPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)splitPred->distortion, splitPred->sa8dBits);
+
+            if(m_rdCost.m_decEnergyRD)
+                splitPred->sa8dBits -= splitPred->decEnergy;
         }
         /* If analysis mode is simple do not Evaluate other modes */
         if (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7)
@@ -2828,7 +2834,32 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe
             tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize);
             tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize);
         }
-        tempPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)tempPred->distortion, tempPred->sa8dBits);
+
+        if(m_rdCost.m_decEnergyRD)
+        {
+            bool biPred = false;
+            if (candDir[i] == 3) // consider biprediction
+            {
+                tempPred->decEnergy += this->m_param->sSpecificDecEnergies.e_bi * ((pu.width*pu.height)>>4);
+                biPred = true;
+            }
+
+            // Fractional pel calculation
+            int decEnergyL0 = this->calcFracpelDecodingEnergySearch(candMvField[i][0].mv, pu.width, pu.height);
+            int decEnergyL1 = this->calcFracpelDecodingEnergySearch(candMvField[i][1].mv, pu.width, pu.height);
+
+            if (biPred)    
+                tempPred->decEnergy += decEnergyL0 + decEnergyL1;
+            else if (candDir[i] & 1)
+                tempPred->decEnergy += decEnergyL0;
+            else
+                tempPred->decEnergy += decEnergyL1;
+
+            tempPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)tempPred->distortion, tempPred->sa8dBits+tempPred->decEnergy);
+        }
+        else
+            tempPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)tempPred->distortion, tempPred->sa8dBits);
+
 
         if (tempPred->sa8dCost < bestPred->sa8dCost)
         {
@@ -2972,6 +3003,25 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGe
         {
             /* if the best prediction has CBF (not a skip) then try merge with residual */
 
+            /* Code for DERDO*/
+            bool biPred = false;
+            if (candDir[i] == 3) // consider biprediction
+            {
+                tempPred->decEnergy += this->m_param->sSpecificDecEnergies.e_bi * ((pu.width*pu.height)>>4);
+                biPred = true;
+            }
+
+            // Fractional pel calculation
+            int decEnergyL0 = this->calcFracpelDecodingEnergySearch(candMvField[i][0].mv, pu.width, pu.height);
+            int decEnergyL1 = this->calcFracpelDecodingEnergySearch(candMvField[i][1].mv, pu.width, pu.height);
+
+            if (biPred)
+                tempPred->decEnergy += decEnergyL0 + decEnergyL1;
+            else if (candDir[i] & 1)
+                tempPred->decEnergy += decEnergyL0;
+            else
+                tempPred->decEnergy += decEnergyL1;
+            /* END code for DERDO*/
             encodeResAndCalcRdInterCU(*tempPred, cuGeom);
             hasCbf = tempPred->cu.getQtRootCbf(0);
             foundCbf0Merge = !hasCbf;
@@ -2998,6 +3048,33 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGe
                 tempPred->predYuv.copyFromYuv(bestPred->predYuv);
             }
 
+				/* Code DERDO*/
+                bool biPred = false;
+                if (candDir[i] == 3) // consider biprediction
+                {
+                    tempPred->decEnergy += this->m_param->sSpecificDecEnergies.e_bi * ((pu.width*pu.height)>>4);
+                    biPred = true;
+                }
+                // Fractional pel calculation
+                int decEnergyL0 = this->calcFracpelDecodingEnergySearch(candMvField[i][0].mv, pu.width, pu.height);
+                int decEnergyL1 = this->calcFracpelDecodingEnergySearch(candMvField[i][0].mv, pu.width, pu.height);
+                if (biPred)
+                    tempPred->decEnergy += decEnergyL0 + decEnergyL1;
+                else if (candDir[i] & 1)
+                    tempPred->decEnergy += decEnergyL0;
+                else
+                    tempPred->decEnergy += decEnergyL1;
+                if (m_param->bEnableLoopFilter)
+                {
+                    int nBorders = 0;
+                    if (tempPred->cu.m_cuPelX != 0 && tempPred->cu.m_cuPelX % 8 == 0)
+                        nBorders = 1;
+                    if (tempPred->cu.m_cuPelY != 0 && tempPred->cu.m_cuPelY % 8 == 0)
+                        nBorders = nBorders+1;
+                    tempPred->decEnergy += nBorders ? ((64 >> cuGeom.depth) >> (3-nBorders))*m_param->sSpecificDecEnergies.e_Bs : 0; // Considering DBF boundaries top and left (each boundary has 4 pixels, two boundaries because of left and top)
+                }
+			/* Code DERDO*/
+
             encodeResAndCalcRdSkipCU(*tempPred);
 
             if (tempPred->rdCost < bestPred->rdCost)
@@ -3058,6 +3135,8 @@ void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize
     }
     predInterSearch(interMode, cuGeom, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400), refMask);
 
+    interMode.predDecEnergy = interMode.decEnergy; 
+
     /* predInterSearch sets interMode.sa8dBits */
     const Yuv& fencYuv = *interMode.fencYuv;
     Yuv& predYuv = interMode.predYuv;
@@ -3068,7 +3147,11 @@ void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize
         interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);
         interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);
     }
-    interMode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)interMode.distortion, interMode.sa8dBits);
+
+        if(m_rdCost.m_decEnergyRD) 
+            interMode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)interMode.distortion, interMode.sa8dBits+interMode.decEnergy);
+        else
+            interMode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)interMode.distortion, interMode.sa8dBits);
 
     if (m_param->analysisSaveReuseLevel > 1 && m_reuseInterDataCTU)
     {
@@ -3123,6 +3206,7 @@ void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize
     }
 
     predInterSearch(interMode, cuGeom, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400, refMask);
+    interMode.predDecEnergy = interMode.decEnergy;
 
     /* predInterSearch sets interMode.sa8dBits, but this is ignored */
     encodeResAndCalcRdInterCU(interMode, cuGeom);
@@ -3177,6 +3261,8 @@ void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom&
     cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
     cu.m_mergeFlag[0] = 0;
 
+    bidir2Nx2N.decEnergy = inter2Nx2N.predDecEnergy;
+
     /* Estimate cost of BIDIR using best 2Nx2N L0 and L1 motion vectors */
     cu.setPUMv(0, bestME[0].mv, 0, 0);
     cu.m_mvd[0][0] = bestME[0].mv - mvp0;
@@ -3186,6 +3272,9 @@ void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom&
 
     PredictionUnit pu(cu, cuGeom, 0);
     motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
+    bidir2Nx2N.decEnergy += ((pu.width*pu.height)>>4)*m_param->sSpecificDecEnergies.e_bi; 
+
+
 
     int sa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size);
     if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
@@ -3241,13 +3330,20 @@ void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom&
         uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
         uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
         uint32_t zcost = zsa8d + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
-
+        int zDecEnergy = ((pu.width*pu.height)>>4)*m_param->sSpecificDecEnergies.e_bi;
+        zDecEnergy += this->calcFracpelDecodingEnergySearch(bestME[0].mv, pu.width, pu.height );
+		zDecEnergy += this->calcFracpelDecodingEnergySearch(bestME[1].mv, pu.width, pu.height );
         /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
         mvp0 = checkBestMVP(inter2Nx2N.amvpCand[0][ref0], mvzero, mvpIdx0, bits0, zcost);
         mvp1 = checkBestMVP(inter2Nx2N.amvpCand[1][ref1], mvzero, mvpIdx1, bits1, zcost);
 
         uint32_t zbits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
-        zcost = zsa8d + m_rdCost.getCost(zbits);
+
+        if(m_rdCost.m_decEnergyRD)
+            zcost = zsa8d + m_rdCost.getCost(zbits+ zDecEnergy);
+        else
+            zcost = zsa8d + m_rdCost.getCost(zbits);
+
 
         if (zcost < bidir2Nx2N.sa8dCost)
         {
@@ -3416,7 +3512,12 @@ void Analysis::addSplitFlagCost(Mode& mode, uint32_t depth)
     else if (m_param->rdLevel <= 1)
     {
         mode.sa8dBits++;
-        mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
+
+        if(m_rdCost.m_decEnergyRD)
+            mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits+mode.decEnergy);
+        else
+            mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
+
     }
     else
     {
-- 
2.20.1.windows.1

-------------- next part --------------
From 9e29018311f85f9825ed788fcea327f19e810b80 Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:36:48 +0200
Subject: [PATCH 11/14] DERDO: Introducing DERDO-cost calculation flag and
 modified lambda calculation.

---
 source/encoder/rdcost.h | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/source/encoder/rdcost.h b/source/encoder/rdcost.h
index 1bd4dd696..9d1aaffee 100644
--- a/source/encoder/rdcost.h
+++ b/source/encoder/rdcost.h
@@ -44,6 +44,9 @@ public:
     uint32_t  m_ssimRd;
     int       m_qp; /* QP used to configure lambda, may be higher than QP_MAX_SPEC but <= QP_MAX_MAX */
 
+    uint32_t  		m_decEnergyRD;
+
+
     void setPsyRdScale(double scale)                { m_psyRdBase = (uint32_t)floor(65536.0 * scale * 0.33); }
     void setSsimRd(int ssimRd) { m_ssimRd = ssimRd; };
 
@@ -51,6 +54,13 @@ public:
     {
         x265_emms(); /* TODO: if the lambda tables were ints, this would not be necessary */
         m_qp = qp;
+
+        if(slice.m_param->derdo)
+        	m_decEnergyRD = true;
+        else 
+            m_decEnergyRD = false; 
+
+
         setLambda(x265_lambda2_tab[qp], x265_lambda_tab[qp]);
 
         /* Scale PSY RD factor by a slice type factor */
@@ -92,8 +102,13 @@ public:
 
     void setLambda(double lambda2, double lambda)
     {
-        m_lambda2 = (uint64_t)floor(256.0 * lambda2);
-        m_lambda = (uint64_t)floor(256.0 * lambda);
+    	if(m_decEnergyRD){
+            m_lambda2 = (uint64_t)floor(224.0 * lambda2);
+            m_lambda = (uint64_t)floor(224.0 * lambda);
+    	}else{
+            m_lambda2 = (uint64_t)floor(256.0 * lambda2);
+            m_lambda = (uint64_t)floor(256.0 * lambda);
+    	}
     }
 
     inline uint64_t calcRdCost(sse_t distortion, uint32_t bits) const
-- 
2.20.1.windows.1

-------------- next part --------------
From 5f9cb95413deb6adbd95f121bdb4989f86b829d3 Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:44:21 +0200
Subject: [PATCH 12/14] DERDO: Cost calculations and function definitions for
 counting coefficients + fractional pels in search.cpp and search.h.

---
 source/encoder/search.cpp | 809 ++++++++++++++++++++++++++++++++++----
 source/encoder/search.h   |  30 +-
 2 files changed, 746 insertions(+), 93 deletions(-)

diff --git a/source/encoder/search.cpp b/source/encoder/search.cpp
index dab11fc79..ebc48d792 100644
--- a/source/encoder/search.cpp
+++ b/source/encoder/search.cpp
@@ -313,6 +313,9 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth,
     bool mightSplit     = (log2TrSize > depthRange[0]) && (bAllowSplit || !mightNotSplit);
     bool bEnableRDOQ  = !!m_param->rdoqLevel;
 
+    int dSingleDecEnergyY = 0; 
+    uint32_t TrSize = 64 >> fullDepth;
+
     /* If maximum RD penalty, force spits at TU size 32x32 if SPS allows TUs of 16x16 */
     if (m_param->rdPenalty == 2 && m_slice->m_sliceType != I_SLICE && log2TrSize == 5 && depthRange[0] <= 4)
     {
@@ -356,9 +359,36 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth,
             m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
         primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
 
-        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
+		uint32_t ldVal = 0;
+		uint32_t numSig;
+        if(m_rdCost.m_decEnergyRD){
+            if (m_param->bEnableLoopFilter)
+            {
+                int nBorders = 0;
+                if (cu.m_cuPelX != 0 && cu.m_cuPelX % 8 == 0)
+                    nBorders = 1;
+                if (cu.m_cuPelY != 0 && cu.m_cuPelY % 8 == 0)
+                    nBorders = nBorders + 1;
+                fullCost.decEnergy += nBorders ? ((64 >> cuGeom.depth) >> (3 - nBorders))*m_param->sSpecificDecEnergies.e_Bs : 0; // Considering DBF boundaries top and left (each boundary has 4 pixels, two boundaries because of left and top)
+            }
+            switch (fullDepth) {
+            case 1: fullCost.decEnergy += m_param->sSpecificDecEnergies.e_intra32x32;
+            break;
+            case 2: fullCost.decEnergy += m_param->sSpecificDecEnergies.e_intra16x16;
+            break;
+            case 3: fullCost.decEnergy += m_param->sSpecificDecEnergies.e_intra8x8;
+            break;
+            case 4: fullCost.decEnergy += m_param->sSpecificDecEnergies.e_intra4x4;
+            break;
+            }
+        }
+		numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false, m_rdCost.m_decEnergyRD, ldVal);
+
         if (numSig)
         {
+            dSingleDecEnergyY = calcCoeffDecodingEnergy(numSig, ldVal, TrSize);
+            fullCost.decEnergy += dSingleDecEnergyY;
+
             m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
             bool reconQtYuvAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
             bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
@@ -415,7 +445,9 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth,
 
         if (m_param->rdPenalty && log2TrSize == 5 && m_slice->m_sliceType != I_SLICE)
             fullCost.bits *= 4;
-
+        if(m_rdCost.m_decEnergyRD){
+            fullCost.bits += fullCost.decEnergy;
+        }
         if (m_rdCost.m_psyRd)
         {
             fullCost.energy = m_rdCost.psyCost(sizeIdx, fenc, mode.fencYuv->m_size, reconQt, reconQtStride);
@@ -431,6 +463,9 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth,
     }
     else
         fullCost.rdcost = MAX_INT64;
+    if(m_rdCost.m_decEnergyRD){
+        fullCost.bits -= fullCost.decEnergy;
+    }
 
     if (mightSplit)
     {
@@ -466,13 +501,18 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth,
             m_entropyCoder.resetBits();
             m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
             splitCost.bits += m_entropyCoder.getNumberOfWrittenBits();
-
+            if(m_rdCost.m_decEnergyRD){
+                splitCost.bits += splitCost.decEnergy;
+            }
             if (m_rdCost.m_psyRd)
                 splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
             else if(m_rdCost.m_ssimRd)
                 splitCost.rdcost = m_rdCost.calcSsimRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
             else
                 splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
+            if(m_rdCost.m_decEnergyRD){
+                splitCost.bits -= splitCost.decEnergy;
+            }
         }
 
         if (splitCost.rdcost < fullCost.rdcost)
@@ -481,7 +521,8 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth,
             outCost.distortion += splitCost.distortion;
             outCost.bits       += splitCost.bits;
             outCost.energy     += splitCost.energy;
-            return;
+			outCost.decEnergy  += splitCost.decEnergy;
+			return;
         }
         else
         {
@@ -500,7 +541,7 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth,
     pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
     intptr_t picStride = reconPic->m_stride;
     primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
-
+    outCost.decEnergy    += fullCost.decEnergy;
     outCost.rdcost     += fullCost.rdcost;
     outCost.distortion += fullCost.distortion;
     outCost.bits       += fullCost.bits;
@@ -560,6 +601,8 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDep
         uint64_t tmpCost;
         uint32_t tmpEnergy = 0;
 
+        double tmpDecEnergy = 0; 
+
         coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffY);
         pixel*   tmpRecon = (useTSkip ? m_tsRecon : reconQt);
         bool tmpReconAlign = (useTSkip ? 1 : (m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0));
@@ -567,7 +610,31 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDep
 
         primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
 
-        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip);
+		uint32_t ldVal = 0;
+		uint32_t numSig;
+        if(m_rdCost.m_decEnergyRD){
+            if (m_param->bEnableLoopFilter)
+            {
+                int nBorders = 0;
+                if (cu.m_cuPelX != 0 && cu.m_cuPelX % 8 == 0)
+                    nBorders = 1;
+                if (cu.m_cuPelY != 0 && cu.m_cuPelY % 8 == 0)
+                    nBorders = nBorders + 1;
+                fullCost.decEnergy += nBorders ? ((64 >> cuGeom.depth) >> (3 - nBorders))*m_param->sSpecificDecEnergies.e_Bs : 0; // Considering DBF boundaries top and left (each boundary has 4 pixels, two boundaries because of left and top)
+            }
+            switch (fullDepth) {
+            case 1: fullCost.decEnergy += m_param->sSpecificDecEnergies.e_intra32x32;
+            break;
+            case 2: fullCost.decEnergy += m_param->sSpecificDecEnergies.e_intra16x16;
+            break;
+            case 3: fullCost.decEnergy += m_param->sSpecificDecEnergies.e_intra8x8;
+            break;
+            case 4: fullCost.decEnergy += m_param->sSpecificDecEnergies.e_intra4x4;
+            break;
+            }
+        }
+		numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip, m_rdCost.m_decEnergyRD, ldVal);
+
         if (numSig)
         {
             m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
@@ -635,6 +702,10 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDep
         if (!useTSkip)
             m_entropyCoder.store(m_rqt[fullDepth].rqtTemp);
 
+        if(m_rdCost.m_decEnergyRD){
+            tmpBits += tmpDecEnergy;
+        }
+
         if (m_rdCost.m_psyRd)
         {
             tmpEnergy = m_rdCost.psyCost(sizeIdx, fenc, fencYuv->m_size, tmpRecon, tmpReconStride);
@@ -648,6 +719,10 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDep
         else
             tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
 
+        if(m_rdCost.m_decEnergyRD){
+            tmpBits -= tmpDecEnergy;
+        }
+
         if (tmpCost < fullCost.rdcost)
         {
             bTSkip = useTSkip;
@@ -656,7 +731,8 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDep
             fullCost.distortion = tmpDist;
             fullCost.bits = tmpBits;
             fullCost.energy = tmpEnergy;
-        }
+			fullCost.decEnergy = tmpDecEnergy;
+		}
     }
 
     if (bTSkip)
@@ -681,6 +757,7 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDep
     outCost.distortion += fullCost.distortion;
     outCost.bits += fullCost.bits;
     outCost.energy += fullCost.energy;
+	outCost.decEnergy += fullCost.decEnergy;
 }
 
 /* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */
@@ -727,7 +804,9 @@ void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint3
         pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
         intptr_t picStride = reconPic->m_stride;
 
-        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
+		uint32_t ldVal = 0;
+		uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false, m_rdCost.m_decEnergyRD, ldVal);
+
         if (numSig)
         {
             m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
@@ -821,6 +900,9 @@ void Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDept
     CUData& cu = mode.cu;
     uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
     bool bEnableRDOQ = !!m_param->rdoqLevel;
+    int dSingleDecEnergyU = 0; 
+    int dSingleDecEnergyV = 0;
+    uint32_t ldVal = 0;
 
     if (tuDepth < cu.m_tuDepth[absPartIdx])
     {
@@ -840,6 +922,8 @@ void Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDept
 
     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
     uint32_t tuDepthC = tuDepth;
+    uint32_t TrSizeC = log2TrSizeC < 2 ? 4 : 1 << log2TrSizeC;
+
     if (log2TrSizeC < 2)
     {
         X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
@@ -906,9 +990,21 @@ void Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDept
 
             primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
 
-            uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
+			ldVal = 0;
+			uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false, m_rdCost.m_decEnergyRD, ldVal);
+
             if (numSig)
             {
+                if (chromaId ==1) 
+                {
+                    dSingleDecEnergyU = calcCoeffDecodingEnergy(numSig, ldVal, TrSizeC);
+                    outCost.decEnergy += dSingleDecEnergyU;
+                }
+                else if(chromaId == 2)
+                {
+                    dSingleDecEnergyV = calcCoeffDecodingEnergy(numSig, ldVal, TrSizeC);
+                    outCost.decEnergy += dSingleDecEnergyV;
+                }
                 m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
                 bool reconQtAlign = m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
                 bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
@@ -952,6 +1048,11 @@ void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuD
     const uint32_t log2TrSizeC = 2;
     uint32_t qtLayer = log2TrSize - 2;
 
+    int dSingleDecEnergyU = 0; 
+    int dSingleDecEnergyV = 0;
+    uint32_t TrSizeC = 1 << log2TrSize;
+    uint32_t ldVal = 0;
+
     /* At the TU layers above this one, no RDO is performed, only distortion is being measured,
      * so the entropy coder is not very accurate. The best we can do is return it in the same
      * condition as it arrived, and to do all bit estimates from the same state. */
@@ -999,6 +1100,7 @@ void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuD
             sse_t bDist = 0;
             uint32_t bCbf = 0;
             uint32_t bEnergy = 0;
+            double bDecEnergy = 0; 
             int      bTSkip = 0;
 
             int checkTransformSkip = 1;
@@ -1010,9 +1112,19 @@ void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuD
 
                 primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
 
-                uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip);
+				ldVal = 0;
+				uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip, m_rdCost.m_decEnergyRD, ldVal);
+                
                 if (numSig)
                 {
+                    if (chromaId ==1) 
+                    {
+                        dSingleDecEnergyU = calcCoeffDecodingEnergy(numSig, ldVal, TrSizeC);
+                    }
+                    else if(chromaId == 2)
+                    {
+                        dSingleDecEnergyV = calcCoeffDecodingEnergy(numSig, ldVal, TrSizeC);
+                    }
                     m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
                     bool reconAlign = (useTSkip ? 1 : m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC)) % 64 == 0;
                     bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
@@ -1037,8 +1149,18 @@ void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuD
                 cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 
                 uint32_t tmpBits = 0, tmpEnergy = 0;
+                double tmpDecEnergy = 0;
+
                 if (numSig)
                 {
+                    if (chromaId ==1) 
+                    {
+                        tmpDecEnergy += dSingleDecEnergyU;
+                    }
+                    else if(chromaId == 2)
+                    {
+                        tmpDecEnergy += dSingleDecEnergyV;
+                    }
                     m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
                     m_entropyCoder.resetBits();
                     m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
@@ -1046,6 +1168,9 @@ void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuD
                 }
 
                 uint64_t tmpCost;
+                if(m_rdCost.m_decEnergyRD){
+                    tmpBits += tmpDecEnergy;
+                }
                 if (m_rdCost.m_psyRd)
                 {
                     tmpEnergy = m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
@@ -1059,6 +1184,10 @@ void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuD
                 else
                     tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
 
+                if(m_rdCost.m_decEnergyRD){
+                    tmpBits -= tmpDecEnergy;
+                }
+
                 if (tmpCost < bCost)
                 {
                     bCost = tmpCost;
@@ -1066,7 +1195,8 @@ void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuD
                     bTSkip = useTSkip;
                     bCbf = !!numSig;
                     bEnergy = tmpEnergy;
-                }
+					bDecEnergy = tmpDecEnergy;
+				}
             }
 
             if (bTSkip)
@@ -1085,6 +1215,7 @@ void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuD
 
             outCost.distortion += bDist;
             outCost.energy += bEnergy;
+            outCost.decEnergy += bDecEnergy;
         }
     }
     while (tuIterator.isNextSection());
@@ -1134,6 +1265,8 @@ void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t ab
     CUData& cu = mode.cu;
     uint32_t log2TrSize = cu.m_log2CUSize[absPartIdx] - tuDepth;
 
+    uint32_t ldVal=0;
+
     if (tuDepth < cu.m_tuDepth[absPartIdx])
     {
         uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
@@ -1205,7 +1338,9 @@ void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t ab
 
             primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
 
-            uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
+			ldVal = 0;
+			uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false, m_rdCost.m_decEnergyRD, ldVal);
+
             if (numSig)
             {
                 m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
@@ -1244,6 +1379,8 @@ void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize
     cu.getIntraTUQtDepthRange(tuDepthRange, 0);
 
     intraMode.initCosts();
+    intraMode.decEnergy += m_param->sSpecificDecEnergies.e_IntraCUs;
+
     intraMode.lumaDistortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange);
     if (m_csp != X265_CSP_I400)
     {
@@ -1298,6 +1435,10 @@ void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
     cu.setPartSizeSubParts(SIZE_2Nx2N);
     cu.setPredModeSubParts(MODE_INTRA);
 
+
+    double intraDecEnergy = 0;
+    double bintraDecEnergy = 0;
+
     const uint32_t initTuDepth = 0;
     uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth;
     uint32_t tuSize = 1 << log2TrSize;
@@ -1359,7 +1500,23 @@ void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
     bsad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift;
     bmode = mode = DC_IDX;
     bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
-    bcost = m_rdCost.calcRdSADCost(bsad, bbits);
+
+    if(m_rdCost.m_decEnergyRD){
+        switch(sizeIdx){
+        case 0:                 bintraDecEnergy = m_param->sSpecificDecEnergies.e_intra32x32;
+        break;
+        case 1:                 bintraDecEnergy = m_param->sSpecificDecEnergies.e_intra16x16;
+        break;
+        case 2:                 bintraDecEnergy = m_param->sSpecificDecEnergies.e_intra8x8;
+        break;
+        case 3:                 bintraDecEnergy = m_param->sSpecificDecEnergies.e_intra4x4;
+        break;
+        }
+        bcost = m_rdCost.calcRdSADCost(bsad, bbits+bintraDecEnergy);
+    }
+    else
+        bcost = m_rdCost.calcRdSADCost(bsad, bbits);
+
 
     // PLANAR
     pixel* planar = intraNeighbourBuf[0];
@@ -1370,8 +1527,25 @@ void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
     sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift;
     mode = PLANAR_IDX;
     bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
-    cost = m_rdCost.calcRdSADCost(sad, bits);
-    COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
+
+    if(m_rdCost.m_decEnergyRD){
+        switch(sizeIdx){
+        case 0:                 intraDecEnergy = m_param->sSpecificDecEnergies.e_intra32x32;
+        break;
+        case 1:                 intraDecEnergy = m_param->sSpecificDecEnergies.e_intra16x16;
+        break;
+        case 2:                 intraDecEnergy = m_param->sSpecificDecEnergies.e_intra8x8;
+        break;
+        case 3:                 intraDecEnergy = m_param->sSpecificDecEnergies.e_intra4x4;
+        break;
+        }
+        cost = m_rdCost.calcRdSADCost(sad, bits+intraDecEnergy);
+        COPY5_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits, bintraDecEnergy, intraDecEnergy);
+    }else{
+        cost = m_rdCost.calcRdSADCost(sad, bits);
+        COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
+    }
+
 
     bool allangs = true;
     if (primitives.cu[sizeIdx].intra_pred_allangs)
@@ -1383,32 +1557,60 @@ void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
         allangs = false;
 
 #define TRY_ANGLE(angle) \
-    if (allangs) { \
-        if (angle < 18) \
+        if (allangs) { \
+            if (angle < 18) \
             sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \
-        else \
+            else \
             sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \
-        bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
-        cost = m_rdCost.calcRdSADCost(sad, bits); \
-    } else { \
-        int filter = !!(g_intraFilterFlags[angle] & scaleTuSize); \
-        primitives.cu[sizeIdx].intra_pred[angle](m_intraPredAngs, scaleTuSize, intraNeighbourBuf[filter], angle, scaleTuSize <= 16); \
-        sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleTuSize) << costShift; \
-        bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
-        cost = m_rdCost.calcRdSADCost(sad, bits); \
-    }
-
+            bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
+            if(m_rdCost.m_decEnergyRD){ \
+                switch(sizeIdx){ \
+                case 0: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra32x32; \
+                break; \
+                case 1: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra16x16; \
+                break; \
+                case 2: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra8x8; \
+                break; \
+                case 3: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra4x4; \
+                break; } \
+                cost = m_rdCost.calcRdSADCost(sad, bits+ intraDecEnergy); \
+        }\
+            else \
+                cost = m_rdCost.calcRdSADCost(sad, bits); \
+        } else { \
+            int filter = !!(g_intraFilterFlags[angle] & scaleTuSize); \
+            primitives.cu[sizeIdx].intra_pred[angle](m_intraPredAngs, scaleTuSize, intraNeighbourBuf[filter], angle, scaleTuSize <= 16); \
+            sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleTuSize) << costShift; \
+            bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
+            if(m_rdCost.m_decEnergyRD){ \
+               switch(sizeIdx){ \
+                case 0: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra32x32; \
+                break; \
+                case 1: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra16x16; \
+                break; \
+                case 2: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra8x8; \
+                break; \
+                case 3: intraDecEnergy = m_param->sSpecificDecEnergies.e_intra4x4; \
+                break;} \
+                cost = m_rdCost.calcRdSADCost(sad, bits+ intraDecEnergy); \
+            } \
+            else \
+                cost = m_rdCost.calcRdSADCost(sad, bits); \
+        }
     if (m_param->bEnableFastIntra)
     {
         int asad = 0;
         uint32_t lowmode, highmode, amode = 5, abits = 0;
+
+        double aintraDecEnergy = 0;
         uint64_t acost = MAX_INT64;
 
         /* pick the best angle, sampling at distance of 5 */
         for (mode = 5; mode < 35; mode += 5)
         {
             TRY_ANGLE(mode);
-            COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
+            COPY5_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits, aintraDecEnergy, intraDecEnergy);
+
         }
 
         /* refine best angle at distance 2, then distance 1 */
@@ -1419,27 +1621,32 @@ void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
 
             X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n");
             TRY_ANGLE(lowmode);
-            COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits);
+            COPY5_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits, aintraDecEnergy, intraDecEnergy);
+
 
             X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n");
             TRY_ANGLE(highmode);
-            COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits);
+            COPY5_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits, aintraDecEnergy, intraDecEnergy);
+
         }
 
         if (amode == 33)
         {
             TRY_ANGLE(34);
-            COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits);
+            COPY5_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits, aintraDecEnergy, intraDecEnergy);
+
         }
+        COPY5_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits, bintraDecEnergy, aintraDecEnergy);
+
 
-        COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
     }
     else // calculate and search all intra prediction angles for lowest cost
     {
         for (mode = 2; mode < 35; mode++)
         {
             TRY_ANGLE(mode);
-            COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
+            COPY5_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits, bintraDecEnergy, intraDecEnergy);
+
         }
     }
 
@@ -1449,6 +1656,8 @@ void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
     intraMode.distortion = bsad;
     intraMode.sa8dCost = bcost;
     intraMode.sa8dBits = bbits;
+
+    intraMode.decEnergy = bintraDecEnergy;
 }
 
 void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
@@ -1469,7 +1678,7 @@ void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
     Cost icosts;
     codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
     extractIntraResultQT(cu, *reconYuv, 0, 0);
-
+    intraMode.decEnergy += icosts.decEnergy;
     intraMode.lumaDistortion = icosts.distortion;
     if (m_csp != X265_CSP_I400)
     {
@@ -1530,6 +1739,9 @@ sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32
     {
         uint32_t bmode = 0;
 
+        double intraDecEnergy = 0;
+        double bintraDecEnergy = 0;
+
         if (intraMode.cu.m_lumaIntraDir[puIdx] != (uint8_t)ALL_IDX)
             bmode = intraMode.cu.m_lumaIntraDir[puIdx];
         else
@@ -1572,7 +1784,24 @@ sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32
                 primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPred, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
                 uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, DC_IDX) : rbits;
                 uint32_t sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
-                modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
+                
+
+                if(m_rdCost.m_decEnergyRD){
+	                switch(tuSize){ 
+                    case 32:                 bintraDecEnergy = m_param->sSpecificDecEnergies.e_intra32x32;
+                    break;
+                    case 16:                 bintraDecEnergy = m_param->sSpecificDecEnergies.e_intra16x16;
+                    break;
+                    case 8:                 bintraDecEnergy = m_param->sSpecificDecEnergies.e_intra8x8;
+                   break;
+                   case 4:                 bintraDecEnergy = m_param->sSpecificDecEnergies.e_intra4x4;
+                   break;
+                   }
+                    modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits+bintraDecEnergy);
+				}
+                else
+                    modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
+
 
                 // PLANAR
                 pixel* planar = intraNeighbourBuf[0];
@@ -1582,8 +1811,24 @@ sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32
                 primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPred, scaleStride, planar, 0, 0);
                 bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, PLANAR_IDX) : rbits;
                 sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
-                modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
-                COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
+
+                if(m_rdCost.m_decEnergyRD){
+                  switch(tuSize){
+                  case 32:                 intraDecEnergy = m_param->sSpecificDecEnergies.e_intra32x32;
+                  break;
+                  case 16:                    intraDecEnergy = m_param->sSpecificDecEnergies.e_intra16x16;
+                  break;
+                  case 8:                 intraDecEnergy = m_param->sSpecificDecEnergies.e_intra8x8;
+                  break;
+                  case 4:                 intraDecEnergy = m_param->sSpecificDecEnergies.e_intra4x4;
+                  break;
+                  }
+                    modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits+ intraDecEnergy);
+                    COPY2_IF_LT(bcost, modeCosts[PLANAR_IDX], bintraDecEnergy, intraDecEnergy);
+                }else{
+                    modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
+                    COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
+                }
 
                 // angular predictions
                 if (primitives.cu[sizeIdx].intra_pred_allangs)
@@ -1597,8 +1842,25 @@ sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32
                             sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
                         else
                             sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
-                        modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
-                        COPY1_IF_LT(bcost, modeCosts[mode]);
+
+                        if(m_rdCost.m_decEnergyRD) {
+                            switch(tuSize){
+                            case 32:                 intraDecEnergy = m_param->sSpecificDecEnergies.e_intra32x32;
+                            break;
+                            case 16:                 intraDecEnergy = m_param->sSpecificDecEnergies.e_intra16x16;
+                            break;
+                            case 8:                 intraDecEnergy = m_param->sSpecificDecEnergies.e_intra8x8;
+                            break;
+                            case 4:                 intraDecEnergy = m_param->sSpecificDecEnergies.e_intra4x4;
+                            break;
+                            }
+                            modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits + intraDecEnergy);
+                            COPY2_IF_LT(bcost, modeCosts[mode], bintraDecEnergy, intraDecEnergy);
+                        }else{
+                            modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
+                            COPY1_IF_LT(bcost, modeCosts[mode]);
+                        }
+
                     }
                 }
                 else
@@ -1609,8 +1871,25 @@ sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32
                         int filter = !!(g_intraFilterFlags[mode] & scaleTuSize);
                         primitives.cu[sizeIdx].intra_pred[mode](m_intraPred, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16);
                         sad = sa8d(fenc, scaleStride, m_intraPred, scaleTuSize) << costShift;
-                        modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
-                        COPY1_IF_LT(bcost, modeCosts[mode]);
+
+                        if(m_rdCost.m_decEnergyRD) {
+                            switch(tuSize){
+                            case 32:                 intraDecEnergy = m_param->sSpecificDecEnergies.e_intra32x32;
+                            break;
+                            case 16:                 intraDecEnergy = m_param->sSpecificDecEnergies.e_intra16x16;
+                            break;
+                            case 8:                 intraDecEnergy = m_param->sSpecificDecEnergies.e_intra8x8;
+                            break;
+                            case 4:                 intraDecEnergy = m_param->sSpecificDecEnergies.e_intra4x4;
+                            break;
+                            }
+                            modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits + intraDecEnergy );
+                            COPY2_IF_LT(bcost, modeCosts[mode], bintraDecEnergy, intraDecEnergy);
+                        }else{
+                            modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
+                            COPY1_IF_LT(bcost, modeCosts[mode]);
+                        }
+
                     }
                 }
 
@@ -1661,6 +1940,7 @@ sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32
         else
             codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange);
         totalDistortion += icosts.distortion;
+        intraMode.decEnergy += icosts.decEnergy;
 
         extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx);
 
@@ -1825,8 +2105,17 @@ sse_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
             codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U);
             codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V);
             uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
+
+
+            if(m_rdCost.m_decEnergyRD){
+                bits += outCost.decEnergy;
+            }
             uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(outCost.distortion, bits, outCost.energy) : m_rdCost.m_ssimRd ? m_rdCost.calcSsimRdCost(outCost.distortion, bits, outCost.energy)
-                                             : m_rdCost.calcRdCost(outCost.distortion, bits);
+                    : m_rdCost.calcRdCost(outCost.distortion, bits);
+            if(m_rdCost.m_decEnergyRD){
+                bits -= outCost.decEnergy;
+            }
+
 
             if (cost < bestCost)
             {
@@ -1888,7 +2177,7 @@ sse_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
 }
 
 /* estimation of best merge coding of an inter PU (2Nx2N merge PUs are evaluated as their own mode) */
-uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m)
+uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m, int& decEnergy)
 {
     X265_CHECK(cu.m_partSize[0] != SIZE_2Nx2N, "mergeEstimation() called for 2Nx2N\n");
 
@@ -1943,12 +2232,22 @@ uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, const Predict
 
         motionCompensation(cu, pu, tempYuv, true, m_me.bChromaSATD);
 
+        if (m_rdCost.m_decEnergyRD) {
+            if (cu.m_interDir[0] == 1 || cu.m_interDir[0] == 3)
+				decEnergy += calcFracpelDecodingEnergySearch(cu.m_mv[0][pu.puAbsPartIdx], pu.width, pu.height);
+            if (cu.m_interDir[0] == 2 || cu.m_interDir[0] == 3)
+				decEnergy += calcFracpelDecodingEnergySearch(cu.m_mv[1][pu.puAbsPartIdx], pu.width, pu.height);
+            if (cu.m_interDir[0] == 3)
+				decEnergy += ((pu.width * pu.height) >> 4 ) * m_param->sSpecificDecEnergies.e_bi;
+        }
         uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(pu.puAbsPartIdx), tempYuv.m_size);
         if (m_me.bChromaSATD)
             costCand += m_me.bufChromaSATD(tempYuv, pu.puAbsPartIdx);
 
         uint32_t bitsCand = getTUBits(mergeCand, numMergeCand);
-        costCand = costCand + m_rdCost.getCost(bitsCand);
+
+        costCand = costCand + m_rdCost.getCost(bitsCand + decEnergy);
+
         if (costCand < outCost)
         {
             outCost = costCand;
@@ -2199,8 +2498,17 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
     MergeData merge;
     memset(&merge, 0, sizeof(merge));
     bool useAsMVP = false;
+
+    int dCuDecEnergy = 0; 
+
     for (int puIdx = 0; puIdx < numPart; puIdx++)
     {
+        int decEnergy[2]={0, 0}; 
+        int decEnergyTempL0[MAX_NUM_REF];
+        int decEnergyBi = 0;
+
+        int dinterPUenergy = 0;
+
         MotionData* bestME = interMode.bestME[puIdx];
         PredictionUnit pu(cu, cuGeom, puIdx);
         m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
@@ -2218,7 +2526,10 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
                 useAsMVP = true;
         }
         /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
-        uint32_t mrgCost = numPart == 1 ? MAX_UINT : mergeEstimation(cu, cuGeom, pu, puIdx, merge);
+        int dMergeDecEnergy = 0; 
+        uint32_t mrgCost = numPart == 1 ? MAX_UINT : mergeEstimation(cu, cuGeom, pu, puIdx, merge, dMergeDecEnergy); 
+
+
         bestME[0].cost = MAX_UINT;
         bestME[1].cost = MAX_UINT;
 
@@ -2232,6 +2543,8 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
         {
             for (int list = 0; list < numPredDir; list++)
             {
+                int partDecEnergy = 0;
+
 
                 int ref = -1;
                 if (useAsMVP)
@@ -2268,6 +2581,8 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
                 int satdCost;
                 if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && mvpIdx == bestME[list].mvpIdx)
                     mvpIn = bestME[list].mv;
+                if(list == 1) 
+                    partDecEnergy=decEnergyTempL0[ref];
                 if (useAsMVP && m_param->mvRefine > 1)
                 {
                     MV bestmv, mvpSel[3];
@@ -2309,7 +2624,16 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
                 /* Get total cost of partition, but only include MV bit cost once */
                 bits += m_me.bitcost(outmv);
                 uint32_t mvCost = m_me.mvcost(outmv);
-                uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
+
+                uint32_t cost;
+                if(m_rdCost.m_decEnergyRD){
+                    partDecEnergy += calcFracpelDecodingEnergySearch( outmv,  pu.width, pu.height);
+                    cost = (satdCost - mvCost) + m_rdCost.getCost(bits+partDecEnergy);
+                }
+                else
+                    cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
+
+
                 /* Refine MVP selection, updates: mvpIdx, bits, cost */
                 if (!(m_param->analysisMultiPassRefine || useAsMVP))
                     mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
@@ -2323,11 +2647,19 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
                         mvpIdx = !mvpIdx;
                         uint32_t origOutBits = bits;
                         bits = origOutBits + diffBits;
-                        cost = (cost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(bits);
+                        if(m_rdCost.m_decEnergyRD)
+                            cost = (cost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(bits + partDecEnergy);
+                        else
+                            cost = (cost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(bits);
+
                     }
                     mvp = amvp[mvpIdx];
                 }
 
+                if(list == 0)
+                    decEnergyTempL0[ref] = partDecEnergy;
+
+
                 if (cost < bestME[list].cost)
                 {
                     bestME[list].mv = outmv;
@@ -2393,7 +2725,7 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
                 for (int ref = 0; ref < numRefIdx[list]; ref++)
                 {
                     ProfileCounter(interMode.cu, totalMotionReferences[cuGeom.depth]);
-
+                    int partDecEnergy= 0;
                     if (!(refMask & (1 << ref)))
                     {
                         ProfileCounter(interMode.cu, skippedMotionReferences[cuGeom.depth]);
@@ -2446,7 +2778,13 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
                     /* Get total cost of partition, but only include MV bit cost once */
                     bits += m_me.bitcost(outmv);
                     uint32_t mvCost = m_me.mvcost(outmv);
-                    uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
+                    uint32_t cost;
+                    if(m_rdCost.m_decEnergyRD){
+                        partDecEnergy += calcFracpelDecodingEnergySearch( outmv,  pu.width, pu.height);
+                        cost = (satdCost - mvCost) + m_rdCost.getCost(bits+ partDecEnergy);
+                    }
+                    else
+                        cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
                     /* Update LowresMVP to best AMVP cand*/
                     if (bLowresMVP)
                         updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);
@@ -2454,6 +2792,9 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
                     /* Refine MVP selection, updates: mvpIdx, bits, cost */
                     mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
 
+                    if(list == 0)
+                        decEnergyTempL0[ref] = partDecEnergy;
+
                     if (cost < bestME[list].cost)
                     {
                         bestME[list].mv      = outmv;
@@ -2463,6 +2804,8 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
                         bestME[list].cost    = cost;
                         bestME[list].bits    = bits;
                         bestME[list].mvCost  = mvCost;
+                        decEnergy[list] = partDecEnergy; 
+
                     }
                 }
                 /* the second list ref bits start at bit 16 */
@@ -2479,6 +2822,12 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
             cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N &&    /* 2Nx2N biprediction is handled elsewhere */
             bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT)
         {
+
+            int partDecEnergy = 0;
+            int dBiEnergyTemp = ((pu.width*pu.height)>>4)*m_param->sSpecificDecEnergies.e_bi;
+
+
+
             bidir[0] = bestME[0];
             bidir[1] = bestME[1];
 
@@ -2512,6 +2861,18 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
             bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
             bidirCost = satdCost + m_rdCost.getCost(bidirBits);
 
+            if(m_rdCost.m_decEnergyRD){
+                int RefPOCa = cu.m_slice->m_refPOCList[0][(int8_t)bidir[0].ref];            //getRefPic(RefPicList(iRefList), iRefIdxTemp)->getPOC();//Taken from xCheckIdenticalMotion
+                int RefPOCb = cu.m_slice->m_refPOCList[1][(int8_t)bidir[1].ref];            //pcCU->getSlice()->getRefPic(RefPicList(1-iRefList), iRefIdxBi[1-iRefList])->getPOC();
+                if (!(RefPOCa == RefPOCb)) // If motion vectors are NOT equal
+                {
+                    partDecEnergy += dBiEnergyTemp;
+                    partDecEnergy += calcFracpelDecodingEnergySearch(bidir[0].mv, pu.width, pu.height);
+                    partDecEnergy += calcFracpelDecodingEnergySearch(bidir[1].mv, pu.width, pu.height); 
+                }
+                bidirCost += m_rdCost.getCost(partDecEnergy);
+            }
+
             bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
             if (bTryZero)
             {
@@ -2563,6 +2924,20 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
                 mvp0 = checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvpIdx0, bits0, cost);
                 mvp1 = checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvpIdx1, bits1, cost);
 
+
+                if(m_rdCost.m_decEnergyRD){
+                    int RefPOCa = cu.m_slice->m_refPOCList[0][(int8_t)bidir[0].ref];            //getRefPic(RefPicList(iRefList), iRefIdxTemp)->getPOC();//Taken from xCheckIdenticalMotion
+                    int RefPOCb = cu.m_slice->m_refPOCList[1][(int8_t)bidir[1].ref];            //pcCU->getSlice()->getRefPic(RefPicList(1-iRefList), iRefIdxBi[1-iRefList])->getPOC();
+                    if (!(RefPOCa == RefPOCb && mvp0 == mvp1)) // If motion vectors are NOT equal
+                    {
+                        partDecEnergy += dBiEnergyTemp;
+						partDecEnergy += calcFracpelDecodingEnergySearch(bidir[0].ref, pu.width, pu.height);
+						partDecEnergy += calcFracpelDecodingEnergySearch(bidir[1].ref, pu.width, pu.height);
+                    }
+                    cost += m_rdCost.getCost(partDecEnergy);
+                }
+
+
                 if (cost < bidirCost)
                 {
                     bidir[0].mv = mvzero;
@@ -2573,6 +2948,7 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
                     bidir[1].mvpIdx = mvpIdx1;
                     bidirCost = cost;
                     bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
+                    decEnergyBi = partDecEnergy; 
                 }
             }
         }
@@ -2589,6 +2965,7 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
             cu.setPURefIdx(1, merge.mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
 
             totalmebits += merge.bits;
+            dCuDecEnergy += dMergeDecEnergy; 
         }
         else if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost)
         {
@@ -2607,6 +2984,11 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
             cu.m_mvpIdx[1][pu.puAbsPartIdx] = bidir[1].mvpIdx;
 
             totalmebits += bidirBits;
+
+            dCuDecEnergy += decEnergyBi+ dinterPUenergy;
+
+
+
         }
         else if (bestME[0].cost <= bestME[1].cost)
         {
@@ -2623,6 +3005,8 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
             cu.setPUMv(1, mvzero, pu.puAbsPartIdx, puIdx);
 
             totalmebits += bestME[0].bits;
+            dCuDecEnergy += decEnergy[0]+dinterPUenergy;
+
         }
         else
         {
@@ -2639,10 +3023,13 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
             cu.setPUMv(0, mvzero, pu.puAbsPartIdx, puIdx);
 
             totalmebits += bestME[1].bits;
+            dCuDecEnergy += decEnergy[1]+ dinterPUenergy;
+
         }
 
         motionCompensation(cu, pu, *predYuv, true, bChromaMC);
     }
+    interMode.decEnergy += dCuDecEnergy;
     interMode.sa8dBits += totalmebits;
 }
 
@@ -2807,7 +3194,13 @@ void Search::encodeResAndCalcRdSkipCU(Mode& interMode)
     interMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
     interMode.coeffBits = 0;
     interMode.totalBits = interMode.mvBits + skipFlagBits;
-    if (m_rdCost.m_psyRd)
+	switch (depth) {
+	case 0: interMode.decEnergy += this->m_param->sSpecificDecEnergies.e_skip64x64; break;
+	case 1: interMode.decEnergy += this->m_param->sSpecificDecEnergies.e_skip8x82x32; break;
+	case 2: interMode.decEnergy += this->m_param->sSpecificDecEnergies.e_skip16x16; break;
+	case 3: interMode.decEnergy += this->m_param->sSpecificDecEnergies.e_skip8x8; break;
+	}
+	if (m_rdCost.m_psyRd)
         interMode.psyEnergy = m_rdCost.psyCost(part, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
     else if(m_rdCost.m_ssimRd)
         interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0);
@@ -2848,6 +3241,33 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
         memset(&m_cacheTU, 0, sizeof(TUInfoCache));
 
     Cost costs;
+    int32_t inter0Energy = 0;
+    int32_t skipEnergy = 0;
+    if (m_rdCost.m_decEnergyRD) {
+       switch (depth) {
+       case 0:     inter0Energy = this->m_param->sSpecificDecEnergies.e_inter64x64;
+           skipEnergy = this->m_param->sSpecificDecEnergies.e_skip64x64;break;
+       case 1:     inter0Energy = this->m_param->sSpecificDecEnergies.e_inter32x32;
+           skipEnergy = this->m_param->sSpecificDecEnergies.e_skip8x82x32;break;
+       case 2:     inter0Energy = this->m_param->sSpecificDecEnergies.e_inter32x326x16;
+           skipEnergy = this->m_param->sSpecificDecEnergies.e_skip16x16;break;
+       case 3:     inter0Energy = this->m_param->sSpecificDecEnergies.e_inter8x8;
+           skipEnergy = this->m_param->sSpecificDecEnergies.e_skip8x8;break;
+       }
+   
+       costs.decEnergy += inter0Energy; 
+       if (m_param->bEnableLoopFilter)
+       {
+           int nBorders = 0;
+           if (cu.m_cuPelX != 0 && cu.m_cuPelX % 8 == 0)
+               nBorders = 1;
+           if (cu.m_cuPelY != 0 && cu.m_cuPelY % 8 == 0)
+               nBorders = nBorders + 1;
+           int addecEnergy = nBorders ? ((64 >> cuGeom.depth) >> (3 - nBorders))*m_param->sSpecificDecEnergies.e_Bs : 0; // Considering DBF boundaries top and left (each boundary has 4 pixels, two boundaries because of left and top)
+           skipEnergy += addecEnergy;
+           inter0Energy += addecEnergy;
+       }
+    }
     if (m_limitTU & X265_TU_LIMIT_NEIGH)
     {
         /* Save and reload maxTUDepth to avoid changing of maxTUDepth between modes */
@@ -2881,7 +3301,7 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
         m_entropyCoder.resetBits();
         m_entropyCoder.codeQtRootCbfZero();
         uint32_t cbf0Bits = m_entropyCoder.getNumberOfWrittenBits();
-
+        cbf0Bits += (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N) ? skipEnergy : inter0Energy;
         uint32_t cbf0Energy; uint64_t cbf0Cost;
         if (m_rdCost.m_psyRd)
         {
@@ -2900,6 +3320,7 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
         {
             cu.clearCbf();
             cu.setTUDepthSubParts(0, 0, depth);
+            costs.decEnergy = (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N) ? skipEnergy : inter0Energy;
         }
     }
 
@@ -2969,6 +3390,10 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
     interMode.lumaDistortion = bestLumaDist;
     interMode.coeffBits = coeffBits;
     interMode.mvBits = mvBits;
+    interMode.decEnergy += costs.decEnergy;
+
+
+
     cu.m_distortion[0] = interMode.distortion;
     updateModeCost(interMode);
     checkDQP(interMode, cuGeom);
@@ -3017,7 +3442,9 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
         uint32_t strideResiY = resiYuv.m_size;
 
         const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
-        uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
+        uint32_t ldVal=0;
+        uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false, m_rdCost.m_decEnergyRD, ldVal); 
+
 
         if (numSigY)
         {
@@ -3051,7 +3478,9 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
 
                 int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC);
                 const pixel* fencCb = fencYuv->getCbAddr(absPartIdxC);
-                uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false);
+                ldVal = 0;
+                uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false, m_rdCost.m_decEnergyRD, ldVal);
+                
                 if (numSigU)
                 {
                     m_quant.invtransformNxN(cu, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU);
@@ -3065,7 +3494,9 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
 
                 int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC);
                 const pixel* fencCr = fencYuv->getCrAddr(absPartIdxC);
-                uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false);
+                ldVal=0;
+                uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false, m_rdCost.m_decEnergyRD, ldVal);
+               
                 if (numSigV)
                 {
                     m_quant.invtransformNxN(cu, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV);
@@ -3110,11 +3541,12 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
         }
     }
 }
-
-uint64_t Search::estimateNullCbfCost(sse_t dist, uint32_t energy, uint32_t tuDepth, TextType compId)
+uint64_t Search::estimateNullCbfCost(sse_t dist, uint32_t energy, uint32_t tuDepth, TextType compId, double decEnergy)
 {
     uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth);
-
+    if(m_rdCost.m_decEnergyRD){
+        nullBits += decEnergy;
+    }
     if (m_rdCost.m_psyRd)
         return m_rdCost.calcPsyRdCost(dist, nullBits, energy);
     else if(m_rdCost.m_ssimRd)
@@ -3165,6 +3597,10 @@ bool Search::splitTU(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint
     uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
     splitCost.bits += splitCbfBits;
 
+    if(m_rdCost.m_decEnergyRD){
+        splitCost.bits += splitCost.decEnergy;
+    }
+
     if (m_rdCost.m_psyRd)
         splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
     else if(m_rdCost.m_ssimRd)
@@ -3172,6 +3608,9 @@ bool Search::splitTU(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint
     else
         splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
         
+    if(m_rdCost.m_decEnergyRD)
+        splitCost.bits -= splitCost.decEnergy;
+
     return ycbf || ucbf || vcbf;
 }
 
@@ -3244,7 +3683,13 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
     uint32_t singleEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
     uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
     uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} };
-
+    double dSingleDecEnergyY = 0; 
+    double dSingleDecEnergyU = 0;
+    double dSingleDecEnergyV = 0;
+    double dSingleDecEnergy = 0;
+    uint32_t TrSize = 1 << (log2TrSize);
+    uint32_t TrSizeC = 1 << log2TrSizeC;
+    uint32_t ldVal = 0;
     m_entropyCoder.store(m_rqt[depth].rqtRoot);
 
     uint32_t trSize = 1 << log2TrSize;
@@ -3274,9 +3719,15 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
 
         const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
         int16_t* resi = resiYuv.getLumaAddr(absPartIdx);
-        numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
+        ldVal=0;
+        numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false, m_rdCost.m_decEnergyRD, ldVal);
         cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0];
 
+        if (numSig[TEXT_LUMA][0] && m_rdCost.m_decEnergyRD)
+        {
+            dSingleDecEnergyY = calcCoeffDecodingEnergy(numSig[TEXT_LUMA][0], ldVal, TrSize);
+        }
+
         m_entropyCoder.resetBits();
 
         if (bSplitPresentFlag && log2TrSize > depthRange[0])
@@ -3316,6 +3767,11 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
             const sse_t nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, curReconY, strideReconY);
             uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
             uint32_t nonZeroEnergyY = 0; uint64_t singleCostY = 0;
+
+            if(m_rdCost.m_decEnergyRD){
+                nzCbfBitsY += dSingleDecEnergyY;
+            }
+
             if (m_rdCost.m_psyRd)
             {
                 nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, curReconY, strideReconY);
@@ -3329,6 +3785,10 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
             else
                 singleCostY = m_rdCost.calcRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0]);
 
+            if(m_rdCost.m_decEnergyRD){
+                nzCbfBitsY -= dSingleDecEnergyY;
+            }
+
             if (cu.m_tqBypass[0])
             {
                 singleDist[TEXT_LUMA][0] = nonZeroDistY;
@@ -3354,6 +3814,7 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
                         minCost[TEXT_LUMA][0] = nullCostY;
                     singleDist[TEXT_LUMA][0] = zeroDistY;
                     singleEnergy[TEXT_LUMA][0] = zeroEnergyY;
+                    dSingleDecEnergyY = 0;
                 }
                 else
                 {
@@ -3366,8 +3827,13 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
         }
         else
         {
-            if (checkTransformSkipY)
-                minCost[TEXT_LUMA][0] = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA);
+            if (checkTransformSkipY){
+                if(m_rdCost.m_decEnergyRD)
+                    minCost[TEXT_LUMA][0] = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA, dSingleDecEnergyY);
+                else
+                    minCost[TEXT_LUMA][0] = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA);
+            }
+
             primitives.cu[partSize].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0);
             singleDist[TEXT_LUMA][0] = zeroDistY;
             singleBits[TEXT_LUMA][0] = 0;
@@ -3399,9 +3865,22 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
 
                     fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
                     resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
-                    numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
+                    ldVal = 0;
+                    numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false, m_rdCost.m_decEnergyRD, ldVal);
                     cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
 
+                    if (m_rdCost.m_decEnergyRD && numSig[chromaId][tuIterator.section] > 0)
+                    {
+                        if (chromaId ==1)
+                        {
+                            dSingleDecEnergyU = calcCoeffDecodingEnergy(numSig[chromaId][tuIterator.section], ldVal, TrSizeC);
+                        }
+                        else if(chromaId == 2)
+                        {
+                            dSingleDecEnergyV = calcCoeffDecodingEnergy(numSig[chromaId][tuIterator.section], ldVal, TrSizeC);
+                        }
+                    }
+
                     uint32_t latestBitCount = m_entropyCoder.getNumberOfWrittenBits();
                     if (cbfFlag[chromaId][tuIterator.section])
                         m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
@@ -3445,7 +3924,26 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
                             singleCostC = m_rdCost.calcSsimRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
                         }
                         else
-                            singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]);
+
+                        if (chromaId ==1)
+                        {
+                            nzCbfBitsC +=dSingleDecEnergyU;
+                        }
+                        else if(chromaId == 2)
+                        {
+                            nzCbfBitsC +=dSingleDecEnergyV;
+                        }
+
+                        singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]);
+                        if (chromaId ==1)
+                        {
+                            nzCbfBitsC -=dSingleDecEnergyU;
+                        }
+                        else if(chromaId == 2)
+                        {
+                            nzCbfBitsC -=dSingleDecEnergyV;
+                        }
+
 
                         if (cu.m_tqBypass[0])
                         {
@@ -3477,13 +3975,25 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
                                     minCost[chromaId][tuIterator.section] = singleCostC;
                                 singleDist[chromaId][tuIterator.section] = nonZeroDistC;
                                 singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC;
+                                if (chromaId ==1)
+                                    dSingleDecEnergy += dSingleDecEnergyU;
+                                if (chromaId ==2)
+                                    dSingleDecEnergy += dSingleDecEnergyV;
                             }
                         }
                     }
                     else
                     {
-                        if (checkTransformSkipC)
-                            minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepthC, (TextType)chromaId);
+                        if (checkTransformSkipC){
+                            if(m_rdCost.m_decEnergyRD){
+                                if(chromaId ==1)
+                                    minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepthC, (TextType)chromaId, dSingleDecEnergyU);
+                                else
+                                    minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepthC, (TextType)chromaId, dSingleDecEnergyV);
+                            }else{
+                                minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepthC, (TextType)chromaId);
+                            }
+                        }
                         primitives.cu[partSizeC].blockfill_s[strideResiC % 64 == 0](curResiC, strideResiC, 0);
                         singleBits[chromaId][tuIterator.section] = 0;
                         singleDist[chromaId][tuIterator.section] = zeroDistC;
@@ -3524,10 +4034,14 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
 
             fenc = fencYuv->getLumaAddr(absPartIdx);
             resi = resiYuv.getLumaAddr(absPartIdx);
-            uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, m_tsCoeff, log2TrSize, TEXT_LUMA, absPartIdx, true);
+            ldVal = 0;
+            uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, m_tsCoeff, log2TrSize, TEXT_LUMA, absPartIdx, true, m_rdCost.m_decEnergyRD, ldVal);
 
             if (numSigTSkipY)
             {
+
+                dSingleDecEnergyY = calcCoeffDecodingEnergy(numSigTSkipY, ldVal, TrSize);//( cuGeom.depth , TrSize, TrSize, TEXT_LUMA, m_tsCoeff, MODE_INTER );
+                dSingleDecEnergy += dSingleDecEnergyY;
                 m_entropyCoder.resetBits();
                 m_entropyCoder.codeQtCbfLuma(!!numSigTSkipY, tuDepth);
                 m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdx, log2TrSize, TEXT_LUMA);
@@ -3540,18 +4054,33 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
                 primitives.cu[partSize].add_ps[bufferAlignCheck](m_tsRecon, trSize, mode.predYuv.getLumaAddr(absPartIdx), m_tsResidual, mode.predYuv.m_size, trSize);
                 nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, m_tsRecon, trSize);
 
-                if (m_rdCost.m_psyRd)
-                {
-                    nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, m_tsRecon, trSize);
-                    singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY);
-                }
-                else if(m_rdCost.m_ssimRd)
-                {
-                    nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, m_tsRecon, trSize, log2TrSize, TEXT_LUMA, absPartIdx);
-                    singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY);
+                if(m_rdCost.m_decEnergyRD){
+                    if (m_rdCost.m_psyRd)
+                    {
+                        nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, m_tsRecon, trSize);
+                        singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY+dSingleDecEnergyY, nonZeroEnergyY);
+                    }
+                    else if(m_rdCost.m_ssimRd)
+                    {
+                        nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, m_tsRecon, trSize, log2TrSize, TEXT_LUMA, absPartIdx);
+                        singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, skipSingleBitsY+dSingleDecEnergyY, nonZeroEnergyY);
+                    }
+                    else
+                        singleCostY = m_rdCost.calcRdCost(nonZeroDistY, skipSingleBitsY+dSingleDecEnergyY);
+                }else{
+                    if (m_rdCost.m_psyRd)
+                    {
+                        nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, m_tsRecon, trSize);
+                        singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY);
+                    }
+                    else if(m_rdCost.m_ssimRd)
+                    {
+                        nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, m_tsRecon, trSize, log2TrSize, TEXT_LUMA, absPartIdx);
+                        singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY);
+                    }
+                    else
+                        singleCostY = m_rdCost.calcRdCost(nonZeroDistY, skipSingleBitsY);
                 }
-                else
-                    singleCostY = m_rdCost.calcRdCost(nonZeroDistY, skipSingleBitsY);
             }
 
             if (!numSigTSkipY || minCost[TEXT_LUMA][0] < singleCostY)
@@ -3601,13 +4130,24 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
 
                     fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
                     resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
-                    uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, m_tsCoeff, log2TrSizeC, (TextType)chromaId, absPartIdxC, true);
+                    ldVal = 0;
+                    uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, m_tsCoeff, log2TrSizeC, (TextType)chromaId, absPartIdxC, true, m_rdCost.m_decEnergyRD, ldVal);
 
                     m_entropyCoder.resetBits();
                     singleBits[chromaId][tuIterator.section] = 0;
 
                     if (numSigTSkipC)
                     {
+                        if (m_rdCost.m_decEnergyRD && chromaId ==1)
+                        {
+                            dSingleDecEnergyU = calcCoeffDecodingEnergy(numSigTSkipC, ldVal, TrSizeC);
+                            dSingleDecEnergy += dSingleDecEnergyU;
+                        }
+                        else if(m_rdCost.m_decEnergyRD && chromaId == 2)
+                        {
+                            dSingleDecEnergyV = calcCoeffDecodingEnergy(numSigTSkipC, ldVal, TrSizeC);
+                            dSingleDecEnergy += dSingleDecEnergyV;
+                        }
                         m_entropyCoder.codeQtCbfChroma(!!numSigTSkipC, tuDepth);
                         m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
                         singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
@@ -3618,6 +4158,18 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
                         bool bufferAlignCheck = predYuvAlign && (trSizeC % 64 == 0) && (mode.predYuv.m_csize % 64 == 0) && (trSizeC % 64 == 0);
                         primitives.cu[partSizeC].add_ps[bufferAlignCheck](m_tsRecon, trSizeC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), m_tsResidual, mode.predYuv.m_csize, trSizeC);
                         nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, m_tsRecon, trSizeC));
+
+                        if(m_rdCost.m_decEnergyRD){
+                            if (chromaId ==1)
+                            {
+                                singleBits[chromaId][tuIterator.section]+= dSingleDecEnergyU;
+                            }
+                            else if(chromaId == 2)
+                            {
+                                singleBits[chromaId][tuIterator.section]+= dSingleDecEnergyV;
+                            }
+                        }
+
                         if (m_rdCost.m_psyRd)
                         {
                             nonZeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, m_tsRecon, trSizeC);
@@ -3630,6 +4182,17 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
                         }
                         else
                             singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section]);
+
+                        if(m_rdCost.m_decEnergyRD){
+                            if (chromaId ==1)
+                            {
+                                singleBits[chromaId][tuIterator.section]-= dSingleDecEnergyU;
+                            }
+                            else if(chromaId == 2)
+                            {
+                                singleBits[chromaId][tuIterator.section]-= dSingleDecEnergyV;
+                            }
+                        }
                     }
 
                     if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC)
@@ -3695,6 +4258,21 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
         // For that reason, I am collecting individual coefficient bits only.
         fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits;
 
+        if(m_rdCost.m_decEnergyRD){
+            fullCost.decEnergy+=dSingleDecEnergy;
+            if (m_param->bEnableLoopFilter)
+            {
+                int nBorders = 0;
+                if (cu.m_cuPelX != 0 && cu.m_cuPelX % 8 == 0)
+                    nBorders = 1;
+                if (cu.m_cuPelY != 0 && cu.m_cuPelY % 8 == 0)
+                    nBorders = nBorders + 1;
+                fullCost.decEnergy += nBorders ? ((64 >> cuGeom.depth) >> (3 - nBorders))*m_param->sSpecificDecEnergies.e_Bs : 0; // Considering DBF boundaries top and left (each boundary has 4 pixels, two boundaries because of left and top)
+            }
+
+            fullCost.bits+= fullCost.decEnergy+ outCosts.decEnergy;
+        }
+
         fullCost.distortion += singleDist[TEXT_LUMA][0];
         fullCost.energy += singleEnergy[TEXT_LUMA][0];// need to check we need to add chroma also
         for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
@@ -3710,6 +4288,10 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
         else
             fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
 
+        if(m_rdCost.m_decEnergyRD){
+            fullCost.bits-= fullCost.decEnergy + outCosts.decEnergy;
+        }
+
         if (m_param->limitTU && bCheckSplit)
         {
             // Stop recursion if the TU's energy level is minimal
@@ -3773,6 +4355,7 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
             splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
         }
 
+        splitCost.decEnergy = outCosts.decEnergy;
         bool yCbCrCbf = splitTU(mode, cuGeom, absPartIdx, tuDepth, resiYuv, splitCost, depthRange, 0);
         if (yCbCrCbf || !bCheckFull)
         {
@@ -3785,6 +4368,7 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
                     if (nextSplit)
                     {
                         m_entropyCoder.load(m_rqt[depth].rqtRoot);
+                        splitCost.decEnergy = outCosts.decEnergy;
                         splitCost.bits = splitCost.distortion = splitCost.rdcost = splitCost.energy = 0;
                         if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
                         {
@@ -3797,9 +4381,10 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
                     }
                 }
                 outCosts.distortion += splitCost.distortion;
-                outCosts.rdcost     += splitCost.rdcost;
-                outCosts.bits       += splitCost.bits;
-                outCosts.energy     += splitCost.energy;
+                outCosts.rdcost  += splitCost.rdcost;
+                outCosts.bits      += splitCost.bits;
+                outCosts.energy  += splitCost.energy;
+                outCosts.decEnergy = splitCost.decEnergy; 
                 return;
             }
             else
@@ -3851,9 +4436,10 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
     }
 
     outCosts.distortion += fullCost.distortion;
-    outCosts.rdcost     += fullCost.rdcost;
-    outCosts.bits       += fullCost.bits;
-    outCosts.energy     += fullCost.energy;
+    outCosts.rdcost  += fullCost.rdcost;
+    outCosts.bits      += fullCost.bits;
+    outCosts.energy  += fullCost.energy;
+    outCosts.decEnergy    += fullCost.decEnergy;
 }
 
 void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2])
@@ -3989,7 +4575,11 @@ void Search::checkDQP(Mode& mode, const CUGeom& cuGeom)
             else if (m_param->rdLevel <= 1)
             {
                 mode.sa8dBits++;
-                mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
+                if(m_rdCost.m_decEnergyRD)
+                    mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits + mode.decEnergy );
+                else
+                    mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
+
             }
             else
             {
@@ -4032,7 +4622,11 @@ void Search::checkDQPForSplitPred(Mode& mode, const CUGeom& cuGeom)
             else if (m_param->rdLevel <= 1)
             {
                 mode.sa8dBits++;
-                mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
+                if(m_rdCost.m_decEnergyRD)
+                    mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits + mode.decEnergy);
+                else
+                    mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
+
             }
             else
             {
@@ -4048,3 +4642,44 @@ void Search::checkDQPForSplitPred(Mode& mode, const CUGeom& cuGeom)
             cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
     }
 }
+
+int Search::calcFracpelDecodingEnergySearch( MV cMv, uint32_t width, uint32_t height)
+{
+    int decEnergy = 0;
+    bool bHor = false;
+    int partSize = (width*height) >> 4;
+	//printf("%d\n", partSize);
+    if (cMv.getAbsHor()%4 != 0) // Horizontal filterings  subpel
+    {
+        decEnergy += partSize * this->m_param->sSpecificDecEnergies.e_fracpel;
+        bHor = true;
+    }
+    if (cMv.getAbsVer()%4 != 0) // Vertical filterings
+    {
+        decEnergy += partSize * this->m_param->sSpecificDecEnergies.e_fracpel;
+        if (bHor)// Additional horizontal filterings if both are applied
+        {
+            decEnergy += ((6*height)>>4) * this->m_param->sSpecificDecEnergies.e_fracpel;
+        }
+    }
+    return decEnergy;
+}
+
+int Search::calcCoeffDecodingEnergy(uint32_t numSig, uint32_t ldVal, int trWidth)
+{
+    // Block energies
+    int dCoeffEnergy = 0;
+
+    dCoeffEnergy += m_param->sSpecificDecEnergies.e_coeff * numSig;
+    dCoeffEnergy += m_param->sSpecificDecEnergies.e_val * ldVal /16; 
+
+    switch(trWidth){
+    case 32:    dCoeffEnergy += m_param->sSpecificDecEnergies.e_trans32x32;break;
+    case 16:    dCoeffEnergy += m_param->sSpecificDecEnergies.e_trans16x16;break;
+    case 8:     dCoeffEnergy += m_param->sSpecificDecEnergies.e_trans8x8;break;
+    case 4:     dCoeffEnergy += m_param->sSpecificDecEnergies.e_trans4x4;break;
+    }
+
+    return dCoeffEnergy;
+}
+
diff --git a/source/encoder/search.h b/source/encoder/search.h
index 02bd6e647..1cda9aa0f 100644
--- a/source/encoder/search.h
+++ b/source/encoder/search.h
@@ -127,6 +127,9 @@ struct Mode
     uint32_t    mvBits;     // Mv bits + Ref + block type (or intra mode)
     uint32_t    coeffBits;  // Texture bits (DCT Coeffs)
 
+    uint32_t	decEnergy; //DERDO
+    uint32_t    predDecEnergy;
+
     void initCosts()
     {
         rdCost = 0;
@@ -141,6 +144,9 @@ struct Mode
         totalBits = 0;
         mvBits = 0;
         coeffBits = 0;
+
+        decEnergy = 0; //DERDO
+        predDecEnergy = 0;
     }
 
     void addSubCosts(const Mode& subMode)
@@ -157,6 +163,8 @@ struct Mode
         totalBits += subMode.totalBits;
         mvBits += subMode.mvBits;
         coeffBits += subMode.coeffBits;
+
+        decEnergy += subMode.decEnergy;
     }
 };
 
@@ -382,7 +390,8 @@ protected:
         uint32_t bits;
         sse_t distortion;
         uint32_t energy;
-        Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
+        uint32_t decEnergy;
+        Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; decEnergy = 0;}
     };
 
     struct TUInfoCache
@@ -393,7 +402,7 @@ protected:
         Entropy rqtStore[NUM_SUBPART];
     } m_cacheTU;
 
-    uint64_t estimateNullCbfCost(sse_t dist, uint32_t energy, uint32_t tuDepth, TextType compId);
+    uint64_t estimateNullCbfCost(sse_t dist, uint32_t energy, uint32_t tuDepth, TextType compId, double decEnergy=0);
     bool     splitTU(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& splitCost, const uint32_t depthRange[2], int32_t splitMore);
     void     estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2], int32_t splitMore = -1);
 
@@ -423,10 +432,13 @@ protected:
     int       selectMVP(const CUData& cu, const PredictionUnit& pu, const MV amvp[AMVP_NUM_CANDS], int list, int ref);
     const MV& checkBestMVP(const MV amvpCand[2], const MV& mv, int& mvpIdx, uint32_t& outBits, uint32_t& outCost) const;
     void     setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const;
-    uint32_t mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m);
+    uint32_t mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m, int& decEnergy);
     static void getBlkBits(PartSize cuMode, bool bPSlice, int puIdx, uint32_t lastMode, uint32_t blockBit[3]);
     void      updateMVP(const MV amvp, const MV& mv, uint32_t& outBits, uint32_t& outCost, const MV& alterMVP);
 
+	int 	calcFracpelDecodingEnergySearch(MV cMv, uint32_t width, uint32_t height);
+	int     calcCoeffDecodingEnergy(uint32_t numSig, uint32_t ldVal, int trWidth); 
+
     /* intra helper functions */
     enum { MAX_RD_INTRA_MODES = 16 };
     static void updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList);
@@ -434,9 +446,15 @@ protected:
     // get most probable luma modes for CU part, and bit cost of all non mpm modes
     uint32_t getIntraRemModeBits(CUData & cu, uint32_t absPartIdx, uint32_t mpmModes[3], uint64_t& mpms) const;
 
-    void updateModeCost(Mode& m) const { m.rdCost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(m.distortion, m.totalBits, m.psyEnergy)
-                                                : (m_rdCost.m_ssimRd ? m_rdCost.calcSsimRdCost(m.distortion, m.totalBits, m.ssimEnergy) 
-                                                : m_rdCost.calcRdCost(m.distortion, m.totalBits)); }
+    void updateModeCost(Mode& m) const { m.rdCost = m_rdCost.m_decEnergyRD ?
+    		(m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(m.distortion, m.totalBits+m.decEnergy, m.psyEnergy)
+            : (m_rdCost.m_ssimRd ? m_rdCost.calcSsimRdCost(m.distortion, m.totalBits+m.decEnergy, m.ssimEnergy)
+            : m_rdCost.calcRdCost(m.distortion, m.totalBits+m.decEnergy))):
+			  (m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(m.distortion, m.totalBits, m.psyEnergy)
+            : (m_rdCost.m_ssimRd ? m_rdCost.calcSsimRdCost(m.distortion, m.totalBits, m.ssimEnergy)
+            : m_rdCost.calcRdCost(m.distortion, m.totalBits))); }
+
+
 };
 }
 
-- 
2.20.1.windows.1

-------------- next part --------------
From 1b6be642cb62b06505361ab8e2b2ff6dd70fc536 Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:46:29 +0200
Subject: [PATCH 13/14] DERDO: Cost calculations and function modifications for
 considering SAO energy.

---
 source/encoder/sao.cpp | 62 +++++++++++++++++++++++++++---------------
 source/encoder/sao.h   | 10 ++++---
 2 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/source/encoder/sao.cpp b/source/encoder/sao.cpp
index 0c46ece53..9498ea6fb 100644
--- a/source/encoder/sao.cpp
+++ b/source/encoder/sao.cpp
@@ -1272,6 +1272,7 @@ void SAO::rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr)
     memset(m_offset, 0, sizeof(m_offset));
     int64_t bestCost = 0;
     int64_t rateDist = 0;
+	int64_t decEnergy = 0; 
 
     bool bAboveLeftAvail = true;
     for (int mergeIdx = 0; mergeIdx < 2; ++mergeIdx)
@@ -1292,7 +1293,7 @@ void SAO::rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr)
         {
             calcSaoStatsCTU(addr, 0);
             saoStatsInitialOffset(addr, 0);
-            saoLumaComponentParamDist(saoParam, addr, rateDist, lambda, bestCost);
+            saoLumaComponentParamDist(saoParam, addr, rateDist, lambda, bestCost, decEnergy);
         }
     }
 
@@ -1304,7 +1305,7 @@ void SAO::rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr)
             calcSaoStatsCTU(addr, 1);
             calcSaoStatsCTU(addr, 2);
             saoStatsInitialOffset(addr, 1);
-            saoChromaComponentParamDist(saoParam, addr, rateDist, lambda, bestCost);
+            saoChromaComponentParamDist(saoParam, addr, rateDist, lambda, bestCost, decEnergy);
         }
     }
     if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
@@ -1314,7 +1315,7 @@ void SAO::rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr)
         {
             if (!allowMerge[mergeIdx])
                 continue;
-
+			int64_t mergeDecEnergy = 0; 
             int64_t mergeDist = 0; 
             for (int plane = 0; plane < planes; plane++)
             {
@@ -1329,7 +1330,11 @@ void SAO::rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr)
                         int mergeOffset = mergeSrcParam->offset[classIdx];
                         estDist += estSaoDist(m_count[plane][typeIdx][classIdx + bandPos], mergeOffset, m_offsetOrg[plane][typeIdx][classIdx + bandPos]);
                     }
-                }
+                    if(plane == 0)
+						mergeDecEnergy += (int64_t)m_param->sSpecificDecEnergies.e_SAOY; 
+                    else
+						mergeDecEnergy += (int64_t)m_param->sSpecificDecEnergies.e_SAOC;
+				}
                 mergeDist += (estDist << 8) / lambda[!!plane];
             }
 
@@ -1342,6 +1347,8 @@ void SAO::rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr)
 
             uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
             int64_t mergeCost = mergeDist + estRate;
+
+			mergeCost += mergeDecEnergy; 
             if (mergeCost < bestCost)
             {
                 SaoMergeMode mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT;
@@ -1432,7 +1439,7 @@ void SAO::saoStatsInitialOffset(int addr, int planes)
     }
 }
 
-inline int64_t SAO::calcSaoRdoCost(int64_t distortion, uint32_t bits, int64_t lambda)
+inline int64_t SAO::calcSaoRdoCost(int64_t distortion, uint32_t bits, int64_t lambda, int64_t decEnergy) 
 {
 #if X265_DEPTH < 10
         X265_CHECK(bits <= (INT64_MAX - 128) / lambda,
@@ -1443,17 +1450,18 @@ inline int64_t SAO::calcSaoRdoCost(int64_t distortion, uint32_t bits, int64_t la
                    "calcRdCost wrap detected dist: " X265_LL ", bits %u, lambda: " X265_LL "\n",
                    distortion, bits, lambda);
 #endif
-        return distortion + ((bits * lambda + 128) >> 8);
+		return distortion + ((bits * lambda + 128) >> 8) + ((decEnergy * lambda + 128) >> 8);
+
 }
 
-void SAO::estIterOffset(int typeIdx, int64_t lambda, int32_t count, int32_t offsetOrg, int32_t& offset, int32_t& distClasses, int64_t& costClasses)
+void SAO::estIterOffset(int typeIdx, int64_t lambda, int32_t count, int32_t offsetOrg, int32_t& offset, int32_t& distClasses, int64_t& costClasses, int64_t decEnergy)
 {
     int bestOffset = 0;
     distClasses    = 0;
 
     // Assuming sending quantized value 0 results in zero offset and sending the value zero needs 1 bit.
     // entropy coder can be used to measure the exact rate here.
-    int64_t bestCost = calcSaoRdoCost(0, 1, lambda);
+    int64_t bestCost = calcSaoRdoCost(0, 1, lambda, 0);
     while (offset != 0)
     {
         // Calculate the bits required for signalling the offset
@@ -1463,7 +1471,7 @@ void SAO::estIterOffset(int typeIdx, int64_t lambda, int32_t count, int32_t offs
 
         // Do the dequntization before distorion calculation
         int64_t dist = estSaoDist(count, offset << SAO_BIT_INC, offsetOrg);
-        int64_t cost  = calcSaoRdoCost(dist, rate, lambda);
+        int64_t cost  = calcSaoRdoCost(dist, rate, lambda, decEnergy);
         if (cost < bestCost)
         {
             bestCost = cost;
@@ -1476,7 +1484,7 @@ void SAO::estIterOffset(int typeIdx, int64_t lambda, int32_t count, int32_t offs
     costClasses = bestCost;
     offset = bestOffset;
 }
-void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& rateDist, int64_t* lambda, int64_t &bestCost)
+void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& rateDist, int64_t* lambda, int64_t &bestCost, int64_t &decEnergy)
 {
     Slice* slice = m_frame->m_encData->m_slice;
     const CUData* cu = m_frame->m_encData->getPicCTU(addr);
@@ -1491,7 +1499,7 @@ void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& r
     m_entropyCoder.load(m_rdContexts.temp);
     m_entropyCoder.resetBits();
     m_entropyCoder.codeSaoType(0);
-    int64_t costPartBest = calcSaoRdoCost(0, m_entropyCoder.getNumberOfWrittenBits(), lambda[0]);
+    int64_t costPartBest = calcSaoRdoCost(0, m_entropyCoder.getNumberOfWrittenBits(), lambda[0], 0);
     int maxSaoType;
     if (m_param->bLimitSAO && ((slice->m_sliceType == P_SLICE && cu->isSkipped(0)) ||
         (slice->m_sliceType == B_SLICE)))
@@ -1512,7 +1520,7 @@ void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& r
             int32_t&  count    = m_count[0][typeIdx][classIdx];
             int32_t& offsetOrg = m_offsetOrg[0][typeIdx][classIdx];
             int32_t& offsetOut = m_offset[0][typeIdx][classIdx];
-            estIterOffset(typeIdx, lambda[0], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
+            estIterOffset(typeIdx, lambda[0], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx], (int64_t)m_param->sSpecificDecEnergies.e_SAOY);
 
             //Calculate distortion
             estDist += distClasses[classIdx];
@@ -1522,13 +1530,14 @@ void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& r
         m_entropyCoder.resetBits();
         m_entropyCoder.codeSaoOffsetEO(m_offset[0][typeIdx] + 1, typeIdx, 0);
 
-        int64_t cost = calcSaoRdoCost(estDist, m_entropyCoder.getNumberOfWrittenBits(), lambda[0]);
+        int64_t cost = calcSaoRdoCost(estDist, m_entropyCoder.getNumberOfWrittenBits(), lambda[0], (int64_t) m_param->sSpecificDecEnergies.e_SAOY);
 
         if (cost < costPartBest)
         {
             costPartBest = cost;
             bestDist = estDist;
             bestTypeIdx = typeIdx;
+			decEnergy = (int64_t)m_param->sSpecificDecEnergies.e_SAOY;
         }
     }
 
@@ -1549,7 +1558,7 @@ void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& r
         int32_t& offsetOrg = m_offsetOrg[0][SAO_BO][classIdx];
         int32_t& offsetOut = m_offset[0][SAO_BO][classIdx];
 
-        estIterOffset(SAO_BO, lambda[0], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
+        estIterOffset(SAO_BO, lambda[0], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx], (int64_t)m_param->sSpecificDecEnergies.e_SAOY);
     }
 
     // Estimate Best Position
@@ -1580,7 +1589,7 @@ void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& r
     m_entropyCoder.resetBits();
     m_entropyCoder.codeSaoOffsetBO(m_offset[0][SAO_BO] + bestClassBO, bestClassBO, 0);
 
-    int64_t cost = calcSaoRdoCost(estDist, m_entropyCoder.getNumberOfWrittenBits(), lambda[0]);
+    int64_t cost = calcSaoRdoCost(estDist, m_entropyCoder.getNumberOfWrittenBits(), lambda[0], (int64_t)m_param->sSpecificDecEnergies.e_SAOY);
 
     if (cost < costPartBest)
     {
@@ -1592,6 +1601,8 @@ void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& r
         lclCtuParam->bandPos = bestClassBO;
         for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
             lclCtuParam->offset[classIdx] = m_offset[0][SAO_BO][classIdx + bestClassBO];
+
+		decEnergy = (int64_t)m_param->sSpecificDecEnergies.e_SAOY;
     }
 
     rateDist = (bestDist << 8) / lambda[0];
@@ -1604,7 +1615,7 @@ void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& r
         bestCost = rateDist + m_entropyCoder.getNumberOfWrittenBits();
     }
 }
-void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& rateDist, int64_t* lambda, int64_t &bestCost)
+void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& rateDist, int64_t* lambda, int64_t &bestCost, int64_t &decEnergy)
 {
     Slice* slice = m_frame->m_encData->m_slice;
     const CUData* cu = m_frame->m_encData->getPicCTU(addr);
@@ -1621,7 +1632,9 @@ void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t&
     m_entropyCoder.codeSaoType(0);
 
     uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
-    int64_t costPartBest = calcSaoRdoCost(0, bits, lambda[1]);
+	uint32_t decEnergyC = 0;
+	int64_t costPartBest = calcSaoRdoCost(0, bits, lambda[1], 0);
+
     int maxSaoType;
     if (m_param->bLimitSAO && ((slice->m_sliceType == P_SLICE && cu->isSkipped(0)) ||
         (slice->m_sliceType == B_SLICE)))
@@ -1645,7 +1658,7 @@ void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t&
                 int32_t& offsetOrg = m_offsetOrg[compIdx][typeIdx][classIdx];
                 int32_t& offsetOut = m_offset[compIdx][typeIdx][classIdx];
 
-                estIterOffset(typeIdx, lambda[1], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
+                estIterOffset(typeIdx, lambda[1], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx], (int64_t)m_param->sSpecificDecEnergies.e_SAOC);
 
                 estDist[compIdx - 1] += distClasses[classIdx];
             }
@@ -1658,13 +1671,14 @@ void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t&
             m_entropyCoder.codeSaoOffsetEO(m_offset[compIdx + 1][typeIdx] + 1, typeIdx, compIdx + 1);
 
         uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
-        int64_t cost = calcSaoRdoCost((estDist[0] + estDist[1]), estRate, lambda[1]);
+        int64_t cost = calcSaoRdoCost((estDist[0] + estDist[1]), estRate, lambda[1], (int64_t)m_param->sSpecificDecEnergies.e_SAOC*2);
 
         if (cost < costPartBest)
         {
             costPartBest = cost;
             bestDist = (estDist[0] + estDist[1]);
             bestTypeIdx = typeIdx;
+			decEnergyC = (int64_t)m_param->sSpecificDecEnergies.e_SAOC*2;
         }
     }
 
@@ -1694,7 +1708,7 @@ void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t&
             int32_t& offsetOrg = m_offsetOrg[compIdx][SAO_BO][classIdx];
             int32_t& offsetOut = m_offset[compIdx][SAO_BO][classIdx];
 
-            estIterOffset(SAO_BO, lambda[1], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
+			estIterOffset(SAO_BO, lambda[1], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx], (int64_t)m_param->sSpecificDecEnergies.e_SAOC);
         }
 
         for (int i = 0; i < MAX_NUM_SAO_CLASS - SAO_NUM_OFFSET + 1; i++)
@@ -1722,7 +1736,8 @@ void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t&
         m_entropyCoder.codeSaoOffsetBO(m_offset[compIdx + 1][SAO_BO] + bestClassBO[compIdx], bestClassBO[compIdx], compIdx + 1);
 
     uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
-    int64_t cost = calcSaoRdoCost((estDist[0] + estDist[1]), estRate, lambda[1]);
+	int64_t cost = calcSaoRdoCost((estDist[0] + estDist[1]), estRate, lambda[1], (int64_t)m_param->sSpecificDecEnergies.e_SAOC * 2);
+
 
     if (cost < costPartBest)
     {
@@ -1737,6 +1752,7 @@ void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t&
             for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
                 lclCtuParam[compIdx]->offset[classIdx] = m_offset[compIdx + 1][SAO_BO][classIdx + bestClassBO[compIdx]];
         }
+		decEnergyC = (int64_t)m_param->sSpecificDecEnergies.e_SAOC * 2;
     }
 
     rateDist += (bestDist << 8) / lambda[1];
@@ -1749,7 +1765,9 @@ void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t&
         m_entropyCoder.store(m_rdContexts.temp);
 
         uint32_t rate = m_entropyCoder.getNumberOfWrittenBits();
-        bestCost = rateDist + rate;
+		decEnergy = decEnergy + decEnergyC;
+		bestCost = rateDist + rate + decEnergyC;
+
     }
     else
     {
diff --git a/source/encoder/sao.h b/source/encoder/sao.h
index c797ca7cc..2ef60e93b 100644
--- a/source/encoder/sao.h
+++ b/source/encoder/sao.h
@@ -123,13 +123,15 @@ public:
     void calcSaoStatsCTU(int addr, int plane);
     void calcSaoStatsCu_BeforeDblk(Frame* pic, int idxX, int idxY);
 
-    void saoLumaComponentParamDist(SAOParam* saoParam, int addr, int64_t& rateDist, int64_t* lambda, int64_t& bestCost);
-    void saoChromaComponentParamDist(SAOParam* saoParam, int addr, int64_t& rateDist, int64_t* lambda, int64_t& bestCost);
+	void saoLumaComponentParamDist(SAOParam* saoParam, int addr, int64_t& rateDist, int64_t* lambda, int64_t& bestCost, int64_t &decEnergy);
+	void saoChromaComponentParamDist(SAOParam* saoParam, int addr, int64_t& rateDist, int64_t* lambda, int64_t& bestCost, int64_t &decEnergy);
 
-    void estIterOffset(int typeIdx, int64_t lambda, int32_t count, int32_t offsetOrg, int32_t& offset, int32_t& distClasses, int64_t& costClasses);
+
+    void estIterOffset(int typeIdx, int64_t lambda, int32_t count, int32_t offsetOrg, int32_t& offset, int32_t& distClasses, int64_t& costClasses, int64_t decEnergy);
     void rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus);
     void rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr);
-    int64_t calcSaoRdoCost(int64_t distortion, uint32_t bits, int64_t lambda);
+	int64_t calcSaoRdoCost(int64_t distortion, uint32_t bits, int64_t lambda, int64_t energy=0);
+
     void saoStatsInitialOffset(int addr, int planes);
     friend class FrameFilter;
 };
-- 
2.20.1.windows.1

-------------- next part --------------
From 3aa9debdd05184a12d816f84cd5b8a338d0042c7 Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 14:44:18 +0200
Subject: [PATCH 14/14] DERDO: Adding explanations on eedecode tuning and
 derdo-flag to cli.rst and presets.rst.

---
 doc/reST/cli.rst     | 14 +++++++++++++-
 doc/reST/presets.rst | 21 ++++++++++++++++++++-
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst
index 1a1de9f50..c3d39ffc0 100755
--- a/doc/reST/cli.rst
+++ b/doc/reST/cli.rst
@@ -392,7 +392,7 @@ Performance Options
 	be applied after :option:`--preset` but before all other parameters. Default none.
 	See :ref:`tunings <tunings>` for more detail.
 
-	**Values:** psnr, ssim, grain, zero-latency, fast-decode, animation.
+	**Values:** psnr, ssim, grain, zero-latency, fast-decode, animation, eedecode.
 
 .. option:: --slices <integer>
 
@@ -1197,6 +1197,18 @@ as the residual quad-tree (RQT).
 	gain in terms of objective quality metrics SSIM and PSNR. It only has effect
 	on presets which use RDO-based mode decisions (:option:`--rd` 3 and above).
 
+.. option:: --derdo
+
+	Enable decoding-energy-rate-distortion optimization (DERDO). In RD-
+	calculations, the energy costs for decoding the bit stream are 
+	additionally taken into account. To estimate the decoding energy costs, 
+	specific energy coefficients are multiplied with the occurence of certain
+	coding modes. In order to reduce the decoding energy, inter and intra 
+	prediction, transforms, coefficient coding, and in-loop filters are taken
+	into account. 
+	
+	Default: disabled
+
 Temporal / motion search options
 ================================
 
diff --git a/doc/reST/presets.rst b/doc/reST/presets.rst
index 7cabe8af2..0cd1c872e 100644
--- a/doc/reST/presets.rst
+++ b/doc/reST/presets.rst
@@ -130,6 +130,8 @@ after the preset.
 +--------------+-----------------------------------------------------+
 | animation    | improves encode quality for animated content        |
 +--------------+-----------------------------------------------------+
+| eedecode     | reduces the energy demand for a software decoder    |
++--------------+-----------------------------------------------------+
 
 
 
@@ -215,4 +217,21 @@ quality for animation content without impacting the encode speed. This is done b
     * :option:`--psy-rd` 0.4
     * :option:`--aq-strength` 0.4
     * :option:`--deblock` 1:1
-    * :option:`--bframes` Increase by 2
\ No newline at end of file
+    * :option:`--bframes` Increase by 2
+	
+	
+Energy-Efficient Decoding
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:option:`--tune` *eedecode* adjusts encoder settings and performs decoding-energy-rate-distortion 
+optimization (derdo) with the goal of reducing the software decoding energy at a constant quality. The
+decoding energy is reduced by:
+
+    * :option:`--no-deblock`
+    * :option:`--no-weightp`
+    * :option:`--no-weightb`
+	* :option:`--no-b-intra`
+	* :option:`--derdo`
+    * :option:`--aq-strength` 0.0
+    * :option:`--psy-rd` 0.0
+    * :option:`--psy-rdoq` 0.0
-- 
2.20.1.windows.1

-------------- next part --------------
From 70e0485d974bbf8fddcf4d603a316846cb4c68b6 Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:16:29 +0200
Subject: [PATCH 01/14] DERDO: Introduce derdo-flag to x265.h

---
 source/x265.h | 45 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 42 insertions(+), 3 deletions(-)

diff --git a/source/x265.h b/source/x265.h
index f44040ba7..cfba3dd51 100644
--- a/source/x265.h
+++ b/source/x265.h
@@ -1308,7 +1308,12 @@ typedef struct x265_param
      * value must be between 0 and 50, 1.0 is typical. Default 0 */
     double    psyRdoq;
 
-    /* Perform quantisation parameter based RD refinement. RD cost is calculated
+	/* Enables decoding energy-rate-distortion optimization. Tuning option for
+	 * energy-saving software decoding. See C. Herglotz, M. Bader, A. Kaup,
+	 * Decoding Energy Optimal Video Encoding for x265", MMSP 2020. Default: false*/
+	bool    derdo;
+	
+	/* Perform quantisation parameter based RD refinement. RD cost is calculated
      * on the best CU partitions, chosen after the CU analysis, for a range of QPs
      * to find the optimal rounding effect. Only effective at rd-levels 5 and 6.
      * Default disabled */
@@ -1380,7 +1385,7 @@ typedef struct x265_param
         /* Ratefactor constant: targets a certain constant "quality".
          * Acceptable values between 0 and 51. Default value: 28 */
         double    rfConstant;
-
+	
         /* Max QP difference between frames. Default: 4 */
         int       qpStep;
 
@@ -1792,6 +1797,40 @@ typedef struct x265_param
     /* File containing base64 encoded SEI messages in POC order */
     const char*    naluFile;
 
+	/* Structure of specific decoding energy coefficients for DERDO (extension of RDO to consider the software decoding energy)*/
+    struct{
+      //	  Intra-Prediction modes
+	  int e_intra32x32;
+	  int e_intra16x16;
+	  int e_intra8x8;
+	  int e_intra4x4;
+	  int e_trans32x32;
+	  int e_trans16x16;
+	  int e_trans8x8;
+	  int e_trans4x4;
+	  int e_IntraCUs;
+	  	  // Coefficient coding
+	  int e_coeff;
+	  int e_val;
+	  int e_PBslice;
+	  int e_skip64x64;			//Number of skipped CUs per depth
+	  int e_skip8x82x32;
+	  int e_skip16x16;
+	  int e_skip8x8;
+		//	  number of inter coded CUs per depth
+	  int e_inter64x64;
+	  int e_inter32x32;
+	  int e_inter32x326x16;
+	  int e_inter8x8;
+		//	  Number of fracpel-predicted pixels [hor/ver][depth]
+	  int e_fracpel;
+	  int e_bi;		// number of bipredicted 4x4-blocks
+	   // SAO
+	  int e_SAOY;
+	  int e_SAOC;
+	  int e_Bs;
+} sSpecificDecEnergies; 
+    
     /* Generate bitstreams confirming to the specified dolby vision profile,
      * note that 0x7C01 makes RPU appear to be an unspecified NAL type in
      * HEVC stream. if BL is backward compatible, Dolby Vision single
@@ -2012,7 +2051,7 @@ static const char * const x265_preset_names[] = { "ultrafast", "superfast", "ver
  *      100 times faster than placebo!
  *
  *      Currently available tunings are: */
-static const char * const x265_tune_names[] = { "psnr", "ssim", "grain", "zerolatency", "fastdecode", "animation", 0 };
+static const char * const x265_tune_names[] = { "psnr", "ssim", "grain", "zerolatency", "fastdecode", "animation", "eedecode", 0 };
 
 /*      returns 0 on success, negative on failure (e.g. invalid preset/tune name). */
 int x265_param_default_preset(x265_param *, const char *preset, const char *tune);
-- 
2.20.1.windows.1

-------------- next part --------------
From 3a9e987e2314e2f7faa35db9bf404526bd00277e Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:18:13 +0200
Subject: [PATCH 02/14] DERDO: Add flags and descriptions to cli-files.

---
 source/x265cli.cpp | 5 +++--
 source/x265cli.h   | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/source/x265cli.cpp b/source/x265cli.cpp
index c28dd7f8c..9a807e42c 100755
--- a/source/x265cli.cpp
+++ b/source/x265cli.cpp
@@ -110,7 +110,7 @@ namespace X265_NS {
         H0("-p/--preset <string>             Trade off performance for compression efficiency. Default medium\n");
         H0("                                 ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow, or placebo\n");
         H0("-t/--tune <string>               Tune the settings for a particular type of source or situation:\n");
-        H0("                                 psnr, ssim, grain, zerolatency, fastdecode\n");
+        H0("                                 psnr, ssim, grain, zerolatency, fastdecode, eedecode\n");
         H0("\nQuad-Tree size and depth:\n");
         H0("-s/--ctu <64|32|16>              Maximum CU size (WxH). Default %d\n", param->maxCUSize);
         H0("   --min-cu-size <64|32|16|8>    Minimum CU size (WxH). Default %d\n", param->minCUSize);
@@ -129,7 +129,8 @@ namespace X265_NS {
         H0("   --[no-]early-skip             Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
         H0("   --rskip <mode>                Set mode for early exit from recursion. Mode 1: exit using rdcost & CU homogenity. Mode 2: exit using CU edge density.\n"
             "                                 Mode 0: disabled. Default %d\n", param->recursionSkipMode);
-        H1("   --rskip-edge-threshold        Threshold in terms of percentage (integer of range [0,100]) for minimum edge density in CUs used to prun the recursion depth. Applicable only for rskip mode 2. Value is preset dependent. Default: %.f\n", param->edgeVarThreshold*100.0f);
+		H0("   --derdo                       Enable decoding-energy-rate-distortion optimization (DERDO). Default %s\n", OPT(param->derdo));
+		H1("   --rskip-edge-threshold        Threshold in terms of percentage (integer of range [0,100]) for minimum edge density in CUs used to prun the recursion depth. Applicable only for rskip mode 2. Value is preset dependent. Default: %.f\n", param->edgeVarThreshold*100.0f);
         H1("   --[no-]tskip-fast             Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast));
         H1("   --[no-]splitrd-skip           Enable skipping split RD analysis when sum of split CU rdCost larger than one split CU rdCost for Intra CU. Default %s\n", OPT(param->bEnableSplitRdSkip));
         H1("   --nr-intra <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in intra CUs. Default 0\n");
diff --git a/source/x265cli.h b/source/x265cli.h
index a24d25435..bba7710ec 100644
--- a/source/x265cli.h
+++ b/source/x265cli.h
@@ -212,7 +212,8 @@ static const struct option long_options[] =
     { "no-psy-rdoq",          no_argument, NULL, 0 },
     { "rd-refine",            no_argument, NULL, 0 },
     { "no-rd-refine",         no_argument, NULL, 0 },
-    { "scaling-list",   required_argument, NULL, 0 },
+	{ "derdo",                no_argument, NULL, 0 },
+	{ "scaling-list",   required_argument, NULL, 0 },
     { "lossless",             no_argument, NULL, 0 },
     { "no-lossless",          no_argument, NULL, 0 },
     { "no-signhide",          no_argument, NULL, 0 },
-- 
2.20.1.windows.1

-------------- next part --------------
From 4d5b12de0e67884975edbdf5a2d62e92778cca11 Mon Sep 17 00:00:00 2001
From: Christian Herglotz <christian.herglotz at fau.de>
Date: Thu, 22 Oct 2020 11:19:38 +0200
Subject: [PATCH 03/14] DERDO: Add new copy-if-lower-than-macro for five
 parameters (COPY5_IF_LT)

---
 source/common/common.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/source/common/common.h b/source/common/common.h
index 8c06cd79e..945e34213 100644
--- a/source/common/common.h
+++ b/source/common/common.h
@@ -202,6 +202,16 @@ typedef int16_t  coeff_t;      // transform coefficient
         (c) = (d); \
         (e) = (f); \
     }
+#define COPY5_IF_LT(x, y, a, b, c, d, e, f, g, h) \
+    if ((y) < (x)) \
+    { \
+        (x) = (y); \
+        (a) = (b); \
+        (c) = (d); \
+        (e) = (f); \
+        (g) = (h); \
+    }
+
 #define X265_MIN3(a, b, c) X265_MIN((a), X265_MIN((b), (c)))
 #define X265_MAX3(a, b, c) X265_MAX((a), X265_MAX((b), (c)))
 #define X265_MIN4(a, b, c, d) X265_MIN((a), X265_MIN3((b), (c), (d)))
-- 
2.20.1.windows.1