[x265] [PATCH] SSIM based RDO for mode selection

Wed Dec 28 16:11:02 CET 2016

# HG changeset patch
# User Ashok Kumar Mishra <ashok at multicorewareinc.com>
# Date 1482932522 -19800
#      Wed Dec 28 19:12:02 2016 +0530
# Node ID 146036b4049c7d5abae3bae83f77d573b67f167e
# Parent  af10eaeb36cd22c7ad20ed2dafeac6f8e388ed9d
SSIM based RDO for mode selection

diff -r af10eaeb36cd -r 146036b4049c source/common/cudata.cpp

--- a/source/common/cudata.cpp	Wed Dec 28 10:17:08 2016 +0530
+++ b/source/common/cudata.cpp	Wed Dec 28 19:12:02 2016 +0530
@@ -224,6 +224,7 @@
         m_trCoeff[0] = dataPool.trCoeffMemBlock + instance * (cuSize * cuSize);
         m_trCoeff[1] = m_trCoeff[2] = 0;
         m_transformSkip[1] = m_transformSkip[2] = m_cbf[1] = m_cbf[2] = 0;
+        m_fAc_den[0] = m_fDc_den[0] = 0;
     }
     else
     {
@@ -267,6 +268,8 @@
         m_trCoeff[0] = dataPool.trCoeffMemBlock + instance * (sizeL + sizeC * 2);
         m_trCoeff[1] = m_trCoeff[0] + sizeL;
         m_trCoeff[2] = m_trCoeff[0] + sizeL + sizeC;
+        for (int i = 0; i < 3; i++)
+            m_fAc_den[i] = m_fDc_den[i] = 0;
     }
 }
 
@@ -327,6 +330,11 @@
     m_bFirstRowInSlice = ctu.m_bFirstRowInSlice;
     m_bLastRowInSlice = ctu.m_bLastRowInSlice;
     m_bLastCuInSlice = ctu.m_bLastCuInSlice;
+    for (int i = 0; i < 3; i++)
+    {
+        m_fAc_den[i] = ctu.m_fAc_den[i];
+        m_fDc_den[i] = ctu.m_fDc_den[i];
+    }
 
     X265_CHECK(m_numPartitions == cuGeom.numPartitions, "initSubCU() size mismatch\n");
 
diff -r af10eaeb36cd -r 146036b4049c source/common/cudata.h
--- a/source/common/cudata.h	Wed Dec 28 10:17:08 2016 +0530
+++ b/source/common/cudata.h	Wed Dec 28 19:12:02 2016 +0530
@@ -218,6 +218,8 @@
     const CUData* m_cuAbove;          // pointer to above neighbor CTU
     const CUData* m_cuLeft;           // pointer to left neighbor CTU
     double m_meanQP;
+    uint64_t      m_fAc_den[3];
+    uint64_t      m_fDc_den[3];
 
     CUData();
 
diff -r af10eaeb36cd -r 146036b4049c source/common/framedata.h
--- a/source/common/framedata.h	Wed Dec 28 10:17:08 2016 +0530
+++ b/source/common/framedata.h	Wed Dec 28 19:12:02 2016 +0530
@@ -55,6 +55,7 @@
     double      avgLumaDistortion;
     double      avgChromaDistortion;
     double      avgPsyEnergy;
+    double      avgSsimEnergy;
     double      avgResEnergy;
     double      percentIntraNxN;
     double      percentSkipCu[NUM_CU_DEPTH];
@@ -68,6 +69,7 @@
     uint64_t    lumaDistortion;
     uint64_t    chromaDistortion;
     uint64_t    psyEnergy;
+    int64_t     ssimEnergy;
     uint64_t    resEnergy;
     uint64_t    cntSkipCu[NUM_CU_DEPTH];
     uint64_t    cntMergeCu[NUM_CU_DEPTH];
diff -r af10eaeb36cd -r 146036b4049c source/common/param.cpp
--- a/source/common/param.cpp	Wed Dec 28 10:17:08 2016 +0530
+++ b/source/common/param.cpp	Wed Dec 28 19:12:02 2016 +0530
@@ -202,6 +202,7 @@
     param->bEnableTemporalSubLayers = 0;
     param->bEnableRdRefine = 0;
     param->bMultiPassOptRPS = 0;
+    param->bSsimRd = 0;
 
     /* Rate control options */
     param->rc.vbvMaxBitrate = 0;
@@ -926,6 +927,16 @@
         OPT("opt-cu-delta-qp") p->bOptCUDeltaQP = atobool(value);
         OPT("multi-pass-opt-analysis") p->analysisMultiPassRefine = atobool(value);
         OPT("multi-pass-opt-distortion") p->analysisMultiPassDistortion = atobool(value);
+        OPT("ssim-rd")
+        {
+            int bval = atobool(value);
+            if (bError || bval)
+            {
+                bError = false;
+                p->psyRd = 0.0;
+                p->bSsimRd = atobool(value);
+            }
+        }
         else
             return X265_PARAM_BAD_NAME;
     }
diff -r af10eaeb36cd -r 146036b4049c source/common/quant.cpp
--- a/source/common/quant.cpp	Wed Dec 28 10:17:08 2016 +0530
+++ b/source/common/quant.cpp	Wed Dec 28 19:12:02 2016 +0530
@@ -479,6 +479,82 @@
     }
 }
 
+uint64_t Quant::ssimDistortion(const CUData& cu, const pixel* fenc, uint32_t fStride, const pixel* recon, intptr_t rstride, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx)
+{
+    static const int ssim_c1 = (int)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64 + .5); // 416
+    static const int ssim_c2 = (int)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63 + .5); // 235963
+
+    int trSize = 1 << log2TrSize;
+    uint64_t ssDc = 0, ssBlock = 0, ssAc = 0;
+
+    // Calculation of (X(0) - Y(0)) * (X(0) - Y(0)), DC
+    ssDc = 0;
+    for (int y = 0; y < trSize; y += 4)
+    {
+        for (int x = 0; x < trSize; x += 4)
+        {
+            int temp = fenc[y * fStride + x] - recon[y * rstride + x]; // copy of residual coeff
+            ssDc += temp * temp;
+        }
+    }
+
+    // Calculation of (X(k) - Y(k)) * (X(k) - Y(k)), AC
+    ssBlock = 0;
+    for (int y = 0; y < trSize; y++)
+    {
+        for (int x = 0; x < trSize; x++)
+        {
+            int temp = fenc[y * fStride + x] - recon[y * rstride + x]; // copy of residual coeff
+            ssBlock += temp * temp;
+        }
+    }
+
+    ssAc = ssBlock - ssDc;
+
+    // 1. Calculation of fdc'
+    // Calculate numerator of dc normalization factor
+    uint64_t fDc_num = 0;
+
+    // 2. Calculate dc component
+    uint64_t dc_k = 0;
+    for (int block_yy = 0; block_yy < trSize; block_yy += 4)
+    {
+        for (int block_xx = 0; block_xx < trSize; block_xx += 4)
+        {
+            uint32_t temp = fenc[block_yy * fStride + block_xx];
+            dc_k += temp * temp;
+        }
+    }
+
+    fDc_num = (2 * dc_k)  + (trSize * trSize * ssim_c1); // 16 pixels -> for each 4x4 block
+    fDc_num /= ((trSize >> 2) * (trSize >> 2));
+
+    // 1. Calculation of fac'
+    // Calculate numerator of ac normalization factor
+    uint64_t fAc_num = 0;
+
+    // 2. Calculate ac component
+    uint64_t ac_k = 0;
+    for (int block_yy = 0; block_yy < trSize; block_yy += 1)
+    {
+        for (int block_xx = 0; block_xx < trSize; block_xx += 1)
+        {
+            uint32_t temp = fenc[block_yy * fStride + block_xx];
+            ac_k += temp * temp;
+        }
+    }
+    ac_k -= dc_k;
+
+    double s = 1 + 0.005 * cu.m_qp[absPartIdx];
+
+    fAc_num = ac_k + uint64_t(s * ac_k) + ssim_c2;
+    fAc_num /= ((trSize >> 2) * (trSize >> 2));
+
+    // Calculate dc and ac normalization factor
+    uint64_t ssim_distortion = ((ssDc * cu.m_fDc_den[ttype]) / fDc_num) + ((ssAc * cu.m_fAc_den[ttype]) / fAc_num);
+    return ssim_distortion;
+}
+
 void Quant::invtransformNxN(const CUData& cu, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
                             uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig)
 {
diff -r af10eaeb36cd -r 146036b4049c source/common/quant.h
--- a/source/common/quant.h	Wed Dec 28 10:17:08 2016 +0530
+++ b/source/common/quant.h	Wed Dec 28 19:12:02 2016 +0530
@@ -111,6 +111,8 @@
 
     void invtransformNxN(const CUData& cu, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
                          uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig);
+    uint64_t ssimDistortion(const CUData& cu, const pixel* fenc, uint32_t fStride, const pixel* recon, intptr_t rstride,
+                            uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx);
 
     /* Pattern decision for context derivation process of significant_coeff_flag */
     static uint32_t calcPatternSigCtx(uint64_t sigCoeffGroupFlag64, uint32_t cgPosX, uint32_t cgPosY, uint32_t cgBlkPos, uint32_t trSizeCG)
diff -r af10eaeb36cd -r 146036b4049c source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp	Wed Dec 28 10:17:08 2016 +0530
+++ b/source/encoder/analysis.cpp	Wed Dec 28 19:12:02 2016 +0530
@@ -76,6 +76,7 @@
     m_reuseRef = NULL;
     m_bHD = false;
 }
+
 bool Analysis::create(ThreadLocalData *tld)
 {
     m_tld = tld;
@@ -145,6 +146,9 @@
     ctu.m_meanQP = initialContext.m_meanQP;
     m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0);
 
+    if (m_param->bSsimRd)
+        calculateNormFactor(ctu, qp);
+
     uint32_t numPartition = ctu.m_numPartitions;
     if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead)
     {
@@ -2910,3 +2914,65 @@
 
     return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int)(qp + 0.5));
 }
+
+void Analysis::normFactor(const pixel* src, uint32_t blockSize, CUData& ctu, int qp, TextType ttype)
+{
+    static const int ssim_c1 = (int)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64 + .5); // 416
+    static const int ssim_c2 = (int)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63 + .5); // 235963
+
+    double s = 1 + 0.005 * qp;
+
+    // Calculate denominator of normalization factor
+    uint64_t fDc_den = 0, fAc_den = 0;
+
+    // 1. Calculate dc component
+    uint64_t z_o = 0;
+    for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 4)
+    {
+        for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 4)
+        {
+            uint32_t temp = src[block_yy * blockSize + block_xx];
+            z_o += temp * temp; // 2 * (Z(0)) pow(2)
+        }
+    }
+    fDc_den = (2 * z_o)  + (blockSize * blockSize * ssim_c1); // 2 * (Z(0)) pow(2) + N * C1
+    fDc_den /= ((blockSize >> 2) * (blockSize >> 2));
+
+    // 2. Calculate ac component
+    uint64_t z_k = 0;
+    for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 1)
+    {
+        for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 1)
+        {
+            uint32_t temp = src[block_yy * blockSize + block_xx];
+            z_k += temp * temp;
+        }
+    }
+
+    // Remove the DC part
+    z_k -= z_o;
+
+    fAc_den = z_k + int(s * z_k) + ssim_c2;
+    fAc_den /= ((blockSize >> 2) * (blockSize >> 2));
+
+    ctu.m_fAc_den[ttype] = fAc_den;
+    ctu.m_fDc_den[ttype] = fDc_den;
+}
+
+void Analysis::calculateNormFactor(CUData& ctu, int qp)
+{
+    const pixel* srcY = m_modeDepth[0].fencYuv.m_buf[0];
+    uint32_t blockSize = m_modeDepth[0].fencYuv.m_size;
+
+    normFactor(srcY, blockSize, ctu, qp, TEXT_LUMA);
+
+    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
+    {
+        const pixel* srcU = m_modeDepth[0].fencYuv.m_buf[1];
+        const pixel* srcV = m_modeDepth[0].fencYuv.m_buf[2];
+        uint32_t blockSizeC = m_modeDepth[0].fencYuv.m_csize;
+
+        normFactor(srcU, blockSizeC, ctu, qp, TEXT_CHROMA_U);
+        normFactor(srcV, blockSizeC, ctu, qp, TEXT_CHROMA_V);
+    }
+}
diff -r af10eaeb36cd -r 146036b4049c source/encoder/analysis.h
--- a/source/encoder/analysis.h	Wed Dec 28 10:17:08 2016 +0530
+++ b/source/encoder/analysis.h	Wed Dec 28 19:12:02 2016 +0530
@@ -176,6 +176,8 @@
 
     int calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, double baseQP = -1);
 
+    void calculateNormFactor(CUData& ctu, int qp);
+    void normFactor(const pixel* src, uint32_t blockSize, CUData& ctu, int qp, TextType ttype);
     /* check whether current mode is the new best */
     inline void checkBestMode(Mode& mode, uint32_t depth)
     {
diff -r af10eaeb36cd -r 146036b4049c source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp	Wed Dec 28 10:17:08 2016 +0530
+++ b/source/encoder/frameencoder.cpp	Wed Dec 28 19:12:02 2016 +0530
@@ -827,6 +827,7 @@
         m_frame->m_encData->m_frameStats.lumaDistortion   += m_rows[i].rowStats.lumaDistortion;
         m_frame->m_encData->m_frameStats.chromaDistortion += m_rows[i].rowStats.chromaDistortion;
         m_frame->m_encData->m_frameStats.psyEnergy        += m_rows[i].rowStats.psyEnergy;
+        m_frame->m_encData->m_frameStats.ssimEnergy       += m_rows[i].rowStats.ssimEnergy;
         m_frame->m_encData->m_frameStats.resEnergy        += m_rows[i].rowStats.resEnergy;
         for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
         {
@@ -841,6 +842,7 @@
     m_frame->m_encData->m_frameStats.avgLumaDistortion   = (double)(m_frame->m_encData->m_frameStats.lumaDistortion) / m_frame->m_encData->m_frameStats.totalCtu;
     m_frame->m_encData->m_frameStats.avgChromaDistortion = (double)(m_frame->m_encData->m_frameStats.chromaDistortion) / m_frame->m_encData->m_frameStats.totalCtu;
     m_frame->m_encData->m_frameStats.avgPsyEnergy        = (double)(m_frame->m_encData->m_frameStats.psyEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
+    m_frame->m_encData->m_frameStats.avgSsimEnergy       = (double)(m_frame->m_encData->m_frameStats.ssimEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
     m_frame->m_encData->m_frameStats.avgResEnergy        = (double)(m_frame->m_encData->m_frameStats.resEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
     m_frame->m_encData->m_frameStats.percentIntraNxN     = (double)(m_frame->m_encData->m_frameStats.cntIntraNxN * 100) / m_frame->m_encData->m_frameStats.totalCu;
     for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
@@ -1419,6 +1421,7 @@
         curRow.rowStats.lumaDistortion   += best.lumaDistortion;
         curRow.rowStats.chromaDistortion += best.chromaDistortion;
         curRow.rowStats.psyEnergy        += best.psyEnergy;
+        curRow.rowStats.ssimEnergy       += best.ssimEnergy;
         curRow.rowStats.resEnergy        += best.resEnergy;
         curRow.rowStats.cntIntraNxN      += frameLog.cntIntraNxN;
         curRow.rowStats.totalCu          += frameLog.totalCu;
diff -r af10eaeb36cd -r 146036b4049c source/encoder/rdcost.h
--- a/source/encoder/rdcost.h	Wed Dec 28 10:17:08 2016 +0530
+++ b/source/encoder/rdcost.h	Wed Dec 28 19:12:02 2016 +0530
@@ -41,9 +41,11 @@
     uint32_t  m_chromaDistWeight[2];
     uint32_t  m_psyRdBase;
     uint32_t  m_psyRd;
+    uint32_t  m_ssimRd;
     int       m_qp; /* QP used to configure lambda, may be higher than QP_MAX_SPEC but <= QP_MAX_MAX */
 
     void setPsyRdScale(double scale)                { m_psyRdBase = (uint32_t)floor(65536.0 * scale * 0.33); }
+    void setSsimRd(int ssimRd) { m_ssimRd = ssimRd; };
 
     void setQP(const Slice& slice, int qp)
     {
@@ -129,6 +131,20 @@
         return distortion + ((m_lambda * m_psyRd * psycost) >> 24) + ((bits * m_lambda2) >> 8);
     }
 
+    inline uint64_t calcSsimRdCost(uint64_t distortion, uint32_t bits, uint32_t ssimCost) const
+    {
+#if X265_DEPTH < 10
+        X265_CHECK((bits <= (UINT64_MAX / m_lambda2)) && (ssimCost <= UINT64_MAX / m_lambda),
+                   "calcPsyRdCost wrap detected dist: %u, bits: %u, lambda: " X265_LL ", lambda2: " X265_LL "\n",
+                   distortion, bits, m_lambda, m_lambda2);
+#else
+        X265_CHECK((bits <= (UINT64_MAX / m_lambda2)) && (ssimCost <= UINT64_MAX / m_lambda),
+                   "calcPsyRdCost wrap detected dist: " X265_LL ", bits: %u, lambda: " X265_LL ", lambda2: " X265_LL "\n",
+                   distortion, bits, m_lambda, m_lambda2);
+#endif
+        return distortion + ((m_lambda * ssimCost) >> 14) + ((bits * m_lambda2) >> 8);
+    }
+
     inline uint64_t calcRdSADCost(uint32_t sadCost, uint32_t bits) const
     {
         X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda,
diff -r af10eaeb36cd -r 146036b4049c source/encoder/search.cpp
--- a/source/encoder/search.cpp	Wed Dec 28 10:17:08 2016 +0530
+++ b/source/encoder/search.cpp	Wed Dec 28 19:12:02 2016 +0530
@@ -78,6 +78,7 @@
     m_numLayers = g_log2Size[param.maxCUSize] - 2;
 
     m_rdCost.setPsyRdScale(param.psyRd);
+    m_rdCost.setSsimRd(param.bSsimRd);
     m_me.init(param.internalCsp);
 
     bool ok = m_quant.init(param.psyRdoq, scalingList, m_entropyCoder);
@@ -417,6 +418,11 @@
             fullCost.energy = m_rdCost.psyCost(sizeIdx, fenc, mode.fencYuv->m_size, reconQt, reconQtStride);
             fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
         }
+        else if(m_rdCost.m_ssimRd)
+        {
+            fullCost.energy = m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSize, TEXT_LUMA, absPartIdx);
+            fullCost.rdcost = m_rdCost.calcSsimRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
+        }
         else
             fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
     }
@@ -460,6 +466,8 @@
 
             if (m_rdCost.m_psyRd)
                 splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
+            else if(m_rdCost.m_ssimRd)
+                splitCost.rdcost = m_rdCost.calcSsimRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
             else
                 splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
         }
@@ -625,6 +633,11 @@
             tmpEnergy = m_rdCost.psyCost(sizeIdx, fenc, fencYuv->m_size, tmpRecon, tmpReconStride);
             tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
         }
+        else if(m_rdCost.m_ssimRd)
+        {
+            tmpEnergy = m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSize, TEXT_LUMA, absPartIdx);
+            tmpCost = m_rdCost.calcSsimRdCost(tmpDist, tmpBits, tmpEnergy);
+        }
         else
             tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
 
@@ -899,6 +912,8 @@
 
             if (m_rdCost.m_psyRd)
                 outCost.energy += m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
+            else if(m_rdCost.m_ssimRd)
+                outCost.energy += m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSizeC, ttype, absPartIdxC);
 
             primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, reconQt, reconQtStride);
         }
@@ -1016,6 +1031,11 @@
                     tmpEnergy = m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
                     tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
                 }
+                else if(m_rdCost.m_ssimRd)
+                {
+                    tmpEnergy = m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSizeC, ttype, absPartIdxC);
+                    tmpCost = m_rdCost.calcSsimRdCost(tmpDist, tmpBits, tmpEnergy);
+                }
                 else
                     tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
 
@@ -1229,11 +1249,12 @@
     m_entropyCoder.store(intraMode.contexts);
     intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
     intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits - skipFlagBits;
+    const Yuv* fencYuv = intraMode.fencYuv;
     if (m_rdCost.m_psyRd)
-    {
-        const Yuv* fencYuv = intraMode.fencYuv;
         intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
-    }
+    else if(m_rdCost.m_ssimRd)
+        intraMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size, cuGeom.log2CUSize, TEXT_LUMA, 0);
+
     intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
 
     updateModeCost(intraMode);
@@ -1448,12 +1469,13 @@
 
     intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
     intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits - skipFlagBits;
+    const Yuv* fencYuv = intraMode.fencYuv;
     if (m_rdCost.m_psyRd)
-    {
-        const Yuv* fencYuv = intraMode.fencYuv;
         intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
-    }
-    intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
+    else if(m_rdCost.m_ssimRd)
+        intraMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cuGeom.log2CUSize, TEXT_LUMA, 0);
+
+    intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
     m_entropyCoder.store(intraMode.contexts);
     updateModeCost(intraMode);
     checkDQP(intraMode, cuGeom);
@@ -1778,7 +1800,7 @@
             codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U);
             codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V);
             uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
-            uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(outCost.distortion, bits, outCost.energy)
+            uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(outCost.distortion, bits, outCost.energy) : m_rdCost.m_ssimRd ? m_rdCost.calcSsimRdCost(outCost.distortion, bits, outCost.energy)
                                              : m_rdCost.calcRdCost(outCost.distortion, bits);
 
             if (cost < bestCost)
@@ -2637,6 +2659,9 @@
     interMode.totalBits = interMode.mvBits + skipFlagBits;
     if (m_rdCost.m_psyRd)
         interMode.psyEnergy = m_rdCost.psyCost(part, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
+    else if(m_rdCost.m_ssimRd)
+        interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0);
+
     interMode.resEnergy = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
     updateModeCost(interMode);
     m_entropyCoder.store(interMode.contexts);
@@ -2707,13 +2732,17 @@
         m_entropyCoder.codeQtRootCbfZero();
         uint32_t cbf0Bits = m_entropyCoder.getNumberOfWrittenBits();
 
-        uint64_t cbf0Cost;
-        uint32_t cbf0Energy;
+        uint32_t cbf0Energy; uint64_t cbf0Cost;
         if (m_rdCost.m_psyRd)
         {
             cbf0Energy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
             cbf0Cost = m_rdCost.calcPsyRdCost(cbf0Dist, cbf0Bits, cbf0Energy);
         }
+        else if(m_rdCost.m_ssimRd)
+        {
+            cbf0Energy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size, log2CUSize, TEXT_LUMA, 0);
+            cbf0Cost = m_rdCost.calcSsimRdCost(cbf0Dist, cbf0Bits, cbf0Energy);
+        }
         else
             cbf0Cost = m_rdCost.calcRdCost(cbf0Dist, cbf0Bits);
 
@@ -2782,6 +2811,9 @@
     }
     if (m_rdCost.m_psyRd)
         interMode.psyEnergy = m_rdCost.psyCost(sizeIdx, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
+    else if(m_rdCost.m_ssimRd)
+        interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0);
+
     interMode.resEnergy = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
     interMode.totalBits = bits;
     interMode.lumaDistortion = bestLumaDist;
@@ -2929,12 +2961,14 @@
     }
 }
 
-uint64_t Search::estimateNullCbfCost(sse_t dist, uint32_t psyEnergy, uint32_t tuDepth, TextType compId)
+uint64_t Search::estimateNullCbfCost(sse_t dist, uint32_t energy, uint32_t tuDepth, TextType compId)
 {
     uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth);
 
     if (m_rdCost.m_psyRd)
-        return m_rdCost.calcPsyRdCost(dist, nullBits, psyEnergy);
+        return m_rdCost.calcPsyRdCost(dist, nullBits, energy);
+    else if(m_rdCost.m_ssimRd)
+        return m_rdCost.calcSsimRdCost(dist, nullBits, energy);
     else
         return m_rdCost.calcRdCost(dist, nullBits);
 }
@@ -2983,6 +3017,8 @@
 
     if (m_rdCost.m_psyRd)
         splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
+    else if(m_rdCost.m_ssimRd)
+        splitCost.rdcost = m_rdCost.calcSsimRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
     else
         splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
         
@@ -3055,7 +3091,7 @@
     uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
     uint32_t singleBits[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
     sse_t singleDist[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
-    uint32_t singlePsyEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
+    uint32_t singleEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
     uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
     uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} };
 
@@ -3104,9 +3140,11 @@
 
         //Assuming zero residual 
         sse_t zeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size);
-        uint32_t zeroPsyEnergyY = 0;
+        uint32_t zeroEnergyY = 0;
         if (m_rdCost.m_psyRd)
-            zeroPsyEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size);
+            zeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size);
+        else if(m_rdCost.m_ssimRd)
+            zeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size, log2TrSize, TEXT_LUMA, absPartIdx);
 
         int16_t* curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
         uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size;
@@ -3123,11 +3161,16 @@
 
             const sse_t nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, curReconY, strideReconY);
             uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
-            uint32_t nonZeroPsyEnergyY = 0; uint64_t singleCostY = 0;
+            uint32_t nonZeroEnergyY = 0; uint64_t singleCostY = 0;
             if (m_rdCost.m_psyRd)
             {
-                nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, curReconY, strideReconY);
-                singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroPsyEnergyY);
+                nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, curReconY, strideReconY);
+                singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroEnergyY);
+            }
+            else if(m_rdCost.m_ssimRd)
+            {
+                nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, curReconY, strideReconY, log2TrSize, TEXT_LUMA, absPartIdx);
+                singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroEnergyY);
             }
             else
                 singleCostY = m_rdCost.calcRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0]);
@@ -3135,14 +3178,14 @@
             if (cu.m_tqBypass[0])
             {
                 singleDist[TEXT_LUMA][0] = nonZeroDistY;
-                singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
+                singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY;
             }
             else
             {
                 // zero-cost calculation for luma. This is an approximation
                 // Initial cost calculation was also an approximation. First resetting the bit counter and then encoding zero cbf.
                 // Now encoding the zero cbf without writing into bitstream, keeping m_fracBits unchanged. The same is valid for chroma.
-                uint64_t nullCostY = estimateNullCbfCost(zeroDistY, zeroPsyEnergyY, tuDepth, TEXT_LUMA);
+                uint64_t nullCostY = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA);
 
                 if (nullCostY < singleCostY)
                 {
@@ -3156,25 +3199,25 @@
                     if (checkTransformSkipY)
                         minCost[TEXT_LUMA][0] = nullCostY;
                     singleDist[TEXT_LUMA][0] = zeroDistY;
-                    singlePsyEnergy[TEXT_LUMA][0] = zeroPsyEnergyY;
+                    singleEnergy[TEXT_LUMA][0] = zeroEnergyY;
                 }
                 else
                 {
                     if (checkTransformSkipY)
                         minCost[TEXT_LUMA][0] = singleCostY;
                     singleDist[TEXT_LUMA][0] = nonZeroDistY;
-                    singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
+                    singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY;
                 }
             }
         }
         else
         {
             if (checkTransformSkipY)
-                minCost[TEXT_LUMA][0] = estimateNullCbfCost(zeroDistY, zeroPsyEnergyY, tuDepth, TEXT_LUMA);
+                minCost[TEXT_LUMA][0] = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA);
             primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
             singleDist[TEXT_LUMA][0] = zeroDistY;
             singleBits[TEXT_LUMA][0] = 0;
-            singlePsyEnergy[TEXT_LUMA][0] = zeroPsyEnergyY;
+            singleEnergy[TEXT_LUMA][0] = zeroEnergyY;
         }
 
         cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
@@ -3186,7 +3229,7 @@
             for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
             {
                 sse_t zeroDistC = 0;
-                uint32_t zeroPsyEnergyC = 0;
+                uint32_t zeroEnergyC = 0;
                 coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
                 TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
 
@@ -3214,9 +3257,11 @@
                     int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
                     zeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[log2TrSizeC - 2].sse_pp(fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize));
 
+                    // Assuming zero residual 
                     if (m_rdCost.m_psyRd)
-                    //Assuming zero residual 
-                        zeroPsyEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize);
+                        zeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize);
+                    else if(m_rdCost.m_ssimRd)
+                        zeroEnergyC = m_quant.ssimDistortion(cu, fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize, log2TrSizeC, (TextType)chromaId, absPartIdxC);
 
                     if (cbfFlag[chromaId][tuIterator.section])
                     {
@@ -3230,11 +3275,16 @@
                         primitives.cu[partSizeC].add_ps(curReconC, strideReconC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), curResiC, mode.predYuv.m_csize, strideResiC);
                         sse_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, curReconC, strideReconC));
                         uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
-                        uint32_t nonZeroPsyEnergyC = 0; uint64_t singleCostC = 0;
+                        uint32_t nonZeroEnergyC = 0; uint64_t singleCostC = 0;
                         if (m_rdCost.m_psyRd)
                         {
-                            nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, curReconC, strideReconC);
-                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
+                            nonZeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, curReconC, strideReconC);
+                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
+                        }
+                        else if(m_rdCost.m_ssimRd)
+                        {
+                            nonZeroEnergyC = m_quant.ssimDistortion(cu, fenc, fencYuv->m_csize, curReconC, strideReconC, log2TrSizeC, (TextType)chromaId, absPartIdxC);
+                            singleCostC = m_rdCost.calcSsimRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
                         }
                         else
                             singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]);
@@ -3242,12 +3292,12 @@
                         if (cu.m_tqBypass[0])
                         {
                             singleDist[chromaId][tuIterator.section] = nonZeroDistC;
-                            singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
+                            singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC;
                         }
                         else
                         {
                             //zero-cost calculation for chroma. This is an approximation
-                            uint64_t nullCostC = estimateNullCbfCost(zeroDistC, zeroPsyEnergyC, tuDepth, (TextType)chromaId);
+                            uint64_t nullCostC = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepth, (TextType)chromaId);
 
                             if (nullCostC < singleCostC)
                             {
@@ -3261,25 +3311,25 @@
                                 if (checkTransformSkipC)
                                     minCost[chromaId][tuIterator.section] = nullCostC;
                                 singleDist[chromaId][tuIterator.section] = zeroDistC;
-                                singlePsyEnergy[chromaId][tuIterator.section] = zeroPsyEnergyC;
+                                singleEnergy[chromaId][tuIterator.section] = zeroEnergyC;
                             }
                             else
                             {
                                 if (checkTransformSkipC)
                                     minCost[chromaId][tuIterator.section] = singleCostC;
                                 singleDist[chromaId][tuIterator.section] = nonZeroDistC;
-                                singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
+                                singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC;
                             }
                         }
                     }
                     else
                     {
                         if (checkTransformSkipC)
-                            minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroPsyEnergyC, tuDepthC, (TextType)chromaId);
+                            minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepthC, (TextType)chromaId);
                         primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0);
                         singleBits[chromaId][tuIterator.section] = 0;
                         singleDist[chromaId][tuIterator.section] = zeroDistC;
-                        singlePsyEnergy[chromaId][tuIterator.section] = zeroPsyEnergyC;
+                        singleEnergy[chromaId][tuIterator.section] = zeroEnergyC;
                     }
 
                     cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
@@ -3304,7 +3354,7 @@
         if (checkTransformSkipY)
         {
             sse_t nonZeroDistY = 0;
-            uint32_t nonZeroPsyEnergyY = 0;
+            uint32_t nonZeroEnergyY = 0;
             uint64_t singleCostY = MAX_INT64;
 
             m_entropyCoder.load(m_rqt[depth].rqtRoot);
@@ -3332,8 +3382,13 @@
 
                 if (m_rdCost.m_psyRd)
                 {
-                    nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, m_tsRecon, trSize);
-                    singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroPsyEnergyY);
+                    nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, m_tsRecon, trSize);
+                    singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY);
+                }
+                else if(m_rdCost.m_ssimRd)
+                {
+                    nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, m_tsRecon, trSize, log2TrSize, TEXT_LUMA, absPartIdx);
+                    singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY);
                 }
                 else
                     singleCostY = m_rdCost.calcRdCost(nonZeroDistY, skipSingleBitsY);
@@ -3344,7 +3399,7 @@
             else
             {
                 singleDist[TEXT_LUMA][0] = nonZeroDistY;
-                singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
+                singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY;
                 cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
                 bestTransformMode[TEXT_LUMA][0] = 1;
                 if (m_param->limitTU)
@@ -3360,7 +3415,7 @@
         if (codeChroma && checkTransformSkipC)
         {
             sse_t nonZeroDistC = 0;
-            uint32_t nonZeroPsyEnergyC = 0;
+            uint32_t nonZeroEnergyC = 0;
             uint64_t singleCostC = MAX_INT64;
             uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
             uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
@@ -3403,9 +3458,13 @@
                         nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, m_tsRecon, trSizeC));
                         if (m_rdCost.m_psyRd)
                         {
-
-                            nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, m_tsRecon, trSizeC);
-                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
+                            nonZeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, m_tsRecon, trSizeC);
+                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
+                        }
+                        else if(m_rdCost.m_ssimRd)
+                        {
+                            nonZeroEnergyC = m_quant.ssimDistortion(cu, fenc, mode.fencYuv->m_csize, m_tsRecon, trSizeC, log2TrSizeC, (TextType)chromaId, absPartIdxC);
+                            singleCostC = m_rdCost.calcSsimRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
                         }
                         else
                             singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section]);
@@ -3416,7 +3475,7 @@
                     else
                     {
                         singleDist[chromaId][tuIterator.section] = nonZeroDistC;
-                        singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
+                        singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC;
                         cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC;
                         bestTransformMode[chromaId][tuIterator.section] = 1;
                         uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
@@ -3475,7 +3534,7 @@
         fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits;
 
         fullCost.distortion += singleDist[TEXT_LUMA][0];
-        fullCost.energy += singlePsyEnergy[TEXT_LUMA][0];// need to check we need to add chroma also
+        fullCost.energy += singleEnergy[TEXT_LUMA][0];// need to check we need to add chroma also
         for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
         {
             fullCost.distortion += singleDist[TEXT_CHROMA_U][subTUIndex];
@@ -3484,6 +3543,8 @@
 
         if (m_rdCost.m_psyRd)
             fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
+        else if(m_rdCost.m_ssimRd)
+            fullCost.rdcost = m_rdCost.calcSsimRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
         else
             fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
 
diff -r af10eaeb36cd -r 146036b4049c source/encoder/search.h
--- a/source/encoder/search.h	Wed Dec 28 10:17:08 2016 +0530
+++ b/source/encoder/search.h	Wed Dec 28 19:12:02 2016 +0530
@@ -118,6 +118,7 @@
     uint64_t    sa8dCost;   // sum of partition sa8d distortion costs   (sa8d(fenc, pred) + lambda * bits)
     uint32_t    sa8dBits;   // signal bits used in sa8dCost calculation
     uint32_t    psyEnergy;  // sum of partition psycho-visual energy difference
+    uint32_t    ssimEnergy;
     sse_t   resEnergy;  // sum of partition residual energy after motion prediction
     sse_t   lumaDistortion;
     sse_t   chromaDistortion;
@@ -132,6 +133,7 @@
         sa8dCost = 0;
         sa8dBits = 0;
         psyEnergy = 0;
+        ssimEnergy = 0;
         resEnergy = 0;
         lumaDistortion = 0;
         chromaDistortion = 0;
@@ -147,6 +149,7 @@
         sa8dCost += subMode.sa8dCost;
         sa8dBits += subMode.sa8dBits;
         psyEnergy += subMode.psyEnergy;
+        ssimEnergy += subMode.ssimEnergy;
         resEnergy += subMode.resEnergy;
         lumaDistortion += subMode.lumaDistortion;
         chromaDistortion += subMode.chromaDistortion;
@@ -390,7 +393,7 @@
         Entropy rqtStore[NUM_SUBPART];
     } m_cacheTU;
 
-    uint64_t estimateNullCbfCost(sse_t dist, uint32_t psyEnergy, uint32_t tuDepth, TextType compId);
+    uint64_t estimateNullCbfCost(sse_t dist, uint32_t energy, uint32_t tuDepth, TextType compId);
     bool     splitTU(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& splitCost, const uint32_t depthRange[2], int32_t splitMore);
     void     estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2], int32_t splitMore = -1);
 
@@ -430,7 +433,9 @@
     // get most probable luma modes for CU part, and bit cost of all non mpm modes
     uint32_t getIntraRemModeBits(CUData & cu, uint32_t absPartIdx, uint32_t mpmModes[3], uint64_t& mpms) const;
 
-    void updateModeCost(Mode& m) const { m.rdCost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(m.distortion, m.totalBits, m.psyEnergy) : m_rdCost.calcRdCost(m.distortion, m.totalBits); }
+    void updateModeCost(Mode& m) const { m.rdCost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(m.distortion, m.totalBits, m.psyEnergy)
+                                                : (m_rdCost.m_ssimRd ? m_rdCost.calcSsimRdCost(m.distortion, m.totalBits, m.ssimEnergy) 
+                                                : m_rdCost.calcRdCost(m.distortion, m.totalBits)); }
 };
 }
 
diff -r af10eaeb36cd -r 146036b4049c source/x265.h
--- a/source/x265.h	Wed Dec 28 10:17:08 2016 +0530
+++ b/source/x265.h	Wed Dec 28 19:12:02 2016 +0530
@@ -1058,6 +1058,11 @@
      * the encoder must perform. Default X265_ANALYSIS_OFF */
     int       analysisMode;
 
+    /* SSIM based RDO, based on residual divisive normalization scheme. Used for mode
+    * selection during analysis of CTUs, can achieve significant gain in terms of 
+    * objective quality metrics SSIM and PSNR */
+    int       bSsimRd;
+
     /* Filename for analysisMode save/load. Default name is "x265_analysis.dat" */
     const char* analysisFileName;
 
diff -r af10eaeb36cd -r 146036b4049c source/x265cli.h
--- a/source/x265cli.h	Wed Dec 28 10:17:08 2016 +0530
+++ b/source/x265cli.h	Wed Dec 28 19:12:02 2016 +0530
@@ -256,6 +256,8 @@
     { "analyze-src-pics", no_argument, NULL, 0 },
     { "no-analyze-src-pics", no_argument, NULL, 0 },
     { "slices",         required_argument, NULL, 0 },
+    { "ssim-rd",      no_argument, NULL, 0 },
+    { "no-ssim-rd",   no_argument, NULL, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
@@ -340,6 +342,7 @@
     H0("   --[no-]psy-rd <0..5.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
     H0("   --[no-]rdoq-level <0|1|2>     Level of RDO in quantization 0:none, 1:levels, 2:levels & coding groups. Default %d\n", param->rdoqLevel);
     H0("   --[no-]psy-rdoq <0..50.0>     Strength of psycho-visual optimization in RDO quantization, 0 to disable. Default %.1f\n", param->psyRdoq);
+    H0("   --[no-]ssim-rd                Enable ssim rate distortion optimization, 0 to disable. Default %.1f\n", OPT(param->bSsimRd));
     H0("   --[no-]rd-refine              Enable QP based RD refinement for rd levels 5 and 6. Default %s\n", OPT(param->bEnableRdRefine));
     H0("   --[no-]early-skip             Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
     H0("   --[no-]rskip                  Enable early exit from recursion. Default %s\n", OPT(param->bEnableRecursionSkip));