[x265] [PATCH RFC] psyrd: use psyrdcost for PU/TU decision for inter and intra

sumalatha at multicorewareinc.com sumalatha at multicorewareinc.com
Tue Jun 10 14:17:27 CEST 2014


# HG changeset patch
# User Sumalatha Polureddy<sumalatha at multicorewareinc.com>
# Date 1402402635 -19800
# Node ID dbe573edb3459f57ab058318e4227f6b19045c2d
# Parent  1f475750880710e49b5519fcad9366d10ea57c1a
psyrd: use psyrdcost for PU/TU decision for inter and intra

diff -r 1f4757508807 -r dbe573edb345 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Tue Jun 10 16:15:58 2014 +0530
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Tue Jun 10 17:47:15 2014 +0530
@@ -625,6 +625,7 @@
 
     uint64_t singleCost   = MAX_INT64;
     uint32_t singleDistY  = 0;
+    uint32_t singlePsyEnergyY = 0;
     uint32_t singleCbfY   = 0;
     int      bestModeId   = 0;
     bool     bestTQbypass = 0;
@@ -663,14 +664,16 @@
             m_rdGoOnSbacCoder->store(m_rdSbacCoders[fullDepth][CI_QT_TRAFO_ROOT]);
 
             uint32_t  singleDistYTmp = 0;
-            uint32_t  singleCbfYTmp  = 0;
-            uint64_t  singleCostTmp  = 0;
+            uint32_t  singlePsyEnergyYTmp = 0;
+            uint32_t  singleCbfYTmp = 0;
+            uint64_t  singleCostTmp = 0;
             bool      singleTQbypass = 0;
             const int firstCheckId   = 0;
 
             for (int modeId = firstCheckId; modeId < 2; modeId++)
             {
                 singleDistYTmp = 0;
+                singlePsyEnergyYTmp = 0;
                 cu->setTransformSkipSubParts(checkTransformSkip ? modeId : 0, TEXT_LUMA, absPartIdx, fullDepth);
 
                 bool bIsLossLess = modeId != firstCheckId;
@@ -681,7 +684,14 @@
 
                 //----- code luma block with given intra prediction mode and store Cbf-----
                 xIntraCodingLumaBlk(cu, trDepth, absPartIdx, fencYuv, predYuv, resiYuv, singleDistYTmp);
-                singleCbfYTmp  = cu->getCbf(absPartIdx, TEXT_LUMA, trDepth);
+                if (m_rdCost->psyRdEnabled())
+                {
+                    int size = g_convertToBit[cu->getCUSize(0) >> trDepth];
+                    uint32_t zorder = cu->getZorderIdxInCU() + absPartIdx;
+                    singlePsyEnergyYTmp = m_rdCost->psyCost(size, fencYuv->getLumaAddr(absPartIdx), fencYuv->getStride(),
+                        cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder), cu->getPic()->getPicYuvRec()->getStride());
+                }
+                singleCbfYTmp = cu->getCbf(absPartIdx, TEXT_LUMA, trDepth);
                 singleTQbypass = cu->getCUTransquantBypass(absPartIdx);
 
                 if ((modeId == 1) && (singleCbfYTmp == 0) && checkTransformSkip)
@@ -692,13 +702,17 @@
                 else
                 {
                     uint32_t singleBits = xGetIntraBitsQT(cu, trDepth, absPartIdx, 0, true, false);
-                    singleCostTmp = m_rdCost->calcRdCost(singleDistYTmp, singleBits);
+                    if (m_rdCost->psyRdEnabled())
+                        singleCostTmp = m_rdCost->calcPsyRdCost(singleDistYTmp, singleBits, singlePsyEnergyYTmp);
+                    else
+                        singleCostTmp = m_rdCost->calcRdCost(singleDistYTmp, singleBits);
                 }
 
                 if (singleCostTmp < singleCost)
                 {
                     singleCost   = singleCostTmp;
                     singleDistY  = singleDistYTmp;
+                    singlePsyEnergyY = singlePsyEnergyYTmp;
                     singleCbfY   = singleCbfYTmp;
                     bestTQbypass = singleTQbypass;
                     bestModeId   = modeId;
@@ -734,6 +748,13 @@
             //----- code luma block with given intra prediction mode and store Cbf-----
             cu->setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
             xIntraCodingLumaBlk(cu, trDepth, absPartIdx, fencYuv, predYuv, resiYuv, singleDistY);
+            if (m_rdCost->psyRdEnabled())
+            {
+                int size = g_convertToBit[cu->getCUSize(0) >> trDepth];
+                uint32_t zorder = cu->getZorderIdxInCU() + absPartIdx;
+                singlePsyEnergyY = m_rdCost->psyCost(size, fencYuv->getLumaAddr(absPartIdx), fencYuv->getStride(),
+                    cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder), cu->getPic()->getPicYuvRec()->getStride());
+            }
 
             if (bCheckSplit)
                 singleCbfY = cu->getCbf(absPartIdx, TEXT_LUMA, trDepth);
@@ -742,7 +763,10 @@
             if (m_cfg->m_param->rdPenalty && (trSizeLog2 == 5) && !isIntraSlice)
                 singleBits *= 4;
 
-            singleCost = m_rdCost->calcRdCost(singleDistY, singleBits);
+            if (m_rdCost->psyRdEnabled())
+                singleCost = m_rdCost->calcPsyRdCost(singleDistY, singleBits, singlePsyEnergyY);
+            else
+                singleCost = m_rdCost->calcRdCost(singleDistY, singleBits);
         }
     }
 
@@ -762,6 +786,7 @@
         //----- code splitted block -----
         uint64_t splitCost     = 0;
         uint32_t splitDistY    = 0;
+        uint32_t splitPsyEnergy = 0;
         uint32_t qPartsDiv     = cu->getPic()->getNumPartInCU() >> ((fullDepth + 1) << 1);
         uint32_t absPartIdxSub = absPartIdx;
 
@@ -769,8 +794,10 @@
 
         for (uint32_t part = 0; part < 4; part++, absPartIdxSub += qPartsDiv)
         {
+            cu->m_psyEnergy = 0;
             xRecurIntraCodingQT(cu, trDepth + 1, absPartIdxSub, fencYuv, predYuv, resiYuv, splitDistY, bCheckFirst, splitCost);
 
+            splitPsyEnergy += cu->m_psyEnergy;
             splitCbfY |= cu->getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1);
         }
 
@@ -784,16 +811,22 @@
 
         //----- determine rate and r-d cost -----
         uint32_t splitBits = xGetIntraBitsQT(cu, trDepth, absPartIdx, 0, true, false);
-        splitCost = m_rdCost->calcRdCost(splitDistY, splitBits);
-
+
+        if (m_rdCost->psyRdEnabled())
+            splitCost = m_rdCost->calcPsyRdCost(splitDistY, splitBits, splitPsyEnergy);
+        else
+            splitCost = m_rdCost->calcRdCost(splitDistY, splitBits);
         //===== compare and set best =====
         if (splitCost < singleCost)
         {
             //--- update cost ---
             outDistY += splitDistY;
             rdCost   += splitCost;
+            cu->m_psyEnergy = splitPsyEnergy;
             return;
         }
+        else
+            cu->m_psyEnergy = singlePsyEnergyY;
 
         //----- set entropy coding status -----
         m_rdGoOnSbacCoder->load(m_rdSbacCoders[fullDepth][CI_QT_TRAFO_TEST]);
@@ -817,6 +850,7 @@
 
     outDistY += singleDistY;
     rdCost   += singleCost;
+    cu->m_psyEnergy = singlePsyEnergyY;
 }
 
 void TEncSearch::residualTransformQuantIntra(TComDataCU* cu,
@@ -1196,7 +1230,7 @@
                 checkTransformSkip &= (nbLumaSkip > 0);
             }
         }
-
+        uint32_t singlePsyEnergy = 0;
         for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
         {
             TComTURecurse tuIterator;
@@ -1222,7 +1256,7 @@
                 chromaPredMode = (chFmt == CHROMA_422) ? g_chroma422IntraAngleMappingTable[chromaPredMode] : chromaPredMode;
                 //===== get prediction signal =====
                 predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, tuSize, chFmt);
-
+                uint32_t singlePsyEnergyTmp = 0;
                 if (checkTransformSkip)
                 {
                     // use RDO to decide whether Cr/Cb takes TS
@@ -1255,16 +1289,25 @@
                         else
                         {
                             uint32_t bitsTmp = xGetIntraBitsQTChroma(cu, trDepth, absPartIdxC, chromaId, splitIntoSubTUs);
-                            singleCostTmp = m_rdCost->calcRdCost(singleDistCTmp, bitsTmp);
+                            if (m_rdCost->psyRdEnabled())
+                            {
+                                int chFmt = cu->getChromaFormat();
+                                int size = g_convertToBit[(cu->getCUSize(0)) >> (trDepth + m_hChromaShift - ((trSizeLog2 == 2) && !(chFmt == CHROMA_444)))];
+                                uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
+                                singlePsyEnergyTmp = m_rdCost->psyCost(size, fencYuv->getChromaAddr(chromaId, absPartIdxC), fencYuv->getCStride(),
+                                    cu->getPic()->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), zorder), cu->getPic()->getPicYuvRec()->getCStride());
+                                singleCostTmp = m_rdCost->calcPsyRdCost(singleDistCTmp, bitsTmp, singlePsyEnergyTmp);
+                            }
+                            else
+                                singleCostTmp = m_rdCost->calcRdCost(singleDistCTmp, bitsTmp);
                         }
-
                         if (singleCostTmp < singleCost)
                         {
                             singleCost  = singleCostTmp;
                             singleDistC = singleDistCTmp;
                             bestModeId  = chromaModeId;
                             singleCbfC  = singleCbfCTmp;
-
+                            singlePsyEnergy = singlePsyEnergyTmp;
                             if (bestModeId == firstCheckId)
                             {
                                 xStoreIntraResultChromaQT(cu, trDepth, absPartIdxC, chromaId, splitIntoSubTUs);
@@ -1298,7 +1341,16 @@
                 {
                     cu->setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.m_absPartIdxStep);
                     xIntraCodingChromaBlk(cu, trDepth, absPartIdxC, tuIterator.m_absPartIdxStep, fencYuv, predYuv, resiYuv, outDist, chromaId);
+                    if (m_rdCost->psyRdEnabled())
+                    {
+                        int chFmt = cu->getChromaFormat();
+                        int size = g_convertToBit[(cu->getCUSize(0)) >> (trDepth + m_hChromaShift - ((trSizeLog2 == 2) && !(chFmt == CHROMA_444)))];
+                        uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
+                        singlePsyEnergyTmp = m_rdCost->psyCost(size, fencYuv->getChromaAddr(chromaId, absPartIdxC), fencYuv->getCStride(),
+                            cu->getPic()->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), zorder), cu->getPic()->getPicYuvRec()->getCStride());
+                    }
                 }
+                singlePsyEnergy += singlePsyEnergyTmp;
             }
             while (isNextSection(&tuIterator));
 
@@ -1307,20 +1359,24 @@
                 offsetSubTUCBFs(cu, (TextType)chromaId, trDepth, absPartIdx);
             }
         }
+        cu->m_psyEnergy = singlePsyEnergy;
     }
     else
     {
         uint32_t splitCbfU     = 0;
         uint32_t splitCbfV     = 0;
+        uint32_t splitPsyEnergy = 0;
         uint32_t qPartsDiv     = cu->getPic()->getNumPartInCU() >> ((fullDepth + 1) << 1);
         uint32_t absPartIdxSub = absPartIdx;
         for (uint32_t part = 0; part < 4; part++, absPartIdxSub += qPartsDiv)
         {
             xRecurIntraChromaCodingQT(cu, trDepth + 1, absPartIdxSub, fencYuv, predYuv, resiYuv, outDist);
+            splitPsyEnergy += cu->m_psyEnergy;
             splitCbfU |= cu->getCbf(absPartIdxSub, TEXT_CHROMA_U, trDepth + 1);
             splitCbfV |= cu->getCbf(absPartIdxSub, TEXT_CHROMA_V, trDepth + 1);
         }
 
+        cu->m_psyEnergy = splitPsyEnergy;
         for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
         {
             cu->getCbf(TEXT_CHROMA_U)[absPartIdx + offs] |= (splitCbfU << trDepth);
@@ -1945,8 +2001,11 @@
             }
 
             uint32_t bits = xGetIntraBitsQT(cu, initTrDepth, absPartIdxC, tuIterator.m_absPartIdxStep, false, true);
-            uint64_t cost = m_rdCost->calcRdCost(dist, bits);
-
+            uint64_t cost = 0; 
+            if (m_rdCost->psyRdEnabled())
+                cost = m_rdCost->calcPsyRdCost(dist, bits, cu->m_psyEnergy);
+            else
+                cost = m_rdCost->calcRdCost(dist, bits);
             //----- compare -----
             if (cost < bestCost)
             {
@@ -2676,9 +2735,7 @@
             cu->m_totalPsyCost = m_rdCost->calcPsyRdCost(cu->m_totalDistortion, cu->m_totalBits, cu->m_psyEnergy);
         }
         else
-        {
             cu->m_totalRDCost = m_rdCost->calcRdCost(cu->m_totalDistortion, cu->m_totalBits);
-        }
 
         m_rdGoOnSbacCoder->store(m_rdSbacCoders[depth][CI_TEMP_BEST]);
 
@@ -2715,7 +2772,7 @@
         distortion = 0;
 
         m_rdGoOnSbacCoder->load(m_rdSbacCoders[depth][CI_CURR_BEST]);
-        xEstimateResidualQT(cu, 0, outResiYuv, depth, cost, bits, distortion, &zeroDistortion, curUseRDOQ);
+        xEstimateResidualQT(cu, 0, fencYuv, predYuv, outResiYuv, depth, cost, bits, distortion, &zeroDistortion, curUseRDOQ);
 
         m_entropyCoder->resetBits();
         m_entropyCoder->encodeQtRootCbfZero(cu);
@@ -2789,9 +2846,7 @@
             cu->m_totalPsyCost = m_rdCost->calcPsyRdCost(bestDist, bestBits, cu->m_psyEnergy);
         }
         else
-        {
             cu->m_totalRDCost = m_rdCost->calcRdCost(bestDist, bestBits);
-        }
         cu->m_totalBits       = bestBits;
         cu->m_totalDistortion = bestDist;
 
@@ -3016,6 +3071,8 @@
 
 void TEncSearch::xEstimateResidualQT(TComDataCU*    cu,
                                      uint32_t       absPartIdx,
+                                     TComYuv*       fencYuv,
+                                     TComYuv*       predYuv,
                                      ShortYuv*      resiYuv,
                                      const uint32_t depth,
                                      uint64_t &     rdCost,
@@ -3055,8 +3112,10 @@
     uint64_t singleCost = MAX_INT64;
     uint32_t singleBits = 0;
     uint32_t singleDist = 0;
+    uint32_t singlePsyEnergy = 0;
     uint32_t singleBitsComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
     uint32_t singleDistComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
+    uint32_t singlePsyEnergyComp[MAX_NUM_COMPONENT][2] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
     uint32_t absSum[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
     uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
     int      lastPos[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { -1, -1 }, { -1, -1 }, { -1, -1 } };
@@ -3173,6 +3232,14 @@
 
         int partSize = partitionFromSize(trSize);
         uint32_t distY = primitives.sse_sp[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, (pixel*)RDCost::zeroPel, trSize);
+        uint32_t psyEnergyY = 0;
+        if (m_rdCost->psyRdEnabled())
+        {
+            int size = g_convertToBit[trSize];
+            psyEnergyY = m_rdCost->psyCost(size, fencYuv->getLumaAddr(absPartIdx), fencYuv->getStride(),
+               (pixel*)RDCost::zeroPel, cu->getPic()->getPicYuvRec()->getStride()); // need to check whether zero distortion is similar to psyenergy of fenc
+        }
+
         int16_t *curResiY = m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx);
         X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width not full CU\n");
         const uint32_t strideResiY = MAX_CU_SIZE;
@@ -3191,17 +3258,41 @@
             m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, strideResiY,  coeffCurY, trSize, scalingListType, false, lastPos[TEXT_LUMA][0]); //this is for inter mode only
 
             const uint32_t nonZeroDistY = primitives.sse_ss[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, curResiY, strideResiY);
+            if (m_rdCost->psyRdEnabled())
+            {
+                pixel*   pred = predYuv->getLumaAddr(absPartIdx);
+                uint32_t zorder = cu->getZorderIdxInCU() + absPartIdx;
+                pixel*   reconIPred = cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
+                uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getStride();
+                uint32_t stride = fencYuv->getStride();
+                //===== reconstruction =====
+                primitives.luma_add_ps[partSize](reconIPred, reconIPredStride, pred, curResiY, stride, strideResiY);
+            }
             if (cu->isLosslessCoded(0))
             {
                 distY = nonZeroDistY;
             }
             else
             {
-                const uint64_t singleCostY = m_rdCost->calcRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0]);
+                uint64_t singleCostY = 0;
+                if (m_rdCost->psyRdEnabled())
+                {
+                    int size = g_convertToBit[trSize];
+                    uint32_t zorder = cu->getZorderIdxInCU() + absPartIdx;
+                    psyEnergyY = m_rdCost->psyCost(size, fencYuv->getLumaAddr(absPartIdx), fencYuv->getStride(),
+                        cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder), cu->getPic()->getPicYuvRec()->getStride());
+                    singleCostY = m_rdCost->calcPsyRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0], psyEnergyY);
+                }
+                else
+                    singleCostY = m_rdCost->calcRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0]);
                 m_entropyCoder->resetBits();
                 m_entropyCoder->encodeQtCbfZero(cu, TEXT_LUMA, trMode);
                 const uint32_t nullBitsY = m_entropyCoder->getNumberOfWrittenBits();
-                const uint64_t nullCostY = m_rdCost->calcRdCost(distY, nullBitsY);
+                uint64_t nullCostY = 0;
+                if (m_rdCost->psyRdEnabled())
+                    nullCostY = m_rdCost->calcPsyRdCost(distY, nullBitsY, psyEnergyY);
+                else
+                    nullCostY = m_rdCost->calcRdCost(distY, nullBitsY);
                 if (nullCostY < singleCostY)
                 {
                     absSum[TEXT_LUMA][0] = 0;
@@ -3228,11 +3319,14 @@
             m_entropyCoder->resetBits();
             m_entropyCoder->encodeQtCbfZero(cu, TEXT_LUMA, trMode);
             const uint32_t nullBitsY = m_entropyCoder->getNumberOfWrittenBits();
-            minCost[TEXT_LUMA][0] = m_rdCost->calcRdCost(distY, nullBitsY);
+            if (m_rdCost->psyRdEnabled())
+                minCost[TEXT_LUMA][0] = m_rdCost->calcPsyRdCost(distY, nullBitsY, psyEnergyY);
+            else
+                minCost[TEXT_LUMA][0] = m_rdCost->calcRdCost(distY, nullBitsY);
         }
 
         singleDistComp[TEXT_LUMA][0] = distY;
-
+        singlePsyEnergyComp[TEXT_LUMA][0] = psyEnergyY;
         if (!absSum[TEXT_LUMA][0])
         {
             primitives.blockfill_s[sizeIdx](curResiY, strideResiY, 0);
@@ -3241,6 +3335,8 @@
 
         uint32_t distU = 0;
         uint32_t distV = 0;
+        uint32_t psyEnergyU = 0;
+        uint32_t psyEnergyV = 0;
         if (bCodeChroma)
         {
             TComTURecurse tuIterator;
@@ -3274,18 +3370,41 @@
                     uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth,
                                                                  curResiU, strideResiC);
                     const uint32_t nonZeroDistU = m_rdCost->scaleChromaDistCb(dist);
-
+                    if (m_rdCost->psyRdEnabled())
+                    {
+                        pixel*   pred = predYuv->getCbAddr(absPartIdxC);
+                        uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
+                        pixel*   reconIPred = cu->getPic()->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder);
+                        uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getCStride();
+                        uint32_t stride = fencYuv->getCStride();
+                        //===== reconstruction =====
+                        primitives.luma_add_ps[partSizeC](reconIPred, reconIPredStride, pred, curResiU, stride, strideResiC);
+                    }
                     if (cu->isLosslessCoded(0))
                     {
                         distU = nonZeroDistU;
                     }
                     else
                     {
-                        const uint64_t singleCostU = m_rdCost->calcRdCost(nonZeroDistU, singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section]);
+                        uint64_t singleCostU = 0;
+                        if (m_rdCost->psyRdEnabled())
+                        {
+                            int size = g_convertToBit[trSizeC];
+                            uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
+                            psyEnergyU = m_rdCost->psyCost(size, fencYuv->getCbAddr(absPartIdxC), fencYuv->getCStride(),
+                                cu->getPic()->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder), cu->getPic()->getPicYuvRec()->getCStride());
+                            singleCostU = m_rdCost->calcPsyRdCost(nonZeroDistU, singleBitsComp[TEXT_CHROMA_U][0], psyEnergyU);
+                        }
+                        else
+                            singleCostU = m_rdCost->calcRdCost(nonZeroDistU, singleBitsComp[TEXT_CHROMA_U][0]);
                         m_entropyCoder->resetBits();
                         m_entropyCoder->encodeQtCbfZero(cu, TEXT_CHROMA_U, trMode);
                         const uint32_t nullBitsU = m_entropyCoder->getNumberOfWrittenBits();
-                        const uint64_t nullCostU = m_rdCost->calcRdCost(distU, nullBitsU);
+                        uint64_t nullCostU = 0;
+                        if (m_rdCost->psyRdEnabled())
+                            nullCostU = m_rdCost->calcPsyRdCost(distU, nullBitsU, psyEnergyU);
+                        else
+                            nullCostU = m_rdCost->calcRdCost(distU, nullBitsU);
                         if (nullCostU < singleCostU)
                         {
                             absSum[TEXT_CHROMA_U][tuIterator.m_section] = 0;
@@ -3312,10 +3431,14 @@
                     m_entropyCoder->resetBits();
                     m_entropyCoder->encodeQtCbfZero(cu, TEXT_CHROMA_U, trModeC);
                     const uint32_t nullBitsU = m_entropyCoder->getNumberOfWrittenBits();
-                    minCost[TEXT_CHROMA_U][tuIterator.m_section] = m_rdCost->calcRdCost(distU, nullBitsU);
+                    if (m_rdCost->psyRdEnabled())
+                        minCost[TEXT_CHROMA_U][tuIterator.m_section] = m_rdCost->calcPsyRdCost(distU, nullBitsU, psyEnergyU);
+                    else
+                        minCost[TEXT_CHROMA_U][tuIterator.m_section] = m_rdCost->calcRdCost(distU, nullBitsU);
                 }
 
                 singleDistComp[TEXT_CHROMA_U][tuIterator.m_section] = distU;
+                singlePsyEnergyComp[TEXT_CHROMA_U][tuIterator.m_section] = psyEnergyU;
 
                 if (!absSum[TEXT_CHROMA_U][tuIterator.m_section])
                 {
@@ -3340,17 +3463,41 @@
                                                                  curResiV, strideResiC);
                     const uint32_t nonZeroDistV = m_rdCost->scaleChromaDistCr(dist);
 
+                    if (m_rdCost->psyRdEnabled())
+                    {
+                        pixel*   pred = predYuv->getCrAddr(absPartIdxC);
+                        uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
+                        pixel*   reconIPred = cu->getPic()->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder);
+                        uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getCStride();
+                        uint32_t stride = fencYuv->getCStride();
+                        //===== reconstruction =====
+                        primitives.luma_add_ps[partSizeC](reconIPred, reconIPredStride, pred, curResiV, stride, strideResiC);
+                    }
                     if (cu->isLosslessCoded(0))
                     {
                         distV = nonZeroDistV;
                     }
                     else
                     {
-                        const uint64_t singleCostV = m_rdCost->calcRdCost(nonZeroDistV, singleBitsComp[TEXT_CHROMA_V][tuIterator.m_section]);
+                        uint64_t singleCostV = 0;
+                        if (m_rdCost->psyRdEnabled())
+                        {
+                            int size = g_convertToBit[trSizeC];
+                            uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
+                            psyEnergyV = m_rdCost->psyCost(size, fencYuv->getCrAddr(absPartIdxC), fencYuv->getCStride(),
+                                cu->getPic()->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder), cu->getPic()->getPicYuvRec()->getCStride());
+                            singleCostV = m_rdCost->calcPsyRdCost(nonZeroDistV, singleBitsComp[TEXT_CHROMA_V][tuIterator.m_section], psyEnergyV);
+                        }
+                        else
+                            singleCostV = m_rdCost->calcRdCost(nonZeroDistV, singleBitsComp[TEXT_CHROMA_V][tuIterator.m_section]);
                         m_entropyCoder->resetBits();
                         m_entropyCoder->encodeQtCbfZero(cu, TEXT_CHROMA_V, trMode);
                         const uint32_t nullBitsV = m_entropyCoder->getNumberOfWrittenBits();
-                        const uint64_t nullCostV = m_rdCost->calcRdCost(distV, nullBitsV);
+                        uint64_t nullCostV = 0;
+                        if (m_rdCost->psyRdEnabled())
+                            nullCostV = m_rdCost->calcPsyRdCost(distV, nullBitsV, psyEnergyV);
+                        else
+                            nullCostV = m_rdCost->calcRdCost(distV, nullBitsV);
                         if (nullCostV < singleCostV)
                         {
                             absSum[TEXT_CHROMA_V][tuIterator.m_section] = 0;
@@ -3377,10 +3524,14 @@
                     m_entropyCoder->resetBits();
                     m_entropyCoder->encodeQtCbfZero(cu, TEXT_CHROMA_V, trModeC);
                     const uint32_t nullBitsV = m_entropyCoder->getNumberOfWrittenBits();
-                    minCost[TEXT_CHROMA_V][tuIterator.m_section] = m_rdCost->calcRdCost(distV, nullBitsV);
+                    if (m_rdCost->psyRdEnabled())
+                        minCost[TEXT_CHROMA_V][tuIterator.m_section] = m_rdCost->calcPsyRdCost(distV, nullBitsV, psyEnergyV);
+                    else
+                        minCost[TEXT_CHROMA_V][tuIterator.m_section] = m_rdCost->calcRdCost(distV, nullBitsV);
                 }
 
                 singleDistComp[TEXT_CHROMA_V][tuIterator.m_section] = distV;
+                singlePsyEnergyComp[TEXT_CHROMA_V][tuIterator.m_section] = psyEnergyV;
 
                 if (!absSum[TEXT_CHROMA_V][tuIterator.m_section])
                 {
@@ -3438,7 +3589,22 @@
                 nonZeroDistY = primitives.sse_ss[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width,
                                                            curResiY, strideResiY);
 
-                singleCostY = m_rdCost->calcRdCost(nonZeroDistY, skipSingleBitsY);
+                if (m_rdCost->psyRdEnabled())
+                {
+                    pixel*   pred = predYuv->getLumaAddr(absPartIdx);
+                    uint32_t zorder = cu->getZorderIdxInCU() + absPartIdx;
+                    pixel*   reconIPred = cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
+                    uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getStride();
+                    uint32_t stride = fencYuv->getStride();
+                    //===== reconstruction =====
+                    primitives.luma_add_ps[partSize](reconIPred, reconIPredStride, pred, curResiY, stride, strideResiY);
+                    int size = g_convertToBit[trSize];
+                    psyEnergyY = m_rdCost->psyCost(size, fencYuv->getLumaAddr(absPartIdx), fencYuv->getStride(),
+                        cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder), cu->getPic()->getPicYuvRec()->getStride());
+                    singleCostY = m_rdCost->calcPsyRdCost(nonZeroDistY, skipSingleBitsY, psyEnergyY);
+                }
+                else
+                    singleCostY = m_rdCost->calcRdCost(nonZeroDistY, skipSingleBitsY);
             }
 
             if (!absSumTransformSkipY || minCost[TEXT_LUMA][0] < singleCostY)
@@ -3450,6 +3616,7 @@
             else
             {
                 singleDistComp[TEXT_LUMA][0] = nonZeroDistY;
+                singlePsyEnergyComp[TEXT_LUMA][0] = psyEnergyY;
                 absSum[TEXT_LUMA][0] = absSumTransformSkipY;
                 bestTransformMode[TEXT_LUMA][0] = 1;
             }
@@ -3527,7 +3694,22 @@
                     uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth,
                                                                  curResiU, strideResiC);
                     nonZeroDistU = m_rdCost->scaleChromaDistCb(dist);
-                    singleCostU = m_rdCost->calcRdCost(nonZeroDistU, singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section]);
+                    if (m_rdCost->psyRdEnabled())
+                    {
+                        pixel*   pred = predYuv->getCbAddr(absPartIdxC);
+                        uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
+                        pixel*   reconIPred = cu->getPic()->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder);
+                        uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getCStride();
+                        uint32_t stride = fencYuv->getCStride();
+                        //===== reconstruction =====
+                        primitives.luma_add_ps[partSizeC](reconIPred, reconIPredStride, pred, curResiU, stride, strideResiC);
+                        int size = g_convertToBit[trSizeC];
+                        psyEnergyU = m_rdCost->psyCost(size, fencYuv->getCbAddr(absPartIdxC), fencYuv->getCStride(),
+                            cu->getPic()->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder), cu->getPic()->getPicYuvRec()->getCStride());
+                        singleCostU = m_rdCost->calcPsyRdCost(nonZeroDistU, singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section], psyEnergyU);
+                    }
+                    else
+                        singleCostU = m_rdCost->calcRdCost(nonZeroDistU, singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section]);
                 }
 
                 if (!absSumTransformSkipU || minCost[TEXT_CHROMA_U][tuIterator.m_section] < singleCostU)
@@ -3540,6 +3722,7 @@
                 else
                 {
                     singleDistComp[TEXT_CHROMA_U][tuIterator.m_section] = nonZeroDistU;
+                    singlePsyEnergyComp[TEXT_CHROMA_U][tuIterator.m_section] = psyEnergyU;
                     absSum[TEXT_CHROMA_U][tuIterator.m_section] = absSumTransformSkipU;
                     bestTransformMode[TEXT_CHROMA_U][tuIterator.m_section] = 1;
                 }
@@ -3561,8 +3744,23 @@
                                                                  curResiV, strideResiC);
                     nonZeroDistV = m_rdCost->scaleChromaDistCr(dist);
                     singleCostV = m_rdCost->calcRdCost(nonZeroDistV, singleBitsComp[TEXT_CHROMA_V][tuIterator.m_section]);
+                    if (m_rdCost->psyRdEnabled())
+                    {
+                        pixel*   pred = predYuv->getCrAddr(absPartIdxC);
+                        uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
+                        pixel*   reconIPred = cu->getPic()->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder);
+                        uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getCStride();
+                        uint32_t stride = fencYuv->getCStride();
+                        //===== reconstruction =====
+                        primitives.luma_add_ps[partSizeC](reconIPred, reconIPredStride, pred, curResiV, stride, strideResiC);
+                        int size = g_convertToBit[trSizeC];
+                        psyEnergyV = m_rdCost->psyCost(size, fencYuv->getCrAddr(absPartIdxC), fencYuv->getCStride(),
+                            cu->getPic()->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder), cu->getPic()->getPicYuvRec()->getCStride());
+                        singleCostV = m_rdCost->calcPsyRdCost(nonZeroDistV, singleBitsComp[TEXT_CHROMA_V][tuIterator.m_section], psyEnergyV);
+                    }
+                    else
+                        singleCostV = m_rdCost->calcRdCost(nonZeroDistV, singleBitsComp[TEXT_CHROMA_V][tuIterator.m_section]);
                 }
-
                 if (!absSumTransformSkipV || minCost[TEXT_CHROMA_V][tuIterator.m_section] < singleCostV)
                 {
                     cu->setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
@@ -3573,6 +3771,7 @@
                 else
                 {
                     singleDistComp[TEXT_CHROMA_V][tuIterator.m_section] = nonZeroDistV;
+                    singlePsyEnergyComp[TEXT_CHROMA_V][tuIterator.m_section] = psyEnergyV;
                     absSum[TEXT_CHROMA_V][tuIterator.m_section] = absSumTransformSkipV;
                     bestTransformMode[TEXT_CHROMA_V][tuIterator.m_section] = 1;
                 }
@@ -3636,6 +3835,7 @@
         }
 
         singleDist += singleDistComp[TEXT_LUMA][0];
+        singlePsyEnergy += singlePsyEnergyComp[TEXT_LUMA][0];// need to check we need to add chroma also
         for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
         {
             singleDist += singleDistComp[TEXT_CHROMA_U][subTUIndex];
@@ -3643,7 +3843,11 @@
         }
 
         singleBits = m_entropyCoder->getNumberOfWrittenBits();
-        singleCost = m_rdCost->calcRdCost(singleDist, singleBits);
+
+        if (m_rdCost->psyRdEnabled())
+            singleCost = m_rdCost->calcPsyRdCost(singleDist, singleBits, singlePsyEnergy);
+        else
+            singleCost = m_rdCost->calcRdCost(singleDist, singleBits);
 
         bestCBF[TEXT_LUMA] = cu->getCbf(absPartIdx, TEXT_LUMA, trMode);
         if (bCodeChroma)
@@ -3661,6 +3865,7 @@
                 }
             }
         }
+        //cu->m_psyEnergy = singlePsyEnergy;
     }
 
     // code sub-blocks
@@ -3674,7 +3879,7 @@
         uint32_t subdivDist = 0;
         uint32_t subdivBits = 0;
         uint64_t subDivCost = 0;
-
+        uint32_t subDivPsyEnergy = 0;
         bestCBF[TEXT_LUMA] = cu->getCbf(absPartIdx, TEXT_LUMA, trMode);
         if (bCodeChroma)
         {
@@ -3695,7 +3900,9 @@
         const uint32_t qPartNumSubdiv = cu->getPic()->getNumPartInCU() >> ((depth + 1) << 1);
         for (uint32_t i = 0; i < 4; ++i)
         {
-            xEstimateResidualQT(cu, absPartIdx + i * qPartNumSubdiv, resiYuv, depth + 1, subDivCost, subdivBits, subdivDist, bCheckFull ? NULL : outZeroDist);
+            cu->m_psyEnergy = 0;
+            xEstimateResidualQT(cu, absPartIdx + i * qPartNumSubdiv, fencYuv, predYuv, resiYuv, depth + 1, subDivCost, subdivBits, subdivDist, bCheckFull ? NULL : outZeroDist);
+            subDivPsyEnergy += cu->m_psyEnergy;
         }
 
         uint32_t ycbf = 0;
@@ -3724,8 +3931,10 @@
         xEncodeResidualQT(cu, absPartIdx, depth, false, TEXT_CHROMA_V);
 
         subdivBits = m_entropyCoder->getNumberOfWrittenBits();
-        subDivCost  = m_rdCost->calcRdCost(subdivDist, subdivBits);
-
+        if (m_rdCost->psyRdEnabled())
+            subDivCost = m_rdCost->calcPsyRdCost(subdivDist, subdivBits, subDivPsyEnergy);
+        else
+            subDivCost = m_rdCost->calcRdCost(subdivDist, subdivBits);
         if (ycbf || ucbf || vcbf || !bCheckFull)
         {
             if (subDivCost < singleCost)
@@ -3733,8 +3942,11 @@
                 rdCost += subDivCost;
                 outBits += subdivBits;
                 outDist += subdivDist;
+                cu->m_psyEnergy = subDivPsyEnergy;
                 return;
             }
+            else
+                cu->m_psyEnergy = singlePsyEnergy;
         }
 
         cu->setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth);
@@ -3758,7 +3970,7 @@
     rdCost += singleCost;
     outBits += singleBits;
     outDist += singleDist;
-
+    cu->m_psyEnergy = singlePsyEnergy;
     cu->setTrIdxSubParts(trMode, absPartIdx, depth);
     cu->setCbfSubParts(absSum[TEXT_LUMA][0] ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
 
diff -r 1f4757508807 -r dbe573edb345 source/Lib/TLibEncoder/TEncSearch.h
--- a/source/Lib/TLibEncoder/TEncSearch.h	Tue Jun 10 16:15:58 2014 +0530
+++ b/source/Lib/TLibEncoder/TEncSearch.h	Tue Jun 10 17:47:15 2014 +0530
@@ -179,7 +179,7 @@
 
     void generateCoeffRecon(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, bool skipRes);
 
-    void xEstimateResidualQT(TComDataCU* cu, uint32_t absPartIdx, ShortYuv* resiYuv, uint32_t depth,
+    void xEstimateResidualQT(TComDataCU* cu, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, uint32_t depth,
                              uint64_t &rdCost, uint32_t &outBits, uint32_t &outDist, uint32_t *puiZeroDist, bool curUseRDOQ = true);
     void xSetResidualQTData(TComDataCU* cu, uint32_t absPartIdx, ShortYuv* resiYuv, uint32_t depth, bool bSpatial);
 



More information about the x265-devel mailing list