[x265] [PATCH RFC] adapt psy-rd from x264

Fri May 9 03:07:37 CEST 2014

# HG changeset patch
# User Sumalatha Polureddy<sumalatha at multicorewareinc.com>
# Date 1399547936 -19800
#      Thu May 08 16:48:56 2014 +0530
# Node ID 51e8421442b54929dc6d40f1ae9745faefa1c57c
# Parent  4a36a281e77c479bb9ecff6a09d3b2da5b216668
adapt psy-rd from x264

In this initial implementation, we only use sa8d to estimate the energy of the
source and reconstructed blocks. I'd like feedback specifically on the new
psyCost() and calcPsyRdCost() methods of TComRdCost.

diff -r 4a36a281e77c -r 51e8421442b5 doc/reST/cli.rst

--- a/doc/reST/cli.rst	Thu May 08 17:39:10 2014 -0500
+++ b/doc/reST/cli.rst	Thu May 08 16:48:56 2014 +0530
@@ -617,6 +617,12 @@
 
 	**Range of values:** 0: least .. 6: full RDO analysis
 
+.. option:: --psy-rd <float>
+
+	Influence rate distortion optimizations to try to preserve the
+	energy of the source image in the encoded image, at the expense of
+	compression efficiency. Default 1.0
+
 .. option:: --signhide, --no-signhide
 
 	Hide sign bit of one coeff per TU (rdo). Default enabled
diff -r 4a36a281e77c -r 51e8421442b5 source/Lib/TLibCommon/TComRdCost.h
--- a/source/Lib/TLibCommon/TComRdCost.h	Thu May 08 17:39:10 2014 -0500
+++ b/source/Lib/TLibCommon/TComRdCost.h	Thu May 08 16:48:56 2014 +0530
@@ -54,16 +54,20 @@
 {
 private:
 
-    uint64_t  m_lambdaMotionSSE;  // m_lambda2 w/ 16 bits of fraction
+    uint64_t  m_lambdaMotionSSE;  // m_lambda2 w/ 8 bits of fraction
 
-    uint64_t  m_lambdaMotionSAD;  // m_lambda w/ 16 bits of fraction
+    uint64_t  m_lambdaMotionSAD;  // m_lambda w/ 8 bits of fraction
 
     uint64_t  m_cbDistortionWeight;
 
     uint64_t  m_crDistortionWeight;
 
+    uint64_t  m_psyRdScale;            // Psy RD strength w/ 8 bits of fraction
+
 public:
 
+    static const pixel zeroPel[MAX_CU_SIZE * MAX_CU_SIZE];
+
     void setLambda(double lambda2, double lambda)
     {
         m_lambdaMotionSSE = (uint64_t)floor(256.0 * lambda2);
@@ -80,6 +84,16 @@
         m_crDistortionWeight = (uint64_t)floor(256.0 * crDistortionWeight);
     }
 
+    void setPsyRdScale(double scale)
+    {
+        m_psyRdScale = (uint64_t)floor(256.0 * scale);
+    }
+
+    inline bool psyRdEnabled() const
+    {
+        return !!m_psyRdScale;
+    }
+
     inline uint64_t calcRdCost(uint32_t distortion, uint32_t bits)
     {
         X265_CHECK(abs(distortion + ((bits * m_lambdaMotionSSE + 128) >> 8)) -
@@ -88,6 +102,23 @@
         return distortion + ((bits * m_lambdaMotionSSE + 128) >> 8);
     }
 
+    /* return the difference in energy between the source block and the recon block */
+    inline uint32_t psyCost(int size, pixel *source, intptr_t sstride, pixel *recon, intptr_t rstride)
+    {
+        return abs(primitives.sa8d[size](source, sstride, (pixel*)zeroPel, MAX_CU_SIZE) -
+                   primitives.sa8d[size](recon,  rstride, (pixel*)zeroPel, MAX_CU_SIZE));
+    }
+
+    /* return the RD cost of this prediction, including the effect of psy-rd */
+    inline uint64_t calcPsyRdCost(uint32_t distortion, uint32_t bits, uint32_t psycost)
+    {
+        uint64_t tot = bits + (((psycost * m_psyRdScale) + 128) >> 8);
+        X265_CHECK(abs(distortion + ((tot * m_lambdaMotionSSE + 128) >> 8)) -
+                      (distortion + (float)tot * m_lambdaMotionSSE / 256.0) < 2,
+                   "calcPsyRdCost wrap detected dist: %d, tot %d, lambda: %d\n", distortion, tot, (int)m_lambdaMotionSSE);
+        return distortion + ((tot * m_lambdaMotionSSE + 128) >> 8);
+    }
+
     inline uint64_t calcRdSADCost(uint32_t sadCost, uint32_t bits)
     {
         X265_CHECK(abs(sadCost + ((bits * m_lambdaMotionSAD + 128) >> 8)) -
diff -r 4a36a281e77c -r 51e8421442b5 source/Lib/TLibEncoder/TEncCu.cpp
--- a/source/Lib/TLibEncoder/TEncCu.cpp	Thu May 08 17:39:10 2014 -0500
+++ b/source/Lib/TLibEncoder/TEncCu.cpp	Thu May 08 16:48:56 2014 +0530
@@ -1394,9 +1394,20 @@
     m_entropyCoder->encodeCoeff(outTempCU, 0, depth, outTempCU->getCUSize(0), outTempCU->getCUSize(0), bCodeDQP);
 
     m_rdGoOnSbacCoder->store(m_rdSbacCoders[depth][CI_TEMP_BEST]);
+    outTempCU->m_totalBits = m_entropyCoder->getNumberOfWrittenBits();
 
-    outTempCU->m_totalBits = m_entropyCoder->getNumberOfWrittenBits();
-    outTempCU->m_totalCost = m_rdCost->calcRdCost(outTempCU->m_totalDistortion, outTempCU->m_totalBits);
+    if (m_rdCost->psyRdEnabled())
+    {
+        int part = g_convertToBit[outTempCU->getCUSize(0)];
+        TComPicYuv *recon = outTempCU->getPic()->getPicYuvRec();
+        uint32_t psyRdCost = m_rdCost->psyCost(part, m_origYuv[depth]->getLumaAddr(), m_origYuv[depth]->getStride(),
+                                                     recon->getLumaAddr(outTempCU->getAddr()), recon->getStride());
+        outTempCU->m_totalCost = m_rdCost->calcPsyRdCost(outTempCU->m_totalDistortion, outTempCU->m_totalBits, psyRdCost);
+    }
+    else
+    {
+        outTempCU->m_totalCost = m_rdCost->calcRdCost(outTempCU->m_totalDistortion, outTempCU->m_totalBits);
+    }
 
     xCheckDQP(outTempCU);
     xCheckBestMode(outBestCU, outTempCU, depth);
@@ -1431,12 +1442,21 @@
     // Encode Coefficients
     bool bCodeDQP = getdQPFlag();
     m_entropyCoder->encodeCoeff(outTempCU, 0, depth, outTempCU->getCUSize(0), outTempCU->getCUSize(0), bCodeDQP);
+    m_rdGoOnSbacCoder->store(m_rdSbacCoders[depth][CI_TEMP_BEST]);
+    outTempCU->m_totalBits = m_entropyCoder->getNumberOfWrittenBits();
 
-    m_rdGoOnSbacCoder->store(m_rdSbacCoders[depth][CI_TEMP_BEST]);
-
-    outTempCU->m_totalBits = m_entropyCoder->getNumberOfWrittenBits();
-    outTempCU->m_totalCost = m_rdCost->calcRdCost(outTempCU->m_totalDistortion, outTempCU->m_totalBits);
-
+    if (m_rdCost->psyRdEnabled())
+    {
+        int part = g_convertToBit[outTempCU->getCUSize(0)];
+        TComPicYuv *recon = outTempCU->getPic()->getPicYuvRec();
+        uint32_t psyRdCost = m_rdCost->psyCost(part, m_origYuv[depth]->getLumaAddr(), m_origYuv[depth]->getStride(),
+            recon->getLumaAddr(outTempCU->getAddr()), recon->getStride());
+        outTempCU->m_totalCost = m_rdCost->calcPsyRdCost(outTempCU->m_totalDistortion, outTempCU->m_totalBits, psyRdCost);
+    }
+    else
+    {
+        outTempCU->m_totalCost = m_rdCost->calcRdCost(outTempCU->m_totalDistortion, outTempCU->m_totalBits);
+    }
     xCheckDQP(outTempCU);
     xCheckBestMode(outBestCU, outTempCU, depth);
 }
diff -r 4a36a281e77c -r 51e8421442b5 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Thu May 08 17:39:10 2014 -0500
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Thu May 08 16:48:56 2014 +0530
@@ -47,8 +47,7 @@
 
 using namespace x265;
 
-//! \ingroup TLibEncoder
-//! \{
+ALIGN_VAR_32(const pixel, TComRdCost::zeroPel[MAX_CU_SIZE * MAX_CU_SIZE]) = { 0 };
 
 TEncSearch::TEncSearch()
 {
@@ -2695,7 +2694,17 @@
 
         cu->m_totalBits       = bits;
         cu->m_totalDistortion = distortion;
-        cu->m_totalCost       = m_rdCost->calcRdCost(distortion, bits);
+        if (m_rdCost->psyRdEnabled())
+        {
+            int part = g_convertToBit[cu->getCUSize(0)];
+            uint32_t psyRdCost = m_rdCost->psyCost(part, fencYuv->getLumaAddr(), fencYuv->getStride(),
+                                                         outReconYuv->getLumaAddr(), outReconYuv->getStride());
+            cu->m_totalCost = m_rdCost->calcPsyRdCost(cu->m_totalDistortion, cu->m_totalBits, psyRdCost);
+        }
+        else
+        {
+            cu->m_totalCost = m_rdCost->calcRdCost(cu->m_totalDistortion, cu->m_totalBits);
+        }
 
         m_rdGoOnSbacCoder->store(m_rdSbacCoders[cu->getDepth(0)][CI_TEMP_BEST]);
 
@@ -3170,8 +3179,6 @@
 
         const uint32_t numSamplesLuma = 1 << (trSizeLog2 << 1);
 
-        ALIGN_VAR_32(static const pixel, zeroPel[MAX_CU_SIZE * MAX_CU_SIZE]) = { 0 };
-
         for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
         {
             minCost[TEXT_LUMA][subTUIndex]     = MAX_INT64;
@@ -3180,7 +3187,7 @@
         }
 
         int partSize = partitionFromSizes(trWidth, trHeight);
-        uint32_t distY = primitives.sse_sp[partSize](resiYuv->getLumaAddr(absTUPartIdx), resiYuv->m_width, (pixel*)zeroPel, trWidth);
+        uint32_t distY = primitives.sse_sp[partSize](resiYuv->getLumaAddr(absTUPartIdx), resiYuv->m_width, (pixel*)TComRdCost::zeroPel, trWidth);
 
         if (outZeroDist)
         {
@@ -3265,7 +3272,7 @@
             {
                 uint32_t subTUBufferOffset = widthC * heightC * tuIterator.m_section;
 
-                distU = m_rdCost->scaleChromaDistCb(primitives.sse_sp[partSizeC](resiYuv->getCbAddr(tuIterator.m_absPartIdxTURelCU), resiYuv->m_cwidth, (pixel*)zeroPel, widthC));
+                distU = m_rdCost->scaleChromaDistCb(primitives.sse_sp[partSizeC](resiYuv->getCbAddr(tuIterator.m_absPartIdxTURelCU), resiYuv->m_cwidth, (pixel*)TComRdCost::zeroPel, widthC));
 
                 if (outZeroDist)
                 {
@@ -3335,7 +3342,7 @@
                     primitives.blockfill_s[(int)g_convertToBit[widthC]](ptr, stride, 0);
                 }
 
-                distV = m_rdCost->scaleChromaDistCr(primitives.sse_sp[partSizeC](resiYuv->getCrAddr(tuIterator.m_absPartIdxTURelCU), resiYuv->m_cwidth, (pixel*)zeroPel, widthC));
+                distV = m_rdCost->scaleChromaDistCr(primitives.sse_sp[partSizeC](resiYuv->getCrAddr(tuIterator.m_absPartIdxTURelCU), resiYuv->m_cwidth, (pixel*)TComRdCost::zeroPel, widthC));
                 if (outZeroDist)
                 {
                     *outZeroDist += distV;
diff -r 4a36a281e77c -r 51e8421442b5 source/common/common.h
--- a/source/common/common.h	Thu May 08 17:39:10 2014 -0500
+++ b/source/common/common.h	Thu May 08 16:48:56 2014 +0530
@@ -165,6 +165,7 @@
 #define X265_LOG2F(x) log2f(x)
 #define X265_LOG2(x)  log2(x)
 #endif
+#define FIX8(f) ((int)(f * (1 << 8) + .5))
 
 /* defined in common.cpp */
 int64_t x265_mdate(void);
diff -r 4a36a281e77c -r 51e8421442b5 source/common/param.cpp
--- a/source/common/param.cpp	Thu May 08 17:39:10 2014 -0500
+++ b/source/common/param.cpp	Thu May 08 16:48:56 2014 +0530
@@ -156,6 +156,7 @@
     param->cbQpOffset = 0;
     param->crQpOffset = 0;
     param->rdPenalty = 0;
+    param->psyRd = 1.0;
 
     /* Rate control options */
     param->rc.vbvMaxBitrate = 0;
@@ -588,6 +589,7 @@
     OPT("cbqpoffs") p->cbQpOffset = atoi(value);
     OPT("crqpoffs") p->crQpOffset = atoi(value);
     OPT("rd") p->rdLevel = atoi(value);
+    OPT("psyrd") p->psyRd = atof(value);
     OPT("signhide") p->bEnableSignHiding = atobool(value);
     OPT("lft") p->bEnableLoopFilter = atobool(value);
     OPT("sao") p->bEnableSAO = atobool(value);
@@ -915,7 +917,7 @@
           "Aq-Mode is out of range");
     CHECK(param->rc.aqStrength < 0 || param->rc.aqStrength > 3,
           "Aq-Strength is out of range");
-
+    CHECK(param->psyRd < 0 || 2.0 < param->psyRd, "Psy-rd strength must be between 0 and 2.0");
     CHECK(param->bEnableWavefront < 0, "WaveFrontSynchro cannot be negative");
     CHECK((param->vui.aspectRatioIdc < 0
            || param->vui.aspectRatioIdc > 16)
@@ -1061,6 +1063,7 @@
     x265_log(param, X265_LOG_INFO, "Lookahead / bframes / badapt        : %d / %d / %d\n", param->lookaheadDepth, param->bframes, param->bFrameAdaptive);
     x265_log(param, X265_LOG_INFO, "b-pyramid / weightp / weightb / refs: %d / %d / %d / %d\n",
              param->bBPyramid, param->bEnableWeightedPred, param->bEnableWeightedBiPred, param->maxNumReferences);
+
     switch (param->rc.rateControlMode)
     {
     case X265_RC_ABR:
@@ -1091,6 +1094,7 @@
     TOOLOPT(param->bEnableConstrainedIntra, "cip");
     TOOLOPT(param->bEnableEarlySkip, "esd");
     fprintf(stderr, "rd=%d ", param->rdLevel);
+    fprintf(stderr, "psyrd=%.1lf ", param->psyRd);
 
     TOOLOPT(param->bEnableLoopFilter, "lft");
     if (param->bEnableSAO)
diff -r 4a36a281e77c -r 51e8421442b5 source/encoder/cturow.cpp
--- a/source/encoder/cturow.cpp	Thu May 08 17:39:10 2014 -0500
+++ b/source/encoder/cturow.cpp	Thu May 08 16:48:56 2014 +0530
@@ -34,7 +34,7 @@
     m_rdGoOnSbacCoder.init(&m_rdGoOnBinCodersCABAC);
     m_sbacCoder.init(&m_binCoderCABAC);
     m_trQuant.init(1 << top->m_quadtreeTULog2MaxSize, top->bEnableRDOQ, top->bEnableRDOQTS, top->param->bEnableTSkipFast);
-
+    m_rdCost.setPsyRdScale(top->param->rdLevel >= 4 ? top->param->psyRd : 0);
     m_rdSbacCoders = new TEncSbac * *[g_maxCUDepth + 1];
     m_binCodersCABAC = new TEncBinCABAC * *[g_maxCUDepth + 1];
 
diff -r 4a36a281e77c -r 51e8421442b5 source/x265.cpp
--- a/source/x265.cpp	Thu May 08 17:39:10 2014 -0500
+++ b/source/x265.cpp	Thu May 08 16:48:56 2014 +0530
@@ -140,6 +140,7 @@
     { "cbqpoffs",       required_argument, NULL, 0 },
     { "crqpoffs",       required_argument, NULL, 0 },
     { "rd",             required_argument, NULL, 0 },
+    { "psy-rd",         required_argument, NULL, 0 },
     { "no-signhide",          no_argument, NULL, 0 },
     { "signhide",             no_argument, NULL, 0 },
     { "no-lft",               no_argument, NULL, 0 },
@@ -379,6 +380,7 @@
     H0("   --cbqpoffs <integer>          Chroma Cb QP Offset. Default %d\n", param->cbQpOffset);
     H0("   --crqpoffs <integer>          Chroma Cr QP Offset. Default %d\n", param->crQpOffset);
     H0("   --rd <0..6>                   Level of RD in mode decision 0:least....6:full RDO. Default %d\n", param->rdLevel);
+    H0("   --psy-rd <0..2.0>             Strength of psycho-visual optimization. Default %f\n", param->psyRd);
     H0("   --[no-]signhide               Hide sign bit of one coeff per TU (rdo). Default %s\n", OPT(param->bEnableSignHiding));
     H0("\nLoop filters (deblock and SAO):\n");
     H0("   --[no-]lft                    Enable Deblocking Loop Filter. Default %s\n", OPT(param->bEnableLoopFilter));
diff -r 4a36a281e77c -r 51e8421442b5 source/x265.h
--- a/source/x265.h	Thu May 08 17:39:10 2014 -0500
+++ b/source/x265.h	Thu May 08 16:48:56 2014 +0530
@@ -579,6 +579,11 @@
      * efficiency at a major cost of performance. Default is no RDO (0) */
     int       rdLevel;
 
+    /* Psycho-visual rate-distortion strength. Only has an effect in presets
+     * which use RDO. It makes mode decision favor options which preserve the
+     * energy of the source, at the cost of lost compression. Default 1.0 */
+    double     psyRd;
+
     /*== Coding tools ==*/
 
     /* Enable the implicit signaling of the sign bit of the last coefficient of