[x265-commits] [x265] quant: don't bother with extra temp variables

Wed Jul 30 11:02:41 CEST 2014

details:   http://hg.videolan.org/x265/rev/5210fca67553
branches:  
changeset: 7633:5210fca67553
user:      Steve Borho <steve at borho.org>
date:      Wed Jul 30 00:21:31 2014 -0500
description:
quant: don't bother with extra temp variables
Subject: [x265] quant: return signal cost from getRateLast(), do not include lambda

details:   http://hg.videolan.org/x265/rev/e0320502f9ea
branches:  
changeset: 7634:e0320502f9ea
user:      Steve Borho <steve at borho.org>
date:      Wed Jul 30 00:21:57 2014 -0500
description:
quant: return signal cost from getRateLast(), do not include lambda
Subject: [x265] quant: pass curCostSig to getCodedLevel as an integer

details:   http://hg.videolan.org/x265/rev/4cb71a283ae3
branches:  
changeset: 7635:4cb71a283ae3
user:      Steve Borho <steve at borho.org>
date:      Wed Jul 30 00:40:26 2014 -0500
description:
quant: pass curCostSig to getCodedLevel as an integer
Subject: [x265] quant: readability nit

details:   http://hg.videolan.org/x265/rev/ddef8e2d88fd
branches:  
changeset: 7636:ddef8e2d88fd
user:      Steve Borho <steve at borho.org>
date:      Wed Jul 30 00:42:52 2014 -0500
description:
quant: readability nit
Subject: [x265] quant: rename absLevel to level, remove diffLevel

details:   http://hg.videolan.org/x265/rev/09ae268bb0ce
branches:  
changeset: 7637:09ae268bb0ce
user:      Steve Borho <steve at borho.org>
date:      Wed Jul 30 00:44:03 2014 -0500
description:
quant: rename absLevel to level, remove diffLevel
Subject: [x265] quant: minor cleanups

details:   http://hg.videolan.org/x265/rev/df8314a1d3cb
branches:  
changeset: 7638:df8314a1d3cb
user:      Steve Borho <steve at borho.org>
date:      Wed Jul 30 02:46:54 2014 -0500
description:
quant: minor cleanups
Subject: [x265] psy-rdoq: implementation of psy-rdoq  (highly experimental)

details:   http://hg.videolan.org/x265/rev/06dcd7c5df6e
branches:  
changeset: 7639:06dcd7c5df6e
user:      Sumalatha Polureddy<sumalatha at multicorewareinc.com>
date:      Fri Jul 25 15:28:47 2014 +0530
description:
psy-rdoq: implementation of psy-rdoq  (highly experimental)

This initial version is storing a temp variable in TComTrQuant to avoid adding
even more parameters to getCodedLevel() and it is ignoring scaling lists in the
unquant operation. Currently, you may need large psy-rdoq scale values to have
any real effect.  It needs lots of testing.
Subject: [x265] defs: remove DISTORTION_PRECISION_ADJUSTMENT, fix bug in 2e22ea6ec4bc

details:   http://hg.videolan.org/x265/rev/38349967645f
branches:  
changeset: 7640:38349967645f
user:      Steve Borho <steve at borho.org>
date:      Wed Jul 30 03:17:30 2014 -0500
description:
defs: remove DISTORTION_PRECISION_ADJUSTMENT, fix bug in 2e22ea6ec4bc

FULL_NBIT was disabling this macro for even 16bpp builds, but I accidentally
enabled it. Since it was previously disabled for every build, and is only even
present in SAO and quant, it is best to just remove it completely.
Subject: [x265] param: disable range checks for psy-rdoq while we tune it

details:   http://hg.videolan.org/x265/rev/3d814fd1268b
branches:  
changeset: 7641:3d814fd1268b
user:      Steve Borho <steve at borho.org>
date:      Wed Jul 30 03:44:24 2014 -0500
description:
param: disable range checks for psy-rdoq while we tune it

diffstat:

 source/Lib/TLibCommon/CommonDef.h                   |    6 -
 source/Lib/TLibCommon/TComTrQuant.cpp               |  101 +++++++++++++------
 source/Lib/TLibCommon/TComTrQuant.h                 |   17 ++-
 source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp |   19 +--
 source/Lib/TLibEncoder/TEncSearch.cpp               |   34 +++---
 source/Lib/TLibEncoder/TEncSearch.h                 |    2 +-
 source/common/param.cpp                             |    2 +-
 source/common/scalinglist.cpp                       |    3 +-
 source/encoder/analysis.cpp                         |    2 +-
 source/encoder/encoder.cpp                          |   10 +-
 10 files changed, 114 insertions(+), 82 deletions(-)

diffs (truncated from 618 to 300 lines):

diff -r 05132ebe8413 -r 3d814fd1268b source/Lib/TLibCommon/CommonDef.h

--- a/source/Lib/TLibCommon/CommonDef.h	Tue Jul 29 18:56:48 2014 -0700
+++ b/source/Lib/TLibCommon/CommonDef.h	Wed Jul 30 03:44:24 2014 -0500
@@ -86,12 +86,6 @@
 #define MDCS_ANGLE_LIMIT            4 // distance from true angle that horiz or vertical scan is allowed
 #define MDCS_LOG2_MAX_SIZE          3 // TUs with log2 of size greater than this can only use diagonal scan
 
-#if HIGH_BIT_DEPTH
-# define DISTORTION_PRECISION_ADJUSTMENT(x) (x)
-#else
-# define DISTORTION_PRECISION_ADJUSTMENT(x) 0
-#endif
-
 #define MAX_NUM_REF_PICS            16 // max. number of pictures used for reference
 #define MAX_NUM_REF                 16 // max. number of entries in picture reference list
 
diff -r 05132ebe8413 -r 3d814fd1268b source/Lib/TLibCommon/TComTrQuant.cpp
--- a/source/Lib/TLibCommon/TComTrQuant.cpp	Tue Jul 29 18:56:48 2014 -0700
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp	Wed Jul 30 03:44:24 2014 -0500
@@ -39,6 +39,8 @@
 
 using namespace x265;
 
+#define SIGN(x,y) ((x^(y >> 31))-(y >> 31))
+
 namespace {
 
 struct coeffGroupRDStats
@@ -174,19 +176,26 @@ inline uint32_t getICRateCost(uint32_t a
 TComTrQuant::TComTrQuant()
 {
     m_resiDctCoeff = NULL;
+    m_fencDctCoeff = NULL;
+    m_fencShortBuf = NULL;
 }
 
-bool TComTrQuant::init(bool useRDOQ, const ScalingList& scalingList)
+bool TComTrQuant::init(bool useRDOQ, double psyScale, const ScalingList& scalingList)
 {
     m_useRDOQ = useRDOQ;
+    m_psyRdoqScale = (uint64_t)(psyScale * 256.0);
     m_scalingList = &scalingList;
-    m_resiDctCoeff = X265_MALLOC(coeff_t, MAX_CU_SIZE * MAX_CU_SIZE);
-    return m_resiDctCoeff;
+    m_resiDctCoeff = X265_MALLOC(coeff_t, MAX_TR_SIZE * MAX_TR_SIZE * 2);
+    m_fencDctCoeff = m_resiDctCoeff + (MAX_TR_SIZE * MAX_TR_SIZE);
+    m_fencShortBuf = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE);
+    
+    return m_resiDctCoeff && m_fencShortBuf;
 }
 
 TComTrQuant::~TComTrQuant()
 {
     X265_FREE(m_resiDctCoeff);
+    X265_FREE(m_fencShortBuf);
 }
 
 void TComTrQuant::setQPforQuant(TComDataCU* cu)
@@ -350,6 +359,8 @@ uint32_t TComTrQuant::quant(TComDataCU* 
 }
 
 uint32_t TComTrQuant::transformNxN(TComDataCU* cu,
+                                   pixel*      fenc,
+                                   uint32_t    fencStride,
                                    int16_t*    residual,
                                    uint32_t    stride,
                                    coeff_t*    coeff,
@@ -394,12 +405,20 @@ uint32_t TComTrQuant::transformNxN(TComD
     }
     else
     {
-        // TODO: this may need larger data types for X265_DEPTH > 10
         const uint32_t sizeIdx = log2TrSize - 2;
-        int useDST = (sizeIdx == 0 && ttype == TEXT_LUMA && cu->getPredictionMode(absPartIdx) == MODE_INTRA);
+        int useDST = !sizeIdx && ttype == TEXT_LUMA && cu->getPredictionMode(absPartIdx) == MODE_INTRA;
         int index = DCT_4x4 + sizeIdx - useDST;
+        if (m_psyRdoqScale && ttype == TEXT_LUMA)
+        {
+            // converting pixel to short for input to dct and psy-rdoq eval
+            // TODO: can this be re-used? should it be performed by caller?
+            primitives.square_copy_ps[sizeIdx](m_fencShortBuf, trSize, fenc, fencStride);
+            primitives.dct[index](m_fencShortBuf, m_fencDctCoeff, trSize);
+        }
+
         primitives.dct[index](residual, m_resiDctCoeff, stride);
-        if (m_nr->bNoiseReduction && index)
+
+        if (m_nr->bNoiseReduction && !useDST)
         {
             denoiseDct(m_resiDctCoeff, m_nr->residualSum[sizeIdx], m_nr->offset[sizeIdx], (16 << sizeIdx * 2));
             m_nr->count[sizeIdx]++;
@@ -490,6 +509,7 @@ uint32_t TComTrQuant::rdoQuant(TComDataC
     uint32_t trSize = 1 << log2TrSize;
     int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform
     int scalingListType = (cu->isIntra(absPartIdx) ? 0 : 3) + ttype;
+    m_transformShift = transformShift;
 
     X265_CHECK(scalingListType < 6, "scaling list type out of range\n");
 
@@ -511,6 +531,7 @@ uint32_t TComTrQuant::rdoQuant(TComDataC
     selectLambda(ttype);
 
     double *errScale = m_scalingList->m_errScale[log2TrSize - 2][scalingListType][rem];
+    bool usePsy = m_psyRdoqScale && ttype == TEXT_LUMA;
 
     double blockUncodedCost = 0;
     double costCoeff[32 * 32];
@@ -524,8 +545,8 @@ uint32_t TComTrQuant::rdoQuant(TComDataC
     TUEntropyCodingParameters codingParameters;
     getTUEntropyCodingParameters(cu, codingParameters, absPartIdx, log2TrSize, ttype);
 
-    const uint32_t cgSize = (1 << MLS_CG_SIZE); // 16
-    double costCoeffGroupSig[MLS_GRP_NUM];
+    const uint32_t cgSize = (1 << MLS_CG_SIZE); // 4x4 coef = 16
+    double costCoeffGroupSig[MLS_GRP_NUM];      // 32x32 has 64 4x4 coding groups
     uint64_t sigCoeffGroupFlag64 = 0;
     uint32_t ctxSet      = 0;
     int    c1            = 1;
@@ -599,7 +620,7 @@ uint32_t TComTrQuant::rdoQuant(TComDataC
                     level = getCodedLevel(costCoeff[scanPos], 0, costSig[scanPos],
                                           levelDouble, maxAbsLevel, baseLevel,
                                           greaterOneBits, levelAbsBits, goRiceParam,
-                                          c1c2Idx, qbits, scaleFactor);
+                                          c1c2Idx, qbits, scaleFactor, blkPos, usePsy);
                     sigRateDelta[blkPos] = 0;
                 }
                 else
@@ -613,10 +634,10 @@ uint32_t TComTrQuant::rdoQuant(TComDataC
                     }
                     if (maxAbsLevel)
                     {
-                        level = getCodedLevel(costCoeff[scanPos], m_lambda * m_estBitsSbac.significantBits[ctxSig][1], costSig[scanPos],
+                        level = getCodedLevel(costCoeff[scanPos], m_estBitsSbac.significantBits[ctxSig][1], costSig[scanPos],
                                               levelDouble, maxAbsLevel, baseLevel,
                                               greaterOneBits, levelAbsBits, goRiceParam,
-                                              c1c2Idx, qbits, scaleFactor);
+                                              c1c2Idx, qbits, scaleFactor, blkPos, usePsy);
                     }
                     else
                         level = 0;
@@ -799,7 +820,7 @@ uint32_t TComTrQuant::rdoQuant(TComDataC
             {
                 uint32_t posY = blkPos >> log2TrSize;
                 uint32_t posX = blkPos - (posY << log2TrSize);
-                double costLast = codingParameters.scanType == SCAN_VER ? getRateLast(posY, posX) : getRateLast(posX, posY);
+                double costLast = m_lambda * (codingParameters.scanType == SCAN_VER ? getRateLast(posY, posX) : getRateLast(posX, posY));
                 double totalCost = baseCost + costLast - costSig[scanPos];
 
                 if (totalCost < bestCost)
@@ -840,9 +861,8 @@ uint32_t TComTrQuant::rdoQuant(TComDataC
     if (cu->m_slice->m_pps->bSignHideEnabled && numSig >= 2)
     {
         // Note:: the scaling list is being ignored in this optimization
-        int prec = DISTORTION_PRECISION_ADJUSTMENT(2 * (X265_DEPTH - 8));
         int64_t invQuant = ScalingList::s_invQuantScales[rem] << per;
-        int64_t rdFactor = (int64_t)((invQuant * invQuant) / (m_lambda * (16 << prec)) + 0.5);
+        int64_t rdFactor = (int64_t)((invQuant * invQuant) / (m_lambda * 16) + 0.5);
 
         int lastCG = 1;
         for (int subSet = cgLastScanPos; subSet >= 0; subSet--)
@@ -1049,7 +1069,7 @@ uint32_t TComTrQuant::getSigCtxInc(const
  * This method calculates the best quantized transform level for a given scan position.
  */
 inline uint32_t TComTrQuant::getCodedLevel(double&      codedCost,
-                                           const double curCostSig,
+                                           uint32_t     curCostSig,
                                            double&      codedCostSig,
                                            int          levelDouble,
                                            uint32_t     maxAbsLevel,
@@ -1059,15 +1079,18 @@ inline uint32_t TComTrQuant::getCodedLev
                                            uint32_t     absGoRice,
                                            uint32_t     c1c2Idx,
                                            int          qbits,
-                                           double       scaleFactor) const
+                                           double       scaleFactor,
+                                           int          blkPos,
+                                           bool         usePsy) const
 {
+    X265_CHECK(abs((double)levelDouble - (maxAbsLevel << qbits)) < INT_MAX, "levelDouble range check failure\n");
+
     uint32_t bestAbsLevel = 0;
     int32_t minAbsLevel = maxAbsLevel - 1;
     if (minAbsLevel < 1)
         minAbsLevel = 1;
 
     // NOTE: (A + B) ^ 2 = (A ^ 2) + 2 * A * B + (B ^ 2)
-    X265_CHECK(abs((double)levelDouble - (maxAbsLevel << qbits)) < INT_MAX, "levelDouble range check failure\n");
     const int32_t err1 = levelDouble - (maxAbsLevel << qbits);            // A
     double err2 = (double)((int64_t)err1 * err1);                         // A ^ 2
     const int64_t err3 = (int64_t)2 * err1 * ((int64_t)1 << qbits);       // 2 * A * B
@@ -1076,27 +1099,39 @@ inline uint32_t TComTrQuant::getCodedLev
 
     err2 *= scaleFactor;
 
-    double bestCodedCost = codedCost;
-    double bestCodedCostSig = codedCostSig;
-    int diffLevel = maxAbsLevel - baseLevel;
-    for (int absLevel = maxAbsLevel; absLevel >= minAbsLevel; absLevel--)
+    int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - m_transformShift;
+    int add = (1 << shift) - 1;
+    int scale = m_scalingList->s_invQuantScales[m_qpParam[0].rem] << m_qpParam[0].per;
+    int scaleBits = SCALE_BITS - 2 * m_transformShift;
+
+    for (int level = maxAbsLevel; level >= minAbsLevel; level--)
     {
-        X265_CHECK(fabs((double)err2 - double(levelDouble  - (absLevel << qbits)) * double(levelDouble  - (absLevel << qbits)) * scaleFactor) < 1e-5, "err2 check failure\n");
-        double curCost = err2 + m_lambda * getICRateCost(absLevel, diffLevel, greaterOneBits, levelAbsBits, absGoRice, c1c2Idx);
-        curCost       += curCostSig;
+        X265_CHECK(fabs((double)err2 - double(levelDouble - (level << qbits)) * double(levelDouble - (level << qbits)) * scaleFactor) < 1e-5, "err2 check failure\n");
 
-        if (curCost < bestCodedCost)
+        uint32_t rateCost = getICRateCost(level, level - baseLevel, greaterOneBits, levelAbsBits, absGoRice, c1c2Idx);
+        double curCost = err2 + m_lambda * (curCostSig + rateCost);
+
+        /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame. */
+        if (usePsy && blkPos)
         {
-            bestAbsLevel = absLevel;
-            bestCodedCost = curCost;
-            bestCodedCostSig = curCostSig;
+            int signCoef = m_resiDctCoeff[blkPos];
+            int unquantAbsLevel = (level * scale + add) >> shift;
+            int predictedCoef = m_fencDctCoeff[blkPos] - signCoef;
+            int reconCoef = abs(unquantAbsLevel + SIGN(predictedCoef, signCoef)) << scaleBits;
+            int psyValue = (m_psyRdoqScale * reconCoef) >> 8;
+            curCost -= psyValue;
         }
+
+        if (curCost < codedCost)
+        {
+            bestAbsLevel = level;
+            codedCost = curCost;
+            codedCostSig = m_lambda * curCostSig;
+        }
+
         err2 += errInc;
-        diffLevel--;
     }
 
-    codedCost = bestCodedCost;
-    codedCostSig = bestCodedCostSig;
     return bestAbsLevel;
 }
 
@@ -1105,7 +1140,7 @@ inline uint32_t TComTrQuant::getCodedLev
  * \param posy Y coordinate of the last significant coefficient
  * \returns cost of last significant coefficient
  */
-inline double TComTrQuant::getRateLast(uint32_t posx, uint32_t posy) const
+inline uint32_t TComTrQuant::getRateLast(uint32_t posx, uint32_t posy) const
 {
     uint32_t ctxX = getGroupIdx(posx);
     uint32_t ctxY = getGroupIdx(posy);
@@ -1116,7 +1151,7 @@ inline double TComTrQuant::getRateLast(u
 
     cost += maskX & (IEP_RATE * ((ctxX - 2) >> 1));
     cost += maskY & (IEP_RATE * ((ctxY - 2) >> 1));
-    return m_lambda * cost;
+    return cost;
 }
 
 /** Context derivation process of coeff_abs_significant_flag
diff -r 05132ebe8413 -r 3d814fd1268b source/Lib/TLibCommon/TComTrQuant.h
--- a/source/Lib/TLibCommon/TComTrQuant.h	Tue Jul 29 18:56:48 2014 -0700
+++ b/source/Lib/TLibCommon/TComTrQuant.h	Wed Jul 30 03:44:24 2014 -0500
@@ -98,14 +98,14 @@ public:
     ~TComTrQuant();
 
     /* one-time setup */
-    bool init(bool useRDOQ, const ScalingList& scalingList);
+    bool init(bool useRDOQ, double scale, const ScalingList& scalingList);
 
     /* CU setup */
     void setQPforQuant(TComDataCU* cu);
     void setLambdas(double lambdaY, double lambdaCb, double lambdaCr) { m_lambdas[0] = lambdaY; m_lambdas[1] = lambdaCb; m_lambdas[2] = lambdaCr; }
 
-    uint32_t transformNxN(TComDataCU* cu, int16_t* residual, uint32_t stride, coeff_t* coeff, uint32_t log2TrSize,
-                          TextType ttype, uint32_t absPartIdx, bool useTransformSkip, bool curUseRDOQ);
+    uint32_t transformNxN(TComDataCU* cu, pixel *fenc, uint32_t fencstride, int16_t* residual, uint32_t stride, coeff_t* coeff,
+                          uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip, bool curUseRDOQ);
 
     void invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, coeff_t* coeff,
                          uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig);
@@ -119,7 +119,10 @@ public:
     double   m_lambdas[3];
 
     bool     m_useRDOQ;
+    uint64_t m_psyRdoqScale;
     coeff_t* m_resiDctCoeff;
+    coeff_t* m_fencDctCoeff;
+    int16_t* m_fencShortBuf;
 
     static const uint32_t IEP_RATE = 32768; // cost of an equal probable bit
 
@@ -135,11 +138,13 @@ protected:
 
     uint32_t rdoQuant(TComDataCU* cu, coeff_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx);
 
-    inline uint32_t getCodedLevel(double& codedCost, const double curCostSig, double& codedCostSig, int levelDouble,
+    inline uint32_t getCodedLevel(double& codedCost, uint32_t curCostSig, double& codedCostSig, int levelDouble,