[x265] quant: returns numSig instead of absSum and lastPos

Mon Jul 7 17:27:20 CEST 2014

On Mon, Jul 7, 2014 at 3:04 AM, Satoshi Nakagawa <nakagawa424 at oki.com> wrote:
> # HG changeset patch
> # User Satoshi Nakagawa <nakagawa424 at oki.com>
> # Date 1404720026 -32400
> #      Mon Jul 07 17:00:26 2014 +0900
> # Node ID dcf6f2ce907c59eedc3d488a7f047a5f094bf925
> # Parent  11c808e562b894d84961cf00080173321e272884
> quant: returns numSig instead of absSum and lastPos

the whole patch is queued for testing, thanks.

> diff -r 11c808e562b8 -r dcf6f2ce907c source/Lib/TLibCommon/CommonDef.h
> --- a/source/Lib/TLibCommon/CommonDef.h Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/Lib/TLibCommon/CommonDef.h Mon Jul 07 17:00:26 2014 +0900
> @@ -118,8 +118,6 @@
>  #define LOG2_MAX_COLUMN_WIDTH       13
>  #define LOG2_MAX_ROW_HEIGHT         13
>
> -#define REG_DCT                     65535
> -
>  #define CABAC_INIT_PRESENT_FLAG     1
>
>  #define MAX_GOP                     64          ///< max. value of hierarchical GOP size
> diff -r 11c808e562b8 -r dcf6f2ce907c source/Lib/TLibCommon/TComSlice.cpp
> --- a/source/Lib/TLibCommon/TComSlice.cpp       Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/Lib/TLibCommon/TComSlice.cpp       Mon Jul 07 17:00:26 2014 +0900
> @@ -476,7 +476,6 @@
>      , m_qpBDOffsetC(0)
>      , m_bitsForPOC(8)
>      , m_numLongTermRefPicSPS(0)
> -    , m_maxTrSize(32)
>      , m_bUseSAO(false)
>      , m_bTemporalIdNestingFlag(false)
>      , m_scalingListEnabledFlag(false)
> diff -r 11c808e562b8 -r dcf6f2ce907c source/Lib/TLibCommon/TComSlice.h
> --- a/source/Lib/TLibCommon/TComSlice.h Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/Lib/TLibCommon/TComSlice.h Mon Jul 07 17:00:26 2014 +0900
> @@ -825,9 +825,6 @@
>      uint32_t    m_ltRefPicPocLsbSps[33];
>      bool        m_usedByCurrPicLtSPSFlag[33];
>
> -    // Max physical transform size
> -    uint32_t    m_maxTrSize;
> -
>      int m_iAMPAcc[MAX_CU_DEPTH];
>      bool        m_bUseSAO;
>
> @@ -954,11 +951,6 @@
>
>      void      setTMVPFlagsPresent(bool b)   { m_TMVPFlagsPresent = b; }
>
> -    // physical transform
> -    void setMaxTrSize(uint32_t u)   { m_maxTrSize = u; }
> -
> -    uint32_t getMaxTrSize() const   { return m_maxTrSize; }
> -
>      // AMP accuracy
>      int       getAMPAcc(uint32_t depth) const { return m_iAMPAcc[depth]; }
>
> diff -r 11c808e562b8 -r dcf6f2ce907c source/Lib/TLibCommon/TComTrQuant.cpp
> --- a/source/Lib/TLibCommon/TComTrQuant.cpp     Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/Lib/TLibCommon/TComTrQuant.cpp     Mon Jul 07 17:00:26 2014 +0900
> @@ -143,7 +143,7 @@
>  }
>
>  // To minimize the distortion only. No rate is considered.
> -void TComTrQuant::signBitHidingHDQ(coeff_t* qCoef, coeff_t* coef, int32_t* deltaU, const TUEntropyCodingParameters &codingParameters)
> +uint32_t TComTrQuant::signBitHidingHDQ(coeff_t* qCoef, coeff_t* coef, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codingParameters)
>  {
>      const uint32_t log2TrSizeCG = codingParameters.log2TrSizeCG;
>
> @@ -249,6 +249,11 @@
>                      finalChange = -1;
>                  }
>
> +                if (qCoef[minPos] == 0)
> +                    numSig++;
> +                else if (finalChange == -1 && abs(qCoef[minPos]) == 1)
> +                    numSig--;
> +
>                  if (coef[minPos] >= 0)
>                  {
>                      qCoef[minPos] += finalChange;
> @@ -261,12 +266,13 @@
>          }
>          lastCG = 0;
>      } // TU loop
> +
> +    return numSig;
>  }
>
> -uint32_t TComTrQuant::xQuant(TComDataCU* cu, int32_t* coef, coeff_t* qCoef, int trSize,
> -                             TextType ttype, uint32_t absPartIdx, int32_t *lastPos)
> +uint32_t TComTrQuant::xQuant(TComDataCU* cu, int32_t* coef, coeff_t* qCoef, uint32_t log2TrSize,
> +                             TextType ttype, uint32_t absPartIdx)
>  {
> -    const uint32_t log2TrSize = g_convertToBit[trSize] + 2;
>      TUEntropyCodingParameters codingParameters;
>      getTUEntropyCodingParameters(cu, codingParameters, absPartIdx, log2TrSize, ttype);
>      int deltaU[32 * 32];
> @@ -281,13 +287,13 @@
>      int add = (cu->getSlice()->getSliceType() == I_SLICE ? 171 : 85) << (qbits - 9);
>
>      int numCoeff = 1 << log2TrSize * 2;
> -    uint32_t acSum = primitives.quant(coef, quantCoeff, deltaU, qCoef, qbits, add, numCoeff, lastPos);
> +    uint32_t numSig = primitives.quant(coef, quantCoeff, deltaU, qCoef, qbits, add, numCoeff);
>
> -    if (acSum >= 2 && cu->getSlice()->getPPS()->getSignHideFlag())
> +    if (numSig >= 2 && cu->getSlice()->getPPS()->getSignHideFlag())
>      {
> -        signBitHidingHDQ(qCoef, coef, deltaU, codingParameters);
> +        return signBitHidingHDQ(qCoef, coef, deltaU, numSig, codingParameters);
>      }
> -    return acSum;
> +    return numSig;
>  }
>
>  void TComTrQuant::init(bool useRDOQ)
> @@ -299,73 +305,65 @@
>                                     int16_t*    residual,
>                                     uint32_t    stride,
>                                     coeff_t*    coeff,
> -                                   uint32_t    trSize,
> +                                   uint32_t    log2TrSize,
>                                     TextType    ttype,
>                                     uint32_t    absPartIdx,
> -                                   int32_t*    lastPos,
>                                     bool        useTransformSkip,
>                                     bool        curUseRDOQ)
>  {
>      if (cu->getCUTransquantBypass(absPartIdx))
>      {
> -        uint32_t absSum = 0;
> -        for (uint32_t k = 0; k < trSize; k++)
> +        uint32_t numSig = 0;
> +        int trSize = 1 << log2TrSize;
> +        for (int k = 0; k < trSize; k++)
>          {
> -            for (uint32_t j = 0; j < trSize; j++)
> +            for (int j = 0; j < trSize; j++)
>              {
>                  coeff[k * trSize + j] = ((int16_t)residual[k * stride + j]);
> -                absSum += abs(residual[k * stride + j]);
> +                numSig += (residual[k * stride + j] != 0);
>              }
>          }
>
> -        return absSum;
> +        return numSig;
>      }
>
> -    uint32_t mode; //luma intra pred
> -    if (ttype == TEXT_LUMA && cu->getPredictionMode(absPartIdx) == MODE_INTRA)
> +    X265_CHECK((cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() >= log2TrSize), "transform size too large\n");
> +    if (!useTransformSkip)
>      {
> -        mode = cu->getLumaIntraDir(absPartIdx);
> +        // TODO: this may need larger data types for X265_DEPTH > 8
> +        const uint32_t sizeIdx = log2TrSize - 2;
> +        int useDST = (sizeIdx == 0 && ttype == TEXT_LUMA && cu->getPredictionMode(absPartIdx) == MODE_INTRA);
> +        int index = DCT_4x4 + sizeIdx - useDST;
> +        primitives.dct[index](residual, m_tmpCoeff, stride);
> +        if (m_nr->bNoiseReduction)
> +        {
> +            if (index > 0)
> +            {
> +                denoiseDct(m_tmpCoeff, m_nr->residualSum[sizeIdx], m_nr->offset[sizeIdx], (16 << sizeIdx * 2));
> +                m_nr->count[sizeIdx]++;
> +            }
> +        }
>      }
>      else
>      {
> -        mode = REG_DCT;
> -    }
> -
> -    X265_CHECK((cu->getSlice()->getSPS()->getMaxTrSize() >= trSize), "transform size too large\n");
> -    if (useTransformSkip)
> -    {
> -        xTransformSkip(residual, stride, m_tmpCoeff, trSize);
> -    }
> -    else
> -    {
> -        // TODO: this may need larger data types for X265_DEPTH > 8
> -        const uint32_t log2BlockSize = g_convertToBit[trSize];
> -        primitives.dct[DCT_4x4 + log2BlockSize - ((trSize == 4) && (mode != REG_DCT))](residual, m_tmpCoeff, stride);
> -        if (m_nr->bNoiseReduction)
> -        {
> -            int index = (DCT_4x4 + log2BlockSize - ((trSize == 4) && (mode != REG_DCT)));
> -            if (index > 0 && index < 5)
> -            {
> -                denoiseDct(m_tmpCoeff, m_nr->residualSum[index - 1], m_nr->offset[index - 1], (16 << (index - 1) * 2));
> -                m_nr->count[index - 1]++;
> -            }
> -        }
> +        xTransformSkip(residual, stride, m_tmpCoeff, log2TrSize);
>      }
>
>      if (m_useRDOQ && curUseRDOQ)
>      {
> -        return xRateDistOptQuant(cu, m_tmpCoeff, coeff, trSize, ttype, absPartIdx, lastPos);
> +        return xRateDistOptQuant(cu, m_tmpCoeff, coeff, log2TrSize, ttype, absPartIdx);
>      }
> -    return xQuant(cu, m_tmpCoeff, coeff, trSize, ttype, absPartIdx, lastPos);
> +    return xQuant(cu, m_tmpCoeff, coeff, log2TrSize, ttype, absPartIdx);
>  }
>
> -void TComTrQuant::invtransformNxN(bool transQuantBypass, uint32_t mode, int16_t* residual, uint32_t stride, coeff_t* coeff, uint32_t trSize, int scalingListType, bool useTransformSkip, int lastPos)
> +void TComTrQuant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, coeff_t* coeff, uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig)
>  {
>      if (transQuantBypass)
>      {
> -        for (uint32_t k = 0; k < trSize; k++)
> +        int trSize = 1 << log2TrSize;
> +        for (int k = 0; k < trSize; k++)
>          {
> -            for (uint32_t j = 0; j < trSize; j++)
> +            for (int j = 0; j < trSize; j++)
>              {
>                  residual[k * stride + j] = (int16_t)(coeff[k * trSize + j]);
>              }
> @@ -377,37 +375,34 @@
>      // Values need to pass as input parameter in dequant
>      int per = m_qpParam.m_per;
>      int rem = m_qpParam.m_rem;
> -    bool useScalingList = getUseScalingList();
> -    const uint32_t log2TrSize = g_convertToBit[trSize] + 2;
>      int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
>      int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift;
> -    int32_t *dequantCoef = getDequantCoeff(scalingListType, m_qpParam.m_rem, log2TrSize - 2);
> +    int numCoeff = 1 << log2TrSize * 2;
>
> -    if (!useScalingList)
> +    if (!getUseScalingList())
>      {
>          static const int invQuantScales[6] = { 40, 45, 51, 57, 64, 72 };
>          int scale = invQuantScales[rem] << per;
> -        primitives.dequant_normal(coeff, m_tmpCoeff, trSize * trSize, scale, shift);
> +        primitives.dequant_normal(coeff, m_tmpCoeff, numCoeff, scale, shift);
>      }
>      else
>      {
>          // CHECK_ME: the code is not verify since this is DEAD path
> -        primitives.dequant_scaling(coeff, dequantCoef, m_tmpCoeff, trSize * trSize, per, shift);
> +        int scalingListType = (!bIntra ? 3 : 0) + ttype;
> +        X265_CHECK(scalingListType < 6, "scalingListType invalid %d\n", scalingListType);
> +        int32_t *dequantCoef = getDequantCoeff(scalingListType, m_qpParam.m_rem, log2TrSize - 2);
> +        primitives.dequant_scaling(coeff, dequantCoef, m_tmpCoeff, numCoeff, per, shift);
>      }
>
> -    if (useTransformSkip == true)
> +    if (!useTransformSkip)
>      {
> -        xITransformSkip(m_tmpCoeff, residual, stride, trSize);
> -    }
> -    else
> -    {
> -        // CHECK_ME: we can't here when no any coeff
> -        X265_CHECK(lastPos >= 0, "lastPos negative\n");
> +        const uint32_t sizeIdx = log2TrSize - 2;
> +        int useDST = (sizeIdx == 0 && ttype == TEXT_LUMA && bIntra);
>
> -        const uint32_t log2BlockSize = log2TrSize - 2;
> +        X265_CHECK(numSig == primitives.count_nonzero(coeff, 1 << log2TrSize * 2), "numSig differ\n");
>
>          // DC only
> -        if (lastPos == 0 && !((trSize == 4) && (mode != REG_DCT)))
> +        if (numSig == 1 && coeff[0] != 0 && !useDST)
>          {
>              const int shift_1st = 7;
>              const int add_1st = 1 << (shift_1st - 1);
> @@ -415,13 +410,17 @@
>              const int add_2nd = 1 << (shift_2nd - 1);
>
>              int dc_val = (((m_tmpCoeff[0] * 64 + add_1st) >> shift_1st) * 64 + add_2nd) >> shift_2nd;
> -            primitives.blockfill_s[log2BlockSize](residual, stride, dc_val);
> +            primitives.blockfill_s[sizeIdx](residual, stride, dc_val);
>
>              return;
>          }
>
>          // TODO: this may need larger data types for X265_DEPTH > 8
> -        primitives.idct[IDCT_4x4 + log2BlockSize - ((trSize == 4) && (mode != REG_DCT))](m_tmpCoeff, residual, stride);
> +        primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_tmpCoeff, residual, stride);
> +    }
> +    else
> +    {
> +        xITransformSkip(m_tmpCoeff, residual, stride, log2TrSize);
>      }
>  }
>
> @@ -435,12 +434,10 @@
>   *  \param stride stride of input residual data
>   *  \param size transform size (size x size)
>   */
> -void TComTrQuant::xTransformSkip(int16_t* resiBlock, uint32_t stride, int32_t* coeff, int trSize)
> +void TComTrQuant::xTransformSkip(int16_t* resiBlock, uint32_t stride, int32_t* coeff, uint32_t log2TrSize)
>  {
> -    uint32_t log2TrSize = g_convertToBit[trSize] + 2;
> -    int  shift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
> -    uint32_t transformSkipShift;
> -    int  j, k;
> +    int trSize = 1 << log2TrSize;
> +    int shift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
>
>      if (shift >= 0)
>      {
> @@ -448,15 +445,14 @@
>      }
>      else
>      {
> -        //The case when X265_DEPTH > 13
> -        int offset;
> -        transformSkipShift = -shift;
> -        offset = (1 << (transformSkipShift - 1));
> -        for (j = 0; j < trSize; j++)
> +        // The case when X265_DEPTH > 13
> +        shift = -shift;
> +        int offset = (1 << (shift - 1));
> +        for (int j = 0; j < trSize; j++)
>          {
> -            for (k = 0; k < trSize; k++)
> +            for (int k = 0; k < trSize; k++)
>              {
> -                coeff[j * trSize + k] = (resiBlock[j * stride + k] + offset) >> transformSkipShift;
> +                coeff[j * trSize + k] = (resiBlock[j * stride + k] + offset) >> shift;
>              }
>          }
>      }
> @@ -468,11 +464,10 @@
>   *  \param stride stride of input residual data
>   *  \param size transform size (size x size)
>   */
> -void TComTrQuant::xITransformSkip(int32_t* coef, int16_t* residual, uint32_t stride, int trSize)
> +void TComTrQuant::xITransformSkip(int32_t* coef, int16_t* residual, uint32_t stride, uint32_t log2TrSize)
>  {
> -    uint32_t log2TrSize = g_convertToBit[trSize] + 2;
> -    int  shift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
> -    int  j, k;
> +    int trSize = 1 << log2TrSize;
> +    int shift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
>
>      if (shift > 0)
>      {
> @@ -480,13 +475,13 @@
>      }
>      else
>      {
> -        //The case when X265_DEPTH >= 13
> -        uint32_t transformSkipShift = -shift;
> -        for (j = 0; j < trSize; j++)
> +        // The case when X265_DEPTH >= 13
> +        shift = -shift;
> +        for (int j = 0; j < trSize; j++)
>          {
> -            for (k = 0; k < trSize; k++)
> +            for (int k = 0; k < trSize; k++)
>              {
> -                residual[j * stride + k] =  coef[j * trSize + k] << transformSkipShift;
> +                residual[j * stride + k] = coef[j * trSize + k] << shift;
>              }
>          }
>      }
> @@ -501,14 +496,14 @@
>   * \param uiAbsSum reference to absolute sum of quantized transform coefficient
>   * \param ttype plane type / luminance or chrominance
>   * \param absPartIdx absolute partition index
> - * \returns void
> + * \returns number of significant coefficient
>   * Rate distortion optimized quantization for entropy
>   * coding engines using probability models like CABAC
>   */
> -uint32_t TComTrQuant::xRateDistOptQuant(TComDataCU* cu, int32_t* srcCoeff, coeff_t* dstCoeff, uint32_t trSize,
> -                                        TextType ttype, uint32_t absPartIdx, int32_t *lastPos)
> +uint32_t TComTrQuant::xRateDistOptQuant(TComDataCU* cu, int32_t* srcCoeff, coeff_t* dstCoeff, uint32_t log2TrSize,
> +                                        TextType ttype, uint32_t absPartIdx)
>  {
> -    const uint32_t log2TrSize = g_convertToBit[trSize] + 2;
> +    uint32_t trSize = 1 << log2TrSize;
>      int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform
>      int scalingListType = (cu->isIntra(absPartIdx) ? 0 : 3) + ttype;
>
> @@ -567,7 +562,7 @@
>          const uint32_t cgPosX   = cgBlkPos - (cgPosY << codingParameters.log2TrSizeCG);
>          const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos);
>          memset(&rdStats, 0, sizeof(coeffGroupRDStats));
> -        X265_CHECK((trSize >> 2) == (1 << codingParameters.log2TrSizeCG), "transform size invalid\n");
> +        X265_CHECK(log2TrSize - 2  == codingParameters.log2TrSizeCG, "transform size invalid\n");
>          const int patternSigCtx = TComTrQuant::calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, codingParameters.log2TrSizeCG);
>          for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--)
>          {
> @@ -845,14 +840,12 @@
>          } // end if (sigCoeffGroupFlag[ cgBlkPos ])
>      } // end for
>
> -    uint32_t absSum = 0;
> +    numSig = 0;
>      for (int pos = 0; pos < bestLastIdxp1; pos++)
>      {
>          int blkPos = codingParameters.scan[pos];
>          int level  = dstCoeff[blkPos];
> -        absSum += level;
> -        if (level)
> -            *lastPos = blkPos;
> +        numSig += (level != 0);
>          uint32_t mask = (int32_t)srcCoeff[blkPos] >> 31;
>          dstCoeff[blkPos] = (level ^ mask) - mask;
>      }
> @@ -863,7 +856,7 @@
>          dstCoeff[codingParameters.scan[pos]] = 0;
>      }
>
> -    if (cu->getSlice()->getPPS()->getSignHideFlag() && absSum >= 2)
> +    if (cu->getSlice()->getPPS()->getSignHideFlag() && numSig >= 2)
>      {
>          int64_t rdFactor = (int64_t)(
>                  g_invQuantScales[m_qpParam.rem()] * g_invQuantScales[m_qpParam.rem()] * (1 << (2 * m_qpParam.m_per))
> @@ -901,14 +894,14 @@
>              if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD)
>              {
>                  uint32_t signbit = (dstCoeff[codingParameters.scan[subPos + firstNZPosInCG]] > 0 ? 0 : 1);
> -                int tmpSum = 0;
> +                int absSum = 0;
>
>                  for (n = firstNZPosInCG; n <= lastNZPosInCG; n++)
>                  {
> -                    tmpSum += dstCoeff[codingParameters.scan[n + subPos]];
> +                    absSum += dstCoeff[codingParameters.scan[n + subPos]];
>                  }
>
> -                if (signbit != (tmpSum & 0x1)) // hide but need tune
> +                if (signbit != (absSum & 0x1)) // hide but need tune
>                  {
>                      // calculate the cost
>                      int64_t minCostInc = MAX_INT64, curCost = MAX_INT64;
> @@ -974,6 +967,11 @@
>                          finalChange = -1;
>                      }
>
> +                    if (dstCoeff[minPos] == 0)
> +                        numSig++;
> +                    else if (finalChange == -1 && abs(dstCoeff[minPos]) == 1)
> +                        numSig--;
> +
>                      if (srcCoeff[minPos] >= 0)
>                      {
>                          dstCoeff[minPos] += finalChange;
> @@ -988,7 +986,7 @@
>          }
>      }
>
> -    return absSum;
> +    return numSig;
>  }
>
>  /** Pattern decision for context derivation process of significant_coeff_flag
> diff -r 11c808e562b8 -r dcf6f2ce907c source/Lib/TLibCommon/TComTrQuant.h
> --- a/source/Lib/TLibCommon/TComTrQuant.h       Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/Lib/TLibCommon/TComTrQuant.h       Mon Jul 07 17:00:26 2014 +0900
> @@ -127,10 +127,10 @@
>      void init(bool useRDOQ);
>
>      // transform & inverse transform functions
> -    uint32_t transformNxN(TComDataCU* cu, int16_t* residual, uint32_t stride, coeff_t* coeff, uint32_t trSize,
> -                          TextType ttype, uint32_t absPartIdx, int32_t* lastPos, bool useTransformSkip = false, bool curUseRDOQ = true);
> +    uint32_t transformNxN(TComDataCU* cu, int16_t* residual, uint32_t stride, coeff_t* coeff, uint32_t log2TrSize,
> +                          TextType ttype, uint32_t absPartIdx, bool useTransformSkip = false, bool curUseRDOQ = true);
>
> -    void invtransformNxN(bool transQuantBypass, uint32_t mode, int16_t* residual, uint32_t stride, coeff_t* coeff, uint32_t trSize, int scalingListType, bool useTransformSkip = false, int lastPos = MAX_INT);
> +    void invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, coeff_t* coeff, uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig);
>
>      // Misc functions
>      void setQPforQuant(int qpy, TextType ttype, int qpBdOffset, int chromaQPOffset, int chFmt);
> @@ -219,12 +219,13 @@
>
>  private:
>
> -    void xTransformSkip(int16_t* resiBlock, uint32_t stride, int32_t* coeff, int trSize);
> -    void signBitHidingHDQ(coeff_t* qcoeff, coeff_t* coeff, int32_t* deltaU, const TUEntropyCodingParameters &codingParameters);
> -    uint32_t xQuant(TComDataCU* cu, int32_t* src, coeff_t* dst, int trSize, TextType ttype, uint32_t absPartIdx, int32_t *lastPos);
> +    void xITransformSkip(int32_t* coeff, int16_t* residual, uint32_t stride, uint32_t log2TrSize);
> +    void xTransformSkip(int16_t* resiBlock, uint32_t stride, int32_t* coeff, uint32_t log2TrSize);
> +    uint32_t signBitHidingHDQ(coeff_t* qcoeff, coeff_t* coeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codingParameters);
> +    uint32_t xQuant(TComDataCU* cu, int32_t* src, coeff_t* dst, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx);
>
>      // RDOQ functions
> -    uint32_t xRateDistOptQuant(TComDataCU* cu, int32_t* srcCoeff, coeff_t* dstCoeff, uint32_t trSize, TextType ttype, uint32_t absPartIdx, int32_t *lastPos);
> +    uint32_t xRateDistOptQuant(TComDataCU* cu, int32_t* srcCoeff, coeff_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx);
>
>      inline uint32_t xGetCodedLevel(double& codedCost, const double curCostSig, double& codedCostSig, int levelDouble,
>                                     uint32_t maxAbsLevel, uint32_t baseLevel, const int *greaterOneBits, const int *levelAbsBits, uint32_t absGoRice,
> @@ -243,8 +244,6 @@
>      inline double xGetICost(double rate) const { return m_lambda * rate; } ///< Get the cost for a specific rate
>
>      inline uint32_t xGetIEPRate() const        { return 32768; }           ///< Get the cost of an equal probable bit
> -
> -    void xITransformSkip(int32_t* coeff, int16_t* residual, uint32_t stride, int trSize);
>  };
>  }
>  //! \}
> diff -r 11c808e562b8 -r dcf6f2ce907c source/Lib/TLibEncoder/TEncSbac.cpp
> --- a/source/Lib/TLibEncoder/TEncSbac.cpp       Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/Lib/TLibEncoder/TEncSbac.cpp       Mon Jul 07 17:00:26 2014 +0900
> @@ -2046,7 +2046,7 @@
>      DTRACE_CABAC_T("\n")
>  #endif // if ENC_DEC_TRACE
>
> -    X265_CHECK(trSize <= m_slice->getSPS()->getMaxTrSize(), "transform size out of range\n");
> +    X265_CHECK(log2TrSize <= m_slice->getSPS()->getQuadtreeTULog2MaxSize(), "transform size out of range\n");
>
>      // compute number of significant coefficients
>      uint32_t numSig = primitives.count_nonzero(coeff, (1 << (log2TrSize << 1)));
> diff -r 11c808e562b8 -r dcf6f2ce907c source/Lib/TLibEncoder/TEncSearch.cpp
> --- a/source/Lib/TLibEncoder/TEncSearch.cpp     Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/Lib/TLibEncoder/TEncSearch.cpp     Mon Jul 07 17:00:26 2014 +0900
> @@ -452,22 +452,17 @@
>      }
>
>      //--- transform and quantization ---
> -    uint32_t absSum;
> -    int lastPos = -1;
> -
>      int chFmt = cu->getChromaFormat();
>      m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
> -    absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, tuSize, TEXT_LUMA, absPartIdx, &lastPos, useTransformSkip);
> +    uint32_t numSig = m_trQuant->transformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTransformSkip);
>
>      //--- set coded block flag ---
> -    cbf = absSum ? 1 : 0;
> -
> -    if (absSum)
> +    cbf = numSig ? 1 : 0;
> +
> +    if (numSig)
>      {
>          //--- inverse transform ---
> -        int scalingListType = 0 + TEXT_LUMA;
> -        X265_CHECK(scalingListType < 6, "scalingListType invalid %d\n", scalingListType);
> -        m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), cu->getLumaIntraDir(absPartIdx), residual, stride, coeff, tuSize, scalingListType, useTransformSkip, lastPos);
> +        m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTransformSkip, numSig);
>          X265_CHECK(log2TrSize <= 5, "log2TrSize is too large %d\n", log2TrSize);
>          //===== reconstruction =====
>          primitives.calcrecon[sizeIdx](pred, residual, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
> @@ -528,9 +523,6 @@
>      }
>
>      //--- transform and quantization ---
> -    uint32_t absSum;
> -    int lastPos = -1;
> -
>      int chFmt = cu->getChromaFormat();
>      int curChromaQpOffset;
>      if (ttype == TEXT_CHROMA_U)
> @@ -542,18 +534,16 @@
>          curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
>      }
>      m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> -    absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, tuSize, ttype, absPartIdx, &lastPos, useTransformSkipC);
> +    uint32_t numSig = m_trQuant->transformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, absPartIdx, useTransformSkipC);
>
>      //--- set coded block flag ---
> -    cbf = absSum ? 1 : 0;
> +    cbf = numSig ? 1 : 0;
>
>      uint32_t dist;
> -    if (absSum)
> +    if (numSig)
>      {
>          //--- inverse transform ---
> -        int scalingListType = 0 + ttype;
> -        X265_CHECK(scalingListType < 6, "scalingListType invalid %d\n", scalingListType);
> -        m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, residual, stride, coeff, tuSize, scalingListType, useTransformSkipC, lastPos);
> +        m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), residual, stride, coeff, log2TrSizeC, ttype, true, useTransformSkipC, numSig);
>          X265_CHECK(log2TrSizeC <= 5, "log2TrSizeC is too large %d\n", log2TrSizeC);
>          //===== reconstruction =====
>          primitives.calcrecon[sizeIdxC](pred, residual, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
> @@ -926,23 +916,18 @@
>          primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
>
>          //===== transform and quantization =====
> -        uint32_t absSum = 0;
> -        int lastPos = -1;
> -
>          m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
> -        absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, tuSize, TEXT_LUMA, absPartIdx, &lastPos, useTransformSkip);
> +        uint32_t numSig = m_trQuant->transformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTransformSkip);
>
>          //--- set coded block flag ---
> -        cu->setCbfSubParts((absSum ? 1 : 0) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
> +        cu->setCbfSubParts((numSig ? 1 : 0) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
>
>          int part = partitionFromSize(tuSize);
>
> -        if (absSum)
> +        if (numSig)
>          {
>              //--- inverse transform ---
> -            int scalingListType = 0 + TEXT_LUMA;
> -            X265_CHECK(scalingListType < 6, "scalingListType %d\n", scalingListType);
> -            m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), cu->getLumaIntraDir(absPartIdx), residual, stride, coeff, tuSize, scalingListType, useTransformSkip, lastPos);
> +            m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTransformSkip, numSig);
>
>              // Generate Recon
>              primitives.luma_add_ps[part](recon, stride, pred, residual, stride, stride);
> @@ -1432,9 +1417,6 @@
>                  primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
>
>                  //--- transform and quantization ---
> -                uint32_t absSum = 0;
> -                int lastPos = -1;
> -
>                  int curChromaQpOffset;
>                  if (ttype == TEXT_CHROMA_U)
>                  {
> @@ -1445,17 +1427,15 @@
>                      curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
>                  }
>                  m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> -                absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, tuSize, ttype, absPartIdxC, &lastPos, useTransformSkipC);
> +                uint32_t numSig = m_trQuant->transformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTransformSkipC);
>
>                  //--- set coded block flag ---
> -                cu->setCbfPartRange((((absSum > 0) ? 1 : 0) << origTrDepth), ttype, absPartIdxC, tuIterator.m_absPartIdxStep);
> -
> -                if (absSum)
> +                cu->setCbfPartRange((((numSig > 0) ? 1 : 0) << origTrDepth), ttype, absPartIdxC, tuIterator.m_absPartIdxStep);
> +
> +                if (numSig)
>                  {
>                      //--- inverse transform ---
> -                    int scalingListType = 0 + ttype;
> -                    X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> -                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, residual, stride, coeff, tuSize, scalingListType, useTransformSkipC, lastPos);
> +                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), residual, stride, coeff, log2TrSizeC, ttype, true, useTransformSkipC, numSig);
>
>                      //===== reconstruction =====
>                      // use square primitives
> @@ -2692,8 +2672,6 @@
>      X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
>
>      // code full block
> -    uint32_t absSumY = 0, absSumU = 0, absSumV = 0;
> -    int lastPosY = -1, lastPosU = -1, lastPosV = -1;
>      if (bCheckFull)
>      {
>          uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
> @@ -2716,8 +2694,6 @@
>          coeff_t *coeffCurU = cu->getCoeffCb() + coeffOffsetC;
>          coeff_t *coeffCurV = cu->getCoeffCr() + coeffOffsetC;
>
> -        uint32_t trSize   = 1 << log2TrSize;
> -        uint32_t trSizeC  = 1 << log2TrSizeC;
>          uint32_t sizeIdx  = log2TrSize  - 2;
>          uint32_t sizeIdxC = log2TrSizeC - 2;
>          cu->setTrIdxSubParts(depth - cu->getDepth(0), absPartIdx, depth);
> @@ -2729,24 +2705,20 @@
>          const uint32_t strideResiC = resiYuv->m_cwidth;
>
>          m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
> -        absSumY = m_trQuant->transformNxN(cu, curResiY, strideResiY, coeffCurY,
> -                                          trSize, TEXT_LUMA, absPartIdx, &lastPosY, false, curuseRDOQ);
> -
> -        cu->setCbfSubParts(absSumY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
> -
> -        if (absSumY)
> +        uint32_t numSigY = m_trQuant->transformNxN(cu, curResiY, strideResiY, coeffCurY,
> +                                                   log2TrSize, TEXT_LUMA, absPartIdx, false, curuseRDOQ);
> +
> +        cu->setCbfSubParts(numSigY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
> +
> +        if (numSigY)
>          {
>              m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
> -
> -            int scalingListType = 3 + TEXT_LUMA;
> -            X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> -            m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, strideResiY,  coeffCurY, trSize, scalingListType, false, lastPosY); //this is for inter mode only
> +            m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSigY);
>          }
>          else
>          {
>              primitives.blockfill_s[sizeIdx](curResiY, strideResiY, 0);
>          }
> -        cu->setCbfSubParts(absSumY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
>
>          if (bCodeChroma)
>          {
> @@ -2766,45 +2738,37 @@
>
>                  int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
>                  m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> -                absSumU = m_trQuant->transformNxN(cu, curResiU, strideResiC, coeffCurU + subTUOffset,
> -                                                  trSizeC, TEXT_CHROMA_U, absPartIdxC, &lastPosU, false, curuseRDOQ);
> +                uint32_t numSigU = m_trQuant->transformNxN(cu, curResiU, strideResiC, coeffCurU + subTUOffset,
> +                                                           log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false, curuseRDOQ);
>
>                  curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
>                  m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> -                absSumV = m_trQuant->transformNxN(cu, curResiV, strideResiC, coeffCurV + subTUOffset,
> -                                                  trSizeC, TEXT_CHROMA_V, absPartIdxC, &lastPosV, false, curuseRDOQ);
> -
> -                cu->setCbfPartRange(absSumU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> -                cu->setCbfPartRange(absSumV ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> -
> -                if (absSumU)
> +                uint32_t numSigV = m_trQuant->transformNxN(cu, curResiV, strideResiC, coeffCurV + subTUOffset,
> +                                                           log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false, curuseRDOQ);
> +
> +                cu->setCbfPartRange(numSigU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> +                cu->setCbfPartRange(numSigV ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> +
> +                if (numSigU)
>                  {
>                      curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
>                      m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> -
> -                    int scalingListType = 3 + TEXT_CHROMA_U;
> -                    X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> -                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiU, strideResiC, coeffCurU + subTUOffset, trSizeC, scalingListType, false, lastPosU);
> +                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU);
>                  }
>                  else
>                  {
>                      primitives.blockfill_s[sizeIdxC](curResiU, strideResiC, 0);
>                  }
> -                if (absSumV)
> +                if (numSigV)
>                  {
>                      curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
>                      m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> -
> -                    int scalingListType = 3 + TEXT_CHROMA_V;
> -                    X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> -                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, strideResiC, coeffCurV + subTUOffset, trSizeC, scalingListType, false, lastPosV);
> +                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV);
>                  }
>                  else
>                  {
>                      primitives.blockfill_s[sizeIdxC](curResiV, strideResiC, 0);
>                  }
> -                cu->setCbfPartRange(absSumU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> -                cu->setCbfPartRange(absSumV ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
>              }
>              while (isNextSection(&tuIterator));
>
> @@ -2894,9 +2858,8 @@
>      uint32_t singleBitsComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
>      uint32_t singleDistComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
>      uint32_t singlePsyEnergyComp[MAX_NUM_COMPONENT][2] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
> -    uint32_t absSum[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
> +    uint32_t numSigY = 0;
>      uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
> -    int      lastPos[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { -1, -1 }, { -1, -1 }, { -1, -1 } };
>      uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/];
>
>      uint32_t bestCBF[MAX_NUM_COMPONENT];
> @@ -2910,6 +2873,8 @@
>      // code full block
>      if (bCheckFull)
>      {
> +        uint32_t numSigU[2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { 0, 0 };
> +        uint32_t numSigV[2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { 0, 0 };
>          uint32_t trSizeC = 1 << log2TrSizeC;
>          int sizeIdx  = log2TrSize - 2;
>          int sizeIdxC = log2TrSizeC - 2;
> @@ -2933,14 +2898,14 @@
>          }
>
>          m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
> -        absSum[TEXT_LUMA][0] = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, coeffCurY,
> -                                                       trSize, TEXT_LUMA, absPartIdx, &lastPos[TEXT_LUMA][0], false, curuseRDOQ);
> -
> -        cu->setCbfSubParts(absSum[TEXT_LUMA][0] ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
> +        numSigY = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, coeffCurY,
> +                                          log2TrSize, TEXT_LUMA, absPartIdx, false, curuseRDOQ);
> +
> +        cu->setCbfSubParts(numSigY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
>
>          m_entropyCoder->resetBits();
>          m_entropyCoder->encodeQtCbf(cu, absPartIdx, TEXT_LUMA, trMode);
> -        if (absSum[TEXT_LUMA][0])
> +        if (numSigY)
>              m_entropyCoder->encodeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
>          singleBitsComp[TEXT_LUMA][0] = m_entropyCoder->getNumberOfWrittenBits();
>
> @@ -2966,24 +2931,24 @@
>                  //Cb transform
>                  int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
>                  m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> -                absSum[TEXT_CHROMA_U][tuIterator.m_section] = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurU + subTUOffset,
> -                                                                                      trSizeC, TEXT_CHROMA_U, absPartIdxC, &lastPos[TEXT_CHROMA_U][tuIterator.m_section], false, curuseRDOQ);
> +                numSigU[tuIterator.m_section] = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurU + subTUOffset,
> +                                                                        log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false, curuseRDOQ);
>                  //Cr transform
>                  curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
>                  m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> -                absSum[TEXT_CHROMA_V][tuIterator.m_section] = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurV + subTUOffset,
> -                                                                                      trSizeC, TEXT_CHROMA_V, absPartIdxC, &lastPos[TEXT_CHROMA_V][tuIterator.m_section], false, curuseRDOQ);
> -
> -                cu->setCbfPartRange(absSum[TEXT_CHROMA_U][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> -                cu->setCbfPartRange(absSum[TEXT_CHROMA_V][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> +                numSigV[tuIterator.m_section] = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurV + subTUOffset,
> +                                                                        log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false, curuseRDOQ);
> +
> +                cu->setCbfPartRange(numSigU[tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> +                cu->setCbfPartRange(numSigV[tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
>
>                  m_entropyCoder->encodeQtCbf(cu, absPartIdxC, TEXT_CHROMA_U, trMode);
> -                if (absSum[TEXT_CHROMA_U][tuIterator.m_section])
> +                if (numSigU[tuIterator.m_section])
>                      m_entropyCoder->encodeCoeffNxN(cu, coeffCurU + subTUOffset, absPartIdxC, log2TrSizeC, TEXT_CHROMA_U);
>                  singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section] = m_entropyCoder->getNumberOfWrittenBits() - singleBitsPrev;
>
>                  m_entropyCoder->encodeQtCbf(cu, absPartIdxC, TEXT_CHROMA_V, trMode);
> -                if (absSum[TEXT_CHROMA_V][tuIterator.m_section])
> +                if (numSigV[tuIterator.m_section])
>                      m_entropyCoder->encodeCoeffNxN(cu, coeffCurV + subTUOffset, absPartIdxC, log2TrSizeC, TEXT_CHROMA_V);
>                  uint32_t newBits = m_entropyCoder->getNumberOfWrittenBits();
>                  singleBitsComp[TEXT_CHROMA_V][tuIterator.m_section] = newBits - (singleBitsPrev + singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section]);
> @@ -3021,13 +2986,10 @@
>          {
>              *outZeroDist += distY;
>          }
> -        if (absSum[TEXT_LUMA][0])
> +        if (numSigY)
>          {
>              m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
> -
> -            int scalingListType = 3 + TEXT_LUMA;
> -            X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> -            m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, strideResiY,  coeffCurY, trSize, scalingListType, false, lastPos[TEXT_LUMA][0]); //this is for inter mode only
> +            m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSigY); //this is for inter mode only
>
>              const uint32_t nonZeroDistY = primitives.sse_ss[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, curResiY, strideResiY);
>              uint32_t nonZeroPsyEnergyY = 0;
> @@ -3066,7 +3028,7 @@
>                      nullCostY = m_rdCost->calcRdCost(distY, nullBitsY);
>                  if (nullCostY < singleCostY)
>                  {
> -                    absSum[TEXT_LUMA][0] = 0;
> +                    numSigY = 0;
>  #if CHECKED_BUILD || _DEBUG
>                      ::memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY);
>  #endif
> @@ -3099,11 +3061,11 @@
>
>          singleDistComp[TEXT_LUMA][0] = distY;
>          singlePsyEnergyComp[TEXT_LUMA][0] = psyEnergyY;
> -        if (!absSum[TEXT_LUMA][0])
> +        if (!numSigY)
>          {
>              primitives.blockfill_s[sizeIdx](curResiY, strideResiY, 0);
>          }
> -        cu->setCbfSubParts(absSum[TEXT_LUMA][0] ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
> +        cu->setCbfSubParts(numSigY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
>
>          uint32_t distU = 0;
>          uint32_t distV = 0;
> @@ -3130,19 +3092,17 @@
>                  {
>                      *outZeroDist += distU;
>                  }
> -                if (absSum[TEXT_CHROMA_U][tuIterator.m_section])
> +                if (numSigU[tuIterator.m_section])
>                  {
>                      int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
>                      m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> -
> -                    int scalingListType = 3 + TEXT_CHROMA_U;
> -                    X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> -                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiU, strideResiC, coeffCurU + subTUOffset,
> -                                               trSizeC, scalingListType, false, lastPos[TEXT_CHROMA_U][tuIterator.m_section]);
> +                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), curResiU, strideResiC, coeffCurU + subTUOffset,
> +                                               log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU[tuIterator.m_section]);
>                      uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth,
>                                                                   curResiU, strideResiC);
>                      const uint32_t nonZeroDistU = m_rdCost->scaleChromaDistCb(dist);
> -                    uint32_t  nonZeroPsyEnergyU = 0;
> +                    uint32_t nonZeroPsyEnergyU = 0;
> +
>                      if (m_rdCost->psyRdEnabled())
>                      {
>                          pixel*   pred = predYuv->getCbAddr(absPartIdxC);
> @@ -3178,7 +3138,7 @@
>                              nullCostU = m_rdCost->calcRdCost(distU, nullBitsU);
>                          if (nullCostU < singleCostU)
>                          {
> -                            absSum[TEXT_CHROMA_U][tuIterator.m_section] = 0;
> +                            numSigU[tuIterator.m_section] = 0;
>  #if CHECKED_BUILD || _DEBUG
>                              ::memset(coeffCurU + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
>  #endif
> @@ -3212,7 +3172,7 @@
>                  singleDistComp[TEXT_CHROMA_U][tuIterator.m_section] = distU;
>                  singlePsyEnergyComp[TEXT_CHROMA_U][tuIterator.m_section] = psyEnergyU;
>
> -                if (!absSum[TEXT_CHROMA_U][tuIterator.m_section])
> +                if (!numSigU[tuIterator.m_section])
>                  {
>                      primitives.blockfill_s[sizeIdxC](curResiU, strideResiC, 0);
>                  }
> @@ -3222,15 +3182,12 @@
>                  {
>                      *outZeroDist += distV;
>                  }
> -                if (absSum[TEXT_CHROMA_V][tuIterator.m_section])
> +                if (numSigV[tuIterator.m_section])
>                  {
>                      int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
>                      m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> -
> -                    int scalingListType = 3 + TEXT_CHROMA_V;
> -                    X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> -                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, strideResiC, coeffCurV + subTUOffset,
> -                                               trSizeC, scalingListType, false, lastPos[TEXT_CHROMA_V][tuIterator.m_section]);
> +                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), curResiV, strideResiC, coeffCurV + subTUOffset,
> +                                               log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV[tuIterator.m_section]);
>                      uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth,
>                                                                   curResiV, strideResiC);
>                      const uint32_t nonZeroDistV = m_rdCost->scaleChromaDistCr(dist);
> @@ -3271,7 +3228,7 @@
>                              nullCostV = m_rdCost->calcRdCost(distV, nullBitsV);
>                          if (nullCostV < singleCostV)
>                          {
> -                            absSum[TEXT_CHROMA_V][tuIterator.m_section] = 0;
> +                            numSigV[tuIterator.m_section] = 0;
>  #if CHECKED_BUILD || _DEBUG
>                              ::memset(coeffCurV + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
>  #endif
> @@ -3305,21 +3262,20 @@
>                  singleDistComp[TEXT_CHROMA_V][tuIterator.m_section] = distV;
>                  singlePsyEnergyComp[TEXT_CHROMA_V][tuIterator.m_section] = psyEnergyV;
>
> -                if (!absSum[TEXT_CHROMA_V][tuIterator.m_section])
> +                if (!numSigV[tuIterator.m_section])
>                  {
>                      primitives.blockfill_s[sizeIdxC](curResiV, strideResiC, 0);
>                  }
>
> -                cu->setCbfPartRange(absSum[TEXT_CHROMA_U][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> -                cu->setCbfPartRange(absSum[TEXT_CHROMA_V][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> +                cu->setCbfPartRange(numSigU[tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> +                cu->setCbfPartRange(numSigV[tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
>              }
>              while (isNextSection(&tuIterator));
>          }
>
> -        int lastPosTransformSkip[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { -1, -1 }, { -1, -1 }, { -1, -1 } };
>          if (checkTransformSkipY)
>          {
> -            uint32_t nonZeroDistY = 0, absSumTransformSkipY;
> +            uint32_t nonZeroDistY = 0;
>              uint32_t nonZeroPsyEnergyY = 0;
>              uint64_t singleCostY = MAX_INT64;
>
> @@ -3336,11 +3292,11 @@
>              }
>
>              m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
> -            absSumTransformSkipY = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, tsCoeffY,
> -                                                           trSize, TEXT_LUMA, absPartIdx, &lastPosTransformSkip[TEXT_LUMA][0], true, curuseRDOQ);
> -            cu->setCbfSubParts(absSumTransformSkipY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
> -
> -            if (absSumTransformSkipY)
> +            uint32_t numSigTSkipY = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, tsCoeffY,
> +                                                            log2TrSize, TEXT_LUMA, absPartIdx, true, curuseRDOQ);
> +            cu->setCbfSubParts(numSigTSkipY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
> +
> +            if (numSigTSkipY)
>              {
>                  m_entropyCoder->resetBits();
>                  m_entropyCoder->encodeQtCbf(cu, absPartIdx, TEXT_LUMA, trMode);
> @@ -3348,11 +3304,7 @@
>                  const uint32_t skipSingleBitsY = m_entropyCoder->getNumberOfWrittenBits();
>
>                  m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
> -
> -                int scalingListType = 3 + TEXT_LUMA;
> -                X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> -
> -                m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, tsResiY, trSize, tsCoeffY, trSize, scalingListType, true, lastPosTransformSkip[TEXT_LUMA][0]);
> +                m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), tsResiY, trSize, tsCoeffY, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
>
>                  nonZeroDistY = primitives.sse_ss[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width,
>                                                             tsResiY, trSize);
> @@ -3375,7 +3327,7 @@
>                      singleCostY = m_rdCost->calcRdCost(nonZeroDistY, skipSingleBitsY);
>              }
>
> -            if (!absSumTransformSkipY || minCost[TEXT_LUMA][0] < singleCostY)
> +            if (!numSigTSkipY || minCost[TEXT_LUMA][0] < singleCostY)
>              {
>                  cu->setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
>              }
> @@ -3383,18 +3335,18 @@
>              {
>                  singleDistComp[TEXT_LUMA][0] = nonZeroDistY;
>                  singlePsyEnergyComp[TEXT_LUMA][0] = nonZeroPsyEnergyY;
> -                absSum[TEXT_LUMA][0] = absSumTransformSkipY;
> +                numSigY = numSigTSkipY;
>                  bestTransformMode[TEXT_LUMA][0] = 1;
>                  memcpy(coeffCurY, tsCoeffY, sizeof(coeff_t) * numCoeffY);
>                  primitives.square_copy_ss[sizeIdx](curResiY, strideResiY, tsResiY, trSize);
>              }
>
> -            cu->setCbfSubParts(absSum[TEXT_LUMA][0] ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
> +            cu->setCbfSubParts(numSigY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
>          }
>
>          if (bCodeChroma && checkTransformSkipUV)
>          {
> -            uint32_t nonZeroDistU = 0, nonZeroDistV = 0, absSumTransformSkipU, absSumTransformSkipV;
> +            uint32_t nonZeroDistU = 0, nonZeroDistV = 0;
>              uint32_t nonZeroPsyEnergyU = 0, nonZeroPsyEnergyV = 0;
>              uint64_t singleCostU = MAX_INT64;
>              uint64_t singleCostV = MAX_INT64;
> @@ -3429,20 +3381,20 @@
>
>                  int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
>                  m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> -                absSumTransformSkipU = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, tsCoeffU,
> -                                                               trSizeC, TEXT_CHROMA_U, absPartIdxC, &lastPosTransformSkip[TEXT_CHROMA_U][tuIterator.m_section], true, curuseRDOQ);
> +                uint32_t numSigTSkipU = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, tsCoeffU,
> +                                                                log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, true, curuseRDOQ);
>                  curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
>                  m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> -                absSumTransformSkipV = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, tsCoeffV,
> -                                                               trSizeC, TEXT_CHROMA_V, absPartIdxC, &lastPosTransformSkip[TEXT_CHROMA_V][tuIterator.m_section], true, curuseRDOQ);
> -
> -                cu->setCbfPartRange(absSumTransformSkipU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> -                cu->setCbfPartRange(absSumTransformSkipV ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> +                uint32_t numSigTSkipV = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, tsCoeffV,
> +                                                                log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, true, curuseRDOQ);
> +
> +                cu->setCbfPartRange(numSigTSkipU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> +                cu->setCbfPartRange(numSigTSkipV ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
>
>                  m_entropyCoder->resetBits();
>                  singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section] = 0;
>
> -                if (absSumTransformSkipU)
> +                if (numSigTSkipU)
>                  {
>                      m_entropyCoder->encodeQtCbf(cu, absPartIdxC, TEXT_CHROMA_U, trMode);
>                      m_entropyCoder->encodeCoeffNxN(cu, tsCoeffU, absPartIdxC, log2TrSizeC, TEXT_CHROMA_U);
> @@ -3450,11 +3402,8 @@
>
>                      curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
>                      m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> -
> -                    int scalingListType = 3 + TEXT_CHROMA_U;
> -                    X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> -                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, tsResiU, trSizeC, tsCoeffU,
> -                                               trSizeC, scalingListType, true, lastPosTransformSkip[TEXT_CHROMA_U][tuIterator.m_section]);
> +                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), tsResiU, trSizeC, tsCoeffU,
> +                                               log2TrSizeC, TEXT_CHROMA_U, false, true, numSigTSkipU);
>                      uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth,
>                                                                   tsResiU, trSizeC);
>                      nonZeroDistU = m_rdCost->scaleChromaDistCb(dist);
> @@ -3476,7 +3425,7 @@
>                          singleCostU = m_rdCost->calcRdCost(nonZeroDistU, singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section]);
>                  }
>
> -                if (!absSumTransformSkipU || minCost[TEXT_CHROMA_U][tuIterator.m_section] < singleCostU)
> +                if (!numSigTSkipU || minCost[TEXT_CHROMA_U][tuIterator.m_section] < singleCostU)
>                  {
>                      cu->setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
>                  }
> @@ -3484,13 +3433,13 @@
>                  {
>                      singleDistComp[TEXT_CHROMA_U][tuIterator.m_section] = nonZeroDistU;
>                      singlePsyEnergyComp[TEXT_CHROMA_U][tuIterator.m_section] = nonZeroPsyEnergyU;
> -                    absSum[TEXT_CHROMA_U][tuIterator.m_section] = absSumTransformSkipU;
> +                    numSigU[tuIterator.m_section] = numSigTSkipU;
>                      bestTransformMode[TEXT_CHROMA_U][tuIterator.m_section] = 1;
>                      memcpy(coeffCurU + subTUOffset, tsCoeffU, sizeof(coeff_t) * numCoeffC);
>                      primitives.square_copy_ss[sizeIdxC](curResiU, strideResiC, tsResiU, trSizeC);
>                  }
>
> -                if (absSumTransformSkipV)
> +                if (numSigTSkipV)
>                  {
>                      m_entropyCoder->encodeQtCbf(cu, absPartIdxC, TEXT_CHROMA_V, trMode);
>                      m_entropyCoder->encodeCoeffNxN(cu, tsCoeffV, absPartIdxC, log2TrSizeC, TEXT_CHROMA_V);
> @@ -3498,11 +3447,8 @@
>
>                      curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
>                      m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> -
> -                    int scalingListType = 3 + TEXT_CHROMA_V;
> -                    X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> -                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, tsResiV, trSizeC, tsCoeffV,
> -                                               trSizeC, scalingListType, true, lastPosTransformSkip[TEXT_CHROMA_V][tuIterator.m_section]);
> +                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), tsResiV, trSizeC, tsCoeffV,
> +                                               log2TrSizeC, TEXT_CHROMA_V, false, true, numSigTSkipV);
>                      uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth,
>                                                                   tsResiV, trSizeC);
>                      nonZeroDistV = m_rdCost->scaleChromaDistCr(dist);
> @@ -3524,7 +3470,7 @@
>                          singleCostV = m_rdCost->calcRdCost(nonZeroDistV, singleBitsComp[TEXT_CHROMA_V][tuIterator.m_section]);
>                  }
>
> -                if (!absSumTransformSkipV || minCost[TEXT_CHROMA_V][tuIterator.m_section] < singleCostV)
> +                if (!numSigTSkipV || minCost[TEXT_CHROMA_V][tuIterator.m_section] < singleCostV)
>                  {
>                      cu->setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
>                  }
> @@ -3532,14 +3478,14 @@
>                  {
>                      singleDistComp[TEXT_CHROMA_V][tuIterator.m_section] = nonZeroDistV;
>                      singlePsyEnergyComp[TEXT_CHROMA_V][tuIterator.m_section] = nonZeroPsyEnergyV;
> -                    absSum[TEXT_CHROMA_V][tuIterator.m_section] = absSumTransformSkipV;
> +                    numSigV[tuIterator.m_section] = numSigTSkipV;
>                      bestTransformMode[TEXT_CHROMA_V][tuIterator.m_section] = 1;
>                      memcpy(coeffCurV + subTUOffset, tsCoeffV, sizeof(coeff_t) * numCoeffC);
>                      primitives.square_copy_ss[sizeIdxC](curResiV, strideResiC, tsResiV, trSizeC);
>                  }
>
> -                cu->setCbfPartRange(absSum[TEXT_CHROMA_U][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> -                cu->setCbfPartRange(absSum[TEXT_CHROMA_V][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> +                cu->setCbfPartRange(numSigU[tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> +                cu->setCbfPartRange(numSigV[tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
>              }
>              while (isNextSection(&tuIterator));
>
> @@ -3568,16 +3514,16 @@
>          }
>
>          m_entropyCoder->encodeQtCbf(cu, absPartIdx, TEXT_LUMA, trMode);
> -        if (absSum[TEXT_LUMA][0])
> +        if (numSigY)
>              m_entropyCoder->encodeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
>
>          if (bCodeChroma)
>          {
>              if (!splitIntoSubTUs)
>              {
> -                if (absSum[TEXT_CHROMA_U][0])
> +                if (numSigU[0])
>                      m_entropyCoder->encodeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
> -                if (absSum[TEXT_CHROMA_V][0])
> +                if (numSigV[0])
>                      m_entropyCoder->encodeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
>              }
>              else
> @@ -3585,13 +3531,13 @@
>                  uint32_t subTUSize = 1 << (log2TrSizeC * 2);
>                  uint32_t partIdxesPerSubTU = absPartIdxStep >> 1;
>
> -                if (absSum[TEXT_CHROMA_U][0])
> +                if (numSigU[0])
>                      m_entropyCoder->encodeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
> -                if (absSum[TEXT_CHROMA_U][1])
> +                if (numSigU[1])
>                      m_entropyCoder->encodeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_U);
> -                if (absSum[TEXT_CHROMA_V][0])
> +                if (numSigV[0])
>                      m_entropyCoder->encodeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
> -                if (absSum[TEXT_CHROMA_V][1])
> +                if (numSigV[1])
>                      m_entropyCoder->encodeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_V);
>              }
>          }
> @@ -3734,7 +3680,7 @@
>      cu->m_psyEnergy = singlePsyEnergy;
>
>      cu->setTrIdxSubParts(trMode, absPartIdx, depth);
> -    cu->setCbfSubParts(absSum[TEXT_LUMA][0] ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
> +    cu->setCbfSubParts(numSigY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
>
>      if (bCodeChroma)
>      {
> diff -r 11c808e562b8 -r dcf6f2ce907c source/common/dct.cpp
> --- a/source/common/dct.cpp     Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/common/dct.cpp     Mon Jul 07 17:00:26 2014 +0900
> @@ -773,10 +773,10 @@
>      }
>  }
>
> -uint32_t quant_c(int32_t* coef, int32_t* quantCoeff, int32_t* deltaU, int32_t* qCoef, int qBits, int add, int numCoeff, int32_t* lastPos)
> +uint32_t quant_c(int32_t* coef, int32_t* quantCoeff, int32_t* deltaU, int32_t* qCoef, int qBits, int add, int numCoeff)
>  {
>      int qBits8 = qBits - 8;
> -    uint32_t acSum = 0;
> +    uint32_t numSig = 0;
>
>      for (int blockpos = 0; blockpos < numCoeff; blockpos++)
>      {
> @@ -785,15 +785,14 @@
>
>          int tmplevel = abs(level) * quantCoeff[blockpos];
>          level = ((tmplevel + add) >> qBits);
> +        deltaU[blockpos] = ((tmplevel - (level << qBits)) >> qBits8);
>          if (level)
> -            *lastPos = blockpos;
> -        deltaU[blockpos] = ((tmplevel - (level << qBits)) >> qBits8);
> -        acSum += level;
> +            ++numSig;
>          level *= sign;
>          qCoef[blockpos] = Clip3(-32768, 32767, level);
>      }
>
> -    return acSum;
> +    return numSig;
>  }
>
>  uint32_t nquant_c(int32_t* coef, int32_t* quantCoeff, int32_t* scaledCoeff, int32_t* qCoef, int qBits, int add, int numCoeff)
> diff -r 11c808e562b8 -r dcf6f2ce907c source/common/primitives.h
> --- a/source/common/primitives.h        Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/common/primitives.h        Mon Jul 07 17:00:26 2014 +0900
> @@ -146,7 +146,7 @@
>  typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
>  typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
>  typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
> -typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
> +typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff);
>  typedef uint32_t (*nquant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *scaledCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
>  typedef void (*dequant_scaling_t)(const int32_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift);
>  typedef void (*dequant_normal_t)(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift);
> diff -r 11c808e562b8 -r dcf6f2ce907c source/common/x86/pixel-util.h
> --- a/source/common/x86/pixel-util.h    Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/common/x86/pixel-util.h    Mon Jul 07 17:00:26 2014 +0900
> @@ -44,7 +44,7 @@
>  void x265_transpose32_sse2(pixel *dest, pixel *src, intptr_t stride);
>  void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride);
>
> -uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
> +uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff);
>  uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *scaledCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
>  void x265_dequant_normal_sse4(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift);
>  int x265_count_nonzero_ssse3(const int32_t *quantCoeff, int numCoeff);
> diff -r 11c808e562b8 -r dcf6f2ce907c source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/common/x86/pixel-util8.asm Mon Jul 07 17:00:26 2014 +0900
> @@ -27,8 +27,6 @@
>
>  SECTION_RODATA 32
>
> -c_d_4:             dd 4, 4, 4, 4
> -c_d_1234:          dd 1, 2, 3, 4
>  %if BIT_DEPTH == 10
>  ssim_c1:   times 4 dd 6697.7856    ; .01*.01*1023*1023*64
>  ssim_c2:   times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
> @@ -864,42 +862,25 @@
>
>
>  ;-----------------------------------------------------------------------------
> -; uint32_t quant(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
> +; uint32_t quant(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff);
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
> -%if ARCH_X86_64 == 1
> -cglobal quant, 5,6,11
> -  %define addVec    m8
> -  %define qbits     m9
> -  %define qbits8    m10
> -%else
> -cglobal quant, 5,6,8, 0-(3*mmsize)
> -  %define addVec    [rsp + 0 * mmsize]
> -  %define qbits     [rsp + 1 * mmsize]
> -  %define qbits8    [rsp + 2 * mmsize]
> -%endif
> +cglobal quant, 5,6,8
>
>      ; fill qbits
> -    movd        m0, r4d
> -    mova        qbits, m0
> +    movd        m4, r4d         ; m4 = qbits
>
>      ; fill qbits-8
>      sub         r4d, 8
> -    movd        m0, r4d
> -    mova        qbits8, m0
> +    movd        m6, r4d         ; m6 = qbits8
>
>      ; fill offset
> -    mov         r4d, r5m
> -    movd        m0, r4d
> -    pshufd      m0, m0, 0
> -    mova        addVec, m0
> +    movd        m5, r5m
> +    pshufd      m5, m5, 0       ; m5 = add
>
>      mov         r4d, r6m
>      shr         r4d, 3
> -    pxor        m7, m7          ; m7 = acSum4
> -    mova        m6, [c_d_1234]  ; m6 = last4
> -    pxor        m5, m5          ; m5 = count
> -    mova        m4, [c_d_4]     ; m4 = [4 4 4 4]
> +    pxor        m7, m7          ; m7 = numZero
>  .loop:
>      ; 4 coeff
>      movu        m0, [r0]        ; m0 = level
> @@ -908,19 +889,15 @@
>      movu        m2, [r1]        ; m2 = qcoeff
>      pabsd       m0, m0
>      pmulld      m0, m2          ; m0 = tmpLevel1
> -    paddd       m2, m0, addVec
> -    psrad       m2, qbits       ; m2 = level1
> -    paddd       m7, m2
> -    pslld       m3, m2, qbits
> +    paddd       m2, m0, m5
> +    psrad       m2, m4          ; m2 = level1
> +    pslld       m3, m2, m4
>      psubd       m0, m3
> -    psrad       m0, qbits8      ; m0 = deltaU1
> +    psrad       m0, m6          ; m0 = deltaU1
>      movu        [r2], m0
>      pxor        m0, m0
>      pcmpeqd     m0, m2          ; m0 = mask4
> -    pand        m5, m0
> -    pandn       m0, m6
> -    por         m5, m0
> -    paddd       m6, m4
> +    psubd       m7, m0
>
>      pxor        m2, m1
>      psubd       m2, m1
> @@ -934,19 +911,15 @@
>      movu        m2, [r1 + 16]   ; m2 = qcoeff
>      pabsd       m0, m0
>      pmulld      m0, m2          ; m0 = tmpLevel1
> -    paddd       m2, m0, addVec
> -    psrad       m2, qbits       ; m2 = level1
> -    paddd       m7, m2
> -    pslld       m3, m2, qbits
> +    paddd       m2, m0, m5
> +    psrad       m2, m4          ; m2 = level1
> +    pslld       m3, m2, m4
>      psubd       m0, m3
> -    psrad       m0, qbits8      ; m0 = deltaU1
> +    psrad       m0, m6          ; m0 = deltaU1
>      movu        [r2 + 16], m0
>      pxor        m0, m0
>      pcmpeqd     m0, m2          ; m0 = mask4
> -    pand        m5, m0
> -    pandn       m0, m6
> -    por         m5, m0
> -    paddd       m6, m4
> +    psubd       m7, m0
>
>      pxor        m2, m1
>      psubd       m2, m1
> @@ -962,18 +935,11 @@
>      dec         r4d
>      jnz        .loop
>
> -    movhlps     m4, m5
> -    pmaxud      m4, m5
> -    pshufd      m5, m4, 1
> -    pmaxud      m4, m5
> -
> -    mov         r4, r7m
> -    movd        [r4], m4
> -    dec         dword [r4]
> -
>      phaddd      m7, m7
>      phaddd      m7, m7
> -    movd        eax, m7
> +    mov         eax, r6m
> +    movd        r4d, m7
> +    sub         eax, r4d        ; numSig
>
>      RET
>
> @@ -985,11 +951,11 @@
>  cglobal nquant, 5,6,8
>
>      ; fill qbits
> -    movd        m5, r4d         ; m5 = qbits
> +    movd        m4, r4d         ; m4 = qbits
>
>      ; fill offset
> -    movd        m6, r5m
> -    pshufd      m6, m6, 0       ; m6 = add
> +    movd        m5, r5m
> +    pshufd      m5, m5, 0       ; m5 = add
>
>      mov         r4d, r6m
>      shr         r4d, 3
> @@ -1003,10 +969,11 @@
>      pabsd       m0, m0
>      pmulld      m0, m2          ; m0 = tmpLevel1
>      movu        [r2], m0        ; m0 = scaledCoeff
> -    paddd       m2, m0, m6
> -    psrad       m2, m5          ; m2 = level1
> -    pxor        m4, m4
> -    pcmpeqd     m4, m2          ; m4 = mask4
> +    paddd       m2, m0, m5
> +    psrad       m2, m4          ; m2 = level1
> +    pxor        m0, m0
> +    pcmpeqd     m0, m2          ; m0 = mask4
> +    psubd       m7, m0
>
>      pxor        m2, m1
>      psubd       m2, m1
> @@ -1021,10 +988,11 @@
>      pabsd       m0, m0
>      pmulld      m0, m2          ; m0 = tmpLevel1
>      movu        [r2 + 16], m0   ; m0 = scaledCoeff
> -    paddd       m2, m0, m6
> -    psrad       m2, m5          ; m2 = level1
> +    paddd       m2, m0, m5
> +    psrad       m2, m4          ; m2 = level1
>      pxor        m0, m0
>      pcmpeqd     m0, m2          ; m0 = mask4
> +    psubd       m7, m0
>
>      pxor        m2, m1
>      psubd       m2, m1
> @@ -1032,9 +1000,6 @@
>      pmovsxwd    m2, m2
>      movu        [r3 + 16], m2
>
> -    packssdw    m4, m0          ; m4 = mask8
> -    psubw       m7, m4          ; m7 = numZero
> -
>      add         r0, 32
>      add         r1, 32
>      add         r2, 32
> @@ -1043,11 +1008,10 @@
>      dec         r4d
>      jnz        .loop
>
> -    packuswb    m7, m7
> -    pxor        m0, m0
> -    psadbw      m0, m7
> +    phaddd      m7, m7
> +    phaddd      m7, m7
>      mov         eax, r6m
> -    movd        r4d, m0
> +    movd        r4d, m7
>      sub         eax, r4d        ; numSig
>
>      RET
> diff -r 11c808e562b8 -r dcf6f2ce907c source/encoder/encoder.cpp
> --- a/source/encoder/encoder.cpp        Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/encoder/encoder.cpp        Mon Jul 07 17:00:26 2014 +0900
> @@ -1052,8 +1052,6 @@
>
>      sps->setTMVPFlagsPresent(false);
>
> -    sps->setMaxTrSize(1 << m_quadtreeTULog2MaxSize);
> -
>      for (uint32_t i = 0; i < g_maxCUDepth - g_addCUDepth; i++)
>      {
>          sps->setAMPAcc(i, m_param->bEnableAMP);
> diff -r 11c808e562b8 -r dcf6f2ce907c source/test/mbdstharness.cpp
> --- a/source/test/mbdstharness.cpp      Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/test/mbdstharness.cpp      Mon Jul 07 17:00:26 2014 +0900
> @@ -300,13 +300,12 @@
>          int valueToAdd = rand() % (32 * 1024);
>          int cmp_size = sizeof(int) * height * width;
>          int numCoeff = height * width;
> -        int optLastPos = -1, refLastPos = -1;
>
>          int index1 = rand() % TEST_CASES;
>          int index2 = rand() % TEST_CASES;
>
> -        refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf5, mintbuf6, bits, valueToAdd, numCoeff, &refLastPos);
> -        optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mintbuf4, bits, valueToAdd, numCoeff, &optLastPos);
> +        refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf5, mintbuf6, bits, valueToAdd, numCoeff);
> +        optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mintbuf4, bits, valueToAdd, numCoeff);
>
>          if (memcmp(mintbuf3, mintbuf5, cmp_size))
>              return false;
> @@ -317,9 +316,6 @@
>          if (optReturnValue != refReturnValue)
>              return false;
>
> -        if (optLastPos != refLastPos)
> -            return false;
> -
>          reportfail();
>          j += 16;
>      }
> @@ -509,8 +505,7 @@
>      if (opt.quant)
>      {
>          printf("quant\t\t");
> -        int dummy = -1;
> -        REPORT_SPEEDUP(opt.quant, ref.quant, mintbuf1, mintbuf2, mintbuf3, mintbuf4, 23, 23785, 32 * 32, &dummy);
> +        REPORT_SPEEDUP(opt.quant, ref.quant, mintbuf1, mintbuf2, mintbuf3, mintbuf4, 23, 23785, 32 * 32);
>      }
>
>      if (opt.nquant)
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho