[x265] quant: returns numSig instead of absSum and lastPos

Satoshi Nakagawa nakagawa424 at oki.com
Mon Jul 7 10:04:03 CEST 2014


# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1404720026 -32400
#      Mon Jul 07 17:00:26 2014 +0900
# Node ID dcf6f2ce907c59eedc3d488a7f047a5f094bf925
# Parent  11c808e562b894d84961cf00080173321e272884
quant: returns numSig instead of absSum and lastPos

diff -r 11c808e562b8 -r dcf6f2ce907c source/Lib/TLibCommon/CommonDef.h
--- a/source/Lib/TLibCommon/CommonDef.h	Thu Jul 03 15:12:45 2014 -0700
+++ b/source/Lib/TLibCommon/CommonDef.h	Mon Jul 07 17:00:26 2014 +0900
@@ -118,8 +118,6 @@
 #define LOG2_MAX_COLUMN_WIDTH       13
 #define LOG2_MAX_ROW_HEIGHT         13
 
-#define REG_DCT                     65535
-
 #define CABAC_INIT_PRESENT_FLAG     1
 
 #define MAX_GOP                     64          ///< max. value of hierarchical GOP size
diff -r 11c808e562b8 -r dcf6f2ce907c source/Lib/TLibCommon/TComSlice.cpp
--- a/source/Lib/TLibCommon/TComSlice.cpp	Thu Jul 03 15:12:45 2014 -0700
+++ b/source/Lib/TLibCommon/TComSlice.cpp	Mon Jul 07 17:00:26 2014 +0900
@@ -476,7 +476,6 @@
     , m_qpBDOffsetC(0)
     , m_bitsForPOC(8)
     , m_numLongTermRefPicSPS(0)
-    , m_maxTrSize(32)
     , m_bUseSAO(false)
     , m_bTemporalIdNestingFlag(false)
     , m_scalingListEnabledFlag(false)
diff -r 11c808e562b8 -r dcf6f2ce907c source/Lib/TLibCommon/TComSlice.h
--- a/source/Lib/TLibCommon/TComSlice.h	Thu Jul 03 15:12:45 2014 -0700
+++ b/source/Lib/TLibCommon/TComSlice.h	Mon Jul 07 17:00:26 2014 +0900
@@ -825,9 +825,6 @@
     uint32_t    m_ltRefPicPocLsbSps[33];
     bool        m_usedByCurrPicLtSPSFlag[33];
 
-    // Max physical transform size
-    uint32_t    m_maxTrSize;
-
     int m_iAMPAcc[MAX_CU_DEPTH];
     bool        m_bUseSAO;
 
@@ -954,11 +951,6 @@
 
     void      setTMVPFlagsPresent(bool b)   { m_TMVPFlagsPresent = b; }
 
-    // physical transform
-    void setMaxTrSize(uint32_t u)   { m_maxTrSize = u; }
-
-    uint32_t getMaxTrSize() const   { return m_maxTrSize; }
-
     // AMP accuracy
     int       getAMPAcc(uint32_t depth) const { return m_iAMPAcc[depth]; }
 
diff -r 11c808e562b8 -r dcf6f2ce907c source/Lib/TLibCommon/TComTrQuant.cpp
--- a/source/Lib/TLibCommon/TComTrQuant.cpp	Thu Jul 03 15:12:45 2014 -0700
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp	Mon Jul 07 17:00:26 2014 +0900
@@ -143,7 +143,7 @@
 }
 
 // To minimize the distortion only. No rate is considered.
-void TComTrQuant::signBitHidingHDQ(coeff_t* qCoef, coeff_t* coef, int32_t* deltaU, const TUEntropyCodingParameters &codingParameters)
+uint32_t TComTrQuant::signBitHidingHDQ(coeff_t* qCoef, coeff_t* coef, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codingParameters)
 {
     const uint32_t log2TrSizeCG = codingParameters.log2TrSizeCG;
 
@@ -249,6 +249,11 @@
                     finalChange = -1;
                 }
 
+                if (qCoef[minPos] == 0)
+                    numSig++;
+                else if (finalChange == -1 && abs(qCoef[minPos]) == 1)
+                    numSig--;
+
                 if (coef[minPos] >= 0)
                 {
                     qCoef[minPos] += finalChange;
@@ -261,12 +266,13 @@
         }
         lastCG = 0;
     } // TU loop
+
+    return numSig;
 }
 
-uint32_t TComTrQuant::xQuant(TComDataCU* cu, int32_t* coef, coeff_t* qCoef, int trSize,
-                             TextType ttype, uint32_t absPartIdx, int32_t *lastPos)
+uint32_t TComTrQuant::xQuant(TComDataCU* cu, int32_t* coef, coeff_t* qCoef, uint32_t log2TrSize,
+                             TextType ttype, uint32_t absPartIdx)
 {
-    const uint32_t log2TrSize = g_convertToBit[trSize] + 2;
     TUEntropyCodingParameters codingParameters;
     getTUEntropyCodingParameters(cu, codingParameters, absPartIdx, log2TrSize, ttype);
     int deltaU[32 * 32];
@@ -281,13 +287,13 @@
     int add = (cu->getSlice()->getSliceType() == I_SLICE ? 171 : 85) << (qbits - 9);
 
     int numCoeff = 1 << log2TrSize * 2;
-    uint32_t acSum = primitives.quant(coef, quantCoeff, deltaU, qCoef, qbits, add, numCoeff, lastPos);
+    uint32_t numSig = primitives.quant(coef, quantCoeff, deltaU, qCoef, qbits, add, numCoeff);
 
-    if (acSum >= 2 && cu->getSlice()->getPPS()->getSignHideFlag())
+    if (numSig >= 2 && cu->getSlice()->getPPS()->getSignHideFlag())
     {
-        signBitHidingHDQ(qCoef, coef, deltaU, codingParameters);
+        return signBitHidingHDQ(qCoef, coef, deltaU, numSig, codingParameters);
     }
-    return acSum;
+    return numSig;
 }
 
 void TComTrQuant::init(bool useRDOQ)
@@ -299,73 +305,65 @@
                                    int16_t*    residual,
                                    uint32_t    stride,
                                    coeff_t*    coeff,
-                                   uint32_t    trSize,
+                                   uint32_t    log2TrSize,
                                    TextType    ttype,
                                    uint32_t    absPartIdx,
-                                   int32_t*    lastPos,
                                    bool        useTransformSkip,
                                    bool        curUseRDOQ)
 {
     if (cu->getCUTransquantBypass(absPartIdx))
     {
-        uint32_t absSum = 0;
-        for (uint32_t k = 0; k < trSize; k++)
+        uint32_t numSig = 0;
+        int trSize = 1 << log2TrSize;
+        for (int k = 0; k < trSize; k++)
         {
-            for (uint32_t j = 0; j < trSize; j++)
+            for (int j = 0; j < trSize; j++)
             {
                 coeff[k * trSize + j] = ((int16_t)residual[k * stride + j]);
-                absSum += abs(residual[k * stride + j]);
+                numSig += (residual[k * stride + j] != 0);
             }
         }
 
-        return absSum;
+        return numSig;
     }
 
-    uint32_t mode; //luma intra pred
-    if (ttype == TEXT_LUMA && cu->getPredictionMode(absPartIdx) == MODE_INTRA)
+    X265_CHECK((cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() >= log2TrSize), "transform size too large\n");
+    if (!useTransformSkip)
     {
-        mode = cu->getLumaIntraDir(absPartIdx);
+        // TODO: this may need larger data types for X265_DEPTH > 8
+        const uint32_t sizeIdx = log2TrSize - 2;
+        int useDST = (sizeIdx == 0 && ttype == TEXT_LUMA && cu->getPredictionMode(absPartIdx) == MODE_INTRA);
+        int index = DCT_4x4 + sizeIdx - useDST;
+        primitives.dct[index](residual, m_tmpCoeff, stride);
+        if (m_nr->bNoiseReduction)
+        {
+            if (index > 0)
+            {
+                denoiseDct(m_tmpCoeff, m_nr->residualSum[sizeIdx], m_nr->offset[sizeIdx], (16 << sizeIdx * 2));
+                m_nr->count[sizeIdx]++;
+            }
+        }
     }
     else
     {
-        mode = REG_DCT;
-    }
-
-    X265_CHECK((cu->getSlice()->getSPS()->getMaxTrSize() >= trSize), "transform size too large\n");
-    if (useTransformSkip)
-    {
-        xTransformSkip(residual, stride, m_tmpCoeff, trSize);
-    }
-    else
-    {
-        // TODO: this may need larger data types for X265_DEPTH > 8
-        const uint32_t log2BlockSize = g_convertToBit[trSize];
-        primitives.dct[DCT_4x4 + log2BlockSize - ((trSize == 4) && (mode != REG_DCT))](residual, m_tmpCoeff, stride);
-        if (m_nr->bNoiseReduction)
-        {
-            int index = (DCT_4x4 + log2BlockSize - ((trSize == 4) && (mode != REG_DCT)));
-            if (index > 0 && index < 5)
-            {
-                denoiseDct(m_tmpCoeff, m_nr->residualSum[index - 1], m_nr->offset[index - 1], (16 << (index - 1) * 2));
-                m_nr->count[index - 1]++;
-            }
-        }
+        xTransformSkip(residual, stride, m_tmpCoeff, log2TrSize);
     }
 
     if (m_useRDOQ && curUseRDOQ)
     {
-        return xRateDistOptQuant(cu, m_tmpCoeff, coeff, trSize, ttype, absPartIdx, lastPos);
+        return xRateDistOptQuant(cu, m_tmpCoeff, coeff, log2TrSize, ttype, absPartIdx);
     }
-    return xQuant(cu, m_tmpCoeff, coeff, trSize, ttype, absPartIdx, lastPos);
+    return xQuant(cu, m_tmpCoeff, coeff, log2TrSize, ttype, absPartIdx);
 }
 
-void TComTrQuant::invtransformNxN(bool transQuantBypass, uint32_t mode, int16_t* residual, uint32_t stride, coeff_t* coeff, uint32_t trSize, int scalingListType, bool useTransformSkip, int lastPos)
+void TComTrQuant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, coeff_t* coeff, uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig)
 {
     if (transQuantBypass)
     {
-        for (uint32_t k = 0; k < trSize; k++)
+        int trSize = 1 << log2TrSize;
+        for (int k = 0; k < trSize; k++)
         {
-            for (uint32_t j = 0; j < trSize; j++)
+            for (int j = 0; j < trSize; j++)
             {
                 residual[k * stride + j] = (int16_t)(coeff[k * trSize + j]);
             }
@@ -377,37 +375,34 @@
     // Values need to pass as input parameter in dequant
     int per = m_qpParam.m_per;
     int rem = m_qpParam.m_rem;
-    bool useScalingList = getUseScalingList();
-    const uint32_t log2TrSize = g_convertToBit[trSize] + 2;
     int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
     int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift;
-    int32_t *dequantCoef = getDequantCoeff(scalingListType, m_qpParam.m_rem, log2TrSize - 2);
+    int numCoeff = 1 << log2TrSize * 2;
 
-    if (!useScalingList)
+    if (!getUseScalingList())
     {
         static const int invQuantScales[6] = { 40, 45, 51, 57, 64, 72 };
         int scale = invQuantScales[rem] << per;
-        primitives.dequant_normal(coeff, m_tmpCoeff, trSize * trSize, scale, shift);
+        primitives.dequant_normal(coeff, m_tmpCoeff, numCoeff, scale, shift);
     }
     else
     {
         // CHECK_ME: the code is not verify since this is DEAD path
-        primitives.dequant_scaling(coeff, dequantCoef, m_tmpCoeff, trSize * trSize, per, shift);
+        int scalingListType = (!bIntra ? 3 : 0) + ttype;
+        X265_CHECK(scalingListType < 6, "scalingListType invalid %d\n", scalingListType);
+        int32_t *dequantCoef = getDequantCoeff(scalingListType, m_qpParam.m_rem, log2TrSize - 2);
+        primitives.dequant_scaling(coeff, dequantCoef, m_tmpCoeff, numCoeff, per, shift);
     }
 
-    if (useTransformSkip == true)
+    if (!useTransformSkip)
     {
-        xITransformSkip(m_tmpCoeff, residual, stride, trSize);
-    }
-    else
-    {
-        // CHECK_ME: we can't here when no any coeff
-        X265_CHECK(lastPos >= 0, "lastPos negative\n");
+        const uint32_t sizeIdx = log2TrSize - 2;
+        int useDST = (sizeIdx == 0 && ttype == TEXT_LUMA && bIntra);
 
-        const uint32_t log2BlockSize = log2TrSize - 2;
+        X265_CHECK(numSig == primitives.count_nonzero(coeff, 1 << log2TrSize * 2), "numSig differ\n");
 
         // DC only
-        if (lastPos == 0 && !((trSize == 4) && (mode != REG_DCT)))
+        if (numSig == 1 && coeff[0] != 0 && !useDST)
         {
             const int shift_1st = 7;
             const int add_1st = 1 << (shift_1st - 1);
@@ -415,13 +410,17 @@
             const int add_2nd = 1 << (shift_2nd - 1);
 
             int dc_val = (((m_tmpCoeff[0] * 64 + add_1st) >> shift_1st) * 64 + add_2nd) >> shift_2nd;
-            primitives.blockfill_s[log2BlockSize](residual, stride, dc_val);
+            primitives.blockfill_s[sizeIdx](residual, stride, dc_val);
 
             return;
         }
 
         // TODO: this may need larger data types for X265_DEPTH > 8
-        primitives.idct[IDCT_4x4 + log2BlockSize - ((trSize == 4) && (mode != REG_DCT))](m_tmpCoeff, residual, stride);
+        primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_tmpCoeff, residual, stride);
+    }
+    else
+    {
+        xITransformSkip(m_tmpCoeff, residual, stride, log2TrSize);
     }
 }
 
@@ -435,12 +434,10 @@
  *  \param stride stride of input residual data
  *  \param size transform size (size x size)
  */
-void TComTrQuant::xTransformSkip(int16_t* resiBlock, uint32_t stride, int32_t* coeff, int trSize)
+void TComTrQuant::xTransformSkip(int16_t* resiBlock, uint32_t stride, int32_t* coeff, uint32_t log2TrSize)
 {
-    uint32_t log2TrSize = g_convertToBit[trSize] + 2;
-    int  shift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
-    uint32_t transformSkipShift;
-    int  j, k;
+    int trSize = 1 << log2TrSize;
+    int shift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
 
     if (shift >= 0)
     {
@@ -448,15 +445,14 @@
     }
     else
     {
-        //The case when X265_DEPTH > 13
-        int offset;
-        transformSkipShift = -shift;
-        offset = (1 << (transformSkipShift - 1));
-        for (j = 0; j < trSize; j++)
+        // The case when X265_DEPTH > 13
+        shift = -shift;
+        int offset = (1 << (shift - 1));
+        for (int j = 0; j < trSize; j++)
         {
-            for (k = 0; k < trSize; k++)
+            for (int k = 0; k < trSize; k++)
             {
-                coeff[j * trSize + k] = (resiBlock[j * stride + k] + offset) >> transformSkipShift;
+                coeff[j * trSize + k] = (resiBlock[j * stride + k] + offset) >> shift;
             }
         }
     }
@@ -468,11 +464,10 @@
  *  \param stride stride of input residual data
  *  \param size transform size (size x size)
  */
-void TComTrQuant::xITransformSkip(int32_t* coef, int16_t* residual, uint32_t stride, int trSize)
+void TComTrQuant::xITransformSkip(int32_t* coef, int16_t* residual, uint32_t stride, uint32_t log2TrSize)
 {
-    uint32_t log2TrSize = g_convertToBit[trSize] + 2;
-    int  shift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
-    int  j, k;
+    int trSize = 1 << log2TrSize;
+    int shift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
 
     if (shift > 0)
     {
@@ -480,13 +475,13 @@
     }
     else
     {
-        //The case when X265_DEPTH >= 13
-        uint32_t transformSkipShift = -shift;
-        for (j = 0; j < trSize; j++)
+        // The case when X265_DEPTH >= 13
+        shift = -shift;
+        for (int j = 0; j < trSize; j++)
         {
-            for (k = 0; k < trSize; k++)
+            for (int k = 0; k < trSize; k++)
             {
-                residual[j * stride + k] =  coef[j * trSize + k] << transformSkipShift;
+                residual[j * stride + k] = coef[j * trSize + k] << shift;
             }
         }
     }
@@ -501,14 +496,14 @@
  * \param uiAbsSum reference to absolute sum of quantized transform coefficient
  * \param ttype plane type / luminance or chrominance
  * \param absPartIdx absolute partition index
- * \returns void
+ * \returns number of significant coefficient
  * Rate distortion optimized quantization for entropy
  * coding engines using probability models like CABAC
  */
-uint32_t TComTrQuant::xRateDistOptQuant(TComDataCU* cu, int32_t* srcCoeff, coeff_t* dstCoeff, uint32_t trSize,
-                                        TextType ttype, uint32_t absPartIdx, int32_t *lastPos)
+uint32_t TComTrQuant::xRateDistOptQuant(TComDataCU* cu, int32_t* srcCoeff, coeff_t* dstCoeff, uint32_t log2TrSize,
+                                        TextType ttype, uint32_t absPartIdx)
 {
-    const uint32_t log2TrSize = g_convertToBit[trSize] + 2;
+    uint32_t trSize = 1 << log2TrSize;
     int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform
     int scalingListType = (cu->isIntra(absPartIdx) ? 0 : 3) + ttype;
 
@@ -567,7 +562,7 @@
         const uint32_t cgPosX   = cgBlkPos - (cgPosY << codingParameters.log2TrSizeCG);
         const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos);
         memset(&rdStats, 0, sizeof(coeffGroupRDStats));
-        X265_CHECK((trSize >> 2) == (1 << codingParameters.log2TrSizeCG), "transform size invalid\n");
+        X265_CHECK(log2TrSize - 2  == codingParameters.log2TrSizeCG, "transform size invalid\n");
         const int patternSigCtx = TComTrQuant::calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, codingParameters.log2TrSizeCG);
         for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--)
         {
@@ -845,14 +840,12 @@
         } // end if (sigCoeffGroupFlag[ cgBlkPos ])
     } // end for
 
-    uint32_t absSum = 0;
+    numSig = 0;
     for (int pos = 0; pos < bestLastIdxp1; pos++)
     {
         int blkPos = codingParameters.scan[pos];
         int level  = dstCoeff[blkPos];
-        absSum += level;
-        if (level)
-            *lastPos = blkPos;
+        numSig += (level != 0);
         uint32_t mask = (int32_t)srcCoeff[blkPos] >> 31;
         dstCoeff[blkPos] = (level ^ mask) - mask;
     }
@@ -863,7 +856,7 @@
         dstCoeff[codingParameters.scan[pos]] = 0;
     }
 
-    if (cu->getSlice()->getPPS()->getSignHideFlag() && absSum >= 2)
+    if (cu->getSlice()->getPPS()->getSignHideFlag() && numSig >= 2)
     {
         int64_t rdFactor = (int64_t)(
                 g_invQuantScales[m_qpParam.rem()] * g_invQuantScales[m_qpParam.rem()] * (1 << (2 * m_qpParam.m_per))
@@ -901,14 +894,14 @@
             if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD)
             {
                 uint32_t signbit = (dstCoeff[codingParameters.scan[subPos + firstNZPosInCG]] > 0 ? 0 : 1);
-                int tmpSum = 0;
+                int absSum = 0;
 
                 for (n = firstNZPosInCG; n <= lastNZPosInCG; n++)
                 {
-                    tmpSum += dstCoeff[codingParameters.scan[n + subPos]];
+                    absSum += dstCoeff[codingParameters.scan[n + subPos]];
                 }
 
-                if (signbit != (tmpSum & 0x1)) // hide but need tune
+                if (signbit != (absSum & 0x1)) // hide but need tune
                 {
                     // calculate the cost
                     int64_t minCostInc = MAX_INT64, curCost = MAX_INT64;
@@ -974,6 +967,11 @@
                         finalChange = -1;
                     }
 
+                    if (dstCoeff[minPos] == 0)
+                        numSig++;
+                    else if (finalChange == -1 && abs(dstCoeff[minPos]) == 1)
+                        numSig--;
+
                     if (srcCoeff[minPos] >= 0)
                     {
                         dstCoeff[minPos] += finalChange;
@@ -988,7 +986,7 @@
         }
     }
 
-    return absSum;
+    return numSig;
 }
 
 /** Pattern decision for context derivation process of significant_coeff_flag
diff -r 11c808e562b8 -r dcf6f2ce907c source/Lib/TLibCommon/TComTrQuant.h
--- a/source/Lib/TLibCommon/TComTrQuant.h	Thu Jul 03 15:12:45 2014 -0700
+++ b/source/Lib/TLibCommon/TComTrQuant.h	Mon Jul 07 17:00:26 2014 +0900
@@ -127,10 +127,10 @@
     void init(bool useRDOQ);
 
     // transform & inverse transform functions
-    uint32_t transformNxN(TComDataCU* cu, int16_t* residual, uint32_t stride, coeff_t* coeff, uint32_t trSize,
-                          TextType ttype, uint32_t absPartIdx, int32_t* lastPos, bool useTransformSkip = false, bool curUseRDOQ = true);
+    uint32_t transformNxN(TComDataCU* cu, int16_t* residual, uint32_t stride, coeff_t* coeff, uint32_t log2TrSize,
+                          TextType ttype, uint32_t absPartIdx, bool useTransformSkip = false, bool curUseRDOQ = true);
 
-    void invtransformNxN(bool transQuantBypass, uint32_t mode, int16_t* residual, uint32_t stride, coeff_t* coeff, uint32_t trSize, int scalingListType, bool useTransformSkip = false, int lastPos = MAX_INT);
+    void invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, coeff_t* coeff, uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig);
 
     // Misc functions
     void setQPforQuant(int qpy, TextType ttype, int qpBdOffset, int chromaQPOffset, int chFmt);
@@ -219,12 +219,13 @@
 
 private:
 
-    void xTransformSkip(int16_t* resiBlock, uint32_t stride, int32_t* coeff, int trSize);
-    void signBitHidingHDQ(coeff_t* qcoeff, coeff_t* coeff, int32_t* deltaU, const TUEntropyCodingParameters &codingParameters);
-    uint32_t xQuant(TComDataCU* cu, int32_t* src, coeff_t* dst, int trSize, TextType ttype, uint32_t absPartIdx, int32_t *lastPos);
+    void xITransformSkip(int32_t* coeff, int16_t* residual, uint32_t stride, uint32_t log2TrSize);
+    void xTransformSkip(int16_t* resiBlock, uint32_t stride, int32_t* coeff, uint32_t log2TrSize);
+    uint32_t signBitHidingHDQ(coeff_t* qcoeff, coeff_t* coeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codingParameters);
+    uint32_t xQuant(TComDataCU* cu, int32_t* src, coeff_t* dst, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx);
 
     // RDOQ functions
-    uint32_t xRateDistOptQuant(TComDataCU* cu, int32_t* srcCoeff, coeff_t* dstCoeff, uint32_t trSize, TextType ttype, uint32_t absPartIdx, int32_t *lastPos);
+    uint32_t xRateDistOptQuant(TComDataCU* cu, int32_t* srcCoeff, coeff_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx);
 
     inline uint32_t xGetCodedLevel(double& codedCost, const double curCostSig, double& codedCostSig, int levelDouble,
                                    uint32_t maxAbsLevel, uint32_t baseLevel, const int *greaterOneBits, const int *levelAbsBits, uint32_t absGoRice,
@@ -243,8 +244,6 @@
     inline double xGetICost(double rate) const { return m_lambda * rate; } ///< Get the cost for a specific rate
 
     inline uint32_t xGetIEPRate() const        { return 32768; }           ///< Get the cost of an equal probable bit
-
-    void xITransformSkip(int32_t* coeff, int16_t* residual, uint32_t stride, int trSize);
 };
 }
 //! \}
diff -r 11c808e562b8 -r dcf6f2ce907c source/Lib/TLibEncoder/TEncSbac.cpp
--- a/source/Lib/TLibEncoder/TEncSbac.cpp	Thu Jul 03 15:12:45 2014 -0700
+++ b/source/Lib/TLibEncoder/TEncSbac.cpp	Mon Jul 07 17:00:26 2014 +0900
@@ -2046,7 +2046,7 @@
     DTRACE_CABAC_T("\n")
 #endif // if ENC_DEC_TRACE
 
-    X265_CHECK(trSize <= m_slice->getSPS()->getMaxTrSize(), "transform size out of range\n");
+    X265_CHECK(log2TrSize <= m_slice->getSPS()->getQuadtreeTULog2MaxSize(), "transform size out of range\n");
 
     // compute number of significant coefficients
     uint32_t numSig = primitives.count_nonzero(coeff, (1 << (log2TrSize << 1)));
diff -r 11c808e562b8 -r dcf6f2ce907c source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Thu Jul 03 15:12:45 2014 -0700
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Mon Jul 07 17:00:26 2014 +0900
@@ -452,22 +452,17 @@
     }
 
     //--- transform and quantization ---
-    uint32_t absSum;
-    int lastPos = -1;
-
     int chFmt = cu->getChromaFormat();
     m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
-    absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, tuSize, TEXT_LUMA, absPartIdx, &lastPos, useTransformSkip);
+    uint32_t numSig = m_trQuant->transformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTransformSkip);
 
     //--- set coded block flag ---
-    cbf = absSum ? 1 : 0;
-
-    if (absSum)
+    cbf = numSig ? 1 : 0;
+
+    if (numSig)
     {
         //--- inverse transform ---
-        int scalingListType = 0 + TEXT_LUMA;
-        X265_CHECK(scalingListType < 6, "scalingListType invalid %d\n", scalingListType);
-        m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), cu->getLumaIntraDir(absPartIdx), residual, stride, coeff, tuSize, scalingListType, useTransformSkip, lastPos);
+        m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTransformSkip, numSig);
         X265_CHECK(log2TrSize <= 5, "log2TrSize is too large %d\n", log2TrSize);
         //===== reconstruction =====
         primitives.calcrecon[sizeIdx](pred, residual, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
@@ -528,9 +523,6 @@
     }
 
     //--- transform and quantization ---
-    uint32_t absSum;
-    int lastPos = -1;
-
     int chFmt = cu->getChromaFormat();
     int curChromaQpOffset;
     if (ttype == TEXT_CHROMA_U)
@@ -542,18 +534,16 @@
         curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
     }
     m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
-    absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, tuSize, ttype, absPartIdx, &lastPos, useTransformSkipC);
+    uint32_t numSig = m_trQuant->transformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, absPartIdx, useTransformSkipC);
 
     //--- set coded block flag ---
-    cbf = absSum ? 1 : 0;
+    cbf = numSig ? 1 : 0;
 
     uint32_t dist;
-    if (absSum)
+    if (numSig)
     {
         //--- inverse transform ---
-        int scalingListType = 0 + ttype;
-        X265_CHECK(scalingListType < 6, "scalingListType invalid %d\n", scalingListType);
-        m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, residual, stride, coeff, tuSize, scalingListType, useTransformSkipC, lastPos);
+        m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), residual, stride, coeff, log2TrSizeC, ttype, true, useTransformSkipC, numSig);
         X265_CHECK(log2TrSizeC <= 5, "log2TrSizeC is too large %d\n", log2TrSizeC);
         //===== reconstruction =====
         primitives.calcrecon[sizeIdxC](pred, residual, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
@@ -926,23 +916,18 @@
         primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
 
         //===== transform and quantization =====
-        uint32_t absSum = 0;
-        int lastPos = -1;
-
         m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
-        absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, tuSize, TEXT_LUMA, absPartIdx, &lastPos, useTransformSkip);
+        uint32_t numSig = m_trQuant->transformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTransformSkip);
 
         //--- set coded block flag ---
-        cu->setCbfSubParts((absSum ? 1 : 0) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
+        cu->setCbfSubParts((numSig ? 1 : 0) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
 
         int part = partitionFromSize(tuSize);
 
-        if (absSum)
+        if (numSig)
         {
             //--- inverse transform ---
-            int scalingListType = 0 + TEXT_LUMA;
-            X265_CHECK(scalingListType < 6, "scalingListType %d\n", scalingListType);
-            m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), cu->getLumaIntraDir(absPartIdx), residual, stride, coeff, tuSize, scalingListType, useTransformSkip, lastPos);
+            m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTransformSkip, numSig);
 
             // Generate Recon
             primitives.luma_add_ps[part](recon, stride, pred, residual, stride, stride);
@@ -1432,9 +1417,6 @@
                 primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
 
                 //--- transform and quantization ---
-                uint32_t absSum = 0;
-                int lastPos = -1;
-
                 int curChromaQpOffset;
                 if (ttype == TEXT_CHROMA_U)
                 {
@@ -1445,17 +1427,15 @@
                     curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
                 }
                 m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
-                absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, tuSize, ttype, absPartIdxC, &lastPos, useTransformSkipC);
+                uint32_t numSig = m_trQuant->transformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTransformSkipC);
 
                 //--- set coded block flag ---
-                cu->setCbfPartRange((((absSum > 0) ? 1 : 0) << origTrDepth), ttype, absPartIdxC, tuIterator.m_absPartIdxStep);
-
-                if (absSum)
+                cu->setCbfPartRange((((numSig > 0) ? 1 : 0) << origTrDepth), ttype, absPartIdxC, tuIterator.m_absPartIdxStep);
+
+                if (numSig)
                 {
                     //--- inverse transform ---
-                    int scalingListType = 0 + ttype;
-                    X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
-                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, residual, stride, coeff, tuSize, scalingListType, useTransformSkipC, lastPos);
+                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), residual, stride, coeff, log2TrSizeC, ttype, true, useTransformSkipC, numSig);
 
                     //===== reconstruction =====
                     // use square primitives
@@ -2692,8 +2672,6 @@
     X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
 
     // code full block
-    uint32_t absSumY = 0, absSumU = 0, absSumV = 0;
-    int lastPosY = -1, lastPosU = -1, lastPosV = -1;
     if (bCheckFull)
     {
         uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
@@ -2716,8 +2694,6 @@
         coeff_t *coeffCurU = cu->getCoeffCb() + coeffOffsetC;
         coeff_t *coeffCurV = cu->getCoeffCr() + coeffOffsetC;
 
-        uint32_t trSize   = 1 << log2TrSize;
-        uint32_t trSizeC  = 1 << log2TrSizeC;
         uint32_t sizeIdx  = log2TrSize  - 2;
         uint32_t sizeIdxC = log2TrSizeC - 2;
         cu->setTrIdxSubParts(depth - cu->getDepth(0), absPartIdx, depth);
@@ -2729,24 +2705,20 @@
         const uint32_t strideResiC = resiYuv->m_cwidth;
 
         m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
-        absSumY = m_trQuant->transformNxN(cu, curResiY, strideResiY, coeffCurY,
-                                          trSize, TEXT_LUMA, absPartIdx, &lastPosY, false, curuseRDOQ);
-
-        cu->setCbfSubParts(absSumY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
-
-        if (absSumY)
+        uint32_t numSigY = m_trQuant->transformNxN(cu, curResiY, strideResiY, coeffCurY,
+                                                   log2TrSize, TEXT_LUMA, absPartIdx, false, curuseRDOQ);
+
+        cu->setCbfSubParts(numSigY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
+
+        if (numSigY)
         {
             m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
-
-            int scalingListType = 3 + TEXT_LUMA;
-            X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
-            m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, strideResiY,  coeffCurY, trSize, scalingListType, false, lastPosY); //this is for inter mode only
+            m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSigY);
         }
         else
         {
             primitives.blockfill_s[sizeIdx](curResiY, strideResiY, 0);
         }
-        cu->setCbfSubParts(absSumY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
 
         if (bCodeChroma)
         {
@@ -2766,45 +2738,37 @@
 
                 int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
                 m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
-                absSumU = m_trQuant->transformNxN(cu, curResiU, strideResiC, coeffCurU + subTUOffset,
-                                                  trSizeC, TEXT_CHROMA_U, absPartIdxC, &lastPosU, false, curuseRDOQ);
+                uint32_t numSigU = m_trQuant->transformNxN(cu, curResiU, strideResiC, coeffCurU + subTUOffset,
+                                                           log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false, curuseRDOQ);
 
                 curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
                 m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
-                absSumV = m_trQuant->transformNxN(cu, curResiV, strideResiC, coeffCurV + subTUOffset,
-                                                  trSizeC, TEXT_CHROMA_V, absPartIdxC, &lastPosV, false, curuseRDOQ);
-
-                cu->setCbfPartRange(absSumU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
-                cu->setCbfPartRange(absSumV ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
-
-                if (absSumU)
+                uint32_t numSigV = m_trQuant->transformNxN(cu, curResiV, strideResiC, coeffCurV + subTUOffset,
+                                                           log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false, curuseRDOQ);
+
+                cu->setCbfPartRange(numSigU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
+                cu->setCbfPartRange(numSigV ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
+
+                if (numSigU)
                 {
                     curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
                     m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
-
-                    int scalingListType = 3 + TEXT_CHROMA_U;
-                    X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
-                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiU, strideResiC, coeffCurU + subTUOffset, trSizeC, scalingListType, false, lastPosU);
+                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU);
                 }
                 else
                 {
                     primitives.blockfill_s[sizeIdxC](curResiU, strideResiC, 0);
                 }
-                if (absSumV)
+                if (numSigV)
                 {
                     curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
                     m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
-
-                    int scalingListType = 3 + TEXT_CHROMA_V;
-                    X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
-                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, strideResiC, coeffCurV + subTUOffset, trSizeC, scalingListType, false, lastPosV);
+                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV);
                 }
                 else
                 {
                     primitives.blockfill_s[sizeIdxC](curResiV, strideResiC, 0);
                 }
-                cu->setCbfPartRange(absSumU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
-                cu->setCbfPartRange(absSumV ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
             }
             while (isNextSection(&tuIterator));
 
@@ -2894,9 +2858,8 @@
     uint32_t singleBitsComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
     uint32_t singleDistComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
     uint32_t singlePsyEnergyComp[MAX_NUM_COMPONENT][2] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
-    uint32_t absSum[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
+    uint32_t numSigY = 0;
     uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
-    int      lastPos[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { -1, -1 }, { -1, -1 }, { -1, -1 } };
     uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/];
 
     uint32_t bestCBF[MAX_NUM_COMPONENT];
@@ -2910,6 +2873,8 @@
     // code full block
     if (bCheckFull)
     {
+        uint32_t numSigU[2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { 0, 0 };
+        uint32_t numSigV[2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { 0, 0 };
         uint32_t trSizeC = 1 << log2TrSizeC;
         int sizeIdx  = log2TrSize - 2;
         int sizeIdxC = log2TrSizeC - 2;
@@ -2933,14 +2898,14 @@
         }
 
         m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
-        absSum[TEXT_LUMA][0] = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, coeffCurY,
-                                                       trSize, TEXT_LUMA, absPartIdx, &lastPos[TEXT_LUMA][0], false, curuseRDOQ);
-
-        cu->setCbfSubParts(absSum[TEXT_LUMA][0] ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
+        numSigY = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, coeffCurY,
+                                          log2TrSize, TEXT_LUMA, absPartIdx, false, curuseRDOQ);
+
+        cu->setCbfSubParts(numSigY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
 
         m_entropyCoder->resetBits();
         m_entropyCoder->encodeQtCbf(cu, absPartIdx, TEXT_LUMA, trMode);
-        if (absSum[TEXT_LUMA][0])
+        if (numSigY)
             m_entropyCoder->encodeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
         singleBitsComp[TEXT_LUMA][0] = m_entropyCoder->getNumberOfWrittenBits();
 
@@ -2966,24 +2931,24 @@
                 //Cb transform
                 int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
                 m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
-                absSum[TEXT_CHROMA_U][tuIterator.m_section] = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurU + subTUOffset,
-                                                                                      trSizeC, TEXT_CHROMA_U, absPartIdxC, &lastPos[TEXT_CHROMA_U][tuIterator.m_section], false, curuseRDOQ);
+                numSigU[tuIterator.m_section] = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurU + subTUOffset,
+                                                                        log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false, curuseRDOQ);
                 //Cr transform
                 curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
                 m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
-                absSum[TEXT_CHROMA_V][tuIterator.m_section] = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurV + subTUOffset,
-                                                                                      trSizeC, TEXT_CHROMA_V, absPartIdxC, &lastPos[TEXT_CHROMA_V][tuIterator.m_section], false, curuseRDOQ);
-
-                cu->setCbfPartRange(absSum[TEXT_CHROMA_U][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
-                cu->setCbfPartRange(absSum[TEXT_CHROMA_V][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
+                numSigV[tuIterator.m_section] = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurV + subTUOffset,
+                                                                        log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false, curuseRDOQ);
+
+                cu->setCbfPartRange(numSigU[tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
+                cu->setCbfPartRange(numSigV[tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
 
                 m_entropyCoder->encodeQtCbf(cu, absPartIdxC, TEXT_CHROMA_U, trMode);
-                if (absSum[TEXT_CHROMA_U][tuIterator.m_section])
+                if (numSigU[tuIterator.m_section])
                     m_entropyCoder->encodeCoeffNxN(cu, coeffCurU + subTUOffset, absPartIdxC, log2TrSizeC, TEXT_CHROMA_U);
                 singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section] = m_entropyCoder->getNumberOfWrittenBits() - singleBitsPrev;
 
                 m_entropyCoder->encodeQtCbf(cu, absPartIdxC, TEXT_CHROMA_V, trMode);
-                if (absSum[TEXT_CHROMA_V][tuIterator.m_section])
+                if (numSigV[tuIterator.m_section])
                     m_entropyCoder->encodeCoeffNxN(cu, coeffCurV + subTUOffset, absPartIdxC, log2TrSizeC, TEXT_CHROMA_V);
                 uint32_t newBits = m_entropyCoder->getNumberOfWrittenBits();
                 singleBitsComp[TEXT_CHROMA_V][tuIterator.m_section] = newBits - (singleBitsPrev + singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section]);
@@ -3021,13 +2986,10 @@
         {
             *outZeroDist += distY;
         }
-        if (absSum[TEXT_LUMA][0])
+        if (numSigY)
         {
             m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
-
-            int scalingListType = 3 + TEXT_LUMA;
-            X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
-            m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, strideResiY,  coeffCurY, trSize, scalingListType, false, lastPos[TEXT_LUMA][0]); //this is for inter mode only
+            m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSigY); //this is for inter mode only
 
             const uint32_t nonZeroDistY = primitives.sse_ss[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, curResiY, strideResiY);
             uint32_t nonZeroPsyEnergyY = 0;
@@ -3066,7 +3028,7 @@
                     nullCostY = m_rdCost->calcRdCost(distY, nullBitsY);
                 if (nullCostY < singleCostY)
                 {
-                    absSum[TEXT_LUMA][0] = 0;
+                    numSigY = 0;
 #if CHECKED_BUILD || _DEBUG
                     ::memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY);
 #endif
@@ -3099,11 +3061,11 @@
 
         singleDistComp[TEXT_LUMA][0] = distY;
         singlePsyEnergyComp[TEXT_LUMA][0] = psyEnergyY;
-        if (!absSum[TEXT_LUMA][0])
+        if (!numSigY)
         {
             primitives.blockfill_s[sizeIdx](curResiY, strideResiY, 0);
         }
-        cu->setCbfSubParts(absSum[TEXT_LUMA][0] ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
+        cu->setCbfSubParts(numSigY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
 
         uint32_t distU = 0;
         uint32_t distV = 0;
@@ -3130,19 +3092,17 @@
                 {
                     *outZeroDist += distU;
                 }
-                if (absSum[TEXT_CHROMA_U][tuIterator.m_section])
+                if (numSigU[tuIterator.m_section])
                 {
                     int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
                     m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
-
-                    int scalingListType = 3 + TEXT_CHROMA_U;
-                    X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
-                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiU, strideResiC, coeffCurU + subTUOffset,
-                                               trSizeC, scalingListType, false, lastPos[TEXT_CHROMA_U][tuIterator.m_section]);
+                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), curResiU, strideResiC, coeffCurU + subTUOffset,
+                                               log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU[tuIterator.m_section]);
                     uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth,
                                                                  curResiU, strideResiC);
                     const uint32_t nonZeroDistU = m_rdCost->scaleChromaDistCb(dist);
-                    uint32_t  nonZeroPsyEnergyU = 0;
+                    uint32_t nonZeroPsyEnergyU = 0;
+
                     if (m_rdCost->psyRdEnabled())
                     {
                         pixel*   pred = predYuv->getCbAddr(absPartIdxC);
@@ -3178,7 +3138,7 @@
                             nullCostU = m_rdCost->calcRdCost(distU, nullBitsU);
                         if (nullCostU < singleCostU)
                         {
-                            absSum[TEXT_CHROMA_U][tuIterator.m_section] = 0;
+                            numSigU[tuIterator.m_section] = 0;
 #if CHECKED_BUILD || _DEBUG
                             ::memset(coeffCurU + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
 #endif
@@ -3212,7 +3172,7 @@
                 singleDistComp[TEXT_CHROMA_U][tuIterator.m_section] = distU;
                 singlePsyEnergyComp[TEXT_CHROMA_U][tuIterator.m_section] = psyEnergyU;
 
-                if (!absSum[TEXT_CHROMA_U][tuIterator.m_section])
+                if (!numSigU[tuIterator.m_section])
                 {
                     primitives.blockfill_s[sizeIdxC](curResiU, strideResiC, 0);
                 }
@@ -3222,15 +3182,12 @@
                 {
                     *outZeroDist += distV;
                 }
-                if (absSum[TEXT_CHROMA_V][tuIterator.m_section])
+                if (numSigV[tuIterator.m_section])
                 {
                     int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
                     m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
-
-                    int scalingListType = 3 + TEXT_CHROMA_V;
-                    X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
-                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, strideResiC, coeffCurV + subTUOffset,
-                                               trSizeC, scalingListType, false, lastPos[TEXT_CHROMA_V][tuIterator.m_section]);
+                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), curResiV, strideResiC, coeffCurV + subTUOffset,
+                                               log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV[tuIterator.m_section]);
                     uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth,
                                                                  curResiV, strideResiC);
                     const uint32_t nonZeroDistV = m_rdCost->scaleChromaDistCr(dist);
@@ -3271,7 +3228,7 @@
                             nullCostV = m_rdCost->calcRdCost(distV, nullBitsV);
                         if (nullCostV < singleCostV)
                         {
-                            absSum[TEXT_CHROMA_V][tuIterator.m_section] = 0;
+                            numSigV[tuIterator.m_section] = 0;
 #if CHECKED_BUILD || _DEBUG
                             ::memset(coeffCurV + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
 #endif
@@ -3305,21 +3262,20 @@
                 singleDistComp[TEXT_CHROMA_V][tuIterator.m_section] = distV;
                 singlePsyEnergyComp[TEXT_CHROMA_V][tuIterator.m_section] = psyEnergyV;
 
-                if (!absSum[TEXT_CHROMA_V][tuIterator.m_section])
+                if (!numSigV[tuIterator.m_section])
                 {
                     primitives.blockfill_s[sizeIdxC](curResiV, strideResiC, 0);
                 }
 
-                cu->setCbfPartRange(absSum[TEXT_CHROMA_U][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
-                cu->setCbfPartRange(absSum[TEXT_CHROMA_V][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
+                cu->setCbfPartRange(numSigU[tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
+                cu->setCbfPartRange(numSigV[tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
             }
             while (isNextSection(&tuIterator));
         }
 
-        int lastPosTransformSkip[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { -1, -1 }, { -1, -1 }, { -1, -1 } };
         if (checkTransformSkipY)
         {
-            uint32_t nonZeroDistY = 0, absSumTransformSkipY;
+            uint32_t nonZeroDistY = 0;
             uint32_t nonZeroPsyEnergyY = 0;
             uint64_t singleCostY = MAX_INT64;
 
@@ -3336,11 +3292,11 @@
             }
 
             m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
-            absSumTransformSkipY = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, tsCoeffY,
-                                                           trSize, TEXT_LUMA, absPartIdx, &lastPosTransformSkip[TEXT_LUMA][0], true, curuseRDOQ);
-            cu->setCbfSubParts(absSumTransformSkipY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
-
-            if (absSumTransformSkipY)
+            uint32_t numSigTSkipY = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, tsCoeffY,
+                                                            log2TrSize, TEXT_LUMA, absPartIdx, true, curuseRDOQ);
+            cu->setCbfSubParts(numSigTSkipY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
+
+            if (numSigTSkipY)
             {
                 m_entropyCoder->resetBits();
                 m_entropyCoder->encodeQtCbf(cu, absPartIdx, TEXT_LUMA, trMode);
@@ -3348,11 +3304,7 @@
                 const uint32_t skipSingleBitsY = m_entropyCoder->getNumberOfWrittenBits();
 
                 m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
-
-                int scalingListType = 3 + TEXT_LUMA;
-                X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
-
-                m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, tsResiY, trSize, tsCoeffY, trSize, scalingListType, true, lastPosTransformSkip[TEXT_LUMA][0]);
+                m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), tsResiY, trSize, tsCoeffY, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
 
                 nonZeroDistY = primitives.sse_ss[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width,
                                                            tsResiY, trSize);
@@ -3375,7 +3327,7 @@
                     singleCostY = m_rdCost->calcRdCost(nonZeroDistY, skipSingleBitsY);
             }
 
-            if (!absSumTransformSkipY || minCost[TEXT_LUMA][0] < singleCostY)
+            if (!numSigTSkipY || minCost[TEXT_LUMA][0] < singleCostY)
             {
                 cu->setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
             }
@@ -3383,18 +3335,18 @@
             {
                 singleDistComp[TEXT_LUMA][0] = nonZeroDistY;
                 singlePsyEnergyComp[TEXT_LUMA][0] = nonZeroPsyEnergyY;
-                absSum[TEXT_LUMA][0] = absSumTransformSkipY;
+                numSigY = numSigTSkipY;
                 bestTransformMode[TEXT_LUMA][0] = 1;
                 memcpy(coeffCurY, tsCoeffY, sizeof(coeff_t) * numCoeffY);
                 primitives.square_copy_ss[sizeIdx](curResiY, strideResiY, tsResiY, trSize);
             }
 
-            cu->setCbfSubParts(absSum[TEXT_LUMA][0] ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
+            cu->setCbfSubParts(numSigY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
         }
 
         if (bCodeChroma && checkTransformSkipUV)
         {
-            uint32_t nonZeroDistU = 0, nonZeroDistV = 0, absSumTransformSkipU, absSumTransformSkipV;
+            uint32_t nonZeroDistU = 0, nonZeroDistV = 0;
             uint32_t nonZeroPsyEnergyU = 0, nonZeroPsyEnergyV = 0;
             uint64_t singleCostU = MAX_INT64;
             uint64_t singleCostV = MAX_INT64;
@@ -3429,20 +3381,20 @@
 
                 int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
                 m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
-                absSumTransformSkipU = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, tsCoeffU,
-                                                               trSizeC, TEXT_CHROMA_U, absPartIdxC, &lastPosTransformSkip[TEXT_CHROMA_U][tuIterator.m_section], true, curuseRDOQ);
+                uint32_t numSigTSkipU = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, tsCoeffU,
+                                                                log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, true, curuseRDOQ);
                 curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
                 m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
-                absSumTransformSkipV = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, tsCoeffV,
-                                                               trSizeC, TEXT_CHROMA_V, absPartIdxC, &lastPosTransformSkip[TEXT_CHROMA_V][tuIterator.m_section], true, curuseRDOQ);
-
-                cu->setCbfPartRange(absSumTransformSkipU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
-                cu->setCbfPartRange(absSumTransformSkipV ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
+                uint32_t numSigTSkipV = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, tsCoeffV,
+                                                                log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, true, curuseRDOQ);
+
+                cu->setCbfPartRange(numSigTSkipU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
+                cu->setCbfPartRange(numSigTSkipV ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
 
                 m_entropyCoder->resetBits();
                 singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section] = 0;
 
-                if (absSumTransformSkipU)
+                if (numSigTSkipU)
                 {
                     m_entropyCoder->encodeQtCbf(cu, absPartIdxC, TEXT_CHROMA_U, trMode);
                     m_entropyCoder->encodeCoeffNxN(cu, tsCoeffU, absPartIdxC, log2TrSizeC, TEXT_CHROMA_U);
@@ -3450,11 +3402,8 @@
 
                     curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
                     m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
-
-                    int scalingListType = 3 + TEXT_CHROMA_U;
-                    X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
-                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, tsResiU, trSizeC, tsCoeffU,
-                                               trSizeC, scalingListType, true, lastPosTransformSkip[TEXT_CHROMA_U][tuIterator.m_section]);
+                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), tsResiU, trSizeC, tsCoeffU,
+                                               log2TrSizeC, TEXT_CHROMA_U, false, true, numSigTSkipU);
                     uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth,
                                                                  tsResiU, trSizeC);
                     nonZeroDistU = m_rdCost->scaleChromaDistCb(dist);
@@ -3476,7 +3425,7 @@
                         singleCostU = m_rdCost->calcRdCost(nonZeroDistU, singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section]);
                 }
 
-                if (!absSumTransformSkipU || minCost[TEXT_CHROMA_U][tuIterator.m_section] < singleCostU)
+                if (!numSigTSkipU || minCost[TEXT_CHROMA_U][tuIterator.m_section] < singleCostU)
                 {
                     cu->setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
                 }
@@ -3484,13 +3433,13 @@
                 {
                     singleDistComp[TEXT_CHROMA_U][tuIterator.m_section] = nonZeroDistU;
                     singlePsyEnergyComp[TEXT_CHROMA_U][tuIterator.m_section] = nonZeroPsyEnergyU;
-                    absSum[TEXT_CHROMA_U][tuIterator.m_section] = absSumTransformSkipU;
+                    numSigU[tuIterator.m_section] = numSigTSkipU;
                     bestTransformMode[TEXT_CHROMA_U][tuIterator.m_section] = 1;
                     memcpy(coeffCurU + subTUOffset, tsCoeffU, sizeof(coeff_t) * numCoeffC);
                     primitives.square_copy_ss[sizeIdxC](curResiU, strideResiC, tsResiU, trSizeC);
                 }
 
-                if (absSumTransformSkipV)
+                if (numSigTSkipV)
                 {
                     m_entropyCoder->encodeQtCbf(cu, absPartIdxC, TEXT_CHROMA_V, trMode);
                     m_entropyCoder->encodeCoeffNxN(cu, tsCoeffV, absPartIdxC, log2TrSizeC, TEXT_CHROMA_V);
@@ -3498,11 +3447,8 @@
 
                     curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
                     m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
-
-                    int scalingListType = 3 + TEXT_CHROMA_V;
-                    X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
-                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, tsResiV, trSizeC, tsCoeffV,
-                                               trSizeC, scalingListType, true, lastPosTransformSkip[TEXT_CHROMA_V][tuIterator.m_section]);
+                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), tsResiV, trSizeC, tsCoeffV,
+                                               log2TrSizeC, TEXT_CHROMA_V, false, true, numSigTSkipV);
                     uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth,
                                                                  tsResiV, trSizeC);
                     nonZeroDistV = m_rdCost->scaleChromaDistCr(dist);
@@ -3524,7 +3470,7 @@
                         singleCostV = m_rdCost->calcRdCost(nonZeroDistV, singleBitsComp[TEXT_CHROMA_V][tuIterator.m_section]);
                 }
 
-                if (!absSumTransformSkipV || minCost[TEXT_CHROMA_V][tuIterator.m_section] < singleCostV)
+                if (!numSigTSkipV || minCost[TEXT_CHROMA_V][tuIterator.m_section] < singleCostV)
                 {
                     cu->setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
                 }
@@ -3532,14 +3478,14 @@
                 {
                     singleDistComp[TEXT_CHROMA_V][tuIterator.m_section] = nonZeroDistV;
                     singlePsyEnergyComp[TEXT_CHROMA_V][tuIterator.m_section] = nonZeroPsyEnergyV;
-                    absSum[TEXT_CHROMA_V][tuIterator.m_section] = absSumTransformSkipV;
+                    numSigV[tuIterator.m_section] = numSigTSkipV;
                     bestTransformMode[TEXT_CHROMA_V][tuIterator.m_section] = 1;
                     memcpy(coeffCurV + subTUOffset, tsCoeffV, sizeof(coeff_t) * numCoeffC);
                     primitives.square_copy_ss[sizeIdxC](curResiV, strideResiC, tsResiV, trSizeC);
                 }
 
-                cu->setCbfPartRange(absSum[TEXT_CHROMA_U][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
-                cu->setCbfPartRange(absSum[TEXT_CHROMA_V][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
+                cu->setCbfPartRange(numSigU[tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
+                cu->setCbfPartRange(numSigV[tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
             }
             while (isNextSection(&tuIterator));
 
@@ -3568,16 +3514,16 @@
         }
 
         m_entropyCoder->encodeQtCbf(cu, absPartIdx, TEXT_LUMA, trMode);
-        if (absSum[TEXT_LUMA][0])
+        if (numSigY)
             m_entropyCoder->encodeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
 
         if (bCodeChroma)
         {
             if (!splitIntoSubTUs)
             {
-                if (absSum[TEXT_CHROMA_U][0])
+                if (numSigU[0])
                     m_entropyCoder->encodeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
-                if (absSum[TEXT_CHROMA_V][0])
+                if (numSigV[0])
                     m_entropyCoder->encodeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
             }
             else
@@ -3585,13 +3531,13 @@
                 uint32_t subTUSize = 1 << (log2TrSizeC * 2);
                 uint32_t partIdxesPerSubTU = absPartIdxStep >> 1;
 
-                if (absSum[TEXT_CHROMA_U][0])
+                if (numSigU[0])
                     m_entropyCoder->encodeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
-                if (absSum[TEXT_CHROMA_U][1])
+                if (numSigU[1])
                     m_entropyCoder->encodeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_U);
-                if (absSum[TEXT_CHROMA_V][0])
+                if (numSigV[0])
                     m_entropyCoder->encodeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
-                if (absSum[TEXT_CHROMA_V][1])
+                if (numSigV[1])
                     m_entropyCoder->encodeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_V);
             }
         }
@@ -3734,7 +3680,7 @@
     cu->m_psyEnergy = singlePsyEnergy;
 
     cu->setTrIdxSubParts(trMode, absPartIdx, depth);
-    cu->setCbfSubParts(absSum[TEXT_LUMA][0] ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
+    cu->setCbfSubParts(numSigY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
 
     if (bCodeChroma)
     {
diff -r 11c808e562b8 -r dcf6f2ce907c source/common/dct.cpp
--- a/source/common/dct.cpp	Thu Jul 03 15:12:45 2014 -0700
+++ b/source/common/dct.cpp	Mon Jul 07 17:00:26 2014 +0900
@@ -773,10 +773,10 @@
     }
 }
 
-uint32_t quant_c(int32_t* coef, int32_t* quantCoeff, int32_t* deltaU, int32_t* qCoef, int qBits, int add, int numCoeff, int32_t* lastPos)
+uint32_t quant_c(int32_t* coef, int32_t* quantCoeff, int32_t* deltaU, int32_t* qCoef, int qBits, int add, int numCoeff)
 {
     int qBits8 = qBits - 8;
-    uint32_t acSum = 0;
+    uint32_t numSig = 0;
 
     for (int blockpos = 0; blockpos < numCoeff; blockpos++)
     {
@@ -785,15 +785,14 @@
 
         int tmplevel = abs(level) * quantCoeff[blockpos];
         level = ((tmplevel + add) >> qBits);
+        deltaU[blockpos] = ((tmplevel - (level << qBits)) >> qBits8);
         if (level)
-            *lastPos = blockpos;
-        deltaU[blockpos] = ((tmplevel - (level << qBits)) >> qBits8);
-        acSum += level;
+            ++numSig;
         level *= sign;
         qCoef[blockpos] = Clip3(-32768, 32767, level);
     }
 
-    return acSum;
+    return numSig;
 }
 
 uint32_t nquant_c(int32_t* coef, int32_t* quantCoeff, int32_t* scaledCoeff, int32_t* qCoef, int qBits, int add, int numCoeff)
diff -r 11c808e562b8 -r dcf6f2ce907c source/common/primitives.h
--- a/source/common/primitives.h	Thu Jul 03 15:12:45 2014 -0700
+++ b/source/common/primitives.h	Mon Jul 07 17:00:26 2014 +0900
@@ -146,7 +146,7 @@
 typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
 typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
 typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
-typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
+typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff);
 typedef uint32_t (*nquant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *scaledCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
 typedef void (*dequant_scaling_t)(const int32_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift);
 typedef void (*dequant_normal_t)(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift);
diff -r 11c808e562b8 -r dcf6f2ce907c source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Thu Jul 03 15:12:45 2014 -0700
+++ b/source/common/x86/pixel-util.h	Mon Jul 07 17:00:26 2014 +0900
@@ -44,7 +44,7 @@
 void x265_transpose32_sse2(pixel *dest, pixel *src, intptr_t stride);
 void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride);
 
-uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
+uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff);
 uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *scaledCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
 void x265_dequant_normal_sse4(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift);
 int x265_count_nonzero_ssse3(const int32_t *quantCoeff, int numCoeff);
diff -r 11c808e562b8 -r dcf6f2ce907c source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Thu Jul 03 15:12:45 2014 -0700
+++ b/source/common/x86/pixel-util8.asm	Mon Jul 07 17:00:26 2014 +0900
@@ -27,8 +27,6 @@
 
 SECTION_RODATA 32
 
-c_d_4:             dd 4, 4, 4, 4
-c_d_1234:          dd 1, 2, 3, 4
 %if BIT_DEPTH == 10
 ssim_c1:   times 4 dd 6697.7856    ; .01*.01*1023*1023*64
 ssim_c2:   times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
@@ -864,42 +862,25 @@
 
 
 ;-----------------------------------------------------------------------------
-; uint32_t quant(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
+; uint32_t quant(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff);
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-%if ARCH_X86_64 == 1
-cglobal quant, 5,6,11
-  %define addVec    m8
-  %define qbits     m9
-  %define qbits8    m10
-%else
-cglobal quant, 5,6,8, 0-(3*mmsize)
-  %define addVec    [rsp + 0 * mmsize]
-  %define qbits     [rsp + 1 * mmsize]
-  %define qbits8    [rsp + 2 * mmsize]
-%endif
+cglobal quant, 5,6,8
 
     ; fill qbits
-    movd        m0, r4d
-    mova        qbits, m0
+    movd        m4, r4d         ; m4 = qbits
 
     ; fill qbits-8
     sub         r4d, 8
-    movd        m0, r4d
-    mova        qbits8, m0
+    movd        m6, r4d         ; m6 = qbits8
 
     ; fill offset
-    mov         r4d, r5m
-    movd        m0, r4d
-    pshufd      m0, m0, 0
-    mova        addVec, m0
+    movd        m5, r5m
+    pshufd      m5, m5, 0       ; m5 = add
 
     mov         r4d, r6m
     shr         r4d, 3
-    pxor        m7, m7          ; m7 = acSum4
-    mova        m6, [c_d_1234]  ; m6 = last4
-    pxor        m5, m5          ; m5 = count
-    mova        m4, [c_d_4]     ; m4 = [4 4 4 4]
+    pxor        m7, m7          ; m7 = numZero
 .loop:
     ; 4 coeff
     movu        m0, [r0]        ; m0 = level
@@ -908,19 +889,15 @@
     movu        m2, [r1]        ; m2 = qcoeff
     pabsd       m0, m0
     pmulld      m0, m2          ; m0 = tmpLevel1
-    paddd       m2, m0, addVec
-    psrad       m2, qbits       ; m2 = level1
-    paddd       m7, m2
-    pslld       m3, m2, qbits
+    paddd       m2, m0, m5
+    psrad       m2, m4          ; m2 = level1
+    pslld       m3, m2, m4
     psubd       m0, m3
-    psrad       m0, qbits8      ; m0 = deltaU1
+    psrad       m0, m6          ; m0 = deltaU1
     movu        [r2], m0
     pxor        m0, m0
     pcmpeqd     m0, m2          ; m0 = mask4
-    pand        m5, m0
-    pandn       m0, m6
-    por         m5, m0
-    paddd       m6, m4
+    psubd       m7, m0
 
     pxor        m2, m1
     psubd       m2, m1
@@ -934,19 +911,15 @@
     movu        m2, [r1 + 16]   ; m2 = qcoeff
     pabsd       m0, m0
     pmulld      m0, m2          ; m0 = tmpLevel1
-    paddd       m2, m0, addVec
-    psrad       m2, qbits       ; m2 = level1
-    paddd       m7, m2
-    pslld       m3, m2, qbits
+    paddd       m2, m0, m5
+    psrad       m2, m4          ; m2 = level1
+    pslld       m3, m2, m4
     psubd       m0, m3
-    psrad       m0, qbits8      ; m0 = deltaU1
+    psrad       m0, m6          ; m0 = deltaU1
     movu        [r2 + 16], m0
     pxor        m0, m0
     pcmpeqd     m0, m2          ; m0 = mask4
-    pand        m5, m0
-    pandn       m0, m6
-    por         m5, m0
-    paddd       m6, m4
+    psubd       m7, m0
 
     pxor        m2, m1
     psubd       m2, m1
@@ -962,18 +935,11 @@
     dec         r4d
     jnz        .loop
 
-    movhlps     m4, m5
-    pmaxud      m4, m5
-    pshufd      m5, m4, 1
-    pmaxud      m4, m5
-
-    mov         r4, r7m
-    movd        [r4], m4
-    dec         dword [r4]
-
     phaddd      m7, m7
     phaddd      m7, m7
-    movd        eax, m7
+    mov         eax, r6m
+    movd        r4d, m7
+    sub         eax, r4d        ; numSig
 
     RET
 
@@ -985,11 +951,11 @@
 cglobal nquant, 5,6,8
 
     ; fill qbits
-    movd        m5, r4d         ; m5 = qbits
+    movd        m4, r4d         ; m4 = qbits
 
     ; fill offset
-    movd        m6, r5m
-    pshufd      m6, m6, 0       ; m6 = add
+    movd        m5, r5m
+    pshufd      m5, m5, 0       ; m5 = add
 
     mov         r4d, r6m
     shr         r4d, 3
@@ -1003,10 +969,11 @@
     pabsd       m0, m0
     pmulld      m0, m2          ; m0 = tmpLevel1
     movu        [r2], m0        ; m0 = scaledCoeff
-    paddd       m2, m0, m6
-    psrad       m2, m5          ; m2 = level1
-    pxor        m4, m4
-    pcmpeqd     m4, m2          ; m4 = mask4
+    paddd       m2, m0, m5
+    psrad       m2, m4          ; m2 = level1
+    pxor        m0, m0
+    pcmpeqd     m0, m2          ; m0 = mask4
+    psubd       m7, m0
 
     pxor        m2, m1
     psubd       m2, m1
@@ -1021,10 +988,11 @@
     pabsd       m0, m0
     pmulld      m0, m2          ; m0 = tmpLevel1
     movu        [r2 + 16], m0   ; m0 = scaledCoeff
-    paddd       m2, m0, m6
-    psrad       m2, m5          ; m2 = level1
+    paddd       m2, m0, m5
+    psrad       m2, m4          ; m2 = level1
     pxor        m0, m0
     pcmpeqd     m0, m2          ; m0 = mask4
+    psubd       m7, m0
 
     pxor        m2, m1
     psubd       m2, m1
@@ -1032,9 +1000,6 @@
     pmovsxwd    m2, m2
     movu        [r3 + 16], m2
 
-    packssdw    m4, m0          ; m4 = mask8
-    psubw       m7, m4          ; m7 = numZero
-
     add         r0, 32
     add         r1, 32
     add         r2, 32
@@ -1043,11 +1008,10 @@
     dec         r4d
     jnz        .loop
 
-    packuswb    m7, m7
-    pxor        m0, m0
-    psadbw      m0, m7
+    phaddd      m7, m7
+    phaddd      m7, m7
     mov         eax, r6m
-    movd        r4d, m0
+    movd        r4d, m7
     sub         eax, r4d        ; numSig
 
     RET
diff -r 11c808e562b8 -r dcf6f2ce907c source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp	Thu Jul 03 15:12:45 2014 -0700
+++ b/source/encoder/encoder.cpp	Mon Jul 07 17:00:26 2014 +0900
@@ -1052,8 +1052,6 @@
 
     sps->setTMVPFlagsPresent(false);
 
-    sps->setMaxTrSize(1 << m_quadtreeTULog2MaxSize);
-
     for (uint32_t i = 0; i < g_maxCUDepth - g_addCUDepth; i++)
     {
         sps->setAMPAcc(i, m_param->bEnableAMP);
diff -r 11c808e562b8 -r dcf6f2ce907c source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp	Thu Jul 03 15:12:45 2014 -0700
+++ b/source/test/mbdstharness.cpp	Mon Jul 07 17:00:26 2014 +0900
@@ -300,13 +300,12 @@
         int valueToAdd = rand() % (32 * 1024);
         int cmp_size = sizeof(int) * height * width;
         int numCoeff = height * width;
-        int optLastPos = -1, refLastPos = -1;
 
         int index1 = rand() % TEST_CASES;
         int index2 = rand() % TEST_CASES;
 
-        refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf5, mintbuf6, bits, valueToAdd, numCoeff, &refLastPos);
-        optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mintbuf4, bits, valueToAdd, numCoeff, &optLastPos);
+        refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf5, mintbuf6, bits, valueToAdd, numCoeff);
+        optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mintbuf4, bits, valueToAdd, numCoeff);
 
         if (memcmp(mintbuf3, mintbuf5, cmp_size))
             return false;
@@ -317,9 +316,6 @@
         if (optReturnValue != refReturnValue)
             return false;
 
-        if (optLastPos != refLastPos)
-            return false;
-
         reportfail();
         j += 16;
     }
@@ -509,8 +505,7 @@
     if (opt.quant)
     {
         printf("quant\t\t");
-        int dummy = -1;
-        REPORT_SPEEDUP(opt.quant, ref.quant, mintbuf1, mintbuf2, mintbuf3, mintbuf4, 23, 23785, 32 * 32, &dummy);
+        REPORT_SPEEDUP(opt.quant, ref.quant, mintbuf1, mintbuf2, mintbuf3, mintbuf4, 23, 23785, 32 * 32);
     }
 
     if (opt.nquant)


More information about the x265-devel mailing list