[x265] quant: returns numSig instead of absSum and lastPos
Steve Borho
steve at borho.org
Mon Jul 7 17:27:20 CEST 2014
On Mon, Jul 7, 2014 at 3:04 AM, Satoshi Nakagawa <nakagawa424 at oki.com> wrote:
> # HG changeset patch
> # User Satoshi Nakagawa <nakagawa424 at oki.com>
> # Date 1404720026 -32400
> # Mon Jul 07 17:00:26 2014 +0900
> # Node ID dcf6f2ce907c59eedc3d488a7f047a5f094bf925
> # Parent 11c808e562b894d84961cf00080173321e272884
> quant: returns numSig instead of absSum and lastPos
the whole patch is queued for testing, thanks.
> diff -r 11c808e562b8 -r dcf6f2ce907c source/Lib/TLibCommon/CommonDef.h
> --- a/source/Lib/TLibCommon/CommonDef.h Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/Lib/TLibCommon/CommonDef.h Mon Jul 07 17:00:26 2014 +0900
> @@ -118,8 +118,6 @@
> #define LOG2_MAX_COLUMN_WIDTH 13
> #define LOG2_MAX_ROW_HEIGHT 13
>
> -#define REG_DCT 65535
> -
> #define CABAC_INIT_PRESENT_FLAG 1
>
> #define MAX_GOP 64 ///< max. value of hierarchical GOP size
> diff -r 11c808e562b8 -r dcf6f2ce907c source/Lib/TLibCommon/TComSlice.cpp
> --- a/source/Lib/TLibCommon/TComSlice.cpp Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/Lib/TLibCommon/TComSlice.cpp Mon Jul 07 17:00:26 2014 +0900
> @@ -476,7 +476,6 @@
> , m_qpBDOffsetC(0)
> , m_bitsForPOC(8)
> , m_numLongTermRefPicSPS(0)
> - , m_maxTrSize(32)
> , m_bUseSAO(false)
> , m_bTemporalIdNestingFlag(false)
> , m_scalingListEnabledFlag(false)
> diff -r 11c808e562b8 -r dcf6f2ce907c source/Lib/TLibCommon/TComSlice.h
> --- a/source/Lib/TLibCommon/TComSlice.h Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/Lib/TLibCommon/TComSlice.h Mon Jul 07 17:00:26 2014 +0900
> @@ -825,9 +825,6 @@
> uint32_t m_ltRefPicPocLsbSps[33];
> bool m_usedByCurrPicLtSPSFlag[33];
>
> - // Max physical transform size
> - uint32_t m_maxTrSize;
> -
> int m_iAMPAcc[MAX_CU_DEPTH];
> bool m_bUseSAO;
>
> @@ -954,11 +951,6 @@
>
> void setTMVPFlagsPresent(bool b) { m_TMVPFlagsPresent = b; }
>
> - // physical transform
> - void setMaxTrSize(uint32_t u) { m_maxTrSize = u; }
> -
> - uint32_t getMaxTrSize() const { return m_maxTrSize; }
> -
> // AMP accuracy
> int getAMPAcc(uint32_t depth) const { return m_iAMPAcc[depth]; }
>
> diff -r 11c808e562b8 -r dcf6f2ce907c source/Lib/TLibCommon/TComTrQuant.cpp
> --- a/source/Lib/TLibCommon/TComTrQuant.cpp Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/Lib/TLibCommon/TComTrQuant.cpp Mon Jul 07 17:00:26 2014 +0900
> @@ -143,7 +143,7 @@
> }
>
> // To minimize the distortion only. No rate is considered.
> -void TComTrQuant::signBitHidingHDQ(coeff_t* qCoef, coeff_t* coef, int32_t* deltaU, const TUEntropyCodingParameters &codingParameters)
> +uint32_t TComTrQuant::signBitHidingHDQ(coeff_t* qCoef, coeff_t* coef, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codingParameters)
> {
> const uint32_t log2TrSizeCG = codingParameters.log2TrSizeCG;
>
> @@ -249,6 +249,11 @@
> finalChange = -1;
> }
>
> + if (qCoef[minPos] == 0)
> + numSig++;
> + else if (finalChange == -1 && abs(qCoef[minPos]) == 1)
> + numSig--;
> +
> if (coef[minPos] >= 0)
> {
> qCoef[minPos] += finalChange;
> @@ -261,12 +266,13 @@
> }
> lastCG = 0;
> } // TU loop
> +
> + return numSig;
> }
>
> -uint32_t TComTrQuant::xQuant(TComDataCU* cu, int32_t* coef, coeff_t* qCoef, int trSize,
> - TextType ttype, uint32_t absPartIdx, int32_t *lastPos)
> +uint32_t TComTrQuant::xQuant(TComDataCU* cu, int32_t* coef, coeff_t* qCoef, uint32_t log2TrSize,
> + TextType ttype, uint32_t absPartIdx)
> {
> - const uint32_t log2TrSize = g_convertToBit[trSize] + 2;
> TUEntropyCodingParameters codingParameters;
> getTUEntropyCodingParameters(cu, codingParameters, absPartIdx, log2TrSize, ttype);
> int deltaU[32 * 32];
> @@ -281,13 +287,13 @@
> int add = (cu->getSlice()->getSliceType() == I_SLICE ? 171 : 85) << (qbits - 9);
>
> int numCoeff = 1 << log2TrSize * 2;
> - uint32_t acSum = primitives.quant(coef, quantCoeff, deltaU, qCoef, qbits, add, numCoeff, lastPos);
> + uint32_t numSig = primitives.quant(coef, quantCoeff, deltaU, qCoef, qbits, add, numCoeff);
>
> - if (acSum >= 2 && cu->getSlice()->getPPS()->getSignHideFlag())
> + if (numSig >= 2 && cu->getSlice()->getPPS()->getSignHideFlag())
> {
> - signBitHidingHDQ(qCoef, coef, deltaU, codingParameters);
> + return signBitHidingHDQ(qCoef, coef, deltaU, numSig, codingParameters);
> }
> - return acSum;
> + return numSig;
> }
>
> void TComTrQuant::init(bool useRDOQ)
> @@ -299,73 +305,65 @@
> int16_t* residual,
> uint32_t stride,
> coeff_t* coeff,
> - uint32_t trSize,
> + uint32_t log2TrSize,
> TextType ttype,
> uint32_t absPartIdx,
> - int32_t* lastPos,
> bool useTransformSkip,
> bool curUseRDOQ)
> {
> if (cu->getCUTransquantBypass(absPartIdx))
> {
> - uint32_t absSum = 0;
> - for (uint32_t k = 0; k < trSize; k++)
> + uint32_t numSig = 0;
> + int trSize = 1 << log2TrSize;
> + for (int k = 0; k < trSize; k++)
> {
> - for (uint32_t j = 0; j < trSize; j++)
> + for (int j = 0; j < trSize; j++)
> {
> coeff[k * trSize + j] = ((int16_t)residual[k * stride + j]);
> - absSum += abs(residual[k * stride + j]);
> + numSig += (residual[k * stride + j] != 0);
> }
> }
>
> - return absSum;
> + return numSig;
> }
>
> - uint32_t mode; //luma intra pred
> - if (ttype == TEXT_LUMA && cu->getPredictionMode(absPartIdx) == MODE_INTRA)
> + X265_CHECK((cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() >= log2TrSize), "transform size too large\n");
> + if (!useTransformSkip)
> {
> - mode = cu->getLumaIntraDir(absPartIdx);
> + // TODO: this may need larger data types for X265_DEPTH > 8
> + const uint32_t sizeIdx = log2TrSize - 2;
> + int useDST = (sizeIdx == 0 && ttype == TEXT_LUMA && cu->getPredictionMode(absPartIdx) == MODE_INTRA);
> + int index = DCT_4x4 + sizeIdx - useDST;
> + primitives.dct[index](residual, m_tmpCoeff, stride);
> + if (m_nr->bNoiseReduction)
> + {
> + if (index > 0)
> + {
> + denoiseDct(m_tmpCoeff, m_nr->residualSum[sizeIdx], m_nr->offset[sizeIdx], (16 << sizeIdx * 2));
> + m_nr->count[sizeIdx]++;
> + }
> + }
> }
> else
> {
> - mode = REG_DCT;
> - }
> -
> - X265_CHECK((cu->getSlice()->getSPS()->getMaxTrSize() >= trSize), "transform size too large\n");
> - if (useTransformSkip)
> - {
> - xTransformSkip(residual, stride, m_tmpCoeff, trSize);
> - }
> - else
> - {
> - // TODO: this may need larger data types for X265_DEPTH > 8
> - const uint32_t log2BlockSize = g_convertToBit[trSize];
> - primitives.dct[DCT_4x4 + log2BlockSize - ((trSize == 4) && (mode != REG_DCT))](residual, m_tmpCoeff, stride);
> - if (m_nr->bNoiseReduction)
> - {
> - int index = (DCT_4x4 + log2BlockSize - ((trSize == 4) && (mode != REG_DCT)));
> - if (index > 0 && index < 5)
> - {
> - denoiseDct(m_tmpCoeff, m_nr->residualSum[index - 1], m_nr->offset[index - 1], (16 << (index - 1) * 2));
> - m_nr->count[index - 1]++;
> - }
> - }
> + xTransformSkip(residual, stride, m_tmpCoeff, log2TrSize);
> }
>
> if (m_useRDOQ && curUseRDOQ)
> {
> - return xRateDistOptQuant(cu, m_tmpCoeff, coeff, trSize, ttype, absPartIdx, lastPos);
> + return xRateDistOptQuant(cu, m_tmpCoeff, coeff, log2TrSize, ttype, absPartIdx);
> }
> - return xQuant(cu, m_tmpCoeff, coeff, trSize, ttype, absPartIdx, lastPos);
> + return xQuant(cu, m_tmpCoeff, coeff, log2TrSize, ttype, absPartIdx);
> }
>
> -void TComTrQuant::invtransformNxN(bool transQuantBypass, uint32_t mode, int16_t* residual, uint32_t stride, coeff_t* coeff, uint32_t trSize, int scalingListType, bool useTransformSkip, int lastPos)
> +void TComTrQuant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, coeff_t* coeff, uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig)
> {
> if (transQuantBypass)
> {
> - for (uint32_t k = 0; k < trSize; k++)
> + int trSize = 1 << log2TrSize;
> + for (int k = 0; k < trSize; k++)
> {
> - for (uint32_t j = 0; j < trSize; j++)
> + for (int j = 0; j < trSize; j++)
> {
> residual[k * stride + j] = (int16_t)(coeff[k * trSize + j]);
> }
> @@ -377,37 +375,34 @@
> // Values need to pass as input parameter in dequant
> int per = m_qpParam.m_per;
> int rem = m_qpParam.m_rem;
> - bool useScalingList = getUseScalingList();
> - const uint32_t log2TrSize = g_convertToBit[trSize] + 2;
> int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
> int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift;
> - int32_t *dequantCoef = getDequantCoeff(scalingListType, m_qpParam.m_rem, log2TrSize - 2);
> + int numCoeff = 1 << log2TrSize * 2;
>
> - if (!useScalingList)
> + if (!getUseScalingList())
> {
> static const int invQuantScales[6] = { 40, 45, 51, 57, 64, 72 };
> int scale = invQuantScales[rem] << per;
> - primitives.dequant_normal(coeff, m_tmpCoeff, trSize * trSize, scale, shift);
> + primitives.dequant_normal(coeff, m_tmpCoeff, numCoeff, scale, shift);
> }
> else
> {
> // CHECK_ME: the code is not verify since this is DEAD path
> - primitives.dequant_scaling(coeff, dequantCoef, m_tmpCoeff, trSize * trSize, per, shift);
> + int scalingListType = (!bIntra ? 3 : 0) + ttype;
> + X265_CHECK(scalingListType < 6, "scalingListType invalid %d\n", scalingListType);
> + int32_t *dequantCoef = getDequantCoeff(scalingListType, m_qpParam.m_rem, log2TrSize - 2);
> + primitives.dequant_scaling(coeff, dequantCoef, m_tmpCoeff, numCoeff, per, shift);
> }
>
> - if (useTransformSkip == true)
> + if (!useTransformSkip)
> {
> - xITransformSkip(m_tmpCoeff, residual, stride, trSize);
> - }
> - else
> - {
> - // CHECK_ME: we can't here when no any coeff
> - X265_CHECK(lastPos >= 0, "lastPos negative\n");
> + const uint32_t sizeIdx = log2TrSize - 2;
> + int useDST = (sizeIdx == 0 && ttype == TEXT_LUMA && bIntra);
>
> - const uint32_t log2BlockSize = log2TrSize - 2;
> + X265_CHECK(numSig == primitives.count_nonzero(coeff, 1 << log2TrSize * 2), "numSig differ\n");
>
> // DC only
> - if (lastPos == 0 && !((trSize == 4) && (mode != REG_DCT)))
> + if (numSig == 1 && coeff[0] != 0 && !useDST)
> {
> const int shift_1st = 7;
> const int add_1st = 1 << (shift_1st - 1);
> @@ -415,13 +410,17 @@
> const int add_2nd = 1 << (shift_2nd - 1);
>
> int dc_val = (((m_tmpCoeff[0] * 64 + add_1st) >> shift_1st) * 64 + add_2nd) >> shift_2nd;
> - primitives.blockfill_s[log2BlockSize](residual, stride, dc_val);
> + primitives.blockfill_s[sizeIdx](residual, stride, dc_val);
>
> return;
> }
>
> // TODO: this may need larger data types for X265_DEPTH > 8
> - primitives.idct[IDCT_4x4 + log2BlockSize - ((trSize == 4) && (mode != REG_DCT))](m_tmpCoeff, residual, stride);
> + primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_tmpCoeff, residual, stride);
> + }
> + else
> + {
> + xITransformSkip(m_tmpCoeff, residual, stride, log2TrSize);
> }
> }
>
> @@ -435,12 +434,10 @@
> * \param stride stride of input residual data
> * \param size transform size (size x size)
> */
> -void TComTrQuant::xTransformSkip(int16_t* resiBlock, uint32_t stride, int32_t* coeff, int trSize)
> +void TComTrQuant::xTransformSkip(int16_t* resiBlock, uint32_t stride, int32_t* coeff, uint32_t log2TrSize)
> {
> - uint32_t log2TrSize = g_convertToBit[trSize] + 2;
> - int shift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
> - uint32_t transformSkipShift;
> - int j, k;
> + int trSize = 1 << log2TrSize;
> + int shift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
>
> if (shift >= 0)
> {
> @@ -448,15 +445,14 @@
> }
> else
> {
> - //The case when X265_DEPTH > 13
> - int offset;
> - transformSkipShift = -shift;
> - offset = (1 << (transformSkipShift - 1));
> - for (j = 0; j < trSize; j++)
> + // The case when X265_DEPTH > 13
> + shift = -shift;
> + int offset = (1 << (shift - 1));
> + for (int j = 0; j < trSize; j++)
> {
> - for (k = 0; k < trSize; k++)
> + for (int k = 0; k < trSize; k++)
> {
> - coeff[j * trSize + k] = (resiBlock[j * stride + k] + offset) >> transformSkipShift;
> + coeff[j * trSize + k] = (resiBlock[j * stride + k] + offset) >> shift;
> }
> }
> }
> @@ -468,11 +464,10 @@
> * \param stride stride of input residual data
> * \param size transform size (size x size)
> */
> -void TComTrQuant::xITransformSkip(int32_t* coef, int16_t* residual, uint32_t stride, int trSize)
> +void TComTrQuant::xITransformSkip(int32_t* coef, int16_t* residual, uint32_t stride, uint32_t log2TrSize)
> {
> - uint32_t log2TrSize = g_convertToBit[trSize] + 2;
> - int shift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
> - int j, k;
> + int trSize = 1 << log2TrSize;
> + int shift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
>
> if (shift > 0)
> {
> @@ -480,13 +475,13 @@
> }
> else
> {
> - //The case when X265_DEPTH >= 13
> - uint32_t transformSkipShift = -shift;
> - for (j = 0; j < trSize; j++)
> + // The case when X265_DEPTH >= 13
> + shift = -shift;
> + for (int j = 0; j < trSize; j++)
> {
> - for (k = 0; k < trSize; k++)
> + for (int k = 0; k < trSize; k++)
> {
> - residual[j * stride + k] = coef[j * trSize + k] << transformSkipShift;
> + residual[j * stride + k] = coef[j * trSize + k] << shift;
> }
> }
> }
> @@ -501,14 +496,14 @@
> * \param uiAbsSum reference to absolute sum of quantized transform coefficient
> * \param ttype plane type / luminance or chrominance
> * \param absPartIdx absolute partition index
> - * \returns void
> + * \returns number of significant coefficient
> * Rate distortion optimized quantization for entropy
> * coding engines using probability models like CABAC
> */
> -uint32_t TComTrQuant::xRateDistOptQuant(TComDataCU* cu, int32_t* srcCoeff, coeff_t* dstCoeff, uint32_t trSize,
> - TextType ttype, uint32_t absPartIdx, int32_t *lastPos)
> +uint32_t TComTrQuant::xRateDistOptQuant(TComDataCU* cu, int32_t* srcCoeff, coeff_t* dstCoeff, uint32_t log2TrSize,
> + TextType ttype, uint32_t absPartIdx)
> {
> - const uint32_t log2TrSize = g_convertToBit[trSize] + 2;
> + uint32_t trSize = 1 << log2TrSize;
> int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform
> int scalingListType = (cu->isIntra(absPartIdx) ? 0 : 3) + ttype;
>
> @@ -567,7 +562,7 @@
> const uint32_t cgPosX = cgBlkPos - (cgPosY << codingParameters.log2TrSizeCG);
> const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos);
> memset(&rdStats, 0, sizeof(coeffGroupRDStats));
> - X265_CHECK((trSize >> 2) == (1 << codingParameters.log2TrSizeCG), "transform size invalid\n");
> + X265_CHECK(log2TrSize - 2 == codingParameters.log2TrSizeCG, "transform size invalid\n");
> const int patternSigCtx = TComTrQuant::calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, codingParameters.log2TrSizeCG);
> for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--)
> {
> @@ -845,14 +840,12 @@
> } // end if (sigCoeffGroupFlag[ cgBlkPos ])
> } // end for
>
> - uint32_t absSum = 0;
> + numSig = 0;
> for (int pos = 0; pos < bestLastIdxp1; pos++)
> {
> int blkPos = codingParameters.scan[pos];
> int level = dstCoeff[blkPos];
> - absSum += level;
> - if (level)
> - *lastPos = blkPos;
> + numSig += (level != 0);
> uint32_t mask = (int32_t)srcCoeff[blkPos] >> 31;
> dstCoeff[blkPos] = (level ^ mask) - mask;
> }
> @@ -863,7 +856,7 @@
> dstCoeff[codingParameters.scan[pos]] = 0;
> }
>
> - if (cu->getSlice()->getPPS()->getSignHideFlag() && absSum >= 2)
> + if (cu->getSlice()->getPPS()->getSignHideFlag() && numSig >= 2)
> {
> int64_t rdFactor = (int64_t)(
> g_invQuantScales[m_qpParam.rem()] * g_invQuantScales[m_qpParam.rem()] * (1 << (2 * m_qpParam.m_per))
> @@ -901,14 +894,14 @@
> if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD)
> {
> uint32_t signbit = (dstCoeff[codingParameters.scan[subPos + firstNZPosInCG]] > 0 ? 0 : 1);
> - int tmpSum = 0;
> + int absSum = 0;
>
> for (n = firstNZPosInCG; n <= lastNZPosInCG; n++)
> {
> - tmpSum += dstCoeff[codingParameters.scan[n + subPos]];
> + absSum += dstCoeff[codingParameters.scan[n + subPos]];
> }
>
> - if (signbit != (tmpSum & 0x1)) // hide but need tune
> + if (signbit != (absSum & 0x1)) // hide but need tune
> {
> // calculate the cost
> int64_t minCostInc = MAX_INT64, curCost = MAX_INT64;
> @@ -974,6 +967,11 @@
> finalChange = -1;
> }
>
> + if (dstCoeff[minPos] == 0)
> + numSig++;
> + else if (finalChange == -1 && abs(dstCoeff[minPos]) == 1)
> + numSig--;
> +
> if (srcCoeff[minPos] >= 0)
> {
> dstCoeff[minPos] += finalChange;
> @@ -988,7 +986,7 @@
> }
> }
>
> - return absSum;
> + return numSig;
> }
>
> /** Pattern decision for context derivation process of significant_coeff_flag
> diff -r 11c808e562b8 -r dcf6f2ce907c source/Lib/TLibCommon/TComTrQuant.h
> --- a/source/Lib/TLibCommon/TComTrQuant.h Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/Lib/TLibCommon/TComTrQuant.h Mon Jul 07 17:00:26 2014 +0900
> @@ -127,10 +127,10 @@
> void init(bool useRDOQ);
>
> // transform & inverse transform functions
> - uint32_t transformNxN(TComDataCU* cu, int16_t* residual, uint32_t stride, coeff_t* coeff, uint32_t trSize,
> - TextType ttype, uint32_t absPartIdx, int32_t* lastPos, bool useTransformSkip = false, bool curUseRDOQ = true);
> + uint32_t transformNxN(TComDataCU* cu, int16_t* residual, uint32_t stride, coeff_t* coeff, uint32_t log2TrSize,
> + TextType ttype, uint32_t absPartIdx, bool useTransformSkip = false, bool curUseRDOQ = true);
>
> - void invtransformNxN(bool transQuantBypass, uint32_t mode, int16_t* residual, uint32_t stride, coeff_t* coeff, uint32_t trSize, int scalingListType, bool useTransformSkip = false, int lastPos = MAX_INT);
> + void invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, coeff_t* coeff, uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig);
>
> // Misc functions
> void setQPforQuant(int qpy, TextType ttype, int qpBdOffset, int chromaQPOffset, int chFmt);
> @@ -219,12 +219,13 @@
>
> private:
>
> - void xTransformSkip(int16_t* resiBlock, uint32_t stride, int32_t* coeff, int trSize);
> - void signBitHidingHDQ(coeff_t* qcoeff, coeff_t* coeff, int32_t* deltaU, const TUEntropyCodingParameters &codingParameters);
> - uint32_t xQuant(TComDataCU* cu, int32_t* src, coeff_t* dst, int trSize, TextType ttype, uint32_t absPartIdx, int32_t *lastPos);
> + void xITransformSkip(int32_t* coeff, int16_t* residual, uint32_t stride, uint32_t log2TrSize);
> + void xTransformSkip(int16_t* resiBlock, uint32_t stride, int32_t* coeff, uint32_t log2TrSize);
> + uint32_t signBitHidingHDQ(coeff_t* qcoeff, coeff_t* coeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codingParameters);
> + uint32_t xQuant(TComDataCU* cu, int32_t* src, coeff_t* dst, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx);
>
> // RDOQ functions
> - uint32_t xRateDistOptQuant(TComDataCU* cu, int32_t* srcCoeff, coeff_t* dstCoeff, uint32_t trSize, TextType ttype, uint32_t absPartIdx, int32_t *lastPos);
> + uint32_t xRateDistOptQuant(TComDataCU* cu, int32_t* srcCoeff, coeff_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx);
>
> inline uint32_t xGetCodedLevel(double& codedCost, const double curCostSig, double& codedCostSig, int levelDouble,
> uint32_t maxAbsLevel, uint32_t baseLevel, const int *greaterOneBits, const int *levelAbsBits, uint32_t absGoRice,
> @@ -243,8 +244,6 @@
> inline double xGetICost(double rate) const { return m_lambda * rate; } ///< Get the cost for a specific rate
>
> inline uint32_t xGetIEPRate() const { return 32768; } ///< Get the cost of an equal probable bit
> -
> - void xITransformSkip(int32_t* coeff, int16_t* residual, uint32_t stride, int trSize);
> };
> }
> //! \}
> diff -r 11c808e562b8 -r dcf6f2ce907c source/Lib/TLibEncoder/TEncSbac.cpp
> --- a/source/Lib/TLibEncoder/TEncSbac.cpp Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/Lib/TLibEncoder/TEncSbac.cpp Mon Jul 07 17:00:26 2014 +0900
> @@ -2046,7 +2046,7 @@
> DTRACE_CABAC_T("\n")
> #endif // if ENC_DEC_TRACE
>
> - X265_CHECK(trSize <= m_slice->getSPS()->getMaxTrSize(), "transform size out of range\n");
> + X265_CHECK(log2TrSize <= m_slice->getSPS()->getQuadtreeTULog2MaxSize(), "transform size out of range\n");
>
> // compute number of significant coefficients
> uint32_t numSig = primitives.count_nonzero(coeff, (1 << (log2TrSize << 1)));
> diff -r 11c808e562b8 -r dcf6f2ce907c source/Lib/TLibEncoder/TEncSearch.cpp
> --- a/source/Lib/TLibEncoder/TEncSearch.cpp Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/Lib/TLibEncoder/TEncSearch.cpp Mon Jul 07 17:00:26 2014 +0900
> @@ -452,22 +452,17 @@
> }
>
> //--- transform and quantization ---
> - uint32_t absSum;
> - int lastPos = -1;
> -
> int chFmt = cu->getChromaFormat();
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
> - absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, tuSize, TEXT_LUMA, absPartIdx, &lastPos, useTransformSkip);
> + uint32_t numSig = m_trQuant->transformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTransformSkip);
>
> //--- set coded block flag ---
> - cbf = absSum ? 1 : 0;
> -
> - if (absSum)
> + cbf = numSig ? 1 : 0;
> +
> + if (numSig)
> {
> //--- inverse transform ---
> - int scalingListType = 0 + TEXT_LUMA;
> - X265_CHECK(scalingListType < 6, "scalingListType invalid %d\n", scalingListType);
> - m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), cu->getLumaIntraDir(absPartIdx), residual, stride, coeff, tuSize, scalingListType, useTransformSkip, lastPos);
> + m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTransformSkip, numSig);
> X265_CHECK(log2TrSize <= 5, "log2TrSize is too large %d\n", log2TrSize);
> //===== reconstruction =====
> primitives.calcrecon[sizeIdx](pred, residual, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
> @@ -528,9 +523,6 @@
> }
>
> //--- transform and quantization ---
> - uint32_t absSum;
> - int lastPos = -1;
> -
> int chFmt = cu->getChromaFormat();
> int curChromaQpOffset;
> if (ttype == TEXT_CHROMA_U)
> @@ -542,18 +534,16 @@
> curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
> }
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> - absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, tuSize, ttype, absPartIdx, &lastPos, useTransformSkipC);
> + uint32_t numSig = m_trQuant->transformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, absPartIdx, useTransformSkipC);
>
> //--- set coded block flag ---
> - cbf = absSum ? 1 : 0;
> + cbf = numSig ? 1 : 0;
>
> uint32_t dist;
> - if (absSum)
> + if (numSig)
> {
> //--- inverse transform ---
> - int scalingListType = 0 + ttype;
> - X265_CHECK(scalingListType < 6, "scalingListType invalid %d\n", scalingListType);
> - m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, residual, stride, coeff, tuSize, scalingListType, useTransformSkipC, lastPos);
> + m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), residual, stride, coeff, log2TrSizeC, ttype, true, useTransformSkipC, numSig);
> X265_CHECK(log2TrSizeC <= 5, "log2TrSizeC is too large %d\n", log2TrSizeC);
> //===== reconstruction =====
> primitives.calcrecon[sizeIdxC](pred, residual, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
> @@ -926,23 +916,18 @@
> primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
>
> //===== transform and quantization =====
> - uint32_t absSum = 0;
> - int lastPos = -1;
> -
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
> - absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, tuSize, TEXT_LUMA, absPartIdx, &lastPos, useTransformSkip);
> + uint32_t numSig = m_trQuant->transformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTransformSkip);
>
> //--- set coded block flag ---
> - cu->setCbfSubParts((absSum ? 1 : 0) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
> + cu->setCbfSubParts((numSig ? 1 : 0) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
>
> int part = partitionFromSize(tuSize);
>
> - if (absSum)
> + if (numSig)
> {
> //--- inverse transform ---
> - int scalingListType = 0 + TEXT_LUMA;
> - X265_CHECK(scalingListType < 6, "scalingListType %d\n", scalingListType);
> - m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), cu->getLumaIntraDir(absPartIdx), residual, stride, coeff, tuSize, scalingListType, useTransformSkip, lastPos);
> + m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTransformSkip, numSig);
>
> // Generate Recon
> primitives.luma_add_ps[part](recon, stride, pred, residual, stride, stride);
> @@ -1432,9 +1417,6 @@
> primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
>
> //--- transform and quantization ---
> - uint32_t absSum = 0;
> - int lastPos = -1;
> -
> int curChromaQpOffset;
> if (ttype == TEXT_CHROMA_U)
> {
> @@ -1445,17 +1427,15 @@
> curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
> }
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> - absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, tuSize, ttype, absPartIdxC, &lastPos, useTransformSkipC);
> + uint32_t numSig = m_trQuant->transformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTransformSkipC);
>
> //--- set coded block flag ---
> - cu->setCbfPartRange((((absSum > 0) ? 1 : 0) << origTrDepth), ttype, absPartIdxC, tuIterator.m_absPartIdxStep);
> -
> - if (absSum)
> + cu->setCbfPartRange((((numSig > 0) ? 1 : 0) << origTrDepth), ttype, absPartIdxC, tuIterator.m_absPartIdxStep);
> +
> + if (numSig)
> {
> //--- inverse transform ---
> - int scalingListType = 0 + ttype;
> - X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> - m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, residual, stride, coeff, tuSize, scalingListType, useTransformSkipC, lastPos);
> + m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), residual, stride, coeff, log2TrSizeC, ttype, true, useTransformSkipC, numSig);
>
> //===== reconstruction =====
> // use square primitives
> @@ -2692,8 +2672,6 @@
> X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
>
> // code full block
> - uint32_t absSumY = 0, absSumU = 0, absSumV = 0;
> - int lastPosY = -1, lastPosU = -1, lastPosV = -1;
> if (bCheckFull)
> {
> uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
> @@ -2716,8 +2694,6 @@
> coeff_t *coeffCurU = cu->getCoeffCb() + coeffOffsetC;
> coeff_t *coeffCurV = cu->getCoeffCr() + coeffOffsetC;
>
> - uint32_t trSize = 1 << log2TrSize;
> - uint32_t trSizeC = 1 << log2TrSizeC;
> uint32_t sizeIdx = log2TrSize - 2;
> uint32_t sizeIdxC = log2TrSizeC - 2;
> cu->setTrIdxSubParts(depth - cu->getDepth(0), absPartIdx, depth);
> @@ -2729,24 +2705,20 @@
> const uint32_t strideResiC = resiYuv->m_cwidth;
>
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
> - absSumY = m_trQuant->transformNxN(cu, curResiY, strideResiY, coeffCurY,
> - trSize, TEXT_LUMA, absPartIdx, &lastPosY, false, curuseRDOQ);
> -
> - cu->setCbfSubParts(absSumY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
> -
> - if (absSumY)
> + uint32_t numSigY = m_trQuant->transformNxN(cu, curResiY, strideResiY, coeffCurY,
> + log2TrSize, TEXT_LUMA, absPartIdx, false, curuseRDOQ);
> +
> + cu->setCbfSubParts(numSigY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
> +
> + if (numSigY)
> {
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
> -
> - int scalingListType = 3 + TEXT_LUMA;
> - X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> - m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, strideResiY, coeffCurY, trSize, scalingListType, false, lastPosY); //this is for inter mode only
> + m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSigY);
> }
> else
> {
> primitives.blockfill_s[sizeIdx](curResiY, strideResiY, 0);
> }
> - cu->setCbfSubParts(absSumY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
>
> if (bCodeChroma)
> {
> @@ -2766,45 +2738,37 @@
>
> int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> - absSumU = m_trQuant->transformNxN(cu, curResiU, strideResiC, coeffCurU + subTUOffset,
> - trSizeC, TEXT_CHROMA_U, absPartIdxC, &lastPosU, false, curuseRDOQ);
> + uint32_t numSigU = m_trQuant->transformNxN(cu, curResiU, strideResiC, coeffCurU + subTUOffset,
> + log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false, curuseRDOQ);
>
> curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> - absSumV = m_trQuant->transformNxN(cu, curResiV, strideResiC, coeffCurV + subTUOffset,
> - trSizeC, TEXT_CHROMA_V, absPartIdxC, &lastPosV, false, curuseRDOQ);
> -
> - cu->setCbfPartRange(absSumU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> - cu->setCbfPartRange(absSumV ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> -
> - if (absSumU)
> + uint32_t numSigV = m_trQuant->transformNxN(cu, curResiV, strideResiC, coeffCurV + subTUOffset,
> + log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false, curuseRDOQ);
> +
> + cu->setCbfPartRange(numSigU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> + cu->setCbfPartRange(numSigV ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> +
> + if (numSigU)
> {
> curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> -
> - int scalingListType = 3 + TEXT_CHROMA_U;
> - X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> - m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiU, strideResiC, coeffCurU + subTUOffset, trSizeC, scalingListType, false, lastPosU);
> + m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU);
> }
> else
> {
> primitives.blockfill_s[sizeIdxC](curResiU, strideResiC, 0);
> }
> - if (absSumV)
> + if (numSigV)
> {
> curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> -
> - int scalingListType = 3 + TEXT_CHROMA_V;
> - X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> - m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, strideResiC, coeffCurV + subTUOffset, trSizeC, scalingListType, false, lastPosV);
> + m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV);
> }
> else
> {
> primitives.blockfill_s[sizeIdxC](curResiV, strideResiC, 0);
> }
> - cu->setCbfPartRange(absSumU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> - cu->setCbfPartRange(absSumV ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> }
> while (isNextSection(&tuIterator));
>
> @@ -2894,9 +2858,8 @@
> uint32_t singleBitsComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
> uint32_t singleDistComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
> uint32_t singlePsyEnergyComp[MAX_NUM_COMPONENT][2] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
> - uint32_t absSum[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
> + uint32_t numSigY = 0;
> uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
> - int lastPos[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { -1, -1 }, { -1, -1 }, { -1, -1 } };
> uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/];
>
> uint32_t bestCBF[MAX_NUM_COMPONENT];
> @@ -2910,6 +2873,8 @@
> // code full block
> if (bCheckFull)
> {
> + uint32_t numSigU[2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { 0, 0 };
> + uint32_t numSigV[2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { 0, 0 };
> uint32_t trSizeC = 1 << log2TrSizeC;
> int sizeIdx = log2TrSize - 2;
> int sizeIdxC = log2TrSizeC - 2;
> @@ -2933,14 +2898,14 @@
> }
>
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
> - absSum[TEXT_LUMA][0] = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, coeffCurY,
> - trSize, TEXT_LUMA, absPartIdx, &lastPos[TEXT_LUMA][0], false, curuseRDOQ);
> -
> - cu->setCbfSubParts(absSum[TEXT_LUMA][0] ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
> + numSigY = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, coeffCurY,
> + log2TrSize, TEXT_LUMA, absPartIdx, false, curuseRDOQ);
> +
> + cu->setCbfSubParts(numSigY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
>
> m_entropyCoder->resetBits();
> m_entropyCoder->encodeQtCbf(cu, absPartIdx, TEXT_LUMA, trMode);
> - if (absSum[TEXT_LUMA][0])
> + if (numSigY)
> m_entropyCoder->encodeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
> singleBitsComp[TEXT_LUMA][0] = m_entropyCoder->getNumberOfWrittenBits();
>
> @@ -2966,24 +2931,24 @@
> //Cb transform
> int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> - absSum[TEXT_CHROMA_U][tuIterator.m_section] = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurU + subTUOffset,
> - trSizeC, TEXT_CHROMA_U, absPartIdxC, &lastPos[TEXT_CHROMA_U][tuIterator.m_section], false, curuseRDOQ);
> + numSigU[tuIterator.m_section] = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurU + subTUOffset,
> + log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false, curuseRDOQ);
> //Cr transform
> curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> - absSum[TEXT_CHROMA_V][tuIterator.m_section] = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurV + subTUOffset,
> - trSizeC, TEXT_CHROMA_V, absPartIdxC, &lastPos[TEXT_CHROMA_V][tuIterator.m_section], false, curuseRDOQ);
> -
> - cu->setCbfPartRange(absSum[TEXT_CHROMA_U][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> - cu->setCbfPartRange(absSum[TEXT_CHROMA_V][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> + numSigV[tuIterator.m_section] = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurV + subTUOffset,
> + log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false, curuseRDOQ);
> +
> + cu->setCbfPartRange(numSigU[tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> + cu->setCbfPartRange(numSigV[tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
>
> m_entropyCoder->encodeQtCbf(cu, absPartIdxC, TEXT_CHROMA_U, trMode);
> - if (absSum[TEXT_CHROMA_U][tuIterator.m_section])
> + if (numSigU[tuIterator.m_section])
> m_entropyCoder->encodeCoeffNxN(cu, coeffCurU + subTUOffset, absPartIdxC, log2TrSizeC, TEXT_CHROMA_U);
> singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section] = m_entropyCoder->getNumberOfWrittenBits() - singleBitsPrev;
>
> m_entropyCoder->encodeQtCbf(cu, absPartIdxC, TEXT_CHROMA_V, trMode);
> - if (absSum[TEXT_CHROMA_V][tuIterator.m_section])
> + if (numSigV[tuIterator.m_section])
> m_entropyCoder->encodeCoeffNxN(cu, coeffCurV + subTUOffset, absPartIdxC, log2TrSizeC, TEXT_CHROMA_V);
> uint32_t newBits = m_entropyCoder->getNumberOfWrittenBits();
> singleBitsComp[TEXT_CHROMA_V][tuIterator.m_section] = newBits - (singleBitsPrev + singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section]);
> @@ -3021,13 +2986,10 @@
> {
> *outZeroDist += distY;
> }
> - if (absSum[TEXT_LUMA][0])
> + if (numSigY)
> {
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
> -
> - int scalingListType = 3 + TEXT_LUMA;
> - X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> - m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, strideResiY, coeffCurY, trSize, scalingListType, false, lastPos[TEXT_LUMA][0]); //this is for inter mode only
> + m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSigY); //this is for inter mode only
>
> const uint32_t nonZeroDistY = primitives.sse_ss[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, curResiY, strideResiY);
> uint32_t nonZeroPsyEnergyY = 0;
> @@ -3066,7 +3028,7 @@
> nullCostY = m_rdCost->calcRdCost(distY, nullBitsY);
> if (nullCostY < singleCostY)
> {
> - absSum[TEXT_LUMA][0] = 0;
> + numSigY = 0;
> #if CHECKED_BUILD || _DEBUG
> ::memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY);
> #endif
> @@ -3099,11 +3061,11 @@
>
> singleDistComp[TEXT_LUMA][0] = distY;
> singlePsyEnergyComp[TEXT_LUMA][0] = psyEnergyY;
> - if (!absSum[TEXT_LUMA][0])
> + if (!numSigY)
> {
> primitives.blockfill_s[sizeIdx](curResiY, strideResiY, 0);
> }
> - cu->setCbfSubParts(absSum[TEXT_LUMA][0] ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
> + cu->setCbfSubParts(numSigY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
>
> uint32_t distU = 0;
> uint32_t distV = 0;
> @@ -3130,19 +3092,17 @@
> {
> *outZeroDist += distU;
> }
> - if (absSum[TEXT_CHROMA_U][tuIterator.m_section])
> + if (numSigU[tuIterator.m_section])
> {
> int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> -
> - int scalingListType = 3 + TEXT_CHROMA_U;
> - X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> - m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiU, strideResiC, coeffCurU + subTUOffset,
> - trSizeC, scalingListType, false, lastPos[TEXT_CHROMA_U][tuIterator.m_section]);
> + m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), curResiU, strideResiC, coeffCurU + subTUOffset,
> + log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU[tuIterator.m_section]);
> uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth,
> curResiU, strideResiC);
> const uint32_t nonZeroDistU = m_rdCost->scaleChromaDistCb(dist);
> - uint32_t nonZeroPsyEnergyU = 0;
> + uint32_t nonZeroPsyEnergyU = 0;
> +
> if (m_rdCost->psyRdEnabled())
> {
> pixel* pred = predYuv->getCbAddr(absPartIdxC);
> @@ -3178,7 +3138,7 @@
> nullCostU = m_rdCost->calcRdCost(distU, nullBitsU);
> if (nullCostU < singleCostU)
> {
> - absSum[TEXT_CHROMA_U][tuIterator.m_section] = 0;
> + numSigU[tuIterator.m_section] = 0;
> #if CHECKED_BUILD || _DEBUG
> ::memset(coeffCurU + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
> #endif
> @@ -3212,7 +3172,7 @@
> singleDistComp[TEXT_CHROMA_U][tuIterator.m_section] = distU;
> singlePsyEnergyComp[TEXT_CHROMA_U][tuIterator.m_section] = psyEnergyU;
>
> - if (!absSum[TEXT_CHROMA_U][tuIterator.m_section])
> + if (!numSigU[tuIterator.m_section])
> {
> primitives.blockfill_s[sizeIdxC](curResiU, strideResiC, 0);
> }
> @@ -3222,15 +3182,12 @@
> {
> *outZeroDist += distV;
> }
> - if (absSum[TEXT_CHROMA_V][tuIterator.m_section])
> + if (numSigV[tuIterator.m_section])
> {
> int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> -
> - int scalingListType = 3 + TEXT_CHROMA_V;
> - X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> - m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, strideResiC, coeffCurV + subTUOffset,
> - trSizeC, scalingListType, false, lastPos[TEXT_CHROMA_V][tuIterator.m_section]);
> + m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), curResiV, strideResiC, coeffCurV + subTUOffset,
> + log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV[tuIterator.m_section]);
> uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth,
> curResiV, strideResiC);
> const uint32_t nonZeroDistV = m_rdCost->scaleChromaDistCr(dist);
> @@ -3271,7 +3228,7 @@
> nullCostV = m_rdCost->calcRdCost(distV, nullBitsV);
> if (nullCostV < singleCostV)
> {
> - absSum[TEXT_CHROMA_V][tuIterator.m_section] = 0;
> + numSigV[tuIterator.m_section] = 0;
> #if CHECKED_BUILD || _DEBUG
> ::memset(coeffCurV + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
> #endif
> @@ -3305,21 +3262,20 @@
> singleDistComp[TEXT_CHROMA_V][tuIterator.m_section] = distV;
> singlePsyEnergyComp[TEXT_CHROMA_V][tuIterator.m_section] = psyEnergyV;
>
> - if (!absSum[TEXT_CHROMA_V][tuIterator.m_section])
> + if (!numSigV[tuIterator.m_section])
> {
> primitives.blockfill_s[sizeIdxC](curResiV, strideResiC, 0);
> }
>
> - cu->setCbfPartRange(absSum[TEXT_CHROMA_U][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> - cu->setCbfPartRange(absSum[TEXT_CHROMA_V][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> + cu->setCbfPartRange(numSigU[tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> + cu->setCbfPartRange(numSigV[tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> }
> while (isNextSection(&tuIterator));
> }
>
> - int lastPosTransformSkip[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { -1, -1 }, { -1, -1 }, { -1, -1 } };
> if (checkTransformSkipY)
> {
> - uint32_t nonZeroDistY = 0, absSumTransformSkipY;
> + uint32_t nonZeroDistY = 0;
> uint32_t nonZeroPsyEnergyY = 0;
> uint64_t singleCostY = MAX_INT64;
>
> @@ -3336,11 +3292,11 @@
> }
>
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
> - absSumTransformSkipY = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, tsCoeffY,
> - trSize, TEXT_LUMA, absPartIdx, &lastPosTransformSkip[TEXT_LUMA][0], true, curuseRDOQ);
> - cu->setCbfSubParts(absSumTransformSkipY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
> -
> - if (absSumTransformSkipY)
> + uint32_t numSigTSkipY = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, tsCoeffY,
> + log2TrSize, TEXT_LUMA, absPartIdx, true, curuseRDOQ);
> + cu->setCbfSubParts(numSigTSkipY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
> +
> + if (numSigTSkipY)
> {
> m_entropyCoder->resetBits();
> m_entropyCoder->encodeQtCbf(cu, absPartIdx, TEXT_LUMA, trMode);
> @@ -3348,11 +3304,7 @@
> const uint32_t skipSingleBitsY = m_entropyCoder->getNumberOfWrittenBits();
>
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
> -
> - int scalingListType = 3 + TEXT_LUMA;
> - X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> -
> - m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, tsResiY, trSize, tsCoeffY, trSize, scalingListType, true, lastPosTransformSkip[TEXT_LUMA][0]);
> + m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), tsResiY, trSize, tsCoeffY, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
>
> nonZeroDistY = primitives.sse_ss[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width,
> tsResiY, trSize);
> @@ -3375,7 +3327,7 @@
> singleCostY = m_rdCost->calcRdCost(nonZeroDistY, skipSingleBitsY);
> }
>
> - if (!absSumTransformSkipY || minCost[TEXT_LUMA][0] < singleCostY)
> + if (!numSigTSkipY || minCost[TEXT_LUMA][0] < singleCostY)
> {
> cu->setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
> }
> @@ -3383,18 +3335,18 @@
> {
> singleDistComp[TEXT_LUMA][0] = nonZeroDistY;
> singlePsyEnergyComp[TEXT_LUMA][0] = nonZeroPsyEnergyY;
> - absSum[TEXT_LUMA][0] = absSumTransformSkipY;
> + numSigY = numSigTSkipY;
> bestTransformMode[TEXT_LUMA][0] = 1;
> memcpy(coeffCurY, tsCoeffY, sizeof(coeff_t) * numCoeffY);
> primitives.square_copy_ss[sizeIdx](curResiY, strideResiY, tsResiY, trSize);
> }
>
> - cu->setCbfSubParts(absSum[TEXT_LUMA][0] ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
> + cu->setCbfSubParts(numSigY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
> }
>
> if (bCodeChroma && checkTransformSkipUV)
> {
> - uint32_t nonZeroDistU = 0, nonZeroDistV = 0, absSumTransformSkipU, absSumTransformSkipV;
> + uint32_t nonZeroDistU = 0, nonZeroDistV = 0;
> uint32_t nonZeroPsyEnergyU = 0, nonZeroPsyEnergyV = 0;
> uint64_t singleCostU = MAX_INT64;
> uint64_t singleCostV = MAX_INT64;
> @@ -3429,20 +3381,20 @@
>
> int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> - absSumTransformSkipU = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, tsCoeffU,
> - trSizeC, TEXT_CHROMA_U, absPartIdxC, &lastPosTransformSkip[TEXT_CHROMA_U][tuIterator.m_section], true, curuseRDOQ);
> + uint32_t numSigTSkipU = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, tsCoeffU,
> + log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, true, curuseRDOQ);
> curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> - absSumTransformSkipV = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, tsCoeffV,
> - trSizeC, TEXT_CHROMA_V, absPartIdxC, &lastPosTransformSkip[TEXT_CHROMA_V][tuIterator.m_section], true, curuseRDOQ);
> -
> - cu->setCbfPartRange(absSumTransformSkipU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> - cu->setCbfPartRange(absSumTransformSkipV ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> + uint32_t numSigTSkipV = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, tsCoeffV,
> + log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, true, curuseRDOQ);
> +
> + cu->setCbfPartRange(numSigTSkipU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> + cu->setCbfPartRange(numSigTSkipV ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
>
> m_entropyCoder->resetBits();
> singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section] = 0;
>
> - if (absSumTransformSkipU)
> + if (numSigTSkipU)
> {
> m_entropyCoder->encodeQtCbf(cu, absPartIdxC, TEXT_CHROMA_U, trMode);
> m_entropyCoder->encodeCoeffNxN(cu, tsCoeffU, absPartIdxC, log2TrSizeC, TEXT_CHROMA_U);
> @@ -3450,11 +3402,8 @@
>
> curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> -
> - int scalingListType = 3 + TEXT_CHROMA_U;
> - X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> - m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, tsResiU, trSizeC, tsCoeffU,
> - trSizeC, scalingListType, true, lastPosTransformSkip[TEXT_CHROMA_U][tuIterator.m_section]);
> + m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), tsResiU, trSizeC, tsCoeffU,
> + log2TrSizeC, TEXT_CHROMA_U, false, true, numSigTSkipU);
> uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth,
> tsResiU, trSizeC);
> nonZeroDistU = m_rdCost->scaleChromaDistCb(dist);
> @@ -3476,7 +3425,7 @@
> singleCostU = m_rdCost->calcRdCost(nonZeroDistU, singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section]);
> }
>
> - if (!absSumTransformSkipU || minCost[TEXT_CHROMA_U][tuIterator.m_section] < singleCostU)
> + if (!numSigTSkipU || minCost[TEXT_CHROMA_U][tuIterator.m_section] < singleCostU)
> {
> cu->setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> }
> @@ -3484,13 +3433,13 @@
> {
> singleDistComp[TEXT_CHROMA_U][tuIterator.m_section] = nonZeroDistU;
> singlePsyEnergyComp[TEXT_CHROMA_U][tuIterator.m_section] = nonZeroPsyEnergyU;
> - absSum[TEXT_CHROMA_U][tuIterator.m_section] = absSumTransformSkipU;
> + numSigU[tuIterator.m_section] = numSigTSkipU;
> bestTransformMode[TEXT_CHROMA_U][tuIterator.m_section] = 1;
> memcpy(coeffCurU + subTUOffset, tsCoeffU, sizeof(coeff_t) * numCoeffC);
> primitives.square_copy_ss[sizeIdxC](curResiU, strideResiC, tsResiU, trSizeC);
> }
>
> - if (absSumTransformSkipV)
> + if (numSigTSkipV)
> {
> m_entropyCoder->encodeQtCbf(cu, absPartIdxC, TEXT_CHROMA_V, trMode);
> m_entropyCoder->encodeCoeffNxN(cu, tsCoeffV, absPartIdxC, log2TrSizeC, TEXT_CHROMA_V);
> @@ -3498,11 +3447,8 @@
>
> curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
> m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
> -
> - int scalingListType = 3 + TEXT_CHROMA_V;
> - X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
> - m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, tsResiV, trSizeC, tsCoeffV,
> - trSizeC, scalingListType, true, lastPosTransformSkip[TEXT_CHROMA_V][tuIterator.m_section]);
> + m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), tsResiV, trSizeC, tsCoeffV,
> + log2TrSizeC, TEXT_CHROMA_V, false, true, numSigTSkipV);
> uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth,
> tsResiV, trSizeC);
> nonZeroDistV = m_rdCost->scaleChromaDistCr(dist);
> @@ -3524,7 +3470,7 @@
> singleCostV = m_rdCost->calcRdCost(nonZeroDistV, singleBitsComp[TEXT_CHROMA_V][tuIterator.m_section]);
> }
>
> - if (!absSumTransformSkipV || minCost[TEXT_CHROMA_V][tuIterator.m_section] < singleCostV)
> + if (!numSigTSkipV || minCost[TEXT_CHROMA_V][tuIterator.m_section] < singleCostV)
> {
> cu->setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> }
> @@ -3532,14 +3478,14 @@
> {
> singleDistComp[TEXT_CHROMA_V][tuIterator.m_section] = nonZeroDistV;
> singlePsyEnergyComp[TEXT_CHROMA_V][tuIterator.m_section] = nonZeroPsyEnergyV;
> - absSum[TEXT_CHROMA_V][tuIterator.m_section] = absSumTransformSkipV;
> + numSigV[tuIterator.m_section] = numSigTSkipV;
> bestTransformMode[TEXT_CHROMA_V][tuIterator.m_section] = 1;
> memcpy(coeffCurV + subTUOffset, tsCoeffV, sizeof(coeff_t) * numCoeffC);
> primitives.square_copy_ss[sizeIdxC](curResiV, strideResiC, tsResiV, trSizeC);
> }
>
> - cu->setCbfPartRange(absSum[TEXT_CHROMA_U][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> - cu->setCbfPartRange(absSum[TEXT_CHROMA_V][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> + cu->setCbfPartRange(numSigU[tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
> + cu->setCbfPartRange(numSigV[tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
> }
> while (isNextSection(&tuIterator));
>
> @@ -3568,16 +3514,16 @@
> }
>
> m_entropyCoder->encodeQtCbf(cu, absPartIdx, TEXT_LUMA, trMode);
> - if (absSum[TEXT_LUMA][0])
> + if (numSigY)
> m_entropyCoder->encodeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
>
> if (bCodeChroma)
> {
> if (!splitIntoSubTUs)
> {
> - if (absSum[TEXT_CHROMA_U][0])
> + if (numSigU[0])
> m_entropyCoder->encodeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
> - if (absSum[TEXT_CHROMA_V][0])
> + if (numSigV[0])
> m_entropyCoder->encodeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
> }
> else
> @@ -3585,13 +3531,13 @@
> uint32_t subTUSize = 1 << (log2TrSizeC * 2);
> uint32_t partIdxesPerSubTU = absPartIdxStep >> 1;
>
> - if (absSum[TEXT_CHROMA_U][0])
> + if (numSigU[0])
> m_entropyCoder->encodeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
> - if (absSum[TEXT_CHROMA_U][1])
> + if (numSigU[1])
> m_entropyCoder->encodeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_U);
> - if (absSum[TEXT_CHROMA_V][0])
> + if (numSigV[0])
> m_entropyCoder->encodeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
> - if (absSum[TEXT_CHROMA_V][1])
> + if (numSigV[1])
> m_entropyCoder->encodeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_V);
> }
> }
> @@ -3734,7 +3680,7 @@
> cu->m_psyEnergy = singlePsyEnergy;
>
> cu->setTrIdxSubParts(trMode, absPartIdx, depth);
> - cu->setCbfSubParts(absSum[TEXT_LUMA][0] ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
> + cu->setCbfSubParts(numSigY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
>
> if (bCodeChroma)
> {
> diff -r 11c808e562b8 -r dcf6f2ce907c source/common/dct.cpp
> --- a/source/common/dct.cpp Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/common/dct.cpp Mon Jul 07 17:00:26 2014 +0900
> @@ -773,10 +773,10 @@
> }
> }
>
> -uint32_t quant_c(int32_t* coef, int32_t* quantCoeff, int32_t* deltaU, int32_t* qCoef, int qBits, int add, int numCoeff, int32_t* lastPos)
> +uint32_t quant_c(int32_t* coef, int32_t* quantCoeff, int32_t* deltaU, int32_t* qCoef, int qBits, int add, int numCoeff)
> {
> int qBits8 = qBits - 8;
> - uint32_t acSum = 0;
> + uint32_t numSig = 0;
>
> for (int blockpos = 0; blockpos < numCoeff; blockpos++)
> {
> @@ -785,15 +785,14 @@
>
> int tmplevel = abs(level) * quantCoeff[blockpos];
> level = ((tmplevel + add) >> qBits);
> + deltaU[blockpos] = ((tmplevel - (level << qBits)) >> qBits8);
> if (level)
> - *lastPos = blockpos;
> - deltaU[blockpos] = ((tmplevel - (level << qBits)) >> qBits8);
> - acSum += level;
> + ++numSig;
> level *= sign;
> qCoef[blockpos] = Clip3(-32768, 32767, level);
> }
>
> - return acSum;
> + return numSig;
> }
>
> uint32_t nquant_c(int32_t* coef, int32_t* quantCoeff, int32_t* scaledCoeff, int32_t* qCoef, int qBits, int add, int numCoeff)
> diff -r 11c808e562b8 -r dcf6f2ce907c source/common/primitives.h
> --- a/source/common/primitives.h Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/common/primitives.h Mon Jul 07 17:00:26 2014 +0900
> @@ -146,7 +146,7 @@
> typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
> typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
> typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
> -typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
> +typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff);
> typedef uint32_t (*nquant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *scaledCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
> typedef void (*dequant_scaling_t)(const int32_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift);
> typedef void (*dequant_normal_t)(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift);
> diff -r 11c808e562b8 -r dcf6f2ce907c source/common/x86/pixel-util.h
> --- a/source/common/x86/pixel-util.h Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/common/x86/pixel-util.h Mon Jul 07 17:00:26 2014 +0900
> @@ -44,7 +44,7 @@
> void x265_transpose32_sse2(pixel *dest, pixel *src, intptr_t stride);
> void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride);
>
> -uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
> +uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff);
> uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *scaledCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
> void x265_dequant_normal_sse4(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift);
> int x265_count_nonzero_ssse3(const int32_t *quantCoeff, int numCoeff);
> diff -r 11c808e562b8 -r dcf6f2ce907c source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/common/x86/pixel-util8.asm Mon Jul 07 17:00:26 2014 +0900
> @@ -27,8 +27,6 @@
>
> SECTION_RODATA 32
>
> -c_d_4: dd 4, 4, 4, 4
> -c_d_1234: dd 1, 2, 3, 4
> %if BIT_DEPTH == 10
> ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64
> ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
> @@ -864,42 +862,25 @@
>
>
> ;-----------------------------------------------------------------------------
> -; uint32_t quant(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
> +; uint32_t quant(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff);
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> -%if ARCH_X86_64 == 1
> -cglobal quant, 5,6,11
> - %define addVec m8
> - %define qbits m9
> - %define qbits8 m10
> -%else
> -cglobal quant, 5,6,8, 0-(3*mmsize)
> - %define addVec [rsp + 0 * mmsize]
> - %define qbits [rsp + 1 * mmsize]
> - %define qbits8 [rsp + 2 * mmsize]
> -%endif
> +cglobal quant, 5,6,8
>
> ; fill qbits
> - movd m0, r4d
> - mova qbits, m0
> + movd m4, r4d ; m4 = qbits
>
> ; fill qbits-8
> sub r4d, 8
> - movd m0, r4d
> - mova qbits8, m0
> + movd m6, r4d ; m6 = qbits8
>
> ; fill offset
> - mov r4d, r5m
> - movd m0, r4d
> - pshufd m0, m0, 0
> - mova addVec, m0
> + movd m5, r5m
> + pshufd m5, m5, 0 ; m5 = add
>
> mov r4d, r6m
> shr r4d, 3
> - pxor m7, m7 ; m7 = acSum4
> - mova m6, [c_d_1234] ; m6 = last4
> - pxor m5, m5 ; m5 = count
> - mova m4, [c_d_4] ; m4 = [4 4 4 4]
> + pxor m7, m7 ; m7 = numZero
> .loop:
> ; 4 coeff
> movu m0, [r0] ; m0 = level
> @@ -908,19 +889,15 @@
> movu m2, [r1] ; m2 = qcoeff
> pabsd m0, m0
> pmulld m0, m2 ; m0 = tmpLevel1
> - paddd m2, m0, addVec
> - psrad m2, qbits ; m2 = level1
> - paddd m7, m2
> - pslld m3, m2, qbits
> + paddd m2, m0, m5
> + psrad m2, m4 ; m2 = level1
> + pslld m3, m2, m4
> psubd m0, m3
> - psrad m0, qbits8 ; m0 = deltaU1
> + psrad m0, m6 ; m0 = deltaU1
> movu [r2], m0
> pxor m0, m0
> pcmpeqd m0, m2 ; m0 = mask4
> - pand m5, m0
> - pandn m0, m6
> - por m5, m0
> - paddd m6, m4
> + psubd m7, m0
>
> pxor m2, m1
> psubd m2, m1
> @@ -934,19 +911,15 @@
> movu m2, [r1 + 16] ; m2 = qcoeff
> pabsd m0, m0
> pmulld m0, m2 ; m0 = tmpLevel1
> - paddd m2, m0, addVec
> - psrad m2, qbits ; m2 = level1
> - paddd m7, m2
> - pslld m3, m2, qbits
> + paddd m2, m0, m5
> + psrad m2, m4 ; m2 = level1
> + pslld m3, m2, m4
> psubd m0, m3
> - psrad m0, qbits8 ; m0 = deltaU1
> + psrad m0, m6 ; m0 = deltaU1
> movu [r2 + 16], m0
> pxor m0, m0
> pcmpeqd m0, m2 ; m0 = mask4
> - pand m5, m0
> - pandn m0, m6
> - por m5, m0
> - paddd m6, m4
> + psubd m7, m0
>
> pxor m2, m1
> psubd m2, m1
> @@ -962,18 +935,11 @@
> dec r4d
> jnz .loop
>
> - movhlps m4, m5
> - pmaxud m4, m5
> - pshufd m5, m4, 1
> - pmaxud m4, m5
> -
> - mov r4, r7m
> - movd [r4], m4
> - dec dword [r4]
> -
> phaddd m7, m7
> phaddd m7, m7
> - movd eax, m7
> + mov eax, r6m
> + movd r4d, m7
> + sub eax, r4d ; numSig
>
> RET
>
> @@ -985,11 +951,11 @@
> cglobal nquant, 5,6,8
>
> ; fill qbits
> - movd m5, r4d ; m5 = qbits
> + movd m4, r4d ; m4 = qbits
>
> ; fill offset
> - movd m6, r5m
> - pshufd m6, m6, 0 ; m6 = add
> + movd m5, r5m
> + pshufd m5, m5, 0 ; m5 = add
>
> mov r4d, r6m
> shr r4d, 3
> @@ -1003,10 +969,11 @@
> pabsd m0, m0
> pmulld m0, m2 ; m0 = tmpLevel1
> movu [r2], m0 ; m0 = scaledCoeff
> - paddd m2, m0, m6
> - psrad m2, m5 ; m2 = level1
> - pxor m4, m4
> - pcmpeqd m4, m2 ; m4 = mask4
> + paddd m2, m0, m5
> + psrad m2, m4 ; m2 = level1
> + pxor m0, m0
> + pcmpeqd m0, m2 ; m0 = mask4
> + psubd m7, m0
>
> pxor m2, m1
> psubd m2, m1
> @@ -1021,10 +988,11 @@
> pabsd m0, m0
> pmulld m0, m2 ; m0 = tmpLevel1
> movu [r2 + 16], m0 ; m0 = scaledCoeff
> - paddd m2, m0, m6
> - psrad m2, m5 ; m2 = level1
> + paddd m2, m0, m5
> + psrad m2, m4 ; m2 = level1
> pxor m0, m0
> pcmpeqd m0, m2 ; m0 = mask4
> + psubd m7, m0
>
> pxor m2, m1
> psubd m2, m1
> @@ -1032,9 +1000,6 @@
> pmovsxwd m2, m2
> movu [r3 + 16], m2
>
> - packssdw m4, m0 ; m4 = mask8
> - psubw m7, m4 ; m7 = numZero
> -
> add r0, 32
> add r1, 32
> add r2, 32
> @@ -1043,11 +1008,10 @@
> dec r4d
> jnz .loop
>
> - packuswb m7, m7
> - pxor m0, m0
> - psadbw m0, m7
> + phaddd m7, m7
> + phaddd m7, m7
> mov eax, r6m
> - movd r4d, m0
> + movd r4d, m7
> sub eax, r4d ; numSig
>
> RET
> diff -r 11c808e562b8 -r dcf6f2ce907c source/encoder/encoder.cpp
> --- a/source/encoder/encoder.cpp Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/encoder/encoder.cpp Mon Jul 07 17:00:26 2014 +0900
> @@ -1052,8 +1052,6 @@
>
> sps->setTMVPFlagsPresent(false);
>
> - sps->setMaxTrSize(1 << m_quadtreeTULog2MaxSize);
> -
> for (uint32_t i = 0; i < g_maxCUDepth - g_addCUDepth; i++)
> {
> sps->setAMPAcc(i, m_param->bEnableAMP);
> diff -r 11c808e562b8 -r dcf6f2ce907c source/test/mbdstharness.cpp
> --- a/source/test/mbdstharness.cpp Thu Jul 03 15:12:45 2014 -0700
> +++ b/source/test/mbdstharness.cpp Mon Jul 07 17:00:26 2014 +0900
> @@ -300,13 +300,12 @@
> int valueToAdd = rand() % (32 * 1024);
> int cmp_size = sizeof(int) * height * width;
> int numCoeff = height * width;
> - int optLastPos = -1, refLastPos = -1;
>
> int index1 = rand() % TEST_CASES;
> int index2 = rand() % TEST_CASES;
>
> - refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf5, mintbuf6, bits, valueToAdd, numCoeff, &refLastPos);
> - optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mintbuf4, bits, valueToAdd, numCoeff, &optLastPos);
> + refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf5, mintbuf6, bits, valueToAdd, numCoeff);
> + optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mintbuf4, bits, valueToAdd, numCoeff);
>
> if (memcmp(mintbuf3, mintbuf5, cmp_size))
> return false;
> @@ -317,9 +316,6 @@
> if (optReturnValue != refReturnValue)
> return false;
>
> - if (optLastPos != refLastPos)
> - return false;
> -
> reportfail();
> j += 16;
> }
> @@ -509,8 +505,7 @@
> if (opt.quant)
> {
> printf("quant\t\t");
> - int dummy = -1;
> - REPORT_SPEEDUP(opt.quant, ref.quant, mintbuf1, mintbuf2, mintbuf3, mintbuf4, 23, 23785, 32 * 32, &dummy);
> + REPORT_SPEEDUP(opt.quant, ref.quant, mintbuf1, mintbuf2, mintbuf3, mintbuf4, 23, 23785, 32 * 32);
> }
>
> if (opt.nquant)
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list