[x265] [PATCH 2 of 6] generate lastPos in quant
Min Chen
chenm003 at 163.com
Fri Aug 16 12:52:32 CEST 2013
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1376650184 -28800
# Node ID 4be95d676094ee2ec88c63fa2620ab6f037234b5
# Parent 681ab201ea0cb2dfb6e08bc51c8d75a107aa1c39
generate lastPos in quant
diff -r 681ab201ea0c -r 4be95d676094 source/Lib/TLibCommon/TComTrQuant.cpp
--- a/source/Lib/TLibCommon/TComTrQuant.cpp Fri Aug 16 18:49:19 2013 +0800
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp Fri Aug 16 18:49:44 2013 +0800
@@ -255,7 +255,7 @@
}
UInt TComTrQuant::xQuant(TComDataCU* cu, Int* coef, TCoeff* qCoef, Int width, Int height,
- TextType ttype, UInt absPartIdx)
+ TextType ttype, UInt absPartIdx, int *lastPos)
{
UInt acSum = 0;
Int add = 0;
@@ -263,7 +263,7 @@
if (useRDOQ && (ttype == TEXT_LUMA || RDOQ_CHROMA))
{
- acSum = xRateDistOptQuant(cu, coef, qCoef, width, height, ttype, absPartIdx);
+ acSum = xRateDistOptQuant(cu, coef, qCoef, width, height, ttype, absPartIdx, lastPos);
}
else
{
@@ -321,7 +321,7 @@
add = (cu->getSlice()->getSliceType() == I_SLICE ? 171 : 85) << (qbits - 9);
Int numCoeff = width * height;
- acSum += x265::primitives.quant(coef, quantCoeff, deltaU, qCoef, qbits, add, numCoeff);
+ acSum += x265::primitives.quant(coef, quantCoeff, deltaU, qCoef, qbits, add, numCoeff, lastPos);
if (cu->getSlice()->getPPS()->getSignHideFlag() && acSum >= 2)
signBitHidingHDQ(qCoef, coef, scan, deltaU, width, height);
@@ -403,6 +403,7 @@
UInt height,
TextType ttype,
UInt absPartIdx,
+ int* lastPos,
Bool useTransformSkip)
{
if (cu->getCUTransquantBypass(absPartIdx))
@@ -441,7 +442,7 @@
const UInt log2BlockSize = g_convertToBit[width];
x265::primitives.dct[x265::DCT_4x4 + log2BlockSize - ((width == 4) && (mode != REG_DCT))](residual, m_tmpCoeff, stride);
}
- return xQuant(cu, m_tmpCoeff, coeff, width, height, ttype, absPartIdx);
+ return xQuant(cu, m_tmpCoeff, coeff, width, height, ttype, absPartIdx, lastPos);
}
Void TComTrQuant::invtransformNxN( Bool transQuantBypass, UInt mode, Short* residual, UInt stride, TCoeff* coeff, UInt width, UInt height, Int scalingListType, Bool useTransformSkip /*= false*/ )
@@ -630,7 +631,7 @@
* coding engines using probability models like CABAC
*/
UInt TComTrQuant::xRateDistOptQuant(TComDataCU* cu, Int* srcCoeff, TCoeff* dstCoeff, UInt width, UInt height,
- TextType ttype, UInt absPartIdx)
+ TextType ttype, UInt absPartIdx, int *lastPos)
{
UInt log2TrSize = g_convertToBit[width] + 2;
UInt absSum = 0;
@@ -975,6 +976,8 @@
Int blkPos = scan[pos];
Int level = dstCoeff[blkPos];
absSum += level;
+ if (level)
+ *lastPos = blkPos;
dstCoeff[blkPos] = (srcCoeff[blkPos] < 0) ? -level : level;
}
diff -r 681ab201ea0c -r 4be95d676094 source/Lib/TLibCommon/TComTrQuant.h
--- a/source/Lib/TLibCommon/TComTrQuant.h Fri Aug 16 18:49:19 2013 +0800
+++ b/source/Lib/TLibCommon/TComTrQuant.h Fri Aug 16 18:49:44 2013 +0800
@@ -125,7 +125,7 @@
// transform & inverse transform functions
UInt transformNxN(TComDataCU* cu, Short* residual, UInt stride, TCoeff* coeff, UInt width, UInt height,
- TextType ttype, UInt absPartIdx, Bool useTransformSkip = false);
+ TextType ttype, UInt absPartIdx, int* lastPos, Bool useTransformSkip = false);
Void invtransformNxN(Bool transQuantBypass, UInt mode, Short* residual, UInt stride, TCoeff* coeff, UInt width, UInt height, Int scalingListType, Bool useTransformSkip = false);
@@ -195,10 +195,10 @@
Void signBitHidingHDQ(TCoeff* qcoeff, TCoeff* coeff, const UInt* scan, Int* deltaU, Int width, Int height);
- UInt xQuant(TComDataCU* cu, Int* src, TCoeff* dst, Int width, Int height, TextType ttype, UInt absPartIdx);
+ UInt xQuant(TComDataCU* cu, Int* src, TCoeff* dst, Int width, Int height, TextType ttype, UInt absPartIdx, int *lastPos);
// RDOQ functions
- UInt xRateDistOptQuant(TComDataCU* cu, Int* srcCoeff, TCoeff* dstCoeff, UInt width, UInt height, TextType ttype, UInt absPartIdx);
+ UInt xRateDistOptQuant(TComDataCU* cu, Int* srcCoeff, TCoeff* dstCoeff, UInt width, UInt height, TextType ttype, UInt absPartIdx, int *lastPos);
inline UInt xGetCodedLevel(Double& codedCost, Double& codedCost0, Double& codedCostSig, Int levelDouble,
UInt maxAbsLevel, UShort ctxNumSig, UShort ctxNumOne, UShort ctxNumAbs, UShort absGoRice,
diff -r 681ab201ea0c -r 4be95d676094 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Fri Aug 16 18:49:19 2013 +0800
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Fri Aug 16 18:49:44 2013 +0800
@@ -539,12 +539,13 @@
//--- transform and quantization ---
UInt absSum = 0;
+ int lastPos = -1;
cu->setTrIdxSubParts(trDepth, absPartIdx, fullDepth);
m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, cu->getSlice()->getSPS()->getQpBDOffsetY(), 0);
m_trQuant->selectLambda(TEXT_LUMA);
- absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, width, height, TEXT_LUMA, absPartIdx, useTransformSkip);
+ absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, width, height, TEXT_LUMA, absPartIdx, &lastPos, useTransformSkip);
//--- set coded block flag ---
cu->setCbfSubParts((absSum ? 1 : 0) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
@@ -667,6 +668,7 @@
}
//--- transform and quantization ---
UInt absSum = 0;
+ int lastPos = -1;
Int curChromaQpOffset;
if (ttype == TEXT_CHROMA_U)
@@ -681,7 +683,7 @@
m_trQuant->selectLambda(TEXT_CHROMA);
- absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, width, height, ttype, absPartIdx, useTransformSkipChroma);
+ absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, width, height, ttype, absPartIdx, &lastPos, useTransformSkipChroma);
//--- set coded block flag ---
cu->setCbfSubParts((absSum ? 1 : 0) << origTrDepth, ttype, absPartIdx, cu->getDepth(0) + trDepth);
@@ -3289,6 +3291,7 @@
UInt singleBits = 0;
UInt singleDist = 0;
UInt absSumY = 0, absSumU = 0, absSumV = 0;
+ int lastPosY = -1, lastPosU = -1, lastPosV = -1;
UInt bestTransformMode[3] = { 0 };
m_rdGoOnSbacCoder->store(m_rdSbacCoders[depth][CI_QT_TRAFO_ROOT]);
@@ -3332,7 +3335,7 @@
m_trQuant->selectLambda(TEXT_LUMA);
absSumY = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absTUPartIdx), resiYuv->m_width, coeffCurY,
- trWidth, trHeight, TEXT_LUMA, absPartIdx);
+ trWidth, trHeight, TEXT_LUMA, absPartIdx, &lastPosY);
cu->setCbfSubParts(absSumY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
@@ -3349,12 +3352,12 @@
m_trQuant->selectLambda(TEXT_CHROMA);
absSumU = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absTUPartIdxC), resiYuv->m_cwidth, coeffCurU,
- trWidthC, trHeightC, TEXT_CHROMA_U, absPartIdx);
+ trWidthC, trHeightC, TEXT_CHROMA_U, absPartIdx, &lastPosU);
curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset);
absSumV = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absTUPartIdxC), resiYuv->m_cwidth, coeffCurV,
- trWidthC, trHeightC, TEXT_CHROMA_V, absPartIdx);
+ trWidthC, trHeightC, TEXT_CHROMA_V, absPartIdx, &lastPosV);
cu->setCbfSubParts(absSumU ? setCbf : 0, TEXT_CHROMA_U, absPartIdx, cu->getDepth(0) + trModeC);
cu->setCbfSubParts(absSumV ? setCbf : 0, TEXT_CHROMA_V, absPartIdx, cu->getDepth(0) + trModeC);
@@ -3605,6 +3608,7 @@
if (checkTransformSkipY)
{
UInt nonZeroDistY = 0, absSumTransformSkipY;
+ int lastPosTransformSkipY = -1;
UInt64 singleCostY = MAX_INT64;
Short *curResiY = m_qtTempTComYuv[qtlayer].getLumaAddr(absTUPartIdx);
@@ -3632,7 +3636,7 @@
m_trQuant->selectLambda(TEXT_LUMA);
absSumTransformSkipY = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absTUPartIdx), resiYuv->m_width, coeffCurY,
- trWidth, trHeight, TEXT_LUMA, absPartIdx, true);
+ trWidth, trHeight, TEXT_LUMA, absPartIdx, &lastPosTransformSkipY, true);
cu->setCbfSubParts(absSumTransformSkipY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
if (absSumTransformSkipY != 0)
@@ -3678,6 +3682,7 @@
if (bCodeChroma && checkTransformSkipUV)
{
UInt nonZeroDistU = 0, nonZeroDistV = 0, absSumTransformSkipU, absSumTransformSkipV;
+ int lastPosTransformSkipU = -1, lastPosTransformSkipV = -1;
UInt64 singleCostU = MAX_INT64;
UInt64 singleCostV = MAX_INT64;
@@ -3711,11 +3716,11 @@
m_trQuant->selectLambda(TEXT_CHROMA);
absSumTransformSkipU = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absTUPartIdxC), resiYuv->m_cwidth, coeffCurU,
- trWidthC, trHeightC, TEXT_CHROMA_U, absPartIdx, true);
+ trWidthC, trHeightC, TEXT_CHROMA_U, absPartIdx, &lastPosTransformSkipU, true);
curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset);
absSumTransformSkipV = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absTUPartIdxC), resiYuv->m_cwidth, coeffCurV,
- trWidthC, trHeightC, TEXT_CHROMA_V, absPartIdx, true);
+ trWidthC, trHeightC, TEXT_CHROMA_V, absPartIdx, &lastPosTransformSkipV, true);
cu->setCbfSubParts(absSumTransformSkipU ? setCbf : 0, TEXT_CHROMA_U, absPartIdx, cu->getDepth(0) + trModeC);
cu->setCbfSubParts(absSumTransformSkipV ? setCbf : 0, TEXT_CHROMA_V, absPartIdx, cu->getDepth(0) + trModeC);
diff -r 681ab201ea0c -r 4be95d676094 source/common/dct.cpp
--- a/source/common/dct.cpp Fri Aug 16 18:49:19 2013 +0800
+++ b/source/common/dct.cpp Fri Aug 16 18:49:44 2013 +0800
@@ -772,7 +772,7 @@
}
}
-uint32_t quant_c(int* coef, int* quantCoeff, int* deltaU, int* qCoef, int qBits, int add, int numCoeff)
+uint32_t quant_c(int* coef, int* quantCoeff, int* deltaU, int* qCoef, int qBits, int add, int numCoeff, int* lastPos)
{
int qBits8 = qBits - 8;
uint32_t acSum = 0;
@@ -786,6 +786,8 @@
int tmplevel = abs(level) * quantCoeff[blockpos];
level = ((tmplevel + add) >> qBits);
+ if (level)
+ *lastPos = blockpos;
deltaU[blockpos] = ((tmplevel - (level << qBits)) >> qBits8);
acSum += level;
level *= sign;
diff -r 681ab201ea0c -r 4be95d676094 source/common/primitives.h
--- a/source/common/primitives.h Fri Aug 16 18:49:19 2013 +0800
+++ b/source/common/primitives.h Fri Aug 16 18:49:44 2013 +0800
@@ -213,7 +213,7 @@
typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, short *residual, int stride);
typedef void (*calcrecon_t)(pixel* pred, short* residual, pixel* recon, short* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
-typedef uint32_t (*quant_t)(int *coef, int *quantCoeff, int *deltaU, int *qCoef, int qBits, int add, int numCoeff);
+typedef uint32_t (*quant_t)(int *coef, int *quantCoeff, int *deltaU, int *qCoef, int qBits, int add, int numCoeff, int* lastPos);
typedef void (*dequant_t)(const int* src, int* dst, int width, int height, int mcqp_miper, int mcqp_mirem, bool useScalingList,
unsigned int trSizeLog2, int *dequantCoef);
diff -r 681ab201ea0c -r 4be95d676094 source/common/vec/dct.inc
--- a/source/common/vec/dct.inc Fri Aug 16 18:49:19 2013 +0800
+++ b/source/common/vec/dct.inc Fri Aug 16 18:49:44 2013 +0800
@@ -3907,16 +3907,21 @@
int* qCoef,
int qBits,
int add,
- int numCoeff)
+ int numCoeff,
+ int* lastPos)
{
int qBits8 = qBits - 8;
uint32_t acSum = 0;
int dstOffset = 0;
__m128i acSum4 = _mm_setzero_si128();
__m128i addVec = _mm_set1_epi32(add);
+ __m128i maskPos4 = _mm_setr_epi32(0, 1, 2, 3);
+ __m128i posNext4 = _mm_set1_epi32(4);
+ __m128i lastPos4 = _mm_set1_epi32(-1);
for (int blockpos = 0; blockpos < numCoeff; blockpos += 8)
{
+ __m128i maskZero;
__m128i level1 = _mm_loadu_si128((__m128i*)(coef + blockpos));
__m128i sign1 = _mm_cmplt_epi32(level1, _mm_setzero_si128());
@@ -3927,10 +3932,18 @@
__m128i deltaU1 = _mm_srai_epi32(_mm_sub_epi32(tmplevel1, _mm_slli_epi32(level1, qBits)), qBits8);
_mm_storeu_si128((__m128i*)(deltaU + blockpos), deltaU1);
acSum4 = _mm_add_epi32(acSum4, level1);
+
+ maskZero = _mm_cmpeq_epi32(level1, _mm_setzero_si128());
+ //lastPos4 = _mm_or_si128(_mm_andnot_si128(maskZero, maskPos4), _mm_and_si128(maskZero, lastPos4));
+ //lastPos4 = _mm_blendv_epi8(maskPos4, lastPos4, maskZero);
+ lastPos4 = _mm_max_epi32(lastPos4, _mm_or_si128(maskZero, maskPos4));
+ maskPos4 = _mm_add_epi32(maskPos4, posNext4);
+
level1 = _mm_sub_epi32(_mm_xor_si128(level1, sign1), sign1);
level1 = _mm_cvtepi16_epi32(_mm_packs_epi32(level1, level1));
_mm_storeu_si128((__m128i*)(qCoef + dstOffset), level1);
+
__m128i level2 = _mm_loadu_si128((__m128i*)(coef + blockpos + 4));
__m128i sign2 = _mm_cmplt_epi32(level2, _mm_setzero_si128());
@@ -3940,6 +3953,13 @@
__m128i deltaU2 = _mm_srai_epi32(_mm_sub_epi32(tmplevel2, _mm_slli_epi32(level2, qBits)), qBits8);
_mm_storeu_si128((__m128i*)(deltaU + blockpos + 4), deltaU2);
acSum4 = _mm_add_epi32(acSum4, level2);
+
+ maskZero = _mm_cmpeq_epi32(level2, _mm_setzero_si128());
+ //lastPos4 = _mm_or_si128(_mm_andnot_si128(maskZero, maskPos4), _mm_and_si128(maskZero, lastPos4));
+ //lastPos4 = _mm_blendv_epi8(maskPos4, lastPos4, maskZero);
+ lastPos4 = _mm_max_epi32(lastPos4, _mm_or_si128(maskZero, maskPos4));
+ maskPos4 = _mm_add_epi32(maskPos4, posNext4);
+
level2 = _mm_sub_epi32(_mm_xor_si128(level2, sign2), sign2);
level2 = _mm_cvtepi16_epi32(_mm_packs_epi32(level2, level2));
_mm_storeu_si128((__m128i*)(qCoef + dstOffset + 4), level2);
@@ -3950,6 +3970,11 @@
acSum4 = _mm_hadd_epi32(acSum4, acSum4);
acSum = _mm_cvtsi128_si32(acSum4);
+ lastPos4 = _mm_max_epi32(lastPos4, _mm_shuffle_epi32(lastPos4, 0x0E));
+ lastPos4 = _mm_max_epi32(lastPos4, _mm_shuffle_epi32(lastPos4, 0x01));
+ int tmp = _mm_cvtsi128_si32(lastPos4);
+ *lastPos = tmp;
+
return acSum;
}
}
diff -r 681ab201ea0c -r 4be95d676094 source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp Fri Aug 16 18:49:19 2013 +0800
+++ b/source/test/mbdstharness.cpp Fri Aug 16 18:49:44 2013 +0800
@@ -238,6 +238,13 @@
{
int j = 0;
+ // fill again to avoid error Q value
+ for (int i = 0; i < mb_t_size; i++)
+ {
+ mintbuf1[i] = rand() & PIXEL_MAX;
+ mintbuf2[i] = rand() & PIXEL_MAX;
+ }
+
for (int i = 0; i <= 5; i++)
{
int width = (rand() % 4 + 1) * 4;
@@ -255,9 +262,10 @@
int valueToAdd = rand() % (32 * 1024);
int cmp_size = sizeof(int) * height * width;
int numCoeff = height * width;
+ int optLastPos = -1, refLastPos = -1;
- refReturnValue = ref(mintbuf1 + j, mintbuf2 + j, mintbuf5, mintbuf6, bits, valueToAdd, numCoeff);
- optReturnValue = opt(mintbuf1 + j, mintbuf2 + j, mintbuf3, mintbuf4, bits, valueToAdd, numCoeff);
+ refReturnValue = ref(mintbuf1 + j, mintbuf2 + j, mintbuf5, mintbuf6, bits, valueToAdd, numCoeff, &refLastPos);
+ optReturnValue = opt(mintbuf1 + j, mintbuf2 + j, mintbuf3, mintbuf4, bits, valueToAdd, numCoeff, &optLastPos);
if (memcmp(mintbuf3, mintbuf5, cmp_size))
return false;
@@ -268,6 +276,9 @@
if (optReturnValue != refReturnValue)
return false;
+ if (optLastPos != refLastPos)
+ return false;
+
j += 16;
#if _DEBUG
@@ -357,6 +368,7 @@
if (opt.quant)
{
printf("quant\t\t");
- REPORT_SPEEDUP(opt.quant, ref.quant, mintbuf1, mintbuf2, mintbuf3, mintbuf4, 23, 23785, 32 * 32);
+ int dummy = -1;
+ REPORT_SPEEDUP(opt.quant, ref.quant, mintbuf1, mintbuf2, mintbuf3, mintbuf4, 23, 23785, 32 * 32, &dummy);
}
}
More information about the x265-devel
mailing list