[x265] [PATCH 2 of 6] generate lastPos in quant

Fri Aug 16 12:52:32 CEST 2013

# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1376650184 -28800
# Node ID 4be95d676094ee2ec88c63fa2620ab6f037234b5
# Parent  681ab201ea0cb2dfb6e08bc51c8d75a107aa1c39
generate lastPos in quant

diff -r 681ab201ea0c -r 4be95d676094 source/Lib/TLibCommon/TComTrQuant.cpp

--- a/source/Lib/TLibCommon/TComTrQuant.cpp	Fri Aug 16 18:49:19 2013 +0800
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp	Fri Aug 16 18:49:44 2013 +0800
@@ -255,7 +255,7 @@
 }
 
 UInt TComTrQuant::xQuant(TComDataCU* cu, Int* coef, TCoeff* qCoef, Int width, Int height,
-                         TextType ttype, UInt absPartIdx)
+                         TextType ttype, UInt absPartIdx, int *lastPos)
 {
     UInt acSum = 0;
     Int add = 0;
@@ -263,7 +263,7 @@
 
     if (useRDOQ && (ttype == TEXT_LUMA || RDOQ_CHROMA))
     {
-        acSum = xRateDistOptQuant(cu, coef, qCoef, width, height, ttype, absPartIdx);
+        acSum = xRateDistOptQuant(cu, coef, qCoef, width, height, ttype, absPartIdx, lastPos);
     }
     else
     {
@@ -321,7 +321,7 @@
         add = (cu->getSlice()->getSliceType() == I_SLICE ? 171 : 85) << (qbits - 9);
 
         Int numCoeff = width * height;
-        acSum += x265::primitives.quant(coef, quantCoeff, deltaU, qCoef, qbits, add, numCoeff);
+        acSum += x265::primitives.quant(coef, quantCoeff, deltaU, qCoef, qbits, add, numCoeff, lastPos);
 
         if (cu->getSlice()->getPPS()->getSignHideFlag() && acSum >= 2)
             signBitHidingHDQ(qCoef, coef, scan, deltaU, width, height);
@@ -403,6 +403,7 @@
                                UInt        height,
                                TextType    ttype,
                                UInt        absPartIdx,
+                               int*        lastPos,
                                Bool        useTransformSkip)
 {
     if (cu->getCUTransquantBypass(absPartIdx))
@@ -441,7 +442,7 @@
         const UInt log2BlockSize = g_convertToBit[width];
         x265::primitives.dct[x265::DCT_4x4 + log2BlockSize - ((width == 4) && (mode != REG_DCT))](residual, m_tmpCoeff, stride);
     }
-    return xQuant(cu, m_tmpCoeff, coeff, width, height, ttype, absPartIdx);
+    return xQuant(cu, m_tmpCoeff, coeff, width, height, ttype, absPartIdx, lastPos);
 }
 
 Void TComTrQuant::invtransformNxN( Bool transQuantBypass, UInt mode, Short* residual, UInt stride, TCoeff* coeff, UInt width, UInt height, Int scalingListType, Bool useTransformSkip /*= false*/ )
@@ -630,7 +631,7 @@
  * coding engines using probability models like CABAC
  */
 UInt TComTrQuant::xRateDistOptQuant(TComDataCU* cu, Int* srcCoeff, TCoeff* dstCoeff, UInt width, UInt height,
-                                    TextType ttype, UInt absPartIdx)
+                                    TextType ttype, UInt absPartIdx, int *lastPos)
 {
     UInt log2TrSize = g_convertToBit[width] + 2;
     UInt absSum = 0;
@@ -975,6 +976,8 @@
         Int blkPos = scan[pos];
         Int level  = dstCoeff[blkPos];
         absSum += level;
+        if (level)
+            *lastPos = blkPos;
         dstCoeff[blkPos] = (srcCoeff[blkPos] < 0) ? -level : level;
     }
 
diff -r 681ab201ea0c -r 4be95d676094 source/Lib/TLibCommon/TComTrQuant.h
--- a/source/Lib/TLibCommon/TComTrQuant.h	Fri Aug 16 18:49:19 2013 +0800
+++ b/source/Lib/TLibCommon/TComTrQuant.h	Fri Aug 16 18:49:44 2013 +0800
@@ -125,7 +125,7 @@
 
     // transform & inverse transform functions
     UInt transformNxN(TComDataCU* cu, Short* residual, UInt stride, TCoeff* coeff, UInt width, UInt height,
-                      TextType ttype, UInt absPartIdx, Bool useTransformSkip = false);
+                      TextType ttype, UInt absPartIdx, int* lastPos, Bool useTransformSkip = false);
 
     Void invtransformNxN(Bool transQuantBypass, UInt mode, Short* residual, UInt stride, TCoeff* coeff, UInt width, UInt height, Int scalingListType, Bool useTransformSkip = false);
 
@@ -195,10 +195,10 @@
 
     Void signBitHidingHDQ(TCoeff* qcoeff, TCoeff* coeff, const UInt* scan, Int* deltaU, Int width, Int height);
 
-    UInt xQuant(TComDataCU* cu, Int* src, TCoeff* dst, Int width, Int height, TextType ttype, UInt absPartIdx);
+    UInt xQuant(TComDataCU* cu, Int* src, TCoeff* dst, Int width, Int height, TextType ttype, UInt absPartIdx, int *lastPos);
 
     // RDOQ functions
-    UInt xRateDistOptQuant(TComDataCU* cu, Int* srcCoeff, TCoeff* dstCoeff, UInt width, UInt height, TextType ttype, UInt absPartIdx);
+    UInt xRateDistOptQuant(TComDataCU* cu, Int* srcCoeff, TCoeff* dstCoeff, UInt width, UInt height, TextType ttype, UInt absPartIdx, int *lastPos);
 
     inline UInt xGetCodedLevel(Double& codedCost, Double& codedCost0, Double& codedCostSig, Int levelDouble,
                                  UInt maxAbsLevel, UShort ctxNumSig, UShort ctxNumOne, UShort ctxNumAbs, UShort absGoRice,
diff -r 681ab201ea0c -r 4be95d676094 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Fri Aug 16 18:49:19 2013 +0800
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Fri Aug 16 18:49:44 2013 +0800
@@ -539,12 +539,13 @@
 
     //--- transform and quantization ---
     UInt absSum = 0;
+    int lastPos = -1;
     cu->setTrIdxSubParts(trDepth, absPartIdx, fullDepth);
 
     m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, cu->getSlice()->getSPS()->getQpBDOffsetY(), 0);
     m_trQuant->selectLambda(TEXT_LUMA);
 
-    absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, width, height, TEXT_LUMA, absPartIdx, useTransformSkip);
+    absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, width, height, TEXT_LUMA, absPartIdx, &lastPos, useTransformSkip);
 
     //--- set coded block flag ---
     cu->setCbfSubParts((absSum ? 1 : 0) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
@@ -667,6 +668,7 @@
         }
         //--- transform and quantization ---
         UInt absSum = 0;
+        int lastPos = -1;
 
         Int curChromaQpOffset;
         if (ttype == TEXT_CHROMA_U)
@@ -681,7 +683,7 @@
 
         m_trQuant->selectLambda(TEXT_CHROMA);
 
-        absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, width, height, ttype, absPartIdx, useTransformSkipChroma);
+        absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, width, height, ttype, absPartIdx, &lastPos, useTransformSkipChroma);
 
         //--- set coded block flag ---
         cu->setCbfSubParts((absSum ? 1 : 0) << origTrDepth, ttype, absPartIdx, cu->getDepth(0) + trDepth);
@@ -3289,6 +3291,7 @@
     UInt singleBits = 0;
     UInt singleDist = 0;
     UInt absSumY = 0, absSumU = 0, absSumV = 0;
+    int lastPosY = -1, lastPosU = -1, lastPosV = -1;
     UInt bestTransformMode[3] = { 0 };
 
     m_rdGoOnSbacCoder->store(m_rdSbacCoders[depth][CI_QT_TRAFO_ROOT]);
@@ -3332,7 +3335,7 @@
         m_trQuant->selectLambda(TEXT_LUMA);
 
         absSumY = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absTUPartIdx), resiYuv->m_width, coeffCurY,
-                                          trWidth, trHeight, TEXT_LUMA, absPartIdx);
+                                          trWidth, trHeight, TEXT_LUMA, absPartIdx, &lastPosY);
 
         cu->setCbfSubParts(absSumY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
 
@@ -3349,12 +3352,12 @@
             m_trQuant->selectLambda(TEXT_CHROMA);
 
             absSumU = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absTUPartIdxC), resiYuv->m_cwidth, coeffCurU,
-                                              trWidthC, trHeightC, TEXT_CHROMA_U, absPartIdx);
+                                              trWidthC, trHeightC, TEXT_CHROMA_U, absPartIdx, &lastPosU);
 
             curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
             m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset);
             absSumV = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absTUPartIdxC), resiYuv->m_cwidth, coeffCurV,
-                                              trWidthC, trHeightC, TEXT_CHROMA_V, absPartIdx);
+                                              trWidthC, trHeightC, TEXT_CHROMA_V, absPartIdx, &lastPosV);
 
             cu->setCbfSubParts(absSumU ? setCbf : 0, TEXT_CHROMA_U, absPartIdx, cu->getDepth(0) + trModeC);
             cu->setCbfSubParts(absSumV ? setCbf : 0, TEXT_CHROMA_V, absPartIdx, cu->getDepth(0) + trModeC);
@@ -3605,6 +3608,7 @@
         if (checkTransformSkipY)
         {
             UInt nonZeroDistY = 0, absSumTransformSkipY;
+            int lastPosTransformSkipY = -1;
             UInt64 singleCostY = MAX_INT64;
 
             Short *curResiY = m_qtTempTComYuv[qtlayer].getLumaAddr(absTUPartIdx);
@@ -3632,7 +3636,7 @@
 
             m_trQuant->selectLambda(TEXT_LUMA);
             absSumTransformSkipY = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absTUPartIdx), resiYuv->m_width, coeffCurY,
-                                                           trWidth, trHeight, TEXT_LUMA, absPartIdx, true);
+                                                           trWidth, trHeight, TEXT_LUMA, absPartIdx, &lastPosTransformSkipY, true);
             cu->setCbfSubParts(absSumTransformSkipY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
 
             if (absSumTransformSkipY != 0)
@@ -3678,6 +3682,7 @@
         if (bCodeChroma && checkTransformSkipUV)
         {
             UInt nonZeroDistU = 0, nonZeroDistV = 0, absSumTransformSkipU, absSumTransformSkipV;
+            int lastPosTransformSkipU = -1, lastPosTransformSkipV = -1;
             UInt64 singleCostU = MAX_INT64;
             UInt64 singleCostV = MAX_INT64;
 
@@ -3711,11 +3716,11 @@
             m_trQuant->selectLambda(TEXT_CHROMA);
 
             absSumTransformSkipU = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absTUPartIdxC), resiYuv->m_cwidth, coeffCurU,
-                                                           trWidthC, trHeightC, TEXT_CHROMA_U, absPartIdx, true);
+                                                           trWidthC, trHeightC, TEXT_CHROMA_U, absPartIdx, &lastPosTransformSkipU, true);
             curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
             m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset);
             absSumTransformSkipV = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absTUPartIdxC), resiYuv->m_cwidth, coeffCurV,
-                                                           trWidthC, trHeightC, TEXT_CHROMA_V, absPartIdx, true);
+                                                           trWidthC, trHeightC, TEXT_CHROMA_V, absPartIdx, &lastPosTransformSkipV, true);
 
             cu->setCbfSubParts(absSumTransformSkipU ? setCbf : 0, TEXT_CHROMA_U, absPartIdx, cu->getDepth(0) + trModeC);
             cu->setCbfSubParts(absSumTransformSkipV ? setCbf : 0, TEXT_CHROMA_V, absPartIdx, cu->getDepth(0) + trModeC);
diff -r 681ab201ea0c -r 4be95d676094 source/common/dct.cpp
--- a/source/common/dct.cpp	Fri Aug 16 18:49:19 2013 +0800
+++ b/source/common/dct.cpp	Fri Aug 16 18:49:44 2013 +0800
@@ -772,7 +772,7 @@
     }
 }
 
-uint32_t quant_c(int* coef, int* quantCoeff, int* deltaU, int* qCoef, int qBits, int add, int numCoeff)
+uint32_t quant_c(int* coef, int* quantCoeff, int* deltaU, int* qCoef, int qBits, int add, int numCoeff, int* lastPos)
 {
     int qBits8 = qBits - 8;
     uint32_t acSum = 0;
@@ -786,6 +786,8 @@
 
         int tmplevel = abs(level) * quantCoeff[blockpos];
         level = ((tmplevel + add) >> qBits);
+        if (level)
+            *lastPos = blockpos;
         deltaU[blockpos] = ((tmplevel - (level << qBits)) >> qBits8);
         acSum += level;
         level *= sign;
diff -r 681ab201ea0c -r 4be95d676094 source/common/primitives.h
--- a/source/common/primitives.h	Fri Aug 16 18:49:19 2013 +0800
+++ b/source/common/primitives.h	Fri Aug 16 18:49:44 2013 +0800
@@ -213,7 +213,7 @@
 typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, short *residual, int stride);
 typedef void (*calcrecon_t)(pixel* pred, short* residual, pixel* recon, short* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
 typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
-typedef uint32_t (*quant_t)(int *coef, int *quantCoeff, int *deltaU, int *qCoef, int qBits, int add, int numCoeff);
+typedef uint32_t (*quant_t)(int *coef, int *quantCoeff, int *deltaU, int *qCoef, int qBits, int add, int numCoeff, int* lastPos);
 typedef void (*dequant_t)(const int* src, int* dst, int width, int height, int mcqp_miper, int mcqp_mirem, bool useScalingList,
                           unsigned int trSizeLog2, int *dequantCoef);
 
diff -r 681ab201ea0c -r 4be95d676094 source/common/vec/dct.inc
--- a/source/common/vec/dct.inc	Fri Aug 16 18:49:19 2013 +0800
+++ b/source/common/vec/dct.inc	Fri Aug 16 18:49:44 2013 +0800
@@ -3907,16 +3907,21 @@
                int* qCoef,
                int  qBits,
                int  add,
-               int  numCoeff)
+               int  numCoeff,
+               int* lastPos)
 {
     int qBits8 = qBits - 8;
     uint32_t acSum = 0;
     int dstOffset = 0;
     __m128i acSum4 = _mm_setzero_si128();
     __m128i addVec = _mm_set1_epi32(add);
+    __m128i maskPos4 = _mm_setr_epi32(0, 1, 2, 3);
+    __m128i posNext4 = _mm_set1_epi32(4);
+    __m128i lastPos4 = _mm_set1_epi32(-1);
 
     for (int blockpos = 0; blockpos < numCoeff; blockpos += 8)
     {
+        __m128i maskZero;
         __m128i level1 = _mm_loadu_si128((__m128i*)(coef + blockpos));
 
         __m128i sign1 = _mm_cmplt_epi32(level1, _mm_setzero_si128());
@@ -3927,10 +3932,18 @@
         __m128i deltaU1 = _mm_srai_epi32(_mm_sub_epi32(tmplevel1, _mm_slli_epi32(level1, qBits)), qBits8);
         _mm_storeu_si128((__m128i*)(deltaU + blockpos), deltaU1);
         acSum4 = _mm_add_epi32(acSum4, level1);
+
+        maskZero = _mm_cmpeq_epi32(level1, _mm_setzero_si128());
+        //lastPos4 = _mm_or_si128(_mm_andnot_si128(maskZero, maskPos4), _mm_and_si128(maskZero, lastPos4));
+        //lastPos4 = _mm_blendv_epi8(maskPos4, lastPos4, maskZero);
+        lastPos4 = _mm_max_epi32(lastPos4, _mm_or_si128(maskZero, maskPos4));
+        maskPos4 = _mm_add_epi32(maskPos4, posNext4);
+
         level1 = _mm_sub_epi32(_mm_xor_si128(level1, sign1), sign1);
         level1 = _mm_cvtepi16_epi32(_mm_packs_epi32(level1, level1));
         _mm_storeu_si128((__m128i*)(qCoef + dstOffset), level1);
 
+
         __m128i level2 = _mm_loadu_si128((__m128i*)(coef + blockpos + 4));
         __m128i sign2 = _mm_cmplt_epi32(level2, _mm_setzero_si128());
 
@@ -3940,6 +3953,13 @@
         __m128i deltaU2 = _mm_srai_epi32(_mm_sub_epi32(tmplevel2, _mm_slli_epi32(level2, qBits)), qBits8);
         _mm_storeu_si128((__m128i*)(deltaU + blockpos + 4), deltaU2);
         acSum4 = _mm_add_epi32(acSum4, level2);
+
+        maskZero = _mm_cmpeq_epi32(level2, _mm_setzero_si128());
+        //lastPos4 = _mm_or_si128(_mm_andnot_si128(maskZero, maskPos4), _mm_and_si128(maskZero, lastPos4));
+        //lastPos4 = _mm_blendv_epi8(maskPos4, lastPos4, maskZero);
+        lastPos4 = _mm_max_epi32(lastPos4, _mm_or_si128(maskZero, maskPos4));
+        maskPos4 = _mm_add_epi32(maskPos4, posNext4);
+
         level2 = _mm_sub_epi32(_mm_xor_si128(level2, sign2), sign2);
         level2 = _mm_cvtepi16_epi32(_mm_packs_epi32(level2, level2));
         _mm_storeu_si128((__m128i*)(qCoef + dstOffset + 4), level2);
@@ -3950,6 +3970,11 @@
     acSum4 = _mm_hadd_epi32(acSum4, acSum4);
     acSum  = _mm_cvtsi128_si32(acSum4);
 
+    lastPos4 = _mm_max_epi32(lastPos4, _mm_shuffle_epi32(lastPos4, 0x0E));
+    lastPos4 = _mm_max_epi32(lastPos4, _mm_shuffle_epi32(lastPos4, 0x01));
+    int tmp = _mm_cvtsi128_si32(lastPos4);
+    *lastPos = tmp;
+
     return acSum;
 }
 }
diff -r 681ab201ea0c -r 4be95d676094 source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp	Fri Aug 16 18:49:19 2013 +0800
+++ b/source/test/mbdstharness.cpp	Fri Aug 16 18:49:44 2013 +0800
@@ -238,6 +238,13 @@
 {
     int j = 0;
 
+    // fill again to avoid error Q value
+    for (int i = 0; i < mb_t_size; i++)
+    {
+        mintbuf1[i] = rand() & PIXEL_MAX;
+        mintbuf2[i] = rand() & PIXEL_MAX;
+    }
+
     for (int i = 0; i <= 5; i++)
     {
         int width = (rand() % 4 + 1) * 4;
@@ -255,9 +262,10 @@
         int valueToAdd = rand() % (32 * 1024);
         int cmp_size = sizeof(int) * height * width;
         int numCoeff = height * width;
+        int optLastPos = -1, refLastPos = -1;
 
-        refReturnValue = ref(mintbuf1 + j, mintbuf2 + j, mintbuf5, mintbuf6, bits, valueToAdd, numCoeff);
-        optReturnValue = opt(mintbuf1 + j, mintbuf2 + j, mintbuf3, mintbuf4, bits, valueToAdd, numCoeff);
+        refReturnValue = ref(mintbuf1 + j, mintbuf2 + j, mintbuf5, mintbuf6, bits, valueToAdd, numCoeff, &refLastPos);
+        optReturnValue = opt(mintbuf1 + j, mintbuf2 + j, mintbuf3, mintbuf4, bits, valueToAdd, numCoeff, &optLastPos);
 
         if (memcmp(mintbuf3, mintbuf5, cmp_size))
             return false;
@@ -268,6 +276,9 @@
         if (optReturnValue != refReturnValue)
             return false;
 
+        if (optLastPos != refLastPos)
+            return false;
+
         j += 16;
 
 #if _DEBUG
@@ -357,6 +368,7 @@
     if (opt.quant)
     {
         printf("quant\t\t");
-        REPORT_SPEEDUP(opt.quant, ref.quant, mintbuf1, mintbuf2, mintbuf3, mintbuf4, 23, 23785, 32 * 32);
+        int dummy = -1;
+        REPORT_SPEEDUP(opt.quant, ref.quant, mintbuf1, mintbuf2, mintbuf3, mintbuf4, 23, 23785, 32 * 32, &dummy);
     }
 }