[x265] [PATCH] quant.cpp: 'rdoQuant_c' primitive for SIMD optimization

Mon Nov 27 16:01:48 CET 2017

Please ignore this patch I messed an update. I will resend this soon. Thanks

On Mon, Nov 27, 2017 at 5:11 PM, <praveen at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Praveen Tiwari <praveen at multicorewareinc.com>
> # Date 1511167656 -19800
> #      Mon Nov 20 14:17:36 2017 +0530
> # Node ID dffb056e5ad0e2298b0dd65d048f4f16d8508566
> # Parent  b24454f3ff6de650aab6835e291837fc4e2a4466
> quant.cpp: 'rdoQuant_c' primitive for SIMD optimization
>
> This particular section of code appears to be bottleneck in many profiles,
> as it
> involves 64-bit multiplication operations. For SIMD optimization we need
> to convert
> few buffer/variables to double.
>
> diff -r b24454f3ff6d -r dffb056e5ad0 source/common/dct.cpp
> --- a/source/common/dct.cpp     Wed Nov 22 22:00:48 2017 +0530
> +++ b/source/common/dct.cpp     Mon Nov 20 14:17:36 2017 +0530
> @@ -984,6 +984,32 @@
>      return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28);
>  }
>
> +void rdoQuant_c(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double*
> costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t
> psyScale, uint32_t blkPos, uint32_t log2TrSize)
> +{
> +    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH -
> log2TrSize; /* Represents scaling through forward transform */
> +    const int scaleBits = SCALE_BITS - 2 * transformShift;
> +    const uint32_t trSize = 1 << log2TrSize;
> +    int max = X265_MAX(0, (2 * transformShift + 1));
> +
> +    for (int y = 0; y < MLS_CG_SIZE; y++)
> +    {
> +        for (int x = 0; x < MLS_CG_SIZE; x++)
> +        {
> +            int64_t signCoef = m_resiDctCoeff[blkPos + x];            /*
> pre-quantization DCT coeff */
> +            int64_t predictedCoef = m_fencDctCoeff[blkPos + x] -
> signCoef; /* predicted DCT = source DCT - residual DCT*/
> +
> +            costUncoded[blkPos + x] = static_cast<double>((signCoef *
> signCoef) << scaleBits);
> +
> +            /* when no residual coefficient is coded, predicted coef ==
> recon coef */
> +            costUncoded[blkPos + x] -= static_cast<double>((psyScale *
> (predictedCoef)) >> max);
> +
> +            *totalUncodedCost += costUncoded[blkPos + x];
> +            *totalRdCost += costUncoded[blkPos + x];
> +        }
> +        blkPos += trSize;
> +    }
> +}
> +
>  namespace X265_NS {
>  // x265 private namespace
>
> @@ -993,6 +1019,7 @@
>      p.dequant_normal = dequant_normal_c;
>      p.quant = quant_c;
>      p.nquant = nquant_c;
> +    p.rdoQuant = rdoQuant_c;
>      p.dst4x4 = dst4_c;
>      p.cu[BLOCK_4x4].dct   = dct4_c;
>      p.cu[BLOCK_8x8].dct   = dct8_c;
> diff -r b24454f3ff6d -r dffb056e5ad0 source/common/primitives.h
> --- a/source/common/primitives.h        Wed Nov 22 22:00:48 2017 +0530
> +++ b/source/common/primitives.h        Mon Nov 20 14:17:36 2017 +0530
> @@ -216,6 +216,7 @@
>
>  typedef void (*integralv_t)(uint32_t *sum, intptr_t stride);
>  typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride);
> +typedef void (*rdoQuant_t)(int16_t* m_resiDctCoeff, int16_t*
> m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double*
> totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize);
>
>  /* Function pointers to optimized encoder primitives. Each pointer can
> reference
>   * either an assembly routine, a SIMD intrinsic primitive, or a C
> function */
> @@ -304,6 +305,7 @@
>
>      quant_t               quant;
>      nquant_t              nquant;
> +    rdoQuant_t            rdoQuant;
>      dequant_scaling_t     dequant_scaling;
>      dequant_normal_t      dequant_normal;
>      denoiseDct_t          denoiseDct;
> diff -r b24454f3ff6d -r dffb056e5ad0 source/common/quant.cpp
> --- a/source/common/quant.cpp   Wed Nov 22 22:00:48 2017 +0530
> +++ b/source/common/quant.cpp   Mon Nov 20 14:17:36 2017 +0530
> @@ -663,7 +663,7 @@
>  #define PSYVALUE(rec)   ((psyScale * (rec)) >> X265_MAX(0, (2 *
> transformShift + 1)))
>
>      int64_t costCoeff[trSize * trSize];   /* d*d + lambda * bits */
> -    int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0    */
> +    double costUncoded[trSize * trSize]; /* d*d + lambda * 0    */
>      int64_t costSig[trSize * trSize];     /* lambda * bits       */
>
>      int rateIncUp[trSize * trSize];      /* signal overhead of increasing
> level */
> @@ -677,12 +677,12 @@
>      bool bIsLuma = ttype == TEXT_LUMA;
>
>      /* total rate distortion cost of transform block, as CBF=0 */
> -    int64_t totalUncodedCost = 0;
> +    double totalUncodedCost = 0;
>
>      /* Total rate distortion cost of this transform block, counting te
> distortion of uncoded blocks,
>       * the distortion and signal cost of coded blocks, and the coding
> cost of significant
>       * coefficient and coefficient group bitmaps */
> -    int64_t totalRdCost = 0;
> +    double totalRdCost = 0;
>
>      TUEntropyCodingParameters codeParams;
>      cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize,
> bIsLuma);
> @@ -729,24 +729,9 @@
>              uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
>              uint32_t blkPos      = codeParams.scan[scanPosBase];
>
> -            // TODO: we can't SIMD optimize because PSYVALUE need 64-bits
> multiplication, convert to Double can work faster by FMA
> -            for (int y = 0; y < MLS_CG_SIZE; y++)
> -            {
> -                for (int x = 0; x < MLS_CG_SIZE; x++)
> -                {
> -                    int signCoef         = m_resiDctCoeff[blkPos + x];
>         /* pre-quantization DCT coeff */
> -                    int predictedCoef    = m_fencDctCoeff[blkPos + x] -
> signCoef; /* predicted DCT = source DCT - residual DCT*/
> +            // PSYVALUE need 64-bits multiplication, we have converted
> few buffers/variables to double, expected to work faster by SIMD
> +            primitives.rdoQuant(m_resiDctCoeff, m_fencDctCoeff,
> costUncoded, &totalUncodedCost, &totalRdCost, psyScale, blkPos, log2TrSize);
>
> -                    costUncoded[blkPos + x] = ((int64_t)signCoef *
> signCoef) << scaleBits;
> -
> -                    /* when no residual coefficient is coded, predicted
> coef == recon coef */
> -                    costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
> -
> -                    totalUncodedCost += costUncoded[blkPos + x];
> -                    totalRdCost += costUncoded[blkPos + x];
> -                }
> -                blkPos += trSize;
> -            }
>          }
>      }
>      else
> @@ -764,7 +749,7 @@
>                  for (int x = 0; x < MLS_CG_SIZE; x++)
>                  {
>                      int signCoef = m_resiDctCoeff[blkPos + x];
> /* pre-quantization DCT coeff */
> -                    costUncoded[blkPos + x] = ((int64_t)signCoef *
> signCoef) << scaleBits;
> +                    costUncoded[blkPos + x] =
> static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
>
>                      totalUncodedCost += costUncoded[blkPos + x];
>                      totalRdCost += costUncoded[blkPos + x];
> @@ -844,7 +829,7 @@
>                          int signCoef         = m_resiDctCoeff[blkPos +
> x];            /* pre-quantization DCT coeff */
>                          int predictedCoef    = m_fencDctCoeff[blkPos + x]
> - signCoef; /* predicted DCT = source DCT - residual DCT*/
>
> -                        costUncoded[blkPos + x] = ((int64_t)signCoef *
> signCoef) << scaleBits;
> +                        costUncoded[blkPos + x] =
> static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
>
>                          /* when no residual coefficient is coded,
> predicted coef == recon coef */
>                          costUncoded[blkPos + x] -=
> PSYVALUE(predictedCoef);
> @@ -858,7 +843,7 @@
>                          X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx,
> log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma,
> codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
>
>                          costSig[scanPosBase + scanPosOffset] =
> SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
> -                        costCoeff[scanPosBase + scanPosOffset] =
> costUncoded[blkPos + x];
> +                        costCoeff[scanPosBase + scanPosOffset] =
> static_cast<int64_t>(costUncoded[blkPos + x]);
>                          sigRateDelta[blkPos + x] =
> estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0]
> [ctxSig];
>                      }
>                      blkPos += trSize;
> @@ -872,7 +857,7 @@
>                      for (int x = 0; x < MLS_CG_SIZE; x++)
>                      {
>                          int signCoef = m_resiDctCoeff[blkPos + x];
>     /* pre-quantization DCT coeff */
> -                        costUncoded[blkPos + x] = ((int64_t)signCoef *
> signCoef) << scaleBits;
> +                        costUncoded[blkPos + x] =
> static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
>
>                          totalUncodedCost += costUncoded[blkPos + x];
>                          totalRdCost += costUncoded[blkPos + x];
> @@ -883,7 +868,7 @@
>                          X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx,
> log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma,
> codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
>
>                          costSig[scanPosBase + scanPosOffset] =
> SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
> -                        costCoeff[scanPosBase + scanPosOffset] =
> costUncoded[blkPos + x];
> +                        costCoeff[scanPosBase + scanPosOffset] =
> static_cast<int64_t>(costUncoded[blkPos + x]);
>                          sigRateDelta[blkPos + x] =
> estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0]
> [ctxSig];
>                      }
>                      blkPos += trSize;
> @@ -922,7 +907,7 @@
>               * FIX15 nature of the CABAC cost tables minus the forward
> transform scale */
>
>              /* cost of not coding this coefficient (all distortion, no
> signal bits) */
> -            costUncoded[blkPos] = ((int64_t)signCoef * signCoef) <<
> scaleBits;
> +            costUncoded[blkPos] = static_cast<double>(((int64_t)signCoef
> * signCoef) << scaleBits);
>              X265_CHECK((!!scanPos ^ !!blkPos) == 0, "failed on (blkPos=0
> && scanPos!=0)\n");
>              if (usePsyMask & scanPos)
>                  /* when no residual coefficient is coded, predicted coef
> == recon coef */
> @@ -956,7 +941,7 @@
>                  // fast zero coeff path
>                  /* set default costs to uncoded costs */
>                  costSig[scanPos] = SIGCOST(estBitsSbac.
> significantBits[0][ctxSig]);
> -                costCoeff[scanPos] = costUncoded[blkPos] +
> costSig[scanPos];
> +                costCoeff[scanPos] = static_cast<int64_t>(costUncoded[blkPos]
> + costSig[scanPos]);
>                  sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig]
> - estBitsSbac.significantBits[0][ctxSig];
>                  totalRdCost += costCoeff[scanPos];
>                  rateIncUp[blkPos] = greaterOneBits[0];
> @@ -991,7 +976,7 @@
>                      {
>                          /* set default costs to uncoded costs */
>                          costSig[scanPos] = SIGCOST(estBitsSbac.
> significantBits[0][ctxSig]);
> -                        costCoeff[scanPos] = costUncoded[blkPos] +
> costSig[scanPos];
> +                        costCoeff[scanPos] = static_cast<int64_t>(costUncoded[blkPos]
> + costSig[scanPos]);
>                      }
>                      sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig]
> - estBitsSbac.significantBits[0][ctxSig];
>                      sigCoefBits = estBitsSbac.significantBits[1][ctxSig];
> @@ -1138,7 +1123,7 @@
>                  {
>                      sigCoeffGroupFlag64 |= cgBlkPosMask;
>                      cgRdStats.codedLevelAndDist += costCoeff[scanPos] -
> costSig[scanPos];
> -                    cgRdStats.uncodedDist += costUncoded[blkPos];
> +                    cgRdStats.uncodedDist += static_cast<int64_t>(
> costUncoded[blkPos]);
>                      cgRdStats.nnzBeforePos0 += scanPosinCG;
>                  }
>              }
> @@ -1174,7 +1159,7 @@
>
>              uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64,
> cgPosX, cgPosY, cgBlkPos, cgStride);
>
> -            int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.
> significantCoeffGroupBits[sigCtx][0]);
> +            int64_t costZeroCG = static_cast<int64_t>(totalRdCost +
> SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]));
>              costZeroCG += cgRdStats.uncodedDist;       /* add distortion
> for resetting non-zero levels to zero levels */
>              costZeroCG -= cgRdStats.codedLevelAndDist; /* remove
> distortion and level cost of coded coefficients */
>              costZeroCG -= cgRdStats.sigCost;           /* remove
> signaling cost of significant coeff bitmap */
> @@ -1185,7 +1170,7 @@
>              if (costZeroCG < totalRdCost && m_rdoqLevel > 1)
>              {
>                  sigCoeffGroupFlag64 &= ~cgBlkPosMask;
> -                totalRdCost = costZeroCG;
> +                totalRdCost = static_cast<double>(costZeroCG);
>                  costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.
> significantCoeffGroupBits[sigCtx][0]);
>
>                  /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */
> @@ -1212,14 +1197,14 @@
>      int64_t bestCost;
>      if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx])
>      {
> -        bestCost = totalUncodedCost + SIGCOST(estBitsSbac.
> blockRootCbpBits[0]);
> -        totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]);
> +        bestCost = static_cast<int64_t>(totalUncodedCost +
> SIGCOST(estBitsSbac.blockRootCbpBits[0]));
> +        totalRdCost += static_cast<double>((SIGCOST(
> estBitsSbac.blockRootCbpBits[1])));
>      }
>      else
>      {
>          int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]];
> -        bestCost = totalUncodedCost + SIGCOST(estBitsSbac.
> blockCbpBits[ctx][0]);
> -        totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]);
> +        bestCost = static_cast<int64_t>(totalUncodedCost +
> SIGCOST(estBitsSbac.blockCbpBits[ctx][0]));
> +        totalRdCost += static_cast<double>(SIGCOST(
> estBitsSbac.blockCbpBits[ctx][1]));
>      }
>
>      /* This loop starts with the last non-zero found in the first loop
> and then refines this last
> @@ -1277,7 +1262,7 @@
>                      bitsLastNZ += IEP_RATE * suffixLen;
>                  }
>
> -                int64_t costAsLast = totalRdCost - costSig[scanPos] +
> SIGCOST(bitsLastNZ);
> +                int64_t costAsLast = static_cast<int64_t>(totalRdCost -
> costSig[scanPos] + SIGCOST(bitsLastNZ));
>
>                  if (costAsLast < bestCost)
>                  {
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20171127/80fac4ea/attachment-0001.html>