[x265] [PATCH] quant.cpp: 'rdoQuant_c' primitive for SIMD optimization
Praveen Tiwari
praveen at multicorewareinc.com
Mon Nov 27 16:01:48 CET 2017
Please ignore this patch I messed an update. I will resend this soon. Thanks
On Mon, Nov 27, 2017 at 5:11 PM, <praveen at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Praveen Tiwari <praveen at multicorewareinc.com>
> # Date 1511167656 -19800
> # Mon Nov 20 14:17:36 2017 +0530
> # Node ID dffb056e5ad0e2298b0dd65d048f4f16d8508566
> # Parent b24454f3ff6de650aab6835e291837fc4e2a4466
> quant.cpp: 'rdoQuant_c' primitive for SIMD optimization
>
> This particular section of code appears to be bottleneck in many profiles,
> as it
> involves 64-bit multiplication operations. For SIMD optimization we need
> to convert
> few buffer/variables to double.
>
> diff -r b24454f3ff6d -r dffb056e5ad0 source/common/dct.cpp
> --- a/source/common/dct.cpp Wed Nov 22 22:00:48 2017 +0530
> +++ b/source/common/dct.cpp Mon Nov 20 14:17:36 2017 +0530
> @@ -984,6 +984,32 @@
> return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28);
> }
>
> +void rdoQuant_c(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double*
> costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t
> psyScale, uint32_t blkPos, uint32_t log2TrSize)
> +{
> + const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH -
> log2TrSize; /* Represents scaling through forward transform */
> + const int scaleBits = SCALE_BITS - 2 * transformShift;
> + const uint32_t trSize = 1 << log2TrSize;
> + int max = X265_MAX(0, (2 * transformShift + 1));
> +
> + for (int y = 0; y < MLS_CG_SIZE; y++)
> + {
> + for (int x = 0; x < MLS_CG_SIZE; x++)
> + {
> + int64_t signCoef = m_resiDctCoeff[blkPos + x]; /*
> pre-quantization DCT coeff */
> + int64_t predictedCoef = m_fencDctCoeff[blkPos + x] -
> signCoef; /* predicted DCT = source DCT - residual DCT*/
> +
> + costUncoded[blkPos + x] = static_cast<double>((signCoef *
> signCoef) << scaleBits);
> +
> + /* when no residual coefficient is coded, predicted coef ==
> recon coef */
> + costUncoded[blkPos + x] -= static_cast<double>((psyScale *
> (predictedCoef)) >> max);
> +
> + *totalUncodedCost += costUncoded[blkPos + x];
> + *totalRdCost += costUncoded[blkPos + x];
> + }
> + blkPos += trSize;
> + }
> +}
> +
> namespace X265_NS {
> // x265 private namespace
>
> @@ -993,6 +1019,7 @@
> p.dequant_normal = dequant_normal_c;
> p.quant = quant_c;
> p.nquant = nquant_c;
> + p.rdoQuant = rdoQuant_c;
> p.dst4x4 = dst4_c;
> p.cu[BLOCK_4x4].dct = dct4_c;
> p.cu[BLOCK_8x8].dct = dct8_c;
> diff -r b24454f3ff6d -r dffb056e5ad0 source/common/primitives.h
> --- a/source/common/primitives.h Wed Nov 22 22:00:48 2017 +0530
> +++ b/source/common/primitives.h Mon Nov 20 14:17:36 2017 +0530
> @@ -216,6 +216,7 @@
>
> typedef void (*integralv_t)(uint32_t *sum, intptr_t stride);
> typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride);
> +typedef void (*rdoQuant_t)(int16_t* m_resiDctCoeff, int16_t*
> m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double*
> totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize);
>
> /* Function pointers to optimized encoder primitives. Each pointer can
> reference
> * either an assembly routine, a SIMD intrinsic primitive, or a C
> function */
> @@ -304,6 +305,7 @@
>
> quant_t quant;
> nquant_t nquant;
> + rdoQuant_t rdoQuant;
> dequant_scaling_t dequant_scaling;
> dequant_normal_t dequant_normal;
> denoiseDct_t denoiseDct;
> diff -r b24454f3ff6d -r dffb056e5ad0 source/common/quant.cpp
> --- a/source/common/quant.cpp Wed Nov 22 22:00:48 2017 +0530
> +++ b/source/common/quant.cpp Mon Nov 20 14:17:36 2017 +0530
> @@ -663,7 +663,7 @@
> #define PSYVALUE(rec) ((psyScale * (rec)) >> X265_MAX(0, (2 *
> transformShift + 1)))
>
> int64_t costCoeff[trSize * trSize]; /* d*d + lambda * bits */
> - int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0 */
> + double costUncoded[trSize * trSize]; /* d*d + lambda * 0 */
> int64_t costSig[trSize * trSize]; /* lambda * bits */
>
> int rateIncUp[trSize * trSize]; /* signal overhead of increasing
> level */
> @@ -677,12 +677,12 @@
> bool bIsLuma = ttype == TEXT_LUMA;
>
> /* total rate distortion cost of transform block, as CBF=0 */
> - int64_t totalUncodedCost = 0;
> + double totalUncodedCost = 0;
>
> /* Total rate distortion cost of this transform block, counting te
> distortion of uncoded blocks,
> * the distortion and signal cost of coded blocks, and the coding
> cost of significant
> * coefficient and coefficient group bitmaps */
> - int64_t totalRdCost = 0;
> + double totalRdCost = 0;
>
> TUEntropyCodingParameters codeParams;
> cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize,
> bIsLuma);
> @@ -729,24 +729,9 @@
> uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
> uint32_t blkPos = codeParams.scan[scanPosBase];
>
> - // TODO: we can't SIMD optimize because PSYVALUE need 64-bits
> multiplication, convert to Double can work faster by FMA
> - for (int y = 0; y < MLS_CG_SIZE; y++)
> - {
> - for (int x = 0; x < MLS_CG_SIZE; x++)
> - {
> - int signCoef = m_resiDctCoeff[blkPos + x];
> /* pre-quantization DCT coeff */
> - int predictedCoef = m_fencDctCoeff[blkPos + x] -
> signCoef; /* predicted DCT = source DCT - residual DCT*/
> + // PSYVALUE need 64-bits multiplication, we have converted
> few buffers/variables to double, expected to work faster by SIMD
> + primitives.rdoQuant(m_resiDctCoeff, m_fencDctCoeff,
> costUncoded, &totalUncodedCost, &totalRdCost, psyScale, blkPos, log2TrSize);
>
> - costUncoded[blkPos + x] = ((int64_t)signCoef *
> signCoef) << scaleBits;
> -
> - /* when no residual coefficient is coded, predicted
> coef == recon coef */
> - costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
> -
> - totalUncodedCost += costUncoded[blkPos + x];
> - totalRdCost += costUncoded[blkPos + x];
> - }
> - blkPos += trSize;
> - }
> }
> }
> else
> @@ -764,7 +749,7 @@
> for (int x = 0; x < MLS_CG_SIZE; x++)
> {
> int signCoef = m_resiDctCoeff[blkPos + x];
> /* pre-quantization DCT coeff */
> - costUncoded[blkPos + x] = ((int64_t)signCoef *
> signCoef) << scaleBits;
> + costUncoded[blkPos + x] =
> static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
>
> totalUncodedCost += costUncoded[blkPos + x];
> totalRdCost += costUncoded[blkPos + x];
> @@ -844,7 +829,7 @@
> int signCoef = m_resiDctCoeff[blkPos +
> x]; /* pre-quantization DCT coeff */
> int predictedCoef = m_fencDctCoeff[blkPos + x]
> - signCoef; /* predicted DCT = source DCT - residual DCT*/
>
> - costUncoded[blkPos + x] = ((int64_t)signCoef *
> signCoef) << scaleBits;
> + costUncoded[blkPos + x] =
> static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
>
> /* when no residual coefficient is coded,
> predicted coef == recon coef */
> costUncoded[blkPos + x] -=
> PSYVALUE(predictedCoef);
> @@ -858,7 +843,7 @@
> X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx,
> log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma,
> codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
>
> costSig[scanPosBase + scanPosOffset] =
> SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
> - costCoeff[scanPosBase + scanPosOffset] =
> costUncoded[blkPos + x];
> + costCoeff[scanPosBase + scanPosOffset] =
> static_cast<int64_t>(costUncoded[blkPos + x]);
> sigRateDelta[blkPos + x] =
> estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0]
> [ctxSig];
> }
> blkPos += trSize;
> @@ -872,7 +857,7 @@
> for (int x = 0; x < MLS_CG_SIZE; x++)
> {
> int signCoef = m_resiDctCoeff[blkPos + x];
> /* pre-quantization DCT coeff */
> - costUncoded[blkPos + x] = ((int64_t)signCoef *
> signCoef) << scaleBits;
> + costUncoded[blkPos + x] =
> static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
>
> totalUncodedCost += costUncoded[blkPos + x];
> totalRdCost += costUncoded[blkPos + x];
> @@ -883,7 +868,7 @@
> X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx,
> log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma,
> codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
>
> costSig[scanPosBase + scanPosOffset] =
> SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
> - costCoeff[scanPosBase + scanPosOffset] =
> costUncoded[blkPos + x];
> + costCoeff[scanPosBase + scanPosOffset] =
> static_cast<int64_t>(costUncoded[blkPos + x]);
> sigRateDelta[blkPos + x] =
> estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0]
> [ctxSig];
> }
> blkPos += trSize;
> @@ -922,7 +907,7 @@
> * FIX15 nature of the CABAC cost tables minus the forward
> transform scale */
>
> /* cost of not coding this coefficient (all distortion, no
> signal bits) */
> - costUncoded[blkPos] = ((int64_t)signCoef * signCoef) <<
> scaleBits;
> + costUncoded[blkPos] = static_cast<double>(((int64_t)signCoef
> * signCoef) << scaleBits);
> X265_CHECK((!!scanPos ^ !!blkPos) == 0, "failed on (blkPos=0
> && scanPos!=0)\n");
> if (usePsyMask & scanPos)
> /* when no residual coefficient is coded, predicted coef
> == recon coef */
> @@ -956,7 +941,7 @@
> // fast zero coeff path
> /* set default costs to uncoded costs */
> costSig[scanPos] = SIGCOST(estBitsSbac.
> significantBits[0][ctxSig]);
> - costCoeff[scanPos] = costUncoded[blkPos] +
> costSig[scanPos];
> + costCoeff[scanPos] = static_cast<int64_t>(costUncoded[blkPos]
> + costSig[scanPos]);
> sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig]
> - estBitsSbac.significantBits[0][ctxSig];
> totalRdCost += costCoeff[scanPos];
> rateIncUp[blkPos] = greaterOneBits[0];
> @@ -991,7 +976,7 @@
> {
> /* set default costs to uncoded costs */
> costSig[scanPos] = SIGCOST(estBitsSbac.
> significantBits[0][ctxSig]);
> - costCoeff[scanPos] = costUncoded[blkPos] +
> costSig[scanPos];
> + costCoeff[scanPos] = static_cast<int64_t>(costUncoded[blkPos]
> + costSig[scanPos]);
> }
> sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig]
> - estBitsSbac.significantBits[0][ctxSig];
> sigCoefBits = estBitsSbac.significantBits[1][ctxSig];
> @@ -1138,7 +1123,7 @@
> {
> sigCoeffGroupFlag64 |= cgBlkPosMask;
> cgRdStats.codedLevelAndDist += costCoeff[scanPos] -
> costSig[scanPos];
> - cgRdStats.uncodedDist += costUncoded[blkPos];
> + cgRdStats.uncodedDist += static_cast<int64_t>(
> costUncoded[blkPos]);
> cgRdStats.nnzBeforePos0 += scanPosinCG;
> }
> }
> @@ -1174,7 +1159,7 @@
>
> uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64,
> cgPosX, cgPosY, cgBlkPos, cgStride);
>
> - int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.
> significantCoeffGroupBits[sigCtx][0]);
> + int64_t costZeroCG = static_cast<int64_t>(totalRdCost +
> SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]));
> costZeroCG += cgRdStats.uncodedDist; /* add distortion
> for resetting non-zero levels to zero levels */
> costZeroCG -= cgRdStats.codedLevelAndDist; /* remove
> distortion and level cost of coded coefficients */
> costZeroCG -= cgRdStats.sigCost; /* remove
> signaling cost of significant coeff bitmap */
> @@ -1185,7 +1170,7 @@
> if (costZeroCG < totalRdCost && m_rdoqLevel > 1)
> {
> sigCoeffGroupFlag64 &= ~cgBlkPosMask;
> - totalRdCost = costZeroCG;
> + totalRdCost = static_cast<double>(costZeroCG);
> costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.
> significantCoeffGroupBits[sigCtx][0]);
>
> /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */
> @@ -1212,14 +1197,14 @@
> int64_t bestCost;
> if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx])
> {
> - bestCost = totalUncodedCost + SIGCOST(estBitsSbac.
> blockRootCbpBits[0]);
> - totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]);
> + bestCost = static_cast<int64_t>(totalUncodedCost +
> SIGCOST(estBitsSbac.blockRootCbpBits[0]));
> + totalRdCost += static_cast<double>((SIGCOST(
> estBitsSbac.blockRootCbpBits[1])));
> }
> else
> {
> int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]];
> - bestCost = totalUncodedCost + SIGCOST(estBitsSbac.
> blockCbpBits[ctx][0]);
> - totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]);
> + bestCost = static_cast<int64_t>(totalUncodedCost +
> SIGCOST(estBitsSbac.blockCbpBits[ctx][0]));
> + totalRdCost += static_cast<double>(SIGCOST(
> estBitsSbac.blockCbpBits[ctx][1]));
> }
>
> /* This loop starts with the last non-zero found in the first loop
> and then refines this last
> @@ -1277,7 +1262,7 @@
> bitsLastNZ += IEP_RATE * suffixLen;
> }
>
> - int64_t costAsLast = totalRdCost - costSig[scanPos] +
> SIGCOST(bitsLastNZ);
> + int64_t costAsLast = static_cast<int64_t>(totalRdCost -
> costSig[scanPos] + SIGCOST(bitsLastNZ));
>
> if (costAsLast < bestCost)
> {
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20171127/80fac4ea/attachment-0001.html>
More information about the x265-devel
mailing list