[x265] [PATCH] quant.cpp: 'rdoQuant_c' primitive for SIMD optimization
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Mon Nov 27 12:41:41 CET 2017
# HG changeset patch
# User Praveen Tiwari <praveen at multicorewareinc.com>
# Date 1511167656 -19800
# Mon Nov 20 14:17:36 2017 +0530
# Node ID dffb056e5ad0e2298b0dd65d048f4f16d8508566
# Parent b24454f3ff6de650aab6835e291837fc4e2a4466
quant.cpp: 'rdoQuant_c' primitive for SIMD optimization
This particular section of code appears to be bottleneck in many profiles, as it
involves 64-bit multiplication operations. For SIMD optimization we need to convert
few buffer/variables to double.
diff -r b24454f3ff6d -r dffb056e5ad0 source/common/dct.cpp
--- a/source/common/dct.cpp Wed Nov 22 22:00:48 2017 +0530
+++ b/source/common/dct.cpp Mon Nov 20 14:17:36 2017 +0530
@@ -984,6 +984,32 @@
return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28);
}
+void rdoQuant_c(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize)
+{
+ const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
+ const int scaleBits = SCALE_BITS - 2 * transformShift;
+ const uint32_t trSize = 1 << log2TrSize;
+ int max = X265_MAX(0, (2 * transformShift + 1));
+
+ for (int y = 0; y < MLS_CG_SIZE; y++)
+ {
+ for (int x = 0; x < MLS_CG_SIZE; x++)
+ {
+ int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
+ int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
+
+ costUncoded[blkPos + x] = static_cast<double>((signCoef * signCoef) << scaleBits);
+
+ /* when no residual coefficient is coded, predicted coef == recon coef */
+ costUncoded[blkPos + x] -= static_cast<double>((psyScale * (predictedCoef)) >> max);
+
+ *totalUncodedCost += costUncoded[blkPos + x];
+ *totalRdCost += costUncoded[blkPos + x];
+ }
+ blkPos += trSize;
+ }
+}
+
namespace X265_NS {
// x265 private namespace
@@ -993,6 +1019,7 @@
p.dequant_normal = dequant_normal_c;
p.quant = quant_c;
p.nquant = nquant_c;
+ p.rdoQuant = rdoQuant_c;
p.dst4x4 = dst4_c;
p.cu[BLOCK_4x4].dct = dct4_c;
p.cu[BLOCK_8x8].dct = dct8_c;
diff -r b24454f3ff6d -r dffb056e5ad0 source/common/primitives.h
--- a/source/common/primitives.h Wed Nov 22 22:00:48 2017 +0530
+++ b/source/common/primitives.h Mon Nov 20 14:17:36 2017 +0530
@@ -216,6 +216,7 @@
typedef void (*integralv_t)(uint32_t *sum, intptr_t stride);
typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride);
+typedef void (*rdoQuant_t)(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize);
/* Function pointers to optimized encoder primitives. Each pointer can reference
* either an assembly routine, a SIMD intrinsic primitive, or a C function */
@@ -304,6 +305,7 @@
quant_t quant;
nquant_t nquant;
+ rdoQuant_t rdoQuant;
dequant_scaling_t dequant_scaling;
dequant_normal_t dequant_normal;
denoiseDct_t denoiseDct;
diff -r b24454f3ff6d -r dffb056e5ad0 source/common/quant.cpp
--- a/source/common/quant.cpp Wed Nov 22 22:00:48 2017 +0530
+++ b/source/common/quant.cpp Mon Nov 20 14:17:36 2017 +0530
@@ -663,7 +663,7 @@
#define PSYVALUE(rec) ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1)))
int64_t costCoeff[trSize * trSize]; /* d*d + lambda * bits */
- int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0 */
+ double costUncoded[trSize * trSize]; /* d*d + lambda * 0 */
int64_t costSig[trSize * trSize]; /* lambda * bits */
int rateIncUp[trSize * trSize]; /* signal overhead of increasing level */
@@ -677,12 +677,12 @@
bool bIsLuma = ttype == TEXT_LUMA;
/* total rate distortion cost of transform block, as CBF=0 */
- int64_t totalUncodedCost = 0;
+ double totalUncodedCost = 0;
/* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks,
* the distortion and signal cost of coded blocks, and the coding cost of significant
* coefficient and coefficient group bitmaps */
- int64_t totalRdCost = 0;
+ double totalRdCost = 0;
TUEntropyCodingParameters codeParams;
cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma);
@@ -729,24 +729,9 @@
uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
uint32_t blkPos = codeParams.scan[scanPosBase];
- // TODO: we can't SIMD optimize because PSYVALUE need 64-bits multiplication, convert to Double can work faster by FMA
- for (int y = 0; y < MLS_CG_SIZE; y++)
- {
- for (int x = 0; x < MLS_CG_SIZE; x++)
- {
- int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
- int predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
+ // PSYVALUE need 64-bits multiplication, we have converted few buffers/variables to double, expected to work faster by SIMD
+ primitives.rdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, psyScale, blkPos, log2TrSize);
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
-
- /* when no residual coefficient is coded, predicted coef == recon coef */
- costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
-
- totalUncodedCost += costUncoded[blkPos + x];
- totalRdCost += costUncoded[blkPos + x];
- }
- blkPos += trSize;
- }
}
}
else
@@ -764,7 +749,7 @@
for (int x = 0; x < MLS_CG_SIZE; x++)
{
int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
+ costUncoded[blkPos + x] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
totalUncodedCost += costUncoded[blkPos + x];
totalRdCost += costUncoded[blkPos + x];
@@ -844,7 +829,7 @@
int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
int predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
+ costUncoded[blkPos + x] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
/* when no residual coefficient is coded, predicted coef == recon coef */
costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
@@ -858,7 +843,7 @@
X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
- costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
+ costCoeff[scanPosBase + scanPosOffset] = static_cast<int64_t>(costUncoded[blkPos + x]);
sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
}
blkPos += trSize;
@@ -872,7 +857,7 @@
for (int x = 0; x < MLS_CG_SIZE; x++)
{
int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
+ costUncoded[blkPos + x] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
totalUncodedCost += costUncoded[blkPos + x];
totalRdCost += costUncoded[blkPos + x];
@@ -883,7 +868,7 @@
X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
- costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
+ costCoeff[scanPosBase + scanPosOffset] = static_cast<int64_t>(costUncoded[blkPos + x]);
sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
}
blkPos += trSize;
@@ -922,7 +907,7 @@
* FIX15 nature of the CABAC cost tables minus the forward transform scale */
/* cost of not coding this coefficient (all distortion, no signal bits) */
- costUncoded[blkPos] = ((int64_t)signCoef * signCoef) << scaleBits;
+ costUncoded[blkPos] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
X265_CHECK((!!scanPos ^ !!blkPos) == 0, "failed on (blkPos=0 && scanPos!=0)\n");
if (usePsyMask & scanPos)
/* when no residual coefficient is coded, predicted coef == recon coef */
@@ -956,7 +941,7 @@
// fast zero coeff path
/* set default costs to uncoded costs */
costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
- costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];
+ costCoeff[scanPos] = static_cast<int64_t>(costUncoded[blkPos] + costSig[scanPos]);
sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
totalRdCost += costCoeff[scanPos];
rateIncUp[blkPos] = greaterOneBits[0];
@@ -991,7 +976,7 @@
{
/* set default costs to uncoded costs */
costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
- costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];
+ costCoeff[scanPos] = static_cast<int64_t>(costUncoded[blkPos] + costSig[scanPos]);
}
sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
sigCoefBits = estBitsSbac.significantBits[1][ctxSig];
@@ -1138,7 +1123,7 @@
{
sigCoeffGroupFlag64 |= cgBlkPosMask;
cgRdStats.codedLevelAndDist += costCoeff[scanPos] - costSig[scanPos];
- cgRdStats.uncodedDist += costUncoded[blkPos];
+ cgRdStats.uncodedDist += static_cast<int64_t>(costUncoded[blkPos]);
cgRdStats.nnzBeforePos0 += scanPosinCG;
}
}
@@ -1174,7 +1159,7 @@
uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
- int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]);
+ int64_t costZeroCG = static_cast<int64_t>(totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]));
costZeroCG += cgRdStats.uncodedDist; /* add distortion for resetting non-zero levels to zero levels */
costZeroCG -= cgRdStats.codedLevelAndDist; /* remove distortion and level cost of coded coefficients */
costZeroCG -= cgRdStats.sigCost; /* remove signaling cost of significant coeff bitmap */
@@ -1185,7 +1170,7 @@
if (costZeroCG < totalRdCost && m_rdoqLevel > 1)
{
sigCoeffGroupFlag64 &= ~cgBlkPosMask;
- totalRdCost = costZeroCG;
+ totalRdCost = static_cast<double>(costZeroCG);
costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]);
/* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */
@@ -1212,14 +1197,14 @@
int64_t bestCost;
if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx])
{
- bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]);
- totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]);
+ bestCost = static_cast<int64_t>(totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]));
+ totalRdCost += static_cast<double>((SIGCOST(estBitsSbac.blockRootCbpBits[1])));
}
else
{
int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]];
- bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]);
- totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]);
+ bestCost = static_cast<int64_t>(totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]));
+ totalRdCost += static_cast<double>(SIGCOST(estBitsSbac.blockCbpBits[ctx][1]));
}
/* This loop starts with the last non-zero found in the first loop and then refines this last
@@ -1277,7 +1262,7 @@
bitsLastNZ += IEP_RATE * suffixLen;
}
- int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ);
+ int64_t costAsLast = static_cast<int64_t>(totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ));
if (costAsLast < bestCost)
{
More information about the x265-devel
mailing list