[x265] [PATCH] quant.cpp: 'rdoQuant_c' primitive for SIMD optimization
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Tue Nov 28 06:46:11 CET 2017
# HG changeset patch
# User Praveen Tiwari <praveen at multicorewareinc.com>
# Date 1511167656 -19800
# Mon Nov 20 14:17:36 2017 +0530
# Node ID dfd4951a93744f3d732cb4645abd2fd87eded750
# Parent 17bb240012fe990635be621ac261bfd7c9b2d0ba
quant.cpp: 'rdoQuant_c' primitive for SIMD optimization
This particular section of code appears to be bottleneck in many profiles, as it
involves 64-bit multiplication operations. For SIMD optimization we need to convert
few buffer/variables to double.
diff -r 17bb240012fe -r dfd4951a9374 source/common/dct.cpp
--- a/source/common/dct.cpp Fri Nov 24 17:23:59 2017 +0100
+++ b/source/common/dct.cpp Mon Nov 20 14:17:36 2017 +0530
@@ -984,15 +984,41 @@
return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28);
}
+static void rdoQuant_c(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize)
+{
+ const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
+ const int scaleBits = SCALE_BITS - 2 * transformShift;
+ const uint32_t trSize = 1 << log2TrSize;
+ int max = X265_MAX(0, (2 * transformShift + 1));
+
+ for (int y = 0; y < MLS_CG_SIZE; y++)
+ {
+ for (int x = 0; x < MLS_CG_SIZE; x++)
+ {
+ int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
+ int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
+
+ costUncoded[blkPos + x] = static_cast<double>((signCoef * signCoef) << scaleBits);
+
+ /* when no residual coefficient is coded, predicted coef == recon coef */
+ costUncoded[blkPos + x] -= static_cast<double>((psyScale * (predictedCoef)) >> max);
+
+ *totalUncodedCost += costUncoded[blkPos + x];
+ *totalRdCost += costUncoded[blkPos + x];
+ }
+ blkPos += trSize;
+ }
+}
+
namespace X265_NS {
// x265 private namespace
-
void setupDCTPrimitives_c(EncoderPrimitives& p)
{
p.dequant_scaling = dequant_scaling_c;
p.dequant_normal = dequant_normal_c;
p.quant = quant_c;
p.nquant = nquant_c;
+ p.rdoQuant = rdoQuant_c;
p.dst4x4 = dst4_c;
p.cu[BLOCK_4x4].dct = dct4_c;
p.cu[BLOCK_8x8].dct = dct8_c;
diff -r 17bb240012fe -r dfd4951a9374 source/common/primitives.h
--- a/source/common/primitives.h Fri Nov 24 17:23:59 2017 +0100
+++ b/source/common/primitives.h Mon Nov 20 14:17:36 2017 +0530
@@ -213,10 +213,9 @@
typedef void (*pelFilterLumaStrong_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
typedef void (*pelFilterChroma_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ);
-
typedef void (*integralv_t)(uint32_t *sum, intptr_t stride);
typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride);
-
+typedef void (*rdoQuant_t)(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize);
/* Function pointers to optimized encoder primitives. Each pointer can reference
* either an assembly routine, a SIMD intrinsic primitive, or a C function */
struct EncoderPrimitives
@@ -301,9 +300,9 @@
* the CU arrays */
dct_t dst4x4;
idct_t idst4x4;
-
quant_t quant;
nquant_t nquant;
+ rdoQuant_t rdoQuant;
dequant_scaling_t dequant_scaling;
dequant_normal_t dequant_normal;
denoiseDct_t denoiseDct;
diff -r 17bb240012fe -r dfd4951a9374 source/common/quant.cpp
--- a/source/common/quant.cpp Fri Nov 24 17:23:59 2017 +0100
+++ b/source/common/quant.cpp Mon Nov 20 14:17:36 2017 +0530
@@ -661,11 +661,9 @@
#define SIGCOST(bits) ((lambda2 * (bits)) >> 8)
#define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits))
#define PSYVALUE(rec) ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1)))
-
int64_t costCoeff[trSize * trSize]; /* d*d + lambda * bits */
- int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0 */
+ double costUncoded[trSize * trSize]; /* d*d + lambda * 0 */
int64_t costSig[trSize * trSize]; /* lambda * bits */
-
int rateIncUp[trSize * trSize]; /* signal overhead of increasing level */
int rateIncDown[trSize * trSize]; /* signal overhead of decreasing level */
int sigRateDelta[trSize * trSize]; /* signal difference between zero and non-zero */
@@ -675,15 +673,12 @@
const uint32_t cgSize = (1 << MLS_CG_SIZE); /* 4x4 num coef = 16 */
bool bIsLuma = ttype == TEXT_LUMA;
-
/* total rate distortion cost of transform block, as CBF=0 */
- int64_t totalUncodedCost = 0;
-
+ double totalUncodedCost = 0;
/* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks,
* the distortion and signal cost of coded blocks, and the coding cost of significant
* coefficient and coefficient group bitmaps */
- int64_t totalRdCost = 0;
-
+ double totalRdCost = 0;
TUEntropyCodingParameters codeParams;
cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma);
const uint32_t log2TrSizeCG = log2TrSize - 2;
@@ -728,25 +723,8 @@
uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
uint32_t blkPos = codeParams.scan[scanPosBase];
-
- // TODO: we can't SIMD optimize because PSYVALUE need 64-bits multiplication, convert to Double can work faster by FMA
- for (int y = 0; y < MLS_CG_SIZE; y++)
- {
- for (int x = 0; x < MLS_CG_SIZE; x++)
- {
- int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
- int predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
-
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
-
- /* when no residual coefficient is coded, predicted coef == recon coef */
- costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
-
- totalUncodedCost += costUncoded[blkPos + x];
- totalRdCost += costUncoded[blkPos + x];
- }
- blkPos += trSize;
- }
+ // PSYVALUE need 64-bits multiplication, we have converted few buffers/variables to double, expected to work faster by SIMD
+ primitives.rdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, psyScale, blkPos, log2TrSize);
}
}
else
@@ -764,8 +742,7 @@
for (int x = 0; x < MLS_CG_SIZE; x++)
{
int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
-
+ costUncoded[blkPos + x] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
totalUncodedCost += costUncoded[blkPos + x];
totalRdCost += costUncoded[blkPos + x];
}
@@ -843,9 +820,7 @@
{
int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
int predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
-
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
-
+ costUncoded[blkPos + x] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
/* when no residual coefficient is coded, predicted coef == recon coef */
costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
@@ -856,9 +831,8 @@
const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
X265_CHECK(trSize > 4, "trSize check failure\n");
X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
-
costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
- costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
+ costCoeff[scanPosBase + scanPosOffset] = static_cast<int64_t>(costUncoded[blkPos + x]);
sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
}
blkPos += trSize;
@@ -872,8 +846,7 @@
for (int x = 0; x < MLS_CG_SIZE; x++)
{
int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
-
+ costUncoded[blkPos + x] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
totalUncodedCost += costUncoded[blkPos + x];
totalRdCost += costUncoded[blkPos + x];
@@ -881,9 +854,8 @@
const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
X265_CHECK(trSize > 4, "trSize check failure\n");
X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
-
costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
- costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
+ costCoeff[scanPosBase + scanPosOffset] = static_cast<int64_t>(costUncoded[blkPos + x]);
sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
}
blkPos += trSize;
@@ -920,9 +892,8 @@
/* RDOQ measures distortion as the squared difference between the unquantized coded level
* and the original DCT coefficient. The result is shifted scaleBits to account for the
* FIX15 nature of the CABAC cost tables minus the forward transform scale */
-
/* cost of not coding this coefficient (all distortion, no signal bits) */
- costUncoded[blkPos] = ((int64_t)signCoef * signCoef) << scaleBits;
+ costUncoded[blkPos] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
X265_CHECK((!!scanPos ^ !!blkPos) == 0, "failed on (blkPos=0 && scanPos!=0)\n");
if (usePsyMask & scanPos)
/* when no residual coefficient is coded, predicted coef == recon coef */
@@ -956,7 +927,7 @@
// fast zero coeff path
/* set default costs to uncoded costs */
costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
- costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];
+ costCoeff[scanPos] = static_cast<int64_t>(costUncoded[blkPos] + costSig[scanPos]);
sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
totalRdCost += costCoeff[scanPos];
rateIncUp[blkPos] = greaterOneBits[0];
@@ -991,7 +962,7 @@
{
/* set default costs to uncoded costs */
costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
- costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];
+ costCoeff[scanPos] = static_cast<int64_t>(costUncoded[blkPos] + costSig[scanPos]);
}
sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
sigCoefBits = estBitsSbac.significantBits[1][ctxSig];
@@ -1138,7 +1109,7 @@
{
sigCoeffGroupFlag64 |= cgBlkPosMask;
cgRdStats.codedLevelAndDist += costCoeff[scanPos] - costSig[scanPos];
- cgRdStats.uncodedDist += costUncoded[blkPos];
+ cgRdStats.uncodedDist += static_cast<int64_t>(costUncoded[blkPos]);
cgRdStats.nnzBeforePos0 += scanPosinCG;
}
}
@@ -1171,10 +1142,8 @@
/* there are coded coefficients in this group, but now we include the signaling cost
* of the significant coefficient group flag and evaluate whether the RD cost of the
* coded group is more than the RD cost of the uncoded group */
-
uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
-
- int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]);
+ int64_t costZeroCG = static_cast<int64_t>(totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]));
costZeroCG += cgRdStats.uncodedDist; /* add distortion for resetting non-zero levels to zero levels */
costZeroCG -= cgRdStats.codedLevelAndDist; /* remove distortion and level cost of coded coefficients */
costZeroCG -= cgRdStats.sigCost; /* remove signaling cost of significant coeff bitmap */
@@ -1185,9 +1154,8 @@
if (costZeroCG < totalRdCost && m_rdoqLevel > 1)
{
sigCoeffGroupFlag64 &= ~cgBlkPosMask;
- totalRdCost = costZeroCG;
+ totalRdCost = static_cast<double>(costZeroCG);
costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]);
-
/* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */
const uint32_t blkPos = codeParams.scan[cgScanPos * cgSize];
memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff));
@@ -1212,16 +1180,15 @@
int64_t bestCost;
if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx])
{
- bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]);
- totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]);
+ bestCost = static_cast<int64_t>(totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]));
+ totalRdCost += static_cast<double>((SIGCOST(estBitsSbac.blockRootCbpBits[1])));
}
else
{
int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]];
- bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]);
- totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]);
+ bestCost = static_cast<int64_t>(totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]));
+ totalRdCost += static_cast<double>(SIGCOST(estBitsSbac.blockCbpBits[ctx][1]));
}
-
/* This loop starts with the last non-zero found in the first loop and then refines this last
* non-zero by measuring the true RD cost of the last NZ at this position, and then the RD costs
* at all previous coefficients until a coefficient greater than 1 is encountered or we run out
@@ -1276,9 +1243,7 @@
bitsLastNZ += m_entropyCoder->m_estBitsSbac.lastBits[i][prefixOnes];
bitsLastNZ += IEP_RATE * suffixLen;
}
-
- int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ);
-
+ int64_t costAsLast = static_cast<int64_t>(totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ));
if (costAsLast < bestCost)
{
bestLastIdx = scanPos + 1;
More information about the x265-devel
mailing list