<div dir="ltr"><div class="gmail_default" style="font-family:georgia,serif;color:#000000">Please ignore this patch I messed an update. I will resend this soon. Thanks</div></div><div class="gmail_extra"><br><div class="gmail_quote">On Mon, Nov 27, 2017 at 5:11 PM, <span dir="ltr"><<a href="mailto:praveen@multicorewareinc.com" target="_blank">praveen@multicorewareinc.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Praveen Tiwari <<a href="mailto:praveen@multicorewareinc.com">praveen@multicorewareinc.com</a>><br>
# Date 1511167656 -19800<br>
# Mon Nov 20 14:17:36 2017 +0530<br>
# Node ID dffb056e5ad0e2298b0dd65d048f4f<wbr>16d8508566<br>
# Parent b24454f3ff6de650aab6835e291837<wbr>fc4e2a4466<br>
quant.cpp: 'rdoQuant_c' primitive for SIMD optimization<br>
<br>
This particular section of code appears to be bottleneck in many profiles, as it<br>
involves 64-bit multiplication operations. For SIMD optimization we need to convert<br>
few buffer/variables to double.<br>
<br>
diff -r b24454f3ff6d -r dffb056e5ad0 source/common/dct.cpp<br>
--- a/source/common/dct.cpp Wed Nov 22 22:00:48 2017 +0530<br>
+++ b/source/common/dct.cpp Mon Nov 20 14:17:36 2017 +0530<br>
@@ -984,6 +984,32 @@<br>
return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28);<br>
}<br>
<br>
+void rdoQuant_c(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize)<br>
+{<br>
+ const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */<br>
+ const int scaleBits = SCALE_BITS - 2 * transformShift;<br>
+ const uint32_t trSize = 1 << log2TrSize;<br>
+ int max = X265_MAX(0, (2 * transformShift + 1));<br>
+<br>
+ for (int y = 0; y < MLS_CG_SIZE; y++)<br>
+ {<br>
+ for (int x = 0; x < MLS_CG_SIZE; x++)<br>
+ {<br>
+ int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */<br>
+ int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/<br>
+<br>
+ costUncoded[blkPos + x] = static_cast<double>((signCoef * signCoef) << scaleBits);<br>
+<br>
+ /* when no residual coefficient is coded, predicted coef == recon coef */<br>
+ costUncoded[blkPos + x] -= static_cast<double>((psyScale * (predictedCoef)) >> max);<br>
+<br>
+ *totalUncodedCost += costUncoded[blkPos + x];<br>
+ *totalRdCost += costUncoded[blkPos + x];<br>
+ }<br>
+ blkPos += trSize;<br>
+ }<br>
+}<br>
+<br>
namespace X265_NS {<br>
// x265 private namespace<br>
<br>
@@ -993,6 +1019,7 @@<br>
p.dequant_normal = dequant_normal_c;<br>
p.quant = quant_c;<br>
p.nquant = nquant_c;<br>
+ p.rdoQuant = rdoQuant_c;<br>
p.dst4x4 = dst4_c;<br>
<a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_4x4].dct = dct4_c;<br>
<a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_8x8].dct = dct8_c;<br>
diff -r b24454f3ff6d -r dffb056e5ad0 source/common/primitives.h<br>
--- a/source/common/primitives.h Wed Nov 22 22:00:48 2017 +0530<br>
+++ b/source/common/primitives.h Mon Nov 20 14:17:36 2017 +0530<br>
@@ -216,6 +216,7 @@<br>
<br>
typedef void (*integralv_t)(uint32_t *sum, intptr_t stride);<br>
typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride);<br>
+typedef void (*rdoQuant_t)(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize);<br>
<br>
/* Function pointers to optimized encoder primitives. Each pointer can reference<br>
* either an assembly routine, a SIMD intrinsic primitive, or a C function */<br>
@@ -304,6 +305,7 @@<br>
<br>
quant_t quant;<br>
nquant_t nquant;<br>
+ rdoQuant_t rdoQuant;<br>
dequant_scaling_t dequant_scaling;<br>
dequant_normal_t dequant_normal;<br>
denoiseDct_t denoiseDct;<br>
diff -r b24454f3ff6d -r dffb056e5ad0 source/common/quant.cpp<br>
--- a/source/common/quant.cpp Wed Nov 22 22:00:48 2017 +0530<br>
+++ b/source/common/quant.cpp Mon Nov 20 14:17:36 2017 +0530<br>
@@ -663,7 +663,7 @@<br>
#define PSYVALUE(rec) ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1)))<br>
<br>
int64_t costCoeff[trSize * trSize]; /* d*d + lambda * bits */<br>
- int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0 */<br>
+ double costUncoded[trSize * trSize]; /* d*d + lambda * 0 */<br>
int64_t costSig[trSize * trSize]; /* lambda * bits */<br>
<br>
int rateIncUp[trSize * trSize]; /* signal overhead of increasing level */<br>
@@ -677,12 +677,12 @@<br>
bool bIsLuma = ttype == TEXT_LUMA;<br>
<br>
/* total rate distortion cost of transform block, as CBF=0 */<br>
- int64_t totalUncodedCost = 0;<br>
+ double totalUncodedCost = 0;<br>
<br>
/* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks,<br>
* the distortion and signal cost of coded blocks, and the coding cost of significant<br>
* coefficient and coefficient group bitmaps */<br>
- int64_t totalRdCost = 0;<br>
+ double totalRdCost = 0;<br>
<br>
TUEntropyCodingParameters codeParams;<br>
cu.<wbr>getTUEntropyCodingParameters(<wbr>codeParams, absPartIdx, log2TrSize, bIsLuma);<br>
@@ -729,24 +729,9 @@<br>
uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);<br>
uint32_t blkPos = codeParams.scan[scanPosBase];<br>
<br>
- // TODO: we can't SIMD optimize because PSYVALUE need 64-bits multiplication, convert to Double can work faster by FMA<br>
- for (int y = 0; y < MLS_CG_SIZE; y++)<br>
- {<br>
- for (int x = 0; x < MLS_CG_SIZE; x++)<br>
- {<br>
- int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */<br>
- int predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/<br>
+ // PSYVALUE need 64-bits multiplication, we have converted few buffers/variables to double, expected to work faster by SIMD<br>
+ primitives.rdoQuant(m_<wbr>resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, psyScale, blkPos, log2TrSize);<br>
<br>
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;<br>
-<br>
- /* when no residual coefficient is coded, predicted coef == recon coef */<br>
- costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);<br>
-<br>
- totalUncodedCost += costUncoded[blkPos + x];<br>
- totalRdCost += costUncoded[blkPos + x];<br>
- }<br>
- blkPos += trSize;<br>
- }<br>
}<br>
}<br>
else<br>
@@ -764,7 +749,7 @@<br>
for (int x = 0; x < MLS_CG_SIZE; x++)<br>
{<br>
int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */<br>
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;<br>
+ costUncoded[blkPos + x] = static_cast<double>(((int64_t)<wbr>signCoef * signCoef) << scaleBits);<br>
<br>
totalUncodedCost += costUncoded[blkPos + x];<br>
totalRdCost += costUncoded[blkPos + x];<br>
@@ -844,7 +829,7 @@<br>
int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */<br>
int predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/<br>
<br>
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;<br>
+ costUncoded[blkPos + x] = static_cast<double>(((int64_t)<wbr>signCoef * signCoef) << scaleBits);<br>
<br>
/* when no residual coefficient is coded, predicted coef == recon coef */<br>
costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);<br>
@@ -858,7 +843,7 @@<br>
X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.<wbr>firstSignificanceMapContext), "sigCtx check failure\n");<br>
<br>
costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.<wbr>significantBits[0][ctxSig]);<br>
- costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];<br>
+ costCoeff[scanPosBase + scanPosOffset] = static_cast<int64_t>(<wbr>costUncoded[blkPos + x]);<br>
sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1]<wbr>[ctxSig] - estBitsSbac.significantBits[0]<wbr>[ctxSig];<br>
}<br>
blkPos += trSize;<br>
@@ -872,7 +857,7 @@<br>
for (int x = 0; x < MLS_CG_SIZE; x++)<br>
{<br>
int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */<br>
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;<br>
+ costUncoded[blkPos + x] = static_cast<double>(((int64_t)<wbr>signCoef * signCoef) << scaleBits);<br>
<br>
totalUncodedCost += costUncoded[blkPos + x];<br>
totalRdCost += costUncoded[blkPos + x];<br>
@@ -883,7 +868,7 @@<br>
X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.<wbr>firstSignificanceMapContext), "sigCtx check failure\n");<br>
<br>
costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.<wbr>significantBits[0][ctxSig]);<br>
- costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];<br>
+ costCoeff[scanPosBase + scanPosOffset] = static_cast<int64_t>(<wbr>costUncoded[blkPos + x]);<br>
sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1]<wbr>[ctxSig] - estBitsSbac.significantBits[0]<wbr>[ctxSig];<br>
}<br>
blkPos += trSize;<br>
@@ -922,7 +907,7 @@<br>
* FIX15 nature of the CABAC cost tables minus the forward transform scale */<br>
<br>
/* cost of not coding this coefficient (all distortion, no signal bits) */<br>
- costUncoded[blkPos] = ((int64_t)signCoef * signCoef) << scaleBits;<br>
+ costUncoded[blkPos] = static_cast<double>(((int64_t)<wbr>signCoef * signCoef) << scaleBits);<br>
X265_CHECK((!!scanPos ^ !!blkPos) == 0, "failed on (blkPos=0 && scanPos!=0)\n");<br>
if (usePsyMask & scanPos)<br>
/* when no residual coefficient is coded, predicted coef == recon coef */<br>
@@ -956,7 +941,7 @@<br>
// fast zero coeff path<br>
/* set default costs to uncoded costs */<br>
costSig[scanPos] = SIGCOST(estBitsSbac.<wbr>significantBits[0][ctxSig]);<br>
- costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];<br>
+ costCoeff[scanPos] = static_cast<int64_t>(<wbr>costUncoded[blkPos] + costSig[scanPos]);<br>
sigRateDelta[blkPos] = estBitsSbac.significantBits[1]<wbr>[ctxSig] - estBitsSbac.significantBits[0]<wbr>[ctxSig];<br>
totalRdCost += costCoeff[scanPos];<br>
rateIncUp[blkPos] = greaterOneBits[0];<br>
@@ -991,7 +976,7 @@<br>
{<br>
/* set default costs to uncoded costs */<br>
costSig[scanPos] = SIGCOST(estBitsSbac.<wbr>significantBits[0][ctxSig]);<br>
- costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];<br>
+ costCoeff[scanPos] = static_cast<int64_t>(<wbr>costUncoded[blkPos] + costSig[scanPos]);<br>
}<br>
sigRateDelta[blkPos] = estBitsSbac.significantBits[1]<wbr>[ctxSig] - estBitsSbac.significantBits[0]<wbr>[ctxSig];<br>
sigCoefBits = estBitsSbac.significantBits[1]<wbr>[ctxSig];<br>
@@ -1138,7 +1123,7 @@<br>
{<br>
sigCoeffGroupFlag64 |= cgBlkPosMask;<br>
cgRdStats.codedLevelAndDist += costCoeff[scanPos] - costSig[scanPos];<br>
- cgRdStats.uncodedDist += costUncoded[blkPos];<br>
+ cgRdStats.uncodedDist += static_cast<int64_t>(<wbr>costUncoded[blkPos]);<br>
cgRdStats.nnzBeforePos0 += scanPosinCG;<br>
}<br>
}<br>
@@ -1174,7 +1159,7 @@<br>
<br>
uint32_t sigCtx = getSigCoeffGroupCtxInc(<wbr>sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);<br>
<br>
- int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.<wbr>significantCoeffGroupBits[<wbr>sigCtx][0]);<br>
+ int64_t costZeroCG = static_cast<int64_t>(<wbr>totalRdCost + SIGCOST(estBitsSbac.<wbr>significantCoeffGroupBits[<wbr>sigCtx][0]));<br>
costZeroCG += cgRdStats.uncodedDist; /* add distortion for resetting non-zero levels to zero levels */<br>
costZeroCG -= cgRdStats.codedLevelAndDist; /* remove distortion and level cost of coded coefficients */<br>
costZeroCG -= cgRdStats.sigCost; /* remove signaling cost of significant coeff bitmap */<br>
@@ -1185,7 +1170,7 @@<br>
if (costZeroCG < totalRdCost && m_rdoqLevel > 1)<br>
{<br>
sigCoeffGroupFlag64 &= ~cgBlkPosMask;<br>
- totalRdCost = costZeroCG;<br>
+ totalRdCost = static_cast<double>(<wbr>costZeroCG);<br>
costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.<wbr>significantCoeffGroupBits[<wbr>sigCtx][0]);<br>
<br>
/* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */<br>
@@ -1212,14 +1197,14 @@<br>
int64_t bestCost;<br>
if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx])<br>
{<br>
- bestCost = totalUncodedCost + SIGCOST(estBitsSbac.<wbr>blockRootCbpBits[0]);<br>
- totalRdCost += SIGCOST(estBitsSbac.<wbr>blockRootCbpBits[1]);<br>
+ bestCost = static_cast<int64_t>(<wbr>totalUncodedCost + SIGCOST(estBitsSbac.<wbr>blockRootCbpBits[0]));<br>
+ totalRdCost += static_cast<double>((SIGCOST(<wbr>estBitsSbac.blockRootCbpBits[<wbr>1])));<br>
}<br>
else<br>
{<br>
int ctx = ctxCbf[ttype][cu.m_tuDepth[<wbr>absPartIdx]];<br>
- bestCost = totalUncodedCost + SIGCOST(estBitsSbac.<wbr>blockCbpBits[ctx][0]);<br>
- totalRdCost += SIGCOST(estBitsSbac.<wbr>blockCbpBits[ctx][1]);<br>
+ bestCost = static_cast<int64_t>(<wbr>totalUncodedCost + SIGCOST(estBitsSbac.<wbr>blockCbpBits[ctx][0]));<br>
+ totalRdCost += static_cast<double>(SIGCOST(<wbr>estBitsSbac.blockCbpBits[ctx][<wbr>1]));<br>
}<br>
<br>
/* This loop starts with the last non-zero found in the first loop and then refines this last<br>
@@ -1277,7 +1262,7 @@<br>
bitsLastNZ += IEP_RATE * suffixLen;<br>
}<br>
<br>
- int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ);<br>
+ int64_t costAsLast = static_cast<int64_t>(<wbr>totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ));<br>
<br>
if (costAsLast < bestCost)<br>
{<br>
</blockquote></div><br></div>