<div dir="ltr"><div class="gmail_default" style="font-family:georgia,serif;color:#000000">Please ignore this patch I messed an update. I will resend this soon. Thanks</div></div><div class="gmail_extra"><br><div class="gmail_quote">On Mon, Nov 27, 2017 at 5:11 PM,  <span dir="ltr"><<a href="mailto:praveen@multicorewareinc.com" target="_blank">praveen@multicorewareinc.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Praveen Tiwari <<a href="mailto:praveen@multicorewareinc.com">praveen@multicorewareinc.com</a>><br>
# Date 1511167656 -19800<br>
#      Mon Nov 20 14:17:36 2017 +0530<br>
# Node ID dffb056e5ad0e2298b0dd65d048f4f<wbr>16d8508566<br>
# Parent  b24454f3ff6de650aab6835e291837<wbr>fc4e2a4466<br>
quant.cpp: 'rdoQuant_c' primitive for SIMD optimization<br>
<br>
This particular section of code appears to be bottleneck in many profiles, as it<br>
involves 64-bit multiplication operations. For SIMD optimization we need to convert<br>
few buffer/variables to double.<br>
<br>
diff -r b24454f3ff6d -r dffb056e5ad0 source/common/dct.cpp<br>
--- a/source/common/dct.cpp     Wed Nov 22 22:00:48 2017 +0530<br>
+++ b/source/common/dct.cpp     Mon Nov 20 14:17:36 2017 +0530<br>
@@ -984,6 +984,32 @@<br>
     return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28);<br>
 }<br>
<br>
+void rdoQuant_c(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize)<br>
+{<br>
+    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */<br>
+    const int scaleBits = SCALE_BITS - 2 * transformShift;<br>
+    const uint32_t trSize = 1 << log2TrSize;<br>
+    int max = X265_MAX(0, (2 * transformShift + 1));<br>
+<br>
+    for (int y = 0; y < MLS_CG_SIZE; y++)<br>
+    {<br>
+        for (int x = 0; x < MLS_CG_SIZE; x++)<br>
+        {<br>
+            int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */<br>
+            int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/<br>
+<br>
+            costUncoded[blkPos + x] = static_cast<double>((signCoef * signCoef) << scaleBits);<br>
+<br>
+            /* when no residual coefficient is coded, predicted coef == recon coef */<br>
+            costUncoded[blkPos + x] -= static_cast<double>((psyScale * (predictedCoef)) >> max);<br>
+<br>
+            *totalUncodedCost += costUncoded[blkPos + x];<br>
+            *totalRdCost += costUncoded[blkPos + x];<br>
+        }<br>
+        blkPos += trSize;<br>
+    }<br>
+}<br>
+<br>
 namespace X265_NS {<br>
 // x265 private namespace<br>
<br>
@@ -993,6 +1019,7 @@<br>
     p.dequant_normal = dequant_normal_c;<br>
     p.quant = quant_c;<br>
     p.nquant = nquant_c;<br>
+    p.rdoQuant = rdoQuant_c;<br>
     p.dst4x4 = dst4_c;<br>
     <a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_4x4].dct   = dct4_c;<br>
     <a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_8x8].dct   = dct8_c;<br>
diff -r b24454f3ff6d -r dffb056e5ad0 source/common/primitives.h<br>
--- a/source/common/primitives.h        Wed Nov 22 22:00:48 2017 +0530<br>
+++ b/source/common/primitives.h        Mon Nov 20 14:17:36 2017 +0530<br>
@@ -216,6 +216,7 @@<br>
<br>
 typedef void (*integralv_t)(uint32_t *sum, intptr_t stride);<br>
 typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride);<br>
+typedef void (*rdoQuant_t)(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize);<br>
<br>
 /* Function pointers to optimized encoder primitives. Each pointer can reference<br>
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */<br>
@@ -304,6 +305,7 @@<br>
<br>
     quant_t               quant;<br>
     nquant_t              nquant;<br>
+    rdoQuant_t            rdoQuant;<br>
     dequant_scaling_t     dequant_scaling;<br>
     dequant_normal_t      dequant_normal;<br>
     denoiseDct_t          denoiseDct;<br>
diff -r b24454f3ff6d -r dffb056e5ad0 source/common/quant.cpp<br>
--- a/source/common/quant.cpp   Wed Nov 22 22:00:48 2017 +0530<br>
+++ b/source/common/quant.cpp   Mon Nov 20 14:17:36 2017 +0530<br>
@@ -663,7 +663,7 @@<br>
 #define PSYVALUE(rec)   ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1)))<br>
<br>
     int64_t costCoeff[trSize * trSize];   /* d*d + lambda * bits */<br>
-    int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0    */<br>
+    double costUncoded[trSize * trSize]; /* d*d + lambda * 0    */<br>
     int64_t costSig[trSize * trSize];     /* lambda * bits       */<br>
<br>
     int rateIncUp[trSize * trSize];      /* signal overhead of increasing level */<br>
@@ -677,12 +677,12 @@<br>
     bool bIsLuma = ttype == TEXT_LUMA;<br>
<br>
     /* total rate distortion cost of transform block, as CBF=0 */<br>
-    int64_t totalUncodedCost = 0;<br>
+    double totalUncodedCost = 0;<br>
<br>
     /* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks,<br>
      * the distortion and signal cost of coded blocks, and the coding cost of significant<br>
      * coefficient and coefficient group bitmaps */<br>
-    int64_t totalRdCost = 0;<br>
+    double totalRdCost = 0;<br>
<br>
     TUEntropyCodingParameters codeParams;<br>
     cu.<wbr>getTUEntropyCodingParameters(<wbr>codeParams, absPartIdx, log2TrSize, bIsLuma);<br>
@@ -729,24 +729,9 @@<br>
             uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);<br>
             uint32_t blkPos      = codeParams.scan[scanPosBase];<br>
<br>
-            // TODO: we can't SIMD optimize because PSYVALUE need 64-bits multiplication, convert to Double can work faster by FMA<br>
-            for (int y = 0; y < MLS_CG_SIZE; y++)<br>
-            {<br>
-                for (int x = 0; x < MLS_CG_SIZE; x++)<br>
-                {<br>
-                    int signCoef         = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */<br>
-                    int predictedCoef    = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/<br>
+            // PSYVALUE need 64-bits multiplication, we have converted few buffers/variables to double, expected to work faster by SIMD<br>
+            primitives.rdoQuant(m_<wbr>resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, psyScale, blkPos, log2TrSize);<br>
<br>
-                    costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;<br>
-<br>
-                    /* when no residual coefficient is coded, predicted coef == recon coef */<br>
-                    costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);<br>
-<br>
-                    totalUncodedCost += costUncoded[blkPos + x];<br>
-                    totalRdCost += costUncoded[blkPos + x];<br>
-                }<br>
-                blkPos += trSize;<br>
-            }<br>
         }<br>
     }<br>
     else<br>
@@ -764,7 +749,7 @@<br>
                 for (int x = 0; x < MLS_CG_SIZE; x++)<br>
                 {<br>
                     int signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */<br>
-                    costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;<br>
+                    costUncoded[blkPos + x] = static_cast<double>(((int64_t)<wbr>signCoef * signCoef) << scaleBits);<br>
<br>
                     totalUncodedCost += costUncoded[blkPos + x];<br>
                     totalRdCost += costUncoded[blkPos + x];<br>
@@ -844,7 +829,7 @@<br>
                         int signCoef         = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */<br>
                         int predictedCoef    = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/<br>
<br>
-                        costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;<br>
+                        costUncoded[blkPos + x] = static_cast<double>(((int64_t)<wbr>signCoef * signCoef) << scaleBits);<br>
<br>
                         /* when no residual coefficient is coded, predicted coef == recon coef */<br>
                         costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);<br>
@@ -858,7 +843,7 @@<br>
                         X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.<wbr>firstSignificanceMapContext), "sigCtx check failure\n");<br>
<br>
                         costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.<wbr>significantBits[0][ctxSig]);<br>
-                        costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];<br>
+                        costCoeff[scanPosBase + scanPosOffset] = static_cast<int64_t>(<wbr>costUncoded[blkPos + x]);<br>
                         sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1]<wbr>[ctxSig] - estBitsSbac.significantBits[0]<wbr>[ctxSig];<br>
                     }<br>
                     blkPos += trSize;<br>
@@ -872,7 +857,7 @@<br>
                     for (int x = 0; x < MLS_CG_SIZE; x++)<br>
                     {<br>
                         int signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */<br>
-                        costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;<br>
+                        costUncoded[blkPos + x] = static_cast<double>(((int64_t)<wbr>signCoef * signCoef) << scaleBits);<br>
<br>
                         totalUncodedCost += costUncoded[blkPos + x];<br>
                         totalRdCost += costUncoded[blkPos + x];<br>
@@ -883,7 +868,7 @@<br>
                         X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.<wbr>firstSignificanceMapContext), "sigCtx check failure\n");<br>
<br>
                         costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.<wbr>significantBits[0][ctxSig]);<br>
-                        costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];<br>
+                        costCoeff[scanPosBase + scanPosOffset] = static_cast<int64_t>(<wbr>costUncoded[blkPos + x]);<br>
                         sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1]<wbr>[ctxSig] - estBitsSbac.significantBits[0]<wbr>[ctxSig];<br>
                     }<br>
                     blkPos += trSize;<br>
@@ -922,7 +907,7 @@<br>
              * FIX15 nature of the CABAC cost tables minus the forward transform scale */<br>
<br>
             /* cost of not coding this coefficient (all distortion, no signal bits) */<br>
-            costUncoded[blkPos] = ((int64_t)signCoef * signCoef) << scaleBits;<br>
+            costUncoded[blkPos] = static_cast<double>(((int64_t)<wbr>signCoef * signCoef) << scaleBits);<br>
             X265_CHECK((!!scanPos ^ !!blkPos) == 0, "failed on (blkPos=0 && scanPos!=0)\n");<br>
             if (usePsyMask & scanPos)<br>
                 /* when no residual coefficient is coded, predicted coef == recon coef */<br>
@@ -956,7 +941,7 @@<br>
                 // fast zero coeff path<br>
                 /* set default costs to uncoded costs */<br>
                 costSig[scanPos] = SIGCOST(estBitsSbac.<wbr>significantBits[0][ctxSig]);<br>
-                costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];<br>
+                costCoeff[scanPos] = static_cast<int64_t>(<wbr>costUncoded[blkPos] + costSig[scanPos]);<br>
                 sigRateDelta[blkPos] = estBitsSbac.significantBits[1]<wbr>[ctxSig] - estBitsSbac.significantBits[0]<wbr>[ctxSig];<br>
                 totalRdCost += costCoeff[scanPos];<br>
                 rateIncUp[blkPos] = greaterOneBits[0];<br>
@@ -991,7 +976,7 @@<br>
                     {<br>
                         /* set default costs to uncoded costs */<br>
                         costSig[scanPos] = SIGCOST(estBitsSbac.<wbr>significantBits[0][ctxSig]);<br>
-                        costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];<br>
+                        costCoeff[scanPos] = static_cast<int64_t>(<wbr>costUncoded[blkPos] + costSig[scanPos]);<br>
                     }<br>
                     sigRateDelta[blkPos] = estBitsSbac.significantBits[1]<wbr>[ctxSig] - estBitsSbac.significantBits[0]<wbr>[ctxSig];<br>
                     sigCoefBits = estBitsSbac.significantBits[1]<wbr>[ctxSig];<br>
@@ -1138,7 +1123,7 @@<br>
                 {<br>
                     sigCoeffGroupFlag64 |= cgBlkPosMask;<br>
                     cgRdStats.codedLevelAndDist += costCoeff[scanPos] - costSig[scanPos];<br>
-                    cgRdStats.uncodedDist += costUncoded[blkPos];<br>
+                    cgRdStats.uncodedDist += static_cast<int64_t>(<wbr>costUncoded[blkPos]);<br>
                     cgRdStats.nnzBeforePos0 += scanPosinCG;<br>
                 }<br>
             }<br>
@@ -1174,7 +1159,7 @@<br>
<br>
             uint32_t sigCtx = getSigCoeffGroupCtxInc(<wbr>sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);<br>
<br>
-            int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.<wbr>significantCoeffGroupBits[<wbr>sigCtx][0]);<br>
+            int64_t costZeroCG = static_cast<int64_t>(<wbr>totalRdCost + SIGCOST(estBitsSbac.<wbr>significantCoeffGroupBits[<wbr>sigCtx][0]));<br>
             costZeroCG += cgRdStats.uncodedDist;       /* add distortion for resetting non-zero levels to zero levels */<br>
             costZeroCG -= cgRdStats.codedLevelAndDist; /* remove distortion and level cost of coded coefficients */<br>
             costZeroCG -= cgRdStats.sigCost;           /* remove signaling cost of significant coeff bitmap */<br>
@@ -1185,7 +1170,7 @@<br>
             if (costZeroCG < totalRdCost && m_rdoqLevel > 1)<br>
             {<br>
                 sigCoeffGroupFlag64 &= ~cgBlkPosMask;<br>
-                totalRdCost = costZeroCG;<br>
+                totalRdCost = static_cast<double>(<wbr>costZeroCG);<br>
                 costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.<wbr>significantCoeffGroupBits[<wbr>sigCtx][0]);<br>
<br>
                 /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */<br>
@@ -1212,14 +1197,14 @@<br>
     int64_t bestCost;<br>
     if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx])<br>
     {<br>
-        bestCost = totalUncodedCost + SIGCOST(estBitsSbac.<wbr>blockRootCbpBits[0]);<br>
-        totalRdCost += SIGCOST(estBitsSbac.<wbr>blockRootCbpBits[1]);<br>
+        bestCost = static_cast<int64_t>(<wbr>totalUncodedCost + SIGCOST(estBitsSbac.<wbr>blockRootCbpBits[0]));<br>
+        totalRdCost += static_cast<double>((SIGCOST(<wbr>estBitsSbac.blockRootCbpBits[<wbr>1])));<br>
     }<br>
     else<br>
     {<br>
         int ctx = ctxCbf[ttype][cu.m_tuDepth[<wbr>absPartIdx]];<br>
-        bestCost = totalUncodedCost + SIGCOST(estBitsSbac.<wbr>blockCbpBits[ctx][0]);<br>
-        totalRdCost += SIGCOST(estBitsSbac.<wbr>blockCbpBits[ctx][1]);<br>
+        bestCost = static_cast<int64_t>(<wbr>totalUncodedCost + SIGCOST(estBitsSbac.<wbr>blockCbpBits[ctx][0]));<br>
+        totalRdCost += static_cast<double>(SIGCOST(<wbr>estBitsSbac.blockCbpBits[ctx][<wbr>1]));<br>
     }<br>
<br>
     /* This loop starts with the last non-zero found in the first loop and then refines this last<br>
@@ -1277,7 +1262,7 @@<br>
                     bitsLastNZ += IEP_RATE * suffixLen;<br>
                 }<br>
<br>
-                int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ);<br>
+                int64_t costAsLast = static_cast<int64_t>(<wbr>totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ));<br>
<br>
                 if (costAsLast < bestCost)<br>
                 {<br>
</blockquote></div><br></div>