[x265] [PATCH] quant.cpp: 'rdoQuant_c' primitive for SIMD optimization

Mon Nov 27 12:41:41 CET 2017

# HG changeset patch
# User Praveen Tiwari <praveen at multicorewareinc.com>
# Date 1511167656 -19800
#      Mon Nov 20 14:17:36 2017 +0530
# Node ID dffb056e5ad0e2298b0dd65d048f4f16d8508566
# Parent  b24454f3ff6de650aab6835e291837fc4e2a4466
quant.cpp: 'rdoQuant_c' primitive for SIMD optimization

This particular section of code appears to be bottleneck in many profiles, as it
involves 64-bit multiplication operations. For SIMD optimization we need to convert
few buffer/variables to double.

diff -r b24454f3ff6d -r dffb056e5ad0 source/common/dct.cpp

--- a/source/common/dct.cpp	Wed Nov 22 22:00:48 2017 +0530
+++ b/source/common/dct.cpp	Mon Nov 20 14:17:36 2017 +0530
@@ -984,6 +984,32 @@
     return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28);
 }
 
+void rdoQuant_c(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize)
+{
+    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
+    const int scaleBits = SCALE_BITS - 2 * transformShift;
+    const uint32_t trSize = 1 << log2TrSize;
+    int max = X265_MAX(0, (2 * transformShift + 1));
+
+    for (int y = 0; y < MLS_CG_SIZE; y++)
+    {
+        for (int x = 0; x < MLS_CG_SIZE; x++)
+        {
+            int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
+            int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
+
+            costUncoded[blkPos + x] = static_cast<double>((signCoef * signCoef) << scaleBits);
+
+            /* when no residual coefficient is coded, predicted coef == recon coef */
+            costUncoded[blkPos + x] -= static_cast<double>((psyScale * (predictedCoef)) >> max);
+
+            *totalUncodedCost += costUncoded[blkPos + x];
+            *totalRdCost += costUncoded[blkPos + x];
+        }
+        blkPos += trSize;
+    }
+}
+
 namespace X265_NS {
 // x265 private namespace
 
@@ -993,6 +1019,7 @@
     p.dequant_normal = dequant_normal_c;
     p.quant = quant_c;
     p.nquant = nquant_c;
+    p.rdoQuant = rdoQuant_c;
     p.dst4x4 = dst4_c;
     p.cu[BLOCK_4x4].dct   = dct4_c;
     p.cu[BLOCK_8x8].dct   = dct8_c;
diff -r b24454f3ff6d -r dffb056e5ad0 source/common/primitives.h
--- a/source/common/primitives.h	Wed Nov 22 22:00:48 2017 +0530
+++ b/source/common/primitives.h	Mon Nov 20 14:17:36 2017 +0530
@@ -216,6 +216,7 @@
 
 typedef void (*integralv_t)(uint32_t *sum, intptr_t stride);
 typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride);
+typedef void (*rdoQuant_t)(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize);
 
 /* Function pointers to optimized encoder primitives. Each pointer can reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
@@ -304,6 +305,7 @@
 
     quant_t               quant;
     nquant_t              nquant;
+    rdoQuant_t            rdoQuant;
     dequant_scaling_t     dequant_scaling;
     dequant_normal_t      dequant_normal;
     denoiseDct_t          denoiseDct;
diff -r b24454f3ff6d -r dffb056e5ad0 source/common/quant.cpp
--- a/source/common/quant.cpp	Wed Nov 22 22:00:48 2017 +0530
+++ b/source/common/quant.cpp	Mon Nov 20 14:17:36 2017 +0530
@@ -663,7 +663,7 @@
 #define PSYVALUE(rec)   ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1)))
 
     int64_t costCoeff[trSize * trSize];   /* d*d + lambda * bits */
-    int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0    */
+    double costUncoded[trSize * trSize]; /* d*d + lambda * 0    */
     int64_t costSig[trSize * trSize];     /* lambda * bits       */
 
     int rateIncUp[trSize * trSize];      /* signal overhead of increasing level */
@@ -677,12 +677,12 @@
     bool bIsLuma = ttype == TEXT_LUMA;
 
     /* total rate distortion cost of transform block, as CBF=0 */
-    int64_t totalUncodedCost = 0;
+    double totalUncodedCost = 0;
 
     /* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks,
      * the distortion and signal cost of coded blocks, and the coding cost of significant
      * coefficient and coefficient group bitmaps */
-    int64_t totalRdCost = 0;
+    double totalRdCost = 0;
 
     TUEntropyCodingParameters codeParams;
     cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma);
@@ -729,24 +729,9 @@
             uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
             uint32_t blkPos      = codeParams.scan[scanPosBase];
 
-            // TODO: we can't SIMD optimize because PSYVALUE need 64-bits multiplication, convert to Double can work faster by FMA
-            for (int y = 0; y < MLS_CG_SIZE; y++)
-            {
-                for (int x = 0; x < MLS_CG_SIZE; x++)
-                {
-                    int signCoef         = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
-                    int predictedCoef    = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
+            // PSYVALUE need 64-bits multiplication, we have converted few buffers/variables to double, expected to work faster by SIMD
+            primitives.rdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, psyScale, blkPos, log2TrSize);
 
-                    costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
-
-                    /* when no residual coefficient is coded, predicted coef == recon coef */
-                    costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
-
-                    totalUncodedCost += costUncoded[blkPos + x];
-                    totalRdCost += costUncoded[blkPos + x];
-                }
-                blkPos += trSize;
-            }
         }
     }
     else
@@ -764,7 +749,7 @@
                 for (int x = 0; x < MLS_CG_SIZE; x++)
                 {
                     int signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
-                    costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
+                    costUncoded[blkPos + x] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
 
                     totalUncodedCost += costUncoded[blkPos + x];
                     totalRdCost += costUncoded[blkPos + x];
@@ -844,7 +829,7 @@
                         int signCoef         = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
                         int predictedCoef    = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
 
-                        costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
+                        costUncoded[blkPos + x] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
 
                         /* when no residual coefficient is coded, predicted coef == recon coef */
                         costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
@@ -858,7 +843,7 @@
                         X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
 
                         costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
-                        costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
+                        costCoeff[scanPosBase + scanPosOffset] = static_cast<int64_t>(costUncoded[blkPos + x]);
                         sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
                     }
                     blkPos += trSize;
@@ -872,7 +857,7 @@
                     for (int x = 0; x < MLS_CG_SIZE; x++)
                     {
                         int signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
-                        costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
+                        costUncoded[blkPos + x] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
 
                         totalUncodedCost += costUncoded[blkPos + x];
                         totalRdCost += costUncoded[blkPos + x];
@@ -883,7 +868,7 @@
                         X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
 
                         costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
-                        costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
+                        costCoeff[scanPosBase + scanPosOffset] = static_cast<int64_t>(costUncoded[blkPos + x]);
                         sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
                     }
                     blkPos += trSize;
@@ -922,7 +907,7 @@
              * FIX15 nature of the CABAC cost tables minus the forward transform scale */
 
             /* cost of not coding this coefficient (all distortion, no signal bits) */
-            costUncoded[blkPos] = ((int64_t)signCoef * signCoef) << scaleBits;
+            costUncoded[blkPos] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
             X265_CHECK((!!scanPos ^ !!blkPos) == 0, "failed on (blkPos=0 && scanPos!=0)\n");
             if (usePsyMask & scanPos)
                 /* when no residual coefficient is coded, predicted coef == recon coef */
@@ -956,7 +941,7 @@
                 // fast zero coeff path
                 /* set default costs to uncoded costs */
                 costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
-                costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];
+                costCoeff[scanPos] = static_cast<int64_t>(costUncoded[blkPos] + costSig[scanPos]);
                 sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
                 totalRdCost += costCoeff[scanPos];
                 rateIncUp[blkPos] = greaterOneBits[0];
@@ -991,7 +976,7 @@
                     {
                         /* set default costs to uncoded costs */
                         costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
-                        costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];
+                        costCoeff[scanPos] = static_cast<int64_t>(costUncoded[blkPos] + costSig[scanPos]);
                     }
                     sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
                     sigCoefBits = estBitsSbac.significantBits[1][ctxSig];
@@ -1138,7 +1123,7 @@
                 {
                     sigCoeffGroupFlag64 |= cgBlkPosMask;
                     cgRdStats.codedLevelAndDist += costCoeff[scanPos] - costSig[scanPos];
-                    cgRdStats.uncodedDist += costUncoded[blkPos];
+                    cgRdStats.uncodedDist += static_cast<int64_t>(costUncoded[blkPos]);
                     cgRdStats.nnzBeforePos0 += scanPosinCG;
                 }
             }
@@ -1174,7 +1159,7 @@
 
             uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
 
-            int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]);
+            int64_t costZeroCG = static_cast<int64_t>(totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]));
             costZeroCG += cgRdStats.uncodedDist;       /* add distortion for resetting non-zero levels to zero levels */
             costZeroCG -= cgRdStats.codedLevelAndDist; /* remove distortion and level cost of coded coefficients */
             costZeroCG -= cgRdStats.sigCost;           /* remove signaling cost of significant coeff bitmap */
@@ -1185,7 +1170,7 @@
             if (costZeroCG < totalRdCost && m_rdoqLevel > 1)
             {
                 sigCoeffGroupFlag64 &= ~cgBlkPosMask;
-                totalRdCost = costZeroCG;
+                totalRdCost = static_cast<double>(costZeroCG);
                 costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]);
 
                 /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */
@@ -1212,14 +1197,14 @@
     int64_t bestCost;
     if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx])
     {
-        bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]);
-        totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]);
+        bestCost = static_cast<int64_t>(totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]));
+        totalRdCost += static_cast<double>((SIGCOST(estBitsSbac.blockRootCbpBits[1])));
     }
     else
     {
         int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]];
-        bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]);
-        totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]);
+        bestCost = static_cast<int64_t>(totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]));
+        totalRdCost += static_cast<double>(SIGCOST(estBitsSbac.blockCbpBits[ctx][1]));
     }
 
     /* This loop starts with the last non-zero found in the first loop and then refines this last
@@ -1277,7 +1262,7 @@
                     bitsLastNZ += IEP_RATE * suffixLen;
                 }
 
-                int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ);
+                int64_t costAsLast = static_cast<int64_t>(totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ));
 
                 if (costAsLast < bestCost)
                 {