[x265] [PATCH] quant.cpp: 'rdoQuant_c' primitive for SIMD optimization

Tue Nov 28 06:46:11 CET 2017

# HG changeset patch
# User Praveen Tiwari <praveen at multicorewareinc.com>
# Date 1511167656 -19800
#      Mon Nov 20 14:17:36 2017 +0530
# Node ID dfd4951a93744f3d732cb4645abd2fd87eded750
# Parent  17bb240012fe990635be621ac261bfd7c9b2d0ba
quant.cpp: 'rdoQuant_c' primitive for SIMD optimization

This particular section of code appears to be bottleneck in many profiles, as it
involves 64-bit multiplication operations. For SIMD optimization we need to convert
few buffer/variables to double.

diff -r 17bb240012fe -r dfd4951a9374 source/common/dct.cpp

--- a/source/common/dct.cpp	Fri Nov 24 17:23:59 2017 +0100
+++ b/source/common/dct.cpp	Mon Nov 20 14:17:36 2017 +0530
@@ -984,15 +984,41 @@
     return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28);
 }
 
+static void rdoQuant_c(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize)
+{
+    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
+    const int scaleBits = SCALE_BITS - 2 * transformShift;
+    const uint32_t trSize = 1 << log2TrSize;
+    int max = X265_MAX(0, (2 * transformShift + 1));
+
+    for (int y = 0; y < MLS_CG_SIZE; y++)
+    {
+        for (int x = 0; x < MLS_CG_SIZE; x++)
+        {
+            int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
+            int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
+
+            costUncoded[blkPos + x] = static_cast<double>((signCoef * signCoef) << scaleBits);
+
+            /* when no residual coefficient is coded, predicted coef == recon coef */
+            costUncoded[blkPos + x] -= static_cast<double>((psyScale * (predictedCoef)) >> max);
+
+            *totalUncodedCost += costUncoded[blkPos + x];
+            *totalRdCost += costUncoded[blkPos + x];
+        }
+        blkPos += trSize;
+    }
+}
+
 namespace X265_NS {
 // x265 private namespace
-
 void setupDCTPrimitives_c(EncoderPrimitives& p)
 {
     p.dequant_scaling = dequant_scaling_c;
     p.dequant_normal = dequant_normal_c;
     p.quant = quant_c;
     p.nquant = nquant_c;
+    p.rdoQuant = rdoQuant_c;
     p.dst4x4 = dst4_c;
     p.cu[BLOCK_4x4].dct   = dct4_c;
     p.cu[BLOCK_8x8].dct   = dct8_c;
diff -r 17bb240012fe -r dfd4951a9374 source/common/primitives.h
--- a/source/common/primitives.h	Fri Nov 24 17:23:59 2017 +0100
+++ b/source/common/primitives.h	Mon Nov 20 14:17:36 2017 +0530
@@ -213,10 +213,9 @@
 
 typedef void (*pelFilterLumaStrong_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
 typedef void (*pelFilterChroma_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ);
-
 typedef void (*integralv_t)(uint32_t *sum, intptr_t stride);
 typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride);
-
+typedef void (*rdoQuant_t)(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize);
 /* Function pointers to optimized encoder primitives. Each pointer can reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
 struct EncoderPrimitives
@@ -301,9 +300,9 @@
      * the CU arrays */
     dct_t                 dst4x4;
     idct_t                idst4x4;
-
     quant_t               quant;
     nquant_t              nquant;
+    rdoQuant_t            rdoQuant;
     dequant_scaling_t     dequant_scaling;
     dequant_normal_t      dequant_normal;
     denoiseDct_t          denoiseDct;
diff -r 17bb240012fe -r dfd4951a9374 source/common/quant.cpp
--- a/source/common/quant.cpp	Fri Nov 24 17:23:59 2017 +0100
+++ b/source/common/quant.cpp	Mon Nov 20 14:17:36 2017 +0530
@@ -661,11 +661,9 @@
 #define SIGCOST(bits)   ((lambda2 * (bits)) >> 8)
 #define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits))
 #define PSYVALUE(rec)   ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1)))
-
     int64_t costCoeff[trSize * trSize];   /* d*d + lambda * bits */
-    int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0    */
+    double costUncoded[trSize * trSize]; /* d*d + lambda * 0    */
     int64_t costSig[trSize * trSize];     /* lambda * bits       */
-
     int rateIncUp[trSize * trSize];      /* signal overhead of increasing level */
     int rateIncDown[trSize * trSize];    /* signal overhead of decreasing level */
     int sigRateDelta[trSize * trSize];   /* signal difference between zero and non-zero */
@@ -675,15 +673,12 @@
 
     const uint32_t cgSize = (1 << MLS_CG_SIZE); /* 4x4 num coef = 16 */
     bool bIsLuma = ttype == TEXT_LUMA;
-
     /* total rate distortion cost of transform block, as CBF=0 */
-    int64_t totalUncodedCost = 0;
-
+    double totalUncodedCost = 0;
     /* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks,
      * the distortion and signal cost of coded blocks, and the coding cost of significant
      * coefficient and coefficient group bitmaps */
-    int64_t totalRdCost = 0;
-
+    double totalRdCost = 0;
     TUEntropyCodingParameters codeParams;
     cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma);
     const uint32_t log2TrSizeCG = log2TrSize - 2;
@@ -728,25 +723,8 @@
 
             uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
             uint32_t blkPos      = codeParams.scan[scanPosBase];
-
-            // TODO: we can't SIMD optimize because PSYVALUE need 64-bits multiplication, convert to Double can work faster by FMA
-            for (int y = 0; y < MLS_CG_SIZE; y++)
-            {
-                for (int x = 0; x < MLS_CG_SIZE; x++)
-                {
-                    int signCoef         = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
-                    int predictedCoef    = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
-
-                    costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
-
-                    /* when no residual coefficient is coded, predicted coef == recon coef */
-                    costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
-
-                    totalUncodedCost += costUncoded[blkPos + x];
-                    totalRdCost += costUncoded[blkPos + x];
-                }
-                blkPos += trSize;
-            }
+            // PSYVALUE need 64-bits multiplication, we have converted few buffers/variables to double, expected to work faster by SIMD
+            primitives.rdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, psyScale, blkPos, log2TrSize);
         }
     }
     else
@@ -764,8 +742,7 @@
                 for (int x = 0; x < MLS_CG_SIZE; x++)
                 {
                     int signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
-                    costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
-
+                    costUncoded[blkPos + x] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
                     totalUncodedCost += costUncoded[blkPos + x];
                     totalRdCost += costUncoded[blkPos + x];
                 }
@@ -843,9 +820,7 @@
                     {
                         int signCoef         = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
                         int predictedCoef    = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
-
-                        costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
-
+                        costUncoded[blkPos + x] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
                         /* when no residual coefficient is coded, predicted coef == recon coef */
                         costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
 
@@ -856,9 +831,8 @@
                         const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
                         X265_CHECK(trSize > 4, "trSize check failure\n");
                         X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
-
                         costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
-                        costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
+                        costCoeff[scanPosBase + scanPosOffset] = static_cast<int64_t>(costUncoded[blkPos + x]);
                         sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
                     }
                     blkPos += trSize;
@@ -872,8 +846,7 @@
                     for (int x = 0; x < MLS_CG_SIZE; x++)
                     {
                         int signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
-                        costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
-
+                        costUncoded[blkPos + x] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
                         totalUncodedCost += costUncoded[blkPos + x];
                         totalRdCost += costUncoded[blkPos + x];
 
@@ -881,9 +854,8 @@
                         const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
                         X265_CHECK(trSize > 4, "trSize check failure\n");
                         X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
-
                         costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
-                        costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
+                        costCoeff[scanPosBase + scanPosOffset] = static_cast<int64_t>(costUncoded[blkPos + x]);
                         sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
                     }
                     blkPos += trSize;
@@ -920,9 +892,8 @@
             /* RDOQ measures distortion as the squared difference between the unquantized coded level
              * and the original DCT coefficient. The result is shifted scaleBits to account for the
              * FIX15 nature of the CABAC cost tables minus the forward transform scale */
-
             /* cost of not coding this coefficient (all distortion, no signal bits) */
-            costUncoded[blkPos] = ((int64_t)signCoef * signCoef) << scaleBits;
+            costUncoded[blkPos] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
             X265_CHECK((!!scanPos ^ !!blkPos) == 0, "failed on (blkPos=0 && scanPos!=0)\n");
             if (usePsyMask & scanPos)
                 /* when no residual coefficient is coded, predicted coef == recon coef */
@@ -956,7 +927,7 @@
                 // fast zero coeff path
                 /* set default costs to uncoded costs */
                 costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
-                costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];
+                costCoeff[scanPos] = static_cast<int64_t>(costUncoded[blkPos] + costSig[scanPos]);
                 sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
                 totalRdCost += costCoeff[scanPos];
                 rateIncUp[blkPos] = greaterOneBits[0];
@@ -991,7 +962,7 @@
                     {
                         /* set default costs to uncoded costs */
                         costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
-                        costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];
+                        costCoeff[scanPos] = static_cast<int64_t>(costUncoded[blkPos] + costSig[scanPos]);
                     }
                     sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
                     sigCoefBits = estBitsSbac.significantBits[1][ctxSig];
@@ -1138,7 +1109,7 @@
                 {
                     sigCoeffGroupFlag64 |= cgBlkPosMask;
                     cgRdStats.codedLevelAndDist += costCoeff[scanPos] - costSig[scanPos];
-                    cgRdStats.uncodedDist += costUncoded[blkPos];
+                    cgRdStats.uncodedDist += static_cast<int64_t>(costUncoded[blkPos]);
                     cgRdStats.nnzBeforePos0 += scanPosinCG;
                 }
             }
@@ -1171,10 +1142,8 @@
             /* there are coded coefficients in this group, but now we include the signaling cost
              * of the significant coefficient group flag and evaluate whether the RD cost of the
              * coded group is more than the RD cost of the uncoded group */
-
             uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
-
-            int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]);
+            int64_t costZeroCG = static_cast<int64_t>(totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]));
             costZeroCG += cgRdStats.uncodedDist;       /* add distortion for resetting non-zero levels to zero levels */
             costZeroCG -= cgRdStats.codedLevelAndDist; /* remove distortion and level cost of coded coefficients */
             costZeroCG -= cgRdStats.sigCost;           /* remove signaling cost of significant coeff bitmap */
@@ -1185,9 +1154,8 @@
             if (costZeroCG < totalRdCost && m_rdoqLevel > 1)
             {
                 sigCoeffGroupFlag64 &= ~cgBlkPosMask;
-                totalRdCost = costZeroCG;
+                totalRdCost = static_cast<double>(costZeroCG);
                 costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]);
-
                 /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */
                 const uint32_t blkPos = codeParams.scan[cgScanPos * cgSize];
                 memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff));
@@ -1212,16 +1180,15 @@
     int64_t bestCost;
     if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx])
     {
-        bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]);
-        totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]);
+        bestCost = static_cast<int64_t>(totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]));
+        totalRdCost += static_cast<double>((SIGCOST(estBitsSbac.blockRootCbpBits[1])));
     }
     else
     {
         int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]];
-        bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]);
-        totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]);
+        bestCost = static_cast<int64_t>(totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]));
+        totalRdCost += static_cast<double>(SIGCOST(estBitsSbac.blockCbpBits[ctx][1]));
     }
-
     /* This loop starts with the last non-zero found in the first loop and then refines this last
      * non-zero by measuring the true RD cost of the last NZ at this position, and then the RD costs
      * at all previous coefficients until a coefficient greater than 1 is encountered or we run out
@@ -1276,9 +1243,7 @@
                     bitsLastNZ += m_entropyCoder->m_estBitsSbac.lastBits[i][prefixOnes];
                     bitsLastNZ += IEP_RATE * suffixLen;
                 }
-
-                int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ);
-
+                int64_t costAsLast = static_cast<int64_t>(totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ));
                 if (costAsLast < bestCost)
                 {
                     bestLastIdx = scanPos + 1;