[x265] [PATCH] quant.cpp: 'nonPsyRdoQuant_c' primitive for SIMD optimization

Tue Nov 28 08:41:23 CET 2017

# HG changeset patch
# User Praveen Tiwari <praveen at multicorewareinc.com>
# Date 1511849580 -19800
#      Tue Nov 28 11:43:00 2017 +0530
# Node ID 4d242c555d14ca8214d9da89cef41c4418af4dca
# Parent  dfd4951a93744f3d732cb4645abd2fd87eded750
quant.cpp: 'nonPsyRdoQuant_c' primitive for SIMD optimization

This particular section of code appears to be bottleneck in many profiles, as it
involves 64-bit multiplication operations. For SIMD optimization we need to convert
few buffer/variables to double.

diff -r dfd4951a9374 -r 4d242c555d14 source/common/dct.cpp

--- a/source/common/dct.cpp	Mon Nov 20 14:17:36 2017 +0530
+++ b/source/common/dct.cpp	Tue Nov 28 11:43:00 2017 +0530
@@ -1010,6 +1010,26 @@
     }
 }
 
+static void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, double *costUncoded, double *totalUncodedCost, double *totalRdCost, uint32_t blkPos, uint32_t log2TrSize)
+{
+    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
+    const int scaleBits = SCALE_BITS - 2 * transformShift;
+    const uint32_t trSize = 1 << log2TrSize;
+
+    for (int y = 0; y < MLS_CG_SIZE; y++)
+    {
+        for (int x = 0; x < MLS_CG_SIZE; x++)
+        {
+            int signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
+            costUncoded[blkPos + x] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
+
+            *totalUncodedCost += costUncoded[blkPos + x];
+            *totalRdCost += costUncoded[blkPos + x];
+        }
+        blkPos += trSize;
+    }
+}
+
 namespace X265_NS {
 // x265 private namespace
 void setupDCTPrimitives_c(EncoderPrimitives& p)
@@ -1019,6 +1039,7 @@
     p.quant = quant_c;
     p.nquant = nquant_c;
     p.rdoQuant = rdoQuant_c;
+    p.nonPsyRdoQuant = nonPsyRdoQuant_c;
     p.dst4x4 = dst4_c;
     p.cu[BLOCK_4x4].dct   = dct4_c;
     p.cu[BLOCK_8x8].dct   = dct8_c;
diff -r dfd4951a9374 -r 4d242c555d14 source/common/primitives.h
--- a/source/common/primitives.h	Mon Nov 20 14:17:36 2017 +0530
+++ b/source/common/primitives.h	Tue Nov 28 11:43:00 2017 +0530
@@ -216,6 +216,8 @@
 typedef void (*integralv_t)(uint32_t *sum, intptr_t stride);
 typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride);
 typedef void (*rdoQuant_t)(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize);
+typedef void (*nonPsyRdoQuant_t)(int16_t *m_resiDctCoeff, double *costUncoded, double *totalUncodedCost, double *totalRdCost, uint32_t blkPos, uint32_t log2TrSize);
+
 /* Function pointers to optimized encoder primitives. Each pointer can reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
 struct EncoderPrimitives
@@ -303,6 +305,7 @@
     quant_t               quant;
     nquant_t              nquant;
     rdoQuant_t            rdoQuant;
+    nonPsyRdoQuant_t      nonPsyRdoQuant;
     dequant_scaling_t     dequant_scaling;
     dequant_normal_t      dequant_normal;
     denoiseDct_t          denoiseDct;
diff -r dfd4951a9374 -r 4d242c555d14 source/common/quant.cpp
--- a/source/common/quant.cpp	Mon Nov 20 14:17:36 2017 +0530
+++ b/source/common/quant.cpp	Tue Nov 28 11:43:00 2017 +0530
@@ -737,17 +737,7 @@
             uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
             uint32_t blkPos      = codeParams.scan[scanPosBase];
 
-            for (int y = 0; y < MLS_CG_SIZE; y++)
-            {
-                for (int x = 0; x < MLS_CG_SIZE; x++)
-                {
-                    int signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
-                    costUncoded[blkPos + x] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
-                    totalUncodedCost += costUncoded[blkPos + x];
-                    totalRdCost += costUncoded[blkPos + x];
-                }
-                blkPos += trSize;
-            }
+            primitives.nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos, log2TrSize);
         }
     }