[x265] [PATCH] quant.cpp: 'nonPsyRdoQuant_c' primitive for SIMD optimization
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Tue Nov 28 08:41:23 CET 2017
# HG changeset patch
# User Praveen Tiwari <praveen at multicorewareinc.com>
# Date 1511849580 -19800
# Tue Nov 28 11:43:00 2017 +0530
# Node ID 4d242c555d14ca8214d9da89cef41c4418af4dca
# Parent dfd4951a93744f3d732cb4645abd2fd87eded750
quant.cpp: 'nonPsyRdoQuant_c' primitive for SIMD optimization
This particular section of code appears to be bottleneck in many profiles, as it
involves 64-bit multiplication operations. For SIMD optimization we need to convert
few buffer/variables to double.
diff -r dfd4951a9374 -r 4d242c555d14 source/common/dct.cpp
--- a/source/common/dct.cpp Mon Nov 20 14:17:36 2017 +0530
+++ b/source/common/dct.cpp Tue Nov 28 11:43:00 2017 +0530
@@ -1010,6 +1010,26 @@
}
}
+static void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, double *costUncoded, double *totalUncodedCost, double *totalRdCost, uint32_t blkPos, uint32_t log2TrSize)
+{
+ const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
+ const int scaleBits = SCALE_BITS - 2 * transformShift;
+ const uint32_t trSize = 1 << log2TrSize;
+
+ for (int y = 0; y < MLS_CG_SIZE; y++)
+ {
+ for (int x = 0; x < MLS_CG_SIZE; x++)
+ {
+ int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
+ costUncoded[blkPos + x] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
+
+ *totalUncodedCost += costUncoded[blkPos + x];
+ *totalRdCost += costUncoded[blkPos + x];
+ }
+ blkPos += trSize;
+ }
+}
+
namespace X265_NS {
// x265 private namespace
void setupDCTPrimitives_c(EncoderPrimitives& p)
@@ -1019,6 +1039,7 @@
p.quant = quant_c;
p.nquant = nquant_c;
p.rdoQuant = rdoQuant_c;
+ p.nonPsyRdoQuant = nonPsyRdoQuant_c;
p.dst4x4 = dst4_c;
p.cu[BLOCK_4x4].dct = dct4_c;
p.cu[BLOCK_8x8].dct = dct8_c;
diff -r dfd4951a9374 -r 4d242c555d14 source/common/primitives.h
--- a/source/common/primitives.h Mon Nov 20 14:17:36 2017 +0530
+++ b/source/common/primitives.h Tue Nov 28 11:43:00 2017 +0530
@@ -216,6 +216,8 @@
typedef void (*integralv_t)(uint32_t *sum, intptr_t stride);
typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride);
typedef void (*rdoQuant_t)(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize);
+typedef void (*nonPsyRdoQuant_t)(int16_t *m_resiDctCoeff, double *costUncoded, double *totalUncodedCost, double *totalRdCost, uint32_t blkPos, uint32_t log2TrSize);
+
/* Function pointers to optimized encoder primitives. Each pointer can reference
* either an assembly routine, a SIMD intrinsic primitive, or a C function */
struct EncoderPrimitives
@@ -303,6 +305,7 @@
quant_t quant;
nquant_t nquant;
rdoQuant_t rdoQuant;
+ nonPsyRdoQuant_t nonPsyRdoQuant;
dequant_scaling_t dequant_scaling;
dequant_normal_t dequant_normal;
denoiseDct_t denoiseDct;
diff -r dfd4951a9374 -r 4d242c555d14 source/common/quant.cpp
--- a/source/common/quant.cpp Mon Nov 20 14:17:36 2017 +0530
+++ b/source/common/quant.cpp Tue Nov 28 11:43:00 2017 +0530
@@ -737,17 +737,7 @@
uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
uint32_t blkPos = codeParams.scan[scanPosBase];
- for (int y = 0; y < MLS_CG_SIZE; y++)
- {
- for (int x = 0; x < MLS_CG_SIZE; x++)
- {
- int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
- costUncoded[blkPos + x] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits);
- totalUncodedCost += costUncoded[blkPos + x];
- totalRdCost += costUncoded[blkPos + x];
- }
- blkPos += trSize;
- }
+ primitives.nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos, log2TrSize);
}
}
More information about the x265-devel
mailing list