[x265] [PATCH 1 of 4] optimize: rewrite TComTrQuant::xGetICRate
Min Chen
chenm003 at 163.com
Sat Mar 15 02:10:59 CET 2014
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1394845741 25200
# Node ID 27c40f54ac64752f5dea816535ac02b62ba9a019
# Parent ed48f84e541b2916313e067ad04696c4f8514a47
optimize: rewrite TComTrQuant::xGetICRate
diff -r ed48f84e541b -r 27c40f54ac64 source/Lib/TLibCommon/TComRom.cpp
--- a/source/Lib/TLibCommon/TComRom.cpp Fri Mar 14 14:21:34 2014 +0530
+++ b/source/Lib/TLibCommon/TComRom.cpp Fri Mar 14 18:09:01 2014 -0700
@@ -437,9 +437,9 @@
const uint32_t g_groupIdx[32] = { 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9 };
// Rice parameters for absolute transform levels
-const uint32_t g_goRiceRange[5] = { 7, 14, 26, 46, 78 };
+const uint8_t g_goRiceRange[5] = { 7, 14, 26, 46, 78 };
-const uint32_t g_goRicePrefixLen[5] = { 8, 7, 6, 5, 4 };
+//const uint8_t g_goRicePrefixLen[5] = { 8, 7, 6, 5, 4 };
int g_quantTSDefault4x4[16] =
{
diff -r ed48f84e541b -r 27c40f54ac64 source/Lib/TLibCommon/TComRom.h
--- a/source/Lib/TLibCommon/TComRom.h Fri Mar 14 14:21:34 2014 +0530
+++ b/source/Lib/TLibCommon/TComRom.h Fri Mar 14 18:09:01 2014 -0700
@@ -131,8 +131,8 @@
extern const uint32_t g_groupIdx[32];
extern const uint32_t g_minInGroup[10];
-extern const uint32_t g_goRiceRange[5]; //!< maximum value coded with Rice codes
-extern const uint32_t g_goRicePrefixLen[5]; //!< prefix length for each maximum value
+extern const uint8_t g_goRiceRange[5]; //!< maximum value coded with Rice codes
+//extern const uint8_t g_goRicePrefixLen[5]; //!< prefix length for each maximum value
// ====================================================================================================================
// Bit-depth
diff -r ed48f84e541b -r 27c40f54ac64 source/Lib/TLibCommon/TComTrQuant.cpp
--- a/source/Lib/TLibCommon/TComTrQuant.cpp Fri Mar 14 14:21:34 2014 +0530
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp Fri Mar 14 18:09:01 2014 -0700
@@ -60,6 +60,11 @@
#define RDOQ_CHROMA 1 ///< use of RDOQ in chroma
+inline static int x265_min_fast(int x, int y)
+{
+ return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y)
+}
+
// ====================================================================================================================
// TComTrQuant class member functions
// ====================================================================================================================
@@ -568,7 +573,6 @@
uint32_t c1Idx = 0;
uint32_t c2Idx = 0;
int cgLastScanPos = -1;
- int baseLevel;
uint32_t cgNum = 1 << codingParameters.log2TrSizeCG * 2;
int scanPos;
@@ -609,6 +613,13 @@
if (lastScanPos >= 0)
{
+ const uint32_t c1c2Idx = ((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)) + (((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1) * 2;
+ const uint32_t baseLevel = ((uint32_t)0xD9 >> (c1c2Idx * 2)) & 3; // {1, 2, 1, 3}
+ assert(C2FLAG_NUMBER == 1);
+ assert(!!(c1Idx < C1FLAG_NUMBER) == ((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)));
+ assert(!!(c2Idx == 0) == ((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1);
+ assert(baseLevel == ((c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx == 0)) : 1));
+
rateIncUp[blkPos] = 0;
rateIncDown[blkPos] = 0;
deltaU[blkPos] = 0;
@@ -636,9 +647,9 @@
deltaU[blkPos] = (levelDouble - ((int)level << qbits)) >> (qbits - 8);
if (level > 0)
{
- int rateNow = xGetICRate(level, oneCtx, absCtx, goRiceParam, c1Idx, c2Idx);
- rateIncUp[blkPos] = xGetICRate(level + 1, oneCtx, absCtx, goRiceParam, c1Idx, c2Idx) - rateNow;
- rateIncDown[blkPos] = xGetICRate(level - 1, oneCtx, absCtx, goRiceParam, c1Idx, c2Idx) - rateNow;
+ int rateNow = xGetICRate(level, level - baseLevel, oneCtx, absCtx, goRiceParam, c1c2Idx);
+ rateIncUp[blkPos] = xGetICRate(level + 1, level + 1 - baseLevel, oneCtx, absCtx, goRiceParam, c1c2Idx) - rateNow;
+ rateIncDown[blkPos] = xGetICRate(level - 1, level - 1 - baseLevel, oneCtx, absCtx, goRiceParam, c1c2Idx) - rateNow;
}
else // level == 0
{
@@ -647,7 +658,6 @@
dstCoeff[blkPos] = level;
baseCost += costCoeff[scanPos];
- baseLevel = (c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx < C2FLAG_NUMBER)) : 1;
if (level >= baseLevel)
{
if (goRiceParam < 4 && level > (3 << goRiceParam))
@@ -1229,65 +1239,75 @@
}
inline int TComTrQuant::xGetICRate(uint32_t absLevel,
- uint16_t ctxNumOne,
- uint16_t ctxNumAbs,
- uint16_t absGoRice,
- uint32_t c1Idx,
- uint32_t c2Idx) const
+ int32_t diffLevel,
+ uint32_t ctxNumOne,
+ uint32_t ctxNumAbs,
+ uint32_t absGoRice,
+ uint32_t c1c2Idx) const
{
+ assert(c1c2Idx <= 3);
+ assert(absGoRice <= 4);
+ if (absLevel == 0)
+ {
+ assert(diffLevel < 0);
+ return 0;
+ }
int rate = 0;
- uint32_t baseLevel = (c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx < C2FLAG_NUMBER)) : 1;
+ const int *greaterOneBits = m_estBitsSbac->greaterOneBits[ctxNumOne];
+ const int *levelAbsBits = m_estBitsSbac->levelAbsBits[ctxNumAbs];
- if (absLevel >= baseLevel)
+ if (diffLevel < 0)
{
- uint32_t symbol = absLevel - baseLevel;
- uint32_t maxVlc = g_goRiceRange[absGoRice];
+ assert(absLevel >= 0 && absLevel <= 2);
+ rate += greaterOneBits[(absLevel == 2)];
+
+ if (absLevel == 2)
+ {
+ rate += levelAbsBits[0];
+ }
+ }
+ else
+ {
+ uint32_t symbol = diffLevel;
+ const uint32_t maxVlc = g_goRiceRange[absGoRice];
bool expGolomb = (symbol > maxVlc);
if (expGolomb)
{
absLevel = symbol - maxVlc;
- int egs = 1;
- for (uint32_t max = 2; absLevel >= max; max <<= 1, egs += 2)
- {
- }
+
+ // NOTE: mapping to x86 hardware instruction BSR
+ unsigned long size;
+ CLZ32(size, absLevel);
+ int egs = size * 2 + 1;
+ //int egs = 1;
+ //for (uint32_t max = 2; absLevel >= max; max <<= 1, egs += 2)
+ //{
+ //}
+ //assert(egs == size * 2 + 1);
rate += egs << 15;
- symbol = std::min<uint32_t>(symbol, (maxVlc + 1));
+
+ // NOTE: in here, expGolomb=true means (symbol >= maxVlc + 1)
+ assert(x265_min_fast(symbol, (maxVlc + 1)) == maxVlc + 1);
+ symbol = maxVlc + 1;
}
- uint16_t prefLen = uint16_t(symbol >> absGoRice) + 1;
- uint16_t numBins = std::min<uint32_t>(prefLen, g_goRicePrefixLen[absGoRice]) + absGoRice;
+ uint32_t prefLen = (symbol >> absGoRice) + 1;
+ uint32_t numBins = x265_min_fast(prefLen + absGoRice, 8/*g_goRicePrefixLen[absGoRice] + absGoRice*/);
rate += numBins << 15;
- if (c1Idx < C1FLAG_NUMBER)
+ if (c1c2Idx & 1)
{
- rate += m_estBitsSbac->greaterOneBits[ctxNumOne][1];
+ rate += greaterOneBits[1];
+ }
- if (c2Idx < C2FLAG_NUMBER)
- {
- rate += m_estBitsSbac->levelAbsBits[ctxNumAbs][1];
- }
+ if (c1c2Idx == 3)
+ {
+ rate += levelAbsBits[1];
}
}
- else if (absLevel == 0)
- {
- return 0;
- }
- else if (absLevel == 1)
- {
- rate += m_estBitsSbac->greaterOneBits[ctxNumOne][0];
- }
- else if (absLevel == 2)
- {
- rate += m_estBitsSbac->greaterOneBits[ctxNumOne][1];
- rate += m_estBitsSbac->levelAbsBits[ctxNumAbs][0];
- }
- else
- {
- assert(0);
- }
return rate;
}
diff -r ed48f84e541b -r 27c40f54ac64 source/Lib/TLibCommon/TComTrQuant.h
--- a/source/Lib/TLibCommon/TComTrQuant.h Fri Mar 14 14:21:34 2014 +0530
+++ b/source/Lib/TLibCommon/TComTrQuant.h Fri Mar 14 18:09:01 2014 -0700
@@ -199,7 +199,7 @@
inline double xGetICRateCost(uint32_t absLevel, uint16_t ctxNumOne, uint16_t ctxNumAbs, uint16_t absGoRice, uint32_t c1Idx, uint32_t c2Idx) const;
- inline int xGetICRate(uint32_t absLevel, uint16_t ctxNumOne, uint16_t ctxNumAbs, uint16_t absGoRice, uint32_t c1Idx, uint32_t c2Idx) const;
+ inline int xGetICRate(uint32_t absLevel, int32_t diffLevel, uint32_t ctxNumOne, uint32_t ctxNumAbs, uint32_t absGoRice, uint32_t c1c2Idx) const;
inline double xGetRateLast(uint32_t posx, uint32_t posy) const;
diff -r ed48f84e541b -r 27c40f54ac64 source/common/threading.h
--- a/source/common/threading.h Fri Mar 14 14:21:34 2014 +0530
+++ b/source/common/threading.h Fri Mar 14 18:09:01 2014 -0700
@@ -48,6 +48,7 @@
#include <sys/time.h>
#include <unistd.h>
+#define CLZ32(id, x) id = (unsigned long)__builtin_clz(x) ^ 31
#define CTZ64(id, x) id = (unsigned long)__builtin_ctzll(x)
#define ATOMIC_OR(ptr, mask) __sync_or_and_fetch(ptr, mask)
#define ATOMIC_CAS(ptr, oldval, newval) __sync_val_compare_and_swap(ptr, oldval, newval)
@@ -121,6 +122,7 @@
#define ATOMIC_OR(ptr, mask) InterlockedOr64((volatile LONG64*)ptr, mask)
#endif // if _WIN32_WINNT <= _WIN32_WINNT_WINXP
+#define CLZ32(id, x) _BitScanReverse(&id, x)
#define CTZ64(id, x) _BitScanForward64(&id, x)
#define ATOMIC_CAS(ptr, oldval, newval) (uint64_t)_InterlockedCompareExchange64((volatile LONG64*)ptr, newval, oldval)
#define ATOMIC_CAS32(ptr, oldval, newval) (uint64_t)_InterlockedCompareExchange((volatile LONG*)ptr, newval, oldval)
More information about the x265-devel
mailing list