[x265] [PATCH 1 of 4] optimize: rewrite TComTrQuant::xGetICRate

Min Chen chenm003 at 163.com
Sat Mar 15 02:10:59 CET 2014


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1394845741 25200
# Node ID 27c40f54ac64752f5dea816535ac02b62ba9a019
# Parent  ed48f84e541b2916313e067ad04696c4f8514a47
optimize: rewrite TComTrQuant::xGetICRate

diff -r ed48f84e541b -r 27c40f54ac64 source/Lib/TLibCommon/TComRom.cpp
--- a/source/Lib/TLibCommon/TComRom.cpp	Fri Mar 14 14:21:34 2014 +0530
+++ b/source/Lib/TLibCommon/TComRom.cpp	Fri Mar 14 18:09:01 2014 -0700
@@ -437,9 +437,9 @@
 const uint32_t g_groupIdx[32]   = { 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9 };
 
 // Rice parameters for absolute transform levels
-const uint32_t g_goRiceRange[5] = { 7, 14, 26, 46, 78 };
+const uint8_t g_goRiceRange[5] = { 7, 14, 26, 46, 78 };
 
-const uint32_t g_goRicePrefixLen[5] = { 8, 7, 6, 5, 4 };
+//const uint8_t g_goRicePrefixLen[5] = { 8, 7, 6, 5, 4 };
 
 int g_quantTSDefault4x4[16] =
 {
diff -r ed48f84e541b -r 27c40f54ac64 source/Lib/TLibCommon/TComRom.h
--- a/source/Lib/TLibCommon/TComRom.h	Fri Mar 14 14:21:34 2014 +0530
+++ b/source/Lib/TLibCommon/TComRom.h	Fri Mar 14 18:09:01 2014 -0700
@@ -131,8 +131,8 @@
 extern const uint32_t g_groupIdx[32];
 extern const uint32_t g_minInGroup[10];
 
-extern const uint32_t g_goRiceRange[5];      //!< maximum value coded with Rice codes
-extern const uint32_t g_goRicePrefixLen[5];  //!< prefix length for each maximum value
+extern const uint8_t g_goRiceRange[5];      //!< maximum value coded with Rice codes
+//extern const uint8_t g_goRicePrefixLen[5];  //!< prefix length for each maximum value
 
 // ====================================================================================================================
 // Bit-depth
diff -r ed48f84e541b -r 27c40f54ac64 source/Lib/TLibCommon/TComTrQuant.cpp
--- a/source/Lib/TLibCommon/TComTrQuant.cpp	Fri Mar 14 14:21:34 2014 +0530
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp	Fri Mar 14 18:09:01 2014 -0700
@@ -60,6 +60,11 @@
 
 #define RDOQ_CHROMA 1  ///< use of RDOQ in chroma
 
+inline static int x265_min_fast(int x, int y)
+{
+    return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y)
+}
+
 // ====================================================================================================================
 // TComTrQuant class member functions
 // ====================================================================================================================
@@ -568,7 +573,6 @@
     uint32_t   c1Idx     = 0;
     uint32_t   c2Idx     = 0;
     int    cgLastScanPos = -1;
-    int    baseLevel;
     uint32_t cgNum = 1 << codingParameters.log2TrSizeCG * 2;
 
     int scanPos;
@@ -609,6 +613,13 @@
 
             if (lastScanPos >= 0)
             {
+                const uint32_t c1c2Idx = ((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)) + (((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1) * 2;
+                const uint32_t baseLevel = ((uint32_t)0xD9 >> (c1c2Idx * 2)) & 3;  // {1, 2, 1, 3}
+                assert(C2FLAG_NUMBER == 1);
+                assert(!!(c1Idx < C1FLAG_NUMBER) == ((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)));
+                assert(!!(c2Idx == 0) == ((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1);
+                assert(baseLevel == ((c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx == 0)) : 1));
+
                 rateIncUp[blkPos] = 0;
                 rateIncDown[blkPos] = 0;
                 deltaU[blkPos] = 0;
@@ -636,9 +647,9 @@
                 deltaU[blkPos] = (levelDouble - ((int)level << qbits)) >> (qbits - 8);
                 if (level > 0)
                 {
-                    int rateNow = xGetICRate(level, oneCtx, absCtx, goRiceParam, c1Idx, c2Idx);
-                    rateIncUp[blkPos] = xGetICRate(level + 1, oneCtx, absCtx, goRiceParam, c1Idx, c2Idx) - rateNow;
-                    rateIncDown[blkPos] = xGetICRate(level - 1, oneCtx, absCtx, goRiceParam, c1Idx, c2Idx) - rateNow;
+                    int rateNow = xGetICRate(level, level - baseLevel, oneCtx, absCtx, goRiceParam, c1c2Idx);
+                    rateIncUp[blkPos] = xGetICRate(level + 1, level + 1 - baseLevel, oneCtx, absCtx, goRiceParam, c1c2Idx) - rateNow;
+                    rateIncDown[blkPos] = xGetICRate(level - 1, level - 1 - baseLevel, oneCtx, absCtx, goRiceParam, c1c2Idx) - rateNow;
                 }
                 else // level == 0
                 {
@@ -647,7 +658,6 @@
                 dstCoeff[blkPos] = level;
                 baseCost           += costCoeff[scanPos];
 
-                baseLevel = (c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx < C2FLAG_NUMBER)) : 1;
                 if (level >= baseLevel)
                 {
                     if (goRiceParam < 4 && level > (3 << goRiceParam))
@@ -1229,65 +1239,75 @@
 }
 
 inline int TComTrQuant::xGetICRate(uint32_t absLevel,
-                                   uint16_t ctxNumOne,
-                                   uint16_t ctxNumAbs,
-                                   uint16_t absGoRice,
-                                   uint32_t c1Idx,
-                                   uint32_t c2Idx) const
+                                    int32_t diffLevel,
+                                   uint32_t ctxNumOne,
+                                   uint32_t ctxNumAbs,
+                                   uint32_t absGoRice,
+                                   uint32_t c1c2Idx) const
 {
+    assert(c1c2Idx <= 3);
+    assert(absGoRice <= 4);
+    if (absLevel == 0)
+    {
+        assert(diffLevel < 0);
+        return 0;
+    }
     int rate = 0;
-    uint32_t baseLevel = (c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx < C2FLAG_NUMBER)) : 1;
+    const int *greaterOneBits = m_estBitsSbac->greaterOneBits[ctxNumOne];
+    const int *levelAbsBits = m_estBitsSbac->levelAbsBits[ctxNumAbs];
 
-    if (absLevel >= baseLevel)
+    if (diffLevel < 0)
     {
-        uint32_t symbol   = absLevel - baseLevel;
-        uint32_t maxVlc   = g_goRiceRange[absGoRice];
+        assert(absLevel >= 0 && absLevel <= 2);
+        rate += greaterOneBits[(absLevel == 2)];
+
+        if (absLevel == 2)
+        {
+            rate += levelAbsBits[0];
+        }
+    }
+    else
+    {
+        uint32_t symbol   = diffLevel;
+        const uint32_t maxVlc   = g_goRiceRange[absGoRice];
         bool expGolomb = (symbol > maxVlc);
 
         if (expGolomb)
         {
             absLevel = symbol - maxVlc;
-            int egs = 1;
-            for (uint32_t max = 2; absLevel >= max; max <<= 1, egs += 2)
-            {
-            }
+
+            // NOTE: mapping to x86 hardware instruction BSR
+            unsigned long size;
+            CLZ32(size, absLevel);
+            int egs = size * 2 + 1;
+            //int egs = 1;
+            //for (uint32_t max = 2; absLevel >= max; max <<= 1, egs += 2)
+            //{
+            //}
+            //assert(egs == size * 2 + 1);
 
             rate   += egs << 15;
-            symbol = std::min<uint32_t>(symbol, (maxVlc + 1));
+
+            // NOTE: in here, expGolomb=true means (symbol >= maxVlc + 1)
+            assert(x265_min_fast(symbol, (maxVlc + 1)) == maxVlc + 1);
+            symbol = maxVlc + 1;
         }
 
-        uint16_t prefLen = uint16_t(symbol >> absGoRice) + 1;
-        uint16_t numBins = std::min<uint32_t>(prefLen, g_goRicePrefixLen[absGoRice]) + absGoRice;
+        uint32_t prefLen = (symbol >> absGoRice) + 1;
+        uint32_t numBins = x265_min_fast(prefLen + absGoRice, 8/*g_goRicePrefixLen[absGoRice] + absGoRice*/);
 
         rate += numBins << 15;
 
-        if (c1Idx < C1FLAG_NUMBER)
+        if (c1c2Idx & 1)
         {
-            rate += m_estBitsSbac->greaterOneBits[ctxNumOne][1];
+            rate += greaterOneBits[1];
+        }
 
-            if (c2Idx < C2FLAG_NUMBER)
-            {
-                rate += m_estBitsSbac->levelAbsBits[ctxNumAbs][1];
-            }
+        if (c1c2Idx == 3)
+        {
+            rate += levelAbsBits[1];
         }
     }
-    else if (absLevel == 0)
-    {
-        return 0;
-    }
-    else if (absLevel == 1)
-    {
-        rate += m_estBitsSbac->greaterOneBits[ctxNumOne][0];
-    }
-    else if (absLevel == 2)
-    {
-        rate += m_estBitsSbac->greaterOneBits[ctxNumOne][1];
-        rate += m_estBitsSbac->levelAbsBits[ctxNumAbs][0];
-    }
-    else
-    {
-        assert(0);
-    }
     return rate;
 }
 
diff -r ed48f84e541b -r 27c40f54ac64 source/Lib/TLibCommon/TComTrQuant.h
--- a/source/Lib/TLibCommon/TComTrQuant.h	Fri Mar 14 14:21:34 2014 +0530
+++ b/source/Lib/TLibCommon/TComTrQuant.h	Fri Mar 14 18:09:01 2014 -0700
@@ -199,7 +199,7 @@
 
     inline double xGetICRateCost(uint32_t absLevel, uint16_t ctxNumOne, uint16_t ctxNumAbs, uint16_t absGoRice, uint32_t c1Idx, uint32_t c2Idx) const;
 
-    inline int    xGetICRate(uint32_t absLevel, uint16_t ctxNumOne, uint16_t ctxNumAbs, uint16_t absGoRice, uint32_t c1Idx, uint32_t c2Idx) const;
+    inline int    xGetICRate(uint32_t absLevel, int32_t diffLevel, uint32_t ctxNumOne, uint32_t ctxNumAbs, uint32_t absGoRice, uint32_t c1c2Idx) const;
 
     inline double xGetRateLast(uint32_t posx, uint32_t posy) const;
 
diff -r ed48f84e541b -r 27c40f54ac64 source/common/threading.h
--- a/source/common/threading.h	Fri Mar 14 14:21:34 2014 +0530
+++ b/source/common/threading.h	Fri Mar 14 18:09:01 2014 -0700
@@ -48,6 +48,7 @@
 #include <sys/time.h>
 #include <unistd.h>
 
+#define CLZ32(id, x)                        id = (unsigned long)__builtin_clz(x) ^ 31
 #define CTZ64(id, x)                        id = (unsigned long)__builtin_ctzll(x)
 #define ATOMIC_OR(ptr, mask)                __sync_or_and_fetch(ptr, mask)
 #define ATOMIC_CAS(ptr, oldval, newval)     __sync_val_compare_and_swap(ptr, oldval, newval)
@@ -121,6 +122,7 @@
 #define ATOMIC_OR(ptr, mask)            InterlockedOr64((volatile LONG64*)ptr, mask)
 #endif // if _WIN32_WINNT <= _WIN32_WINNT_WINXP
 
+#define CLZ32(id, x)                        _BitScanReverse(&id, x)
 #define CTZ64(id, x)                        _BitScanForward64(&id, x)
 #define ATOMIC_CAS(ptr, oldval, newval)     (uint64_t)_InterlockedCompareExchange64((volatile LONG64*)ptr, newval, oldval)
 #define ATOMIC_CAS32(ptr, oldval, newval)   (uint64_t)_InterlockedCompareExchange((volatile LONG*)ptr, newval, oldval)



More information about the x265-devel mailing list