[x265-commits] [x265] weightp: non-trivial constructors and destructors should ...

Steve Borho steve at borho.org
Fri Feb 7 02:49:39 CET 2014


details:   http://hg.videolan.org/x265/rev/d87b6e92c996
branches:  
changeset: 6038:d87b6e92c996
user:      Steve Borho <steve at borho.org>
date:      Thu Feb 06 18:35:28 2014 -0600
description:
weightp: non-trivial constructors and destructors should not be in headers
Subject: [x265] weightp: do not blindly assume 4:2:0 chroma dimensions

details:   http://hg.videolan.org/x265/rev/9bc4b7b1454e
branches:  
changeset: 6039:9bc4b7b1454e
user:      Steve Borho <steve at borho.org>
date:      Thu Feb 06 18:36:02 2014 -0600
description:
weightp: do not blindly assume 4:2:0 chroma dimensions
Subject: [x265] weightp: don't use m_ prefix for non member variable

details:   http://hg.videolan.org/x265/rev/8f025ee0a506
branches:  
changeset: 6040:8f025ee0a506
user:      Steve Borho <steve at borho.org>
date:      Thu Feb 06 18:47:57 2014 -0600
description:
weightp: don't use m_ prefix for non member variable
Subject: [x265] nit

details:   http://hg.videolan.org/x265/rev/1776b9a58585
branches:  
changeset: 6041:1776b9a58585
user:      Steve Borho <steve at borho.org>
date:      Thu Feb 06 18:48:08 2014 -0600
description:
nit
Subject: [x265] Merge

details:   http://hg.videolan.org/x265/rev/21d808d834c4
branches:  
changeset: 6042:21d808d834c4
user:      Steve Borho <steve at borho.org>
date:      Thu Feb 06 19:24:41 2014 -0600
description:
Merge
Subject: [x265] weightp: remove useless m_dstStride variable

details:   http://hg.videolan.org/x265/rev/c54271b906da
branches:  
changeset: 6043:c54271b906da
user:      Steve Borho <steve at borho.org>
date:      Thu Feb 06 18:59:19 2014 -0600
description:
weightp: remove useless m_dstStride variable

diffstat:

 source/Lib/TLibCommon/TComPicYuv.h   |     2 +-
 source/Lib/TLibEncoder/TEncCu.cpp    |     4 -
 source/common/common.cpp             |     2 +-
 source/common/threadpool.cpp         |   135 +-
 source/common/vec/intra-ssse3.cpp    |  1245 -------------
 source/common/x86/asm-primitives.cpp |    61 +-
 source/common/x86/intrapred.h        |    14 +
 source/common/x86/intrapred8.asm     |  3130 ++++++++++++++++++++++++++++++++-
 source/common/x86/pixel-a.asm        |    70 +-
 source/common/x86/sad16-a.asm        |    14 +-
 source/encoder/compress.cpp          |     2 -
 source/encoder/slicetype.cpp         |    42 +-
 source/encoder/weightPrediction.cpp  |    54 +-
 source/encoder/weightPrediction.h    |    38 +-
 source/test/pixelharness.cpp         |   252 +-
 source/test/pixelharness.h           |     6 +-
 16 files changed, 3484 insertions(+), 1587 deletions(-)

diffs (truncated from 5720 to 300 lines):

diff -r fc90c9b265fd -r c54271b906da source/Lib/TLibCommon/TComPicYuv.h
--- a/source/Lib/TLibCommon/TComPicYuv.h	Wed Feb 05 18:20:41 2014 -0600
+++ b/source/Lib/TLibCommon/TComPicYuv.h	Thu Feb 06 18:59:19 2014 -0600
@@ -166,7 +166,7 @@ public:
     void  copyFromPicture(const x265_picture&, int32_t *pad);
 }; // END CLASS DEFINITION TComPicYuv
 
-void updateChecksum(const Pel* plane, uint32_t& checksumVal, uint32_t height, uint32_t width, uint32_t stride, int row, uint32_t cu_Height);
+void updateChecksum(const Pel* plane, uint32_t& checksumVal, uint32_t height, uint32_t width, uint32_t stride, int row, uint32_t cuHeight);
 void updateCRC(const Pel* plane, uint32_t& crcVal, uint32_t height, uint32_t width, uint32_t stride);
 void crcFinish(uint32_t & crc, UChar digest[16]);
 void checksumFinish(uint32_t & checksum, UChar digest[16]);
diff -r fc90c9b265fd -r c54271b906da source/Lib/TLibEncoder/TEncCu.cpp
--- a/source/Lib/TLibEncoder/TEncCu.cpp	Wed Feb 05 18:20:41 2014 -0600
+++ b/source/Lib/TLibEncoder/TEncCu.cpp	Thu Feb 06 18:59:19 2014 -0600
@@ -1395,8 +1395,6 @@ void TEncCu::xCheckRDCostIntra(TComDataC
 
     m_search->estIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], preCalcDistC, true);
 
-    m_tmpRecoYuv[depth]->copyToPicLuma(outTempCU->getPic()->getPicYuvRec(), outTempCU->getAddr(), outTempCU->getZorderIdxInCU());
-
     m_search->estIntraPredChromaQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], preCalcDistC);
 
     m_entropyCoder->resetBits();
@@ -1444,8 +1442,6 @@ void TEncCu::xCheckRDCostIntraInInter(TC
     m_search->estIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth],
                              preCalcDistC, bSeparateLumaChroma);
 
-    m_tmpRecoYuv[depth]->copyToPicLuma(outTempCU->getPic()->getPicYuvRec(), outTempCU->getAddr(), outTempCU->getZorderIdxInCU());
-
     m_search->estIntraPredChromaQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], preCalcDistC);
 
     m_entropyCoder->resetBits();
diff -r fc90c9b265fd -r c54271b906da source/common/common.cpp
--- a/source/common/common.cpp	Wed Feb 05 18:20:41 2014 -0600
+++ b/source/common/common.cpp	Thu Feb 06 18:59:19 2014 -0600
@@ -532,7 +532,7 @@ int x265_set_globals(x265_param *param)
 
     static int once /* = 0 */;
 
-    if (ATOMIC_CAS(&once, 0, 1) == 1)
+    if (ATOMIC_CAS32(&once, 0, 1) == 1)
     {
         if (param->maxCUSize != g_maxCUWidth)
         {
diff -r fc90c9b265fd -r c54271b906da source/common/threadpool.cpp
--- a/source/common/threadpool.cpp	Wed Feb 05 18:20:41 2014 -0600
+++ b/source/common/threadpool.cpp	Thu Feb 06 18:59:19 2014 -0600
@@ -78,12 +78,8 @@ public:
     virtual ~PoolThread() {}
 
     void threadMain();
-
-    static volatile uint64_t s_sleepMap;
 };
 
-volatile uint64_t PoolThread::s_sleepMap /* = 0 */;
-
 class ThreadPoolImpl : public ThreadPool
 {
 private:
@@ -91,7 +87,9 @@ private:
     bool         m_ok;
     int          m_referenceCount;
     int          m_numThreads;
+    int          m_numSleepMapWords;
     PoolThread  *m_threads;
+    volatile uint64_t *m_sleepMap;
 
     /* Lock for write access to the provider lists.  Threads are
      * always allowed to read m_firstProvider and follow the
@@ -119,6 +117,10 @@ public:
         return this;
     }
 
+    void markThreadAsleep(int id);
+
+    void waitForAllIdle();
+
     int getThreadCount() const { return m_numThreads; }
 
     void release();
@@ -166,8 +168,7 @@ void PoolThread::threadMain()
 
         if (cur == NULL)
         {
-            uint64_t bit = 1LL << m_id;
-            ATOMIC_OR(&s_sleepMap, bit);
+            m_pool.markThreadAsleep(m_id);
             m_wakeEvent.wait();
         }
     }
@@ -175,19 +176,34 @@ void PoolThread::threadMain()
     m_exited = true;
 }
 
+void ThreadPoolImpl::markThreadAsleep(int id)
+{
+    int word = id >> 6;
+    uint64_t bit = 1LL << (id & 63);
+    ATOMIC_OR(&m_sleepMap[word], bit);
+}
+
 void ThreadPoolImpl::pokeIdleThread()
 {
-    /* Find a bit in the sleeping thread bitmap and poke it awake */
-    uint64_t oldval = PoolThread::s_sleepMap;
+    /* Find a bit in the sleeping thread bitmap and poke it awake, do
+     * not give up until a thread is awakened or all of them are awake */
+    for (int i = 0; i < m_numSleepMapWords; i++)
+    {
+        uint64_t oldval = m_sleepMap[i];
+        while (oldval)
+        {
+            unsigned long id;
+            CTZ64(id, oldval);
 
-    if (oldval)
-    {
-        unsigned long id;
-        CTZ64(id, oldval);
+            uint64_t newval = oldval & ~(1LL << id);
+            if (ATOMIC_CAS(&m_sleepMap[i], oldval, newval) == oldval)
+            {
+                m_threads[(i << 6) | id].poke();
+                return;
+            }
 
-        uint64_t newval = oldval & ~(1LL << id);
-        if (ATOMIC_CAS(&PoolThread::s_sleepMap, oldval, newval) == oldval)
-            m_threads[id].poke();
+            oldval = m_sleepMap[i];
+        }
     }
 }
 
@@ -228,71 +244,80 @@ ThreadPoolImpl::ThreadPoolImpl(int numTh
 {
     if (numThreads == 0)
         numThreads = get_cpu_count();
-    numThreads = X265_MIN(64, numThreads); // do not overflow sleep map
+    m_numSleepMapWords = (numThreads + 63) >> 6;
+    m_sleepMap = X265_MALLOC(uint64_t, m_numSleepMapWords);
 
-    char *buffer = new char[sizeof(PoolThread) * numThreads];
+    char *buffer = (char*)X265_MALLOC(PoolThread, numThreads);
     m_threads = reinterpret_cast<PoolThread*>(buffer);
     m_numThreads = numThreads;
 
-    if (m_threads)
+    if (m_threads && m_sleepMap)
     {
-        uint64_t idlemap = 0;
+        for (int i = 0; i < m_numSleepMapWords; i++)
+            m_sleepMap[i] = 0;
 
         m_ok = true;
-        for (int i = 0; i < numThreads; i++)
+        int i;
+        for (i = 0; i < numThreads; i++)
         {
             new (buffer)PoolThread(*this, i);
             buffer += sizeof(PoolThread);
-            m_ok = m_ok && m_threads[i].start();
-            idlemap |= (1LL << i);
+            if (!m_threads[i].start())
+            {
+                m_ok = false;
+                break;
+            }
         }
 
-        // Wait for threads to spin up and idle
-        while (PoolThread::s_sleepMap != idlemap)
+        if (m_ok)
+        {
+            waitForAllIdle();
+        }
+        else
+        {
+            // stop threads that did start up
+            for (int j = 0; j < i; j++)
+            {
+                m_threads[j].poke();
+                m_threads[j].stop();
+            }
+        }
+    }
+}
+
+void ThreadPoolImpl::waitForAllIdle()
+{
+    if (!m_ok)
+        return;
+
+    int id = 0;
+    do
+    {
+        int word = id >> 6;
+        uint64_t bit = 1LL << (id & 63);
+        if (m_sleepMap[word] & bit)
+        {
+            id++;
+        }
+        else
         {
             GIVE_UP_TIME();
         }
     }
+    while (id < m_numThreads);
 }
 
 void ThreadPoolImpl::Stop()
 {
     if (m_ok)
     {
-        uint64_t idlemap = 0;
-        for (int i = 0; i < m_numThreads; i++)
-        {
-            idlemap |= (1LL << i);
-        }
-
-        // wait for all threads to idle
-        while (PoolThread::s_sleepMap != idlemap)
-        {
-            GIVE_UP_TIME();
-        }
+        waitForAllIdle();
 
         // set invalid flag, then wake them up so they exit their main func
         m_ok = false;
         for (int i = 0; i < m_numThreads; i++)
         {
-            pokeIdleThread();
-        }
-
-        int exited_count = 0;
-        do
-        {
-            GIVE_UP_TIME();
-            exited_count = 0;
-            for (int i = 0; i < m_numThreads; i++)
-            {
-                exited_count += m_threads[i].isExited() ? 1 : 0;
-            }
-        }
-        while (exited_count < m_numThreads);
-
-        // join each thread to cleanup resources
-        for (int i = 0; i < m_numThreads; i++)
-        {
+            m_threads[i].poke();
             m_threads[i].stop();
         }
     }
@@ -300,6 +325,8 @@ void ThreadPoolImpl::Stop()
 
 ThreadPoolImpl::~ThreadPoolImpl()
 {
+    X265_FREE((void*)m_sleepMap);
+
     if (m_threads)
     {
         // cleanup thread handles
@@ -308,7 +335,7 @@ ThreadPoolImpl::~ThreadPoolImpl()
             m_threads[i].~PoolThread();
         }
 
-        delete[] reinterpret_cast<char*>(m_threads);
+        X265_FREE(reinterpret_cast<char*>(m_threads));
     }
 }
 
diff -r fc90c9b265fd -r c54271b906da source/common/vec/intra-ssse3.cpp
--- a/source/common/vec/intra-ssse3.cpp	Wed Feb 05 18:20:41 2014 -0600
+++ b/source/common/vec/intra-ssse3.cpp	Thu Feb 06 18:59:19 2014 -0600
@@ -557,1249 +557,6 @@ void intraPredAng16x16(pixel* dst, intpt
 #undef MB4
 #undef CALC_BLND_8ROWS
 
-//32x32
-#define PREDANG_CALCROW_VER(X) \
-    v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
-    v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
-    itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)]))); \
-    row11L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
-    row11H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-\
-    itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)] + 1))); \
-    row12L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
-    row12H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-\
-    it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
-    it2 = _mm_mullo_epi16(it1, row11L); \
-    it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
-    it2 = _mm_add_epi16(it2, it3); \
-    i16 = _mm_set1_epi16(16); \
-    it2 = _mm_add_epi16(it2, i16); \
-    row11L = _mm_srai_epi16(it2, 5); \
-    it2 = _mm_mullo_epi16(it1, row11H); \


More information about the x265-commits mailing list