[x265-commits] [x265] slicetype: fix 10bpp intra pixel preparations - found by ...

Steve Borho steve at borho.org
Thu Feb 6 06:26:02 CET 2014


details:   http://hg.videolan.org/x265/rev/fc90c9b265fd
branches:  
changeset: 6028:fc90c9b265fd
user:      Steve Borho <steve at borho.org>
date:      Wed Feb 05 18:20:41 2014 -0600
description:
slicetype: fix 10bpp intra pixel preparations - found by valgrind
Subject: [x265] threadpool: use aligned malloc to allocate sleep bitmap

details:   http://hg.videolan.org/x265/rev/53b8daed7df5
branches:  
changeset: 6029:53b8daed7df5
user:      Steve Borho <steve at borho.org>
date:      Wed Feb 05 18:48:32 2014 -0600
description:
threadpool: use aligned malloc to allocate sleep bitmap

This kills three birds with one stone - it removes a source of possible un-
alignment, it removes the restriction of max 64 threads, and it further
simplifies pool start and stop
Subject: [x265] common: use ATOMIC_CAS32 to update an int

details:   http://hg.videolan.org/x265/rev/634bc0b1c246
branches:  
changeset: 6030:634bc0b1c246
user:      Steve Borho <steve at borho.org>
date:      Wed Feb 05 23:10:22 2014 -0600
description:
common: use ATOMIC_CAS32 to update an int

diffstat:

 source/common/common.cpp     |    2 +-
 source/common/threadpool.cpp |  135 +++++++++++++++++++++++++-----------------
 source/encoder/slicetype.cpp |   10 +-
 3 files changed, 88 insertions(+), 59 deletions(-)

diffs (266 lines):

diff -r 8c9e1b3564e8 -r 634bc0b1c246 source/common/common.cpp
--- a/source/common/common.cpp	Wed Feb 05 13:39:02 2014 -0600
+++ b/source/common/common.cpp	Wed Feb 05 23:10:22 2014 -0600
@@ -532,7 +532,7 @@ int x265_set_globals(x265_param *param)
 
     static int once /* = 0 */;
 
-    if (ATOMIC_CAS(&once, 0, 1) == 1)
+    if (ATOMIC_CAS32(&once, 0, 1) == 1)
     {
         if (param->maxCUSize != g_maxCUWidth)
         {
diff -r 8c9e1b3564e8 -r 634bc0b1c246 source/common/threadpool.cpp
--- a/source/common/threadpool.cpp	Wed Feb 05 13:39:02 2014 -0600
+++ b/source/common/threadpool.cpp	Wed Feb 05 23:10:22 2014 -0600
@@ -78,12 +78,8 @@ public:
     virtual ~PoolThread() {}
 
     void threadMain();
-
-    static volatile uint64_t s_sleepMap;
 };
 
-volatile uint64_t PoolThread::s_sleepMap /* = 0 */;
-
 class ThreadPoolImpl : public ThreadPool
 {
 private:
@@ -91,7 +87,9 @@ private:
     bool         m_ok;
     int          m_referenceCount;
     int          m_numThreads;
+    int          m_numSleepMapWords;
     PoolThread  *m_threads;
+    volatile uint64_t *m_sleepMap;
 
     /* Lock for write access to the provider lists.  Threads are
      * always allowed to read m_firstProvider and follow the
@@ -119,6 +117,10 @@ public:
         return this;
     }
 
+    void markThreadAsleep(int id);
+
+    void waitForAllIdle();
+
     int getThreadCount() const { return m_numThreads; }
 
     void release();
@@ -166,8 +168,7 @@ void PoolThread::threadMain()
 
         if (cur == NULL)
         {
-            uint64_t bit = 1LL << m_id;
-            ATOMIC_OR(&s_sleepMap, bit);
+            m_pool.markThreadAsleep(m_id);
             m_wakeEvent.wait();
         }
     }
@@ -175,19 +176,34 @@ void PoolThread::threadMain()
     m_exited = true;
 }
 
+void ThreadPoolImpl::markThreadAsleep(int id)
+{
+    int word = id >> 6;
+    uint64_t bit = 1LL << (id & 63);
+    ATOMIC_OR(&m_sleepMap[word], bit);
+}
+
 void ThreadPoolImpl::pokeIdleThread()
 {
-    /* Find a bit in the sleeping thread bitmap and poke it awake */
-    uint64_t oldval = PoolThread::s_sleepMap;
+    /* Find a bit in the sleeping thread bitmap and poke it awake, do
+     * not give up until a thread is awakened or all of them are awake */
+    for (int i = 0; i < m_numSleepMapWords; i++)
+    {
+        uint64_t oldval = m_sleepMap[i];
+        while (oldval)
+        {
+            unsigned long id;
+            CTZ64(id, oldval);
 
-    if (oldval)
-    {
-        unsigned long id;
-        CTZ64(id, oldval);
+            uint64_t newval = oldval & ~(1LL << id);
+            if (ATOMIC_CAS(&m_sleepMap[i], oldval, newval) == oldval)
+            {
+                m_threads[(i << 6) | id].poke();
+                return;
+            }
 
-        uint64_t newval = oldval & ~(1LL << id);
-        if (ATOMIC_CAS(&PoolThread::s_sleepMap, oldval, newval) == oldval)
-            m_threads[id].poke();
+            oldval = m_sleepMap[i];
+        }
     }
 }
 
@@ -228,71 +244,80 @@ ThreadPoolImpl::ThreadPoolImpl(int numTh
 {
     if (numThreads == 0)
         numThreads = get_cpu_count();
-    numThreads = X265_MIN(64, numThreads); // do not overflow sleep map
+    m_numSleepMapWords = (numThreads + 63) >> 6;
+    m_sleepMap = X265_MALLOC(uint64_t, m_numSleepMapWords);
 
-    char *buffer = new char[sizeof(PoolThread) * numThreads];
+    char *buffer = (char*)X265_MALLOC(PoolThread, numThreads);
     m_threads = reinterpret_cast<PoolThread*>(buffer);
     m_numThreads = numThreads;
 
-    if (m_threads)
+    if (m_threads && m_sleepMap)
     {
-        uint64_t idlemap = 0;
+        for (int i = 0; i < m_numSleepMapWords; i++)
+            m_sleepMap[i] = 0;
 
         m_ok = true;
-        for (int i = 0; i < numThreads; i++)
+        int i;
+        for (i = 0; i < numThreads; i++)
         {
             new (buffer)PoolThread(*this, i);
             buffer += sizeof(PoolThread);
-            m_ok = m_ok && m_threads[i].start();
-            idlemap |= (1LL << i);
+            if (!m_threads[i].start())
+            {
+                m_ok = false;
+                break;
+            }
         }
 
-        // Wait for threads to spin up and idle
-        while (PoolThread::s_sleepMap != idlemap)
+        if (m_ok)
+        {
+            waitForAllIdle();
+        }
+        else
+        {
+            // stop threads that did start up
+            for (int j = 0; j < i; j++)
+            {
+                m_threads[j].poke();
+                m_threads[j].stop();
+            }
+        }
+    }
+}
+
+void ThreadPoolImpl::waitForAllIdle()
+{
+    if (!m_ok)
+        return;
+
+    int id = 0;
+    do
+    {
+        int word = id >> 6;
+        uint64_t bit = 1LL << (id & 63);
+        if (m_sleepMap[word] & bit)
+        {
+            id++;
+        }
+        else
         {
             GIVE_UP_TIME();
         }
     }
+    while (id < m_numThreads);
 }
 
 void ThreadPoolImpl::Stop()
 {
     if (m_ok)
     {
-        uint64_t idlemap = 0;
-        for (int i = 0; i < m_numThreads; i++)
-        {
-            idlemap |= (1LL << i);
-        }
-
-        // wait for all threads to idle
-        while (PoolThread::s_sleepMap != idlemap)
-        {
-            GIVE_UP_TIME();
-        }
+        waitForAllIdle();
 
         // set invalid flag, then wake them up so they exit their main func
         m_ok = false;
         for (int i = 0; i < m_numThreads; i++)
         {
-            pokeIdleThread();
-        }
-
-        int exited_count = 0;
-        do
-        {
-            GIVE_UP_TIME();
-            exited_count = 0;
-            for (int i = 0; i < m_numThreads; i++)
-            {
-                exited_count += m_threads[i].isExited() ? 1 : 0;
-            }
-        }
-        while (exited_count < m_numThreads);
-
-        // join each thread to cleanup resources
-        for (int i = 0; i < m_numThreads; i++)
-        {
+            m_threads[i].poke();
             m_threads[i].stop();
         }
     }
@@ -300,6 +325,8 @@ void ThreadPoolImpl::Stop()
 
 ThreadPoolImpl::~ThreadPoolImpl()
 {
+    X265_FREE((void*)m_sleepMap);
+
     if (m_threads)
     {
         // cleanup thread handles
@@ -308,7 +335,7 @@ ThreadPoolImpl::~ThreadPoolImpl()
             m_threads[i].~PoolThread();
         }
 
-        delete[] reinterpret_cast<char*>(m_threads);
+        X265_FREE(reinterpret_cast<char*>(m_threads));
     }
 }
 
diff -r 8c9e1b3564e8 -r 634bc0b1c246 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Wed Feb 05 13:39:02 2014 -0600
+++ b/source/encoder/slicetype.cpp	Wed Feb 05 23:10:22 2014 -0600
@@ -1437,16 +1437,18 @@ void EstimateRow::estimateCUCost(Lowres 
         pixel *pix_cur = fenc->lowresPlane[0] + pelOffset;
 
         // Copy Above
-        memcpy(above0, pix_cur - 1 - fenc->lumaStride, cuSize + 1);
+        memcpy(above0, pix_cur - 1 - fenc->lumaStride, (cuSize + 1) * sizeof(pixel));
 
         // Copy Left
         for (int i = 0; i < cuSize + 1; i++)
         {
             left0[i] = pix_cur[-1 - fenc->lumaStride + i * fenc->lumaStride];
         }
-
-        memset(above0 + cuSize + 1, above0[cuSize], cuSize);
-        memset(left0 + cuSize + 1, left0[cuSize], cuSize);
+        for (int i = 0; i < cuSize; i++)
+        {
+            above0[cuSize + i + 1] = above0[cuSize];
+            left0[cuSize + i + 1] = left0[cuSize];
+        }
 
         // filtering with [1 2 1]
         // assume getUseStrongIntraSmoothing() is disabled


More information about the x265-commits mailing list