[x265] [PATCH 1 of 2] threadpool-fix: utilize all available NUMA nodes for threadpool distribution for windows system,

praveen at multicorewareinc.com praveen at multicorewareinc.com
Mon May 23 12:19:49 CEST 2016


# HG changeset patch
# User Praveen Tiwari <praveen at multicorewareinc.com>
# Date 1463997405 -19800
#      Mon May 23 15:26:45 2016 +0530
# Node ID 2f8a373347649f29953ca9f434eec329e1339aca
# Parent  4723933fdec920debefe606d50a9a312f7bc7f6b
threadpool-fix: utilize all available NUMA nodes for threadpool distribution for windows system,
linux threadpool configuration info, match with windows -> clean logic

diff -r 4723933fdec9 -r 2f8a37334764 source/common/threadpool.cpp
--- a/source/common/threadpool.cpp	Fri May 13 09:32:11 2016 +0530
+++ b/source/common/threadpool.cpp	Mon May 23 15:26:45 2016 +0530
@@ -28,6 +28,10 @@
 
 #include <new>
 
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
+#include <winnt.h>
+#endif
+
 #if X86_64
 
 #ifdef __GNUC__
@@ -64,6 +68,19 @@
 # define strcasecmp _stricmp
 #endif
 
+const uint64_t m1 = 0x5555555555555555; //binary: 0101...
+const uint64_t m2 = 0x3333333333333333; //binary: 00110011..
+const uint64_t m3 = 0x0f0f0f0f0f0f0f0f; //binary:  4 zeros,  4 ones ...
+const uint64_t h01 = 0x0101010101010101; //the sum of 256 to the power of 0,1,2,3...
+
+int popCount(uint64_t x)
+{
+    x -= (x >> 1) & m1;
+    x = (x & m2) + ((x >> 2) & m2);
+    x = (x + (x >> 4)) & m3;
+    return (x * h01) >> 56;
+}
+
 namespace X265_NS {
 // x265 private namespace
 
@@ -238,7 +255,6 @@
     memset(nodeMaskPerPool, 0, sizeof(nodeMaskPerPool));
 
     int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
-    int cpuCount = getCpuCount();
     bool bNumaSupport = false;
 
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
@@ -248,26 +264,54 @@
 #endif
 
 
-    for (int i = 0; i < cpuCount; i++)
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
+    PGROUP_AFFINITY groupAffinityPointer = new GROUP_AFFINITY;
+    for (int i = 0; i < numNumaNodes; i++)
     {
-#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
-        UCHAR node;
-        if (GetNumaProcessorNode((UCHAR)i, &node))
-            cpusPerNode[X265_MIN(node, (UCHAR)MAX_NODE_NUM)]++;
-        else
+        GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer);
+        cpusPerNode[i] = popCount(groupAffinityPointer->Mask);
+    }
+    delete groupAffinityPointer;
 #elif HAVE_LIBNUMA
-        if (bNumaSupport >= 0)
-            cpusPerNode[X265_MIN(numa_node_of_cpu(i), MAX_NODE_NUM)]++;
-        else
+    if (bNumaSupport >= 0)
+    {
+        for (int i = 0; i < numNumaNodes; i++)
+        {
+            struct bitmask* bitMask = numa_allocate_cpumask();
+            int ret = numa_node_to_cpus(i, bitMask);
+            if (!ret)
+                cpusPerNode[i] = numa_num_possible_cpus();
+            else
+                x265_log(p, X265_LOG_ERROR, "Failed to genrate CPU mask\n");
+            numa_free_cpumask(bitMask);
+        }
+    }
+#else // NUMA not supported
+    cpusPerNode[0] = getCpuCount();
 #endif
-            cpusPerNode[0]++;
-    }
 
     if (bNumaSupport && p->logLevel >= X265_LOG_DEBUG)
-        for (int i = 0; i < numNumaNodes; i++)
-            x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical cores\n", i, cpusPerNode[i]);
-
-    /* limit threads based on param->numaPools */
+    for (int i = 0; i < numNumaNodes; i++)
+        x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical cores\n", i, cpusPerNode[i]);
+    /* limit threads based on param->numaPools
+     * For windows because threads can't be allocated to live across sockets
+     * changing the default behavior to be per-socket pools -- FIXME */
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
+     if (!p->numaPools)
+     {
+         char poolString[50] = "";
+         for (int i = 0; i < numNumaNodes; i++)
+         {
+             char nextCount[10] = "";
+             if (i)
+                 sprintf(nextCount, ",%d", cpusPerNode[i]);
+             else
+                   sprintf(nextCount, "%d", cpusPerNode[i]);
+             strcat(poolString, nextCount);
+         }
+         x265_param_parse(p, "pools", poolString);
+     }
+#endif
     if (p->numaPools && *p->numaPools)
     {
         const char *nodeStr = p->numaPools;
@@ -389,16 +433,15 @@
     X265_CHECK(numThreads <= MAX_POOL_THREADS, "a single thread pool cannot have more than MAX_POOL_THREADS threads\n");
 
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
-    m_winCpuMask = 0x0;
-    GROUP_AFFINITY groupAffinity;
+    memset(&m_groupAffinity, 0, sizeof(GROUP_AFFINITY));
     for (int i = 0; i < getNumaNodeCount(); i++)
     {
         int numaNode = ((nodeMask >> i) & 0x1U) ? i : -1;
         if (numaNode != -1)
-            if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &groupAffinity))
-                m_winCpuMask |= groupAffinity.Mask;
+        if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &m_groupAffinity))
+            break;
     }
-    m_numaMask = &m_winCpuMask;
+    m_numaMask = &m_groupAffinity.Mask;
 #elif HAVE_LIBNUMA
     if (numa_available() >= 0)
     {
@@ -480,11 +523,16 @@
     setThreadNodeAffinity(m_numaMask);
 }
 
-/* static */
 void ThreadPool::setThreadNodeAffinity(void *numaMask)
 {
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
-    if (SetThreadAffinityMask(GetCurrentThread(), *((DWORD_PTR*)numaMask)))
+    UNREFERENCED_PARAMETER(numaMask);
+    GROUP_AFFINITY groupAffinity;
+    memset(&groupAffinity, 0, sizeof(GROUP_AFFINITY));
+    groupAffinity.Group = m_groupAffinity.Group;
+    groupAffinity.Mask = m_groupAffinity.Mask;
+    const PGROUP_AFFINITY affinityPointer = &groupAffinity;
+    if (SetThreadGroupAffinity(GetCurrentThread(), affinityPointer, NULL))
         return;
     else
         x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity for NUMA node mask\n");
@@ -525,9 +573,17 @@
 int ThreadPool::getCpuCount()
 {
 #if _WIN32
-    SYSTEM_INFO sysinfo;
-    GetSystemInfo(&sysinfo);
-    return sysinfo.dwNumberOfProcessors;
+    enum { MAX_NODE_NUM = 127 };
+    int cpus = 0;
+    int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
+    PGROUP_AFFINITY groupAffinityPointer = new GROUP_AFFINITY;
+    for (int i = 0; i < numNumaNodes; i++)
+    {
+        GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer);
+        cpus += popCount(groupAffinityPointer->Mask);
+    }
+    delete groupAffinityPointer;
+    return cpus;
 #elif __unix__ && X265_ARCH_ARM
     /* Return the number of processors configured by OS. Because, most embedded linux distributions
      * uses only one processor as the scheduler doesn't have enough work to utilize all processors */
diff -r 4723933fdec9 -r 2f8a37334764 source/common/threadpool.h
--- a/source/common/threadpool.h	Fri May 13 09:32:11 2016 +0530
+++ b/source/common/threadpool.h	Mon May 23 15:26:45 2016 +0530
@@ -85,7 +85,7 @@
     int           m_numWorkers;
     void*         m_numaMask; // node mask in linux, cpu mask in windows
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
-    DWORD_PTR     m_winCpuMask;
+    GROUP_AFFINITY m_groupAffinity;
 #endif
     bool          m_isActive;
 
@@ -99,6 +99,7 @@
     bool start();
     void stopWorkers();
     void setCurrentThreadAffinity();
+    void setThreadNodeAffinity(void *numaMask);
     int  tryAcquireSleepingThread(sleepbitmap_t firstTryBitmap, sleepbitmap_t secondTryBitmap);
     int  tryBondPeers(int maxPeers, sleepbitmap_t peerBitmap, BondedTaskGroup& master);
 
@@ -106,7 +107,6 @@
 
     static int  getCpuCount();
     static int  getNumaNodeCount();
-    static void setThreadNodeAffinity(void *numaMask);
 };
 
 /* Any worker thread may enlist the help of idle worker threads from the same


More information about the x265-devel mailing list