[x265] [PATCH 1 of 2] threadpool-fix: utilize all available NUMA nodes for threadpool distribution for windows system,
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Mon May 23 12:19:49 CEST 2016
# HG changeset patch
# User Praveen Tiwari <praveen at multicorewareinc.com>
# Date 1463997405 -19800
# Mon May 23 15:26:45 2016 +0530
# Node ID 2f8a373347649f29953ca9f434eec329e1339aca
# Parent 4723933fdec920debefe606d50a9a312f7bc7f6b
threadpool-fix: utilize all available NUMA nodes for threadpool distribution for windows system,
linux threadpool configuration info, match with windows -> clean logic
diff -r 4723933fdec9 -r 2f8a37334764 source/common/threadpool.cpp
--- a/source/common/threadpool.cpp Fri May 13 09:32:11 2016 +0530
+++ b/source/common/threadpool.cpp Mon May 23 15:26:45 2016 +0530
@@ -28,6 +28,10 @@
#include <new>
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
+#include <winnt.h>
+#endif
+
#if X86_64
#ifdef __GNUC__
@@ -64,6 +68,19 @@
# define strcasecmp _stricmp
#endif
+const uint64_t m1 = 0x5555555555555555; //binary: 0101...
+const uint64_t m2 = 0x3333333333333333; //binary: 00110011..
+const uint64_t m3 = 0x0f0f0f0f0f0f0f0f; //binary: 4 zeros, 4 ones ...
+const uint64_t h01 = 0x0101010101010101; //the sum of 256 to the power of 0,1,2,3...
+
+int popCount(uint64_t x)
+{
+ x -= (x >> 1) & m1;
+ x = (x & m2) + ((x >> 2) & m2);
+ x = (x + (x >> 4)) & m3;
+ return (x * h01) >> 56;
+}
+
namespace X265_NS {
// x265 private namespace
@@ -238,7 +255,6 @@
memset(nodeMaskPerPool, 0, sizeof(nodeMaskPerPool));
int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
- int cpuCount = getCpuCount();
bool bNumaSupport = false;
#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
@@ -248,26 +264,54 @@
#endif
- for (int i = 0; i < cpuCount; i++)
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
+ PGROUP_AFFINITY groupAffinityPointer = new GROUP_AFFINITY;
+ for (int i = 0; i < numNumaNodes; i++)
{
-#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
- UCHAR node;
- if (GetNumaProcessorNode((UCHAR)i, &node))
- cpusPerNode[X265_MIN(node, (UCHAR)MAX_NODE_NUM)]++;
- else
+ GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer);
+ cpusPerNode[i] = popCount(groupAffinityPointer->Mask);
+ }
+ delete groupAffinityPointer;
#elif HAVE_LIBNUMA
- if (bNumaSupport >= 0)
- cpusPerNode[X265_MIN(numa_node_of_cpu(i), MAX_NODE_NUM)]++;
- else
+ if (bNumaSupport >= 0)
+ {
+ for (int i = 0; i < numNumaNodes; i++)
+ {
+ struct bitmask* bitMask = numa_allocate_cpumask();
+ int ret = numa_node_to_cpus(i, bitMask);
+ if (!ret)
+ cpusPerNode[i] = numa_num_possible_cpus();
+ else
+ x265_log(p, X265_LOG_ERROR, "Failed to genrate CPU mask\n");
+ numa_free_cpumask(bitMask);
+ }
+ }
+#else // NUMA not supported
+ cpusPerNode[0] = getCpuCount();
#endif
- cpusPerNode[0]++;
- }
if (bNumaSupport && p->logLevel >= X265_LOG_DEBUG)
- for (int i = 0; i < numNumaNodes; i++)
- x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical cores\n", i, cpusPerNode[i]);
-
- /* limit threads based on param->numaPools */
+ for (int i = 0; i < numNumaNodes; i++)
+ x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical cores\n", i, cpusPerNode[i]);
+ /* limit threads based on param->numaPools
+ * For windows because threads can't be allocated to live across sockets
+ * changing the default behavior to be per-socket pools -- FIXME */
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
+ if (!p->numaPools)
+ {
+ char poolString[50] = "";
+ for (int i = 0; i < numNumaNodes; i++)
+ {
+ char nextCount[10] = "";
+ if (i)
+ sprintf(nextCount, ",%d", cpusPerNode[i]);
+ else
+ sprintf(nextCount, "%d", cpusPerNode[i]);
+ strcat(poolString, nextCount);
+ }
+ x265_param_parse(p, "pools", poolString);
+ }
+#endif
if (p->numaPools && *p->numaPools)
{
const char *nodeStr = p->numaPools;
@@ -389,16 +433,15 @@
X265_CHECK(numThreads <= MAX_POOL_THREADS, "a single thread pool cannot have more than MAX_POOL_THREADS threads\n");
#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
- m_winCpuMask = 0x0;
- GROUP_AFFINITY groupAffinity;
+ memset(&m_groupAffinity, 0, sizeof(GROUP_AFFINITY));
for (int i = 0; i < getNumaNodeCount(); i++)
{
int numaNode = ((nodeMask >> i) & 0x1U) ? i : -1;
if (numaNode != -1)
- if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &groupAffinity))
- m_winCpuMask |= groupAffinity.Mask;
+ if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &m_groupAffinity))
+ break;
}
- m_numaMask = &m_winCpuMask;
+ m_numaMask = &m_groupAffinity.Mask;
#elif HAVE_LIBNUMA
if (numa_available() >= 0)
{
@@ -480,11 +523,16 @@
setThreadNodeAffinity(m_numaMask);
}
-/* static */
void ThreadPool::setThreadNodeAffinity(void *numaMask)
{
#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
- if (SetThreadAffinityMask(GetCurrentThread(), *((DWORD_PTR*)numaMask)))
+ UNREFERENCED_PARAMETER(numaMask);
+ GROUP_AFFINITY groupAffinity;
+ memset(&groupAffinity, 0, sizeof(GROUP_AFFINITY));
+ groupAffinity.Group = m_groupAffinity.Group;
+ groupAffinity.Mask = m_groupAffinity.Mask;
+ const PGROUP_AFFINITY affinityPointer = &groupAffinity;
+ if (SetThreadGroupAffinity(GetCurrentThread(), affinityPointer, NULL))
return;
else
x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity for NUMA node mask\n");
@@ -525,9 +573,17 @@
int ThreadPool::getCpuCount()
{
#if _WIN32
- SYSTEM_INFO sysinfo;
- GetSystemInfo(&sysinfo);
- return sysinfo.dwNumberOfProcessors;
+ enum { MAX_NODE_NUM = 127 };
+ int cpus = 0;
+ int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
+ PGROUP_AFFINITY groupAffinityPointer = new GROUP_AFFINITY;
+ for (int i = 0; i < numNumaNodes; i++)
+ {
+ GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer);
+ cpus += popCount(groupAffinityPointer->Mask);
+ }
+ delete groupAffinityPointer;
+ return cpus;
#elif __unix__ && X265_ARCH_ARM
/* Return the number of processors configured by OS. Because, most embedded linux distributions
* uses only one processor as the scheduler doesn't have enough work to utilize all processors */
diff -r 4723933fdec9 -r 2f8a37334764 source/common/threadpool.h
--- a/source/common/threadpool.h Fri May 13 09:32:11 2016 +0530
+++ b/source/common/threadpool.h Mon May 23 15:26:45 2016 +0530
@@ -85,7 +85,7 @@
int m_numWorkers;
void* m_numaMask; // node mask in linux, cpu mask in windows
#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
- DWORD_PTR m_winCpuMask;
+ GROUP_AFFINITY m_groupAffinity;
#endif
bool m_isActive;
@@ -99,6 +99,7 @@
bool start();
void stopWorkers();
void setCurrentThreadAffinity();
+ void setThreadNodeAffinity(void *numaMask);
int tryAcquireSleepingThread(sleepbitmap_t firstTryBitmap, sleepbitmap_t secondTryBitmap);
int tryBondPeers(int maxPeers, sleepbitmap_t peerBitmap, BondedTaskGroup& master);
@@ -106,7 +107,6 @@
static int getCpuCount();
static int getNumaNodeCount();
- static void setThreadNodeAffinity(void *numaMask);
};
/* Any worker thread may enlist the help of idle worker threads from the same
More information about the x265-devel
mailing list