[x265] [PATCH] Performance: Don't split threads into per-NUMA pools unless specified in cli

Thu Sep 3 11:12:24 CEST 2015

# HG changeset patch
# User Pradeep Ramachandran <pradeep at multicorewareinc.com>
# Date 1441271466 -19800
#      Thu Sep 03 14:41:06 2015 +0530
# Node ID 2421fae1de247cb6758c0ad2dbb4bd5f45f23727
# Parent  86e9bd7dd19278fceef65fc93a06dc8746ec9daf
Performance: Don't split threads into per-NUMA pools unless specified in cli

Fixes the threadpools to store a numa node mask instead of a numa node id to
use liNUMA functionality, and changes default thread pool behavior to use one
monolithic pool

We see gains of 10%+ on Intel Xeon E5-2666v3, E5-2640 v2 machines for 4K videos
encoding at an ABR of 15Mbps in slower and versylow settings. Ultrafast shows a
perf dip of ~8% on the same machines. So when running in ultrafast mode, explicilty
specify # threads with the --pools command (--pools 18,18 on the E5-2666v3 which
has two sockets of 18 threads each, for example)

diff -r 86e9bd7dd192 -r 2421fae1de24 doc/reST/cli.rst

--- a/doc/reST/cli.rst	Tue Sep 01 17:06:05 2015 +0530
+++ b/doc/reST/cli.rst	Thu Sep 03 14:41:06 2015 +0530
@@ -202,15 +202,29 @@
 	"-"       - same as "none"
 	"10"      - allocate one pool, using up to 10 cores on node 0
 	"-,+"     - allocate one pool, using all cores on node 1
-	"+,-,+"   - allocate two pools, using all cores on nodes 0 and 2
-	"+,-,+,-" - allocate two pools, using all cores on nodes 0 and 2
-	"-,*"     - allocate three pools, using all cores on nodes 1, 2 and 3
+	"+,-,+"   - allocate one pool, using only cores on nodes 0 and 2
+	"+,-,+,-" - allocate one pool, using only cores on nodes 0 and 2
+	"-,*"     - allocate one pool, using all cores on nodes 1, 2 and 3
 	"8,8,8,8" - allocate four pools with up to 8 threads in each pool
+	"8,+,+,+" - allocate two pools, the first with 8 threads on node 0, and the second with all cores on node 1,2,3
 
-	The total number of threads will be determined by the number of threads
-	assigned to all nodes. The worker threads will each be given affinity for
-	their node, they will not be allowed to migrate between nodes, but they
-	will be allowed to move between CPU cores within their node.
+	A thread pool dedicated to a given NUMA node is enabled only when the
+	number of threads to be created on that NUMA node is explicitly mentioned
+	in that corresponding position with the --pools option. Else, all threads
+	are spawned from a single pool. The total number of threads will be
+	determined by the number of threads assigned to the enabled NUMA nodes for
+	that pool. The worker threads are be given affinity to all the enabled
+	NUMA nodes for that pool and may migrate between them, unless explicitly
+	specified as described above.
+
+	In the case that any threadpool has more than 64 threads, the threadpool
+	may be broken down into multiple pools of 64 threads each; on 32-bit
+	machines, this number is 32. All pools are given affinity to the NUMA
+	nodes on which the original pool had affinity. For performance reasons,
+	the last thread pool is spawned only if it has more than 32 threads for
+	64-bit machines, or 16 for 32-bit machines. If the total number of threads
+	in the system doesn't obey this constraint, we may spawn fewer threads
+	than cores which has been emperically shown to be better for performance. 
 
 	If the four pool features: :option:`--wpp`, :option:`--pmode`,
 	:option:`--pme` and :option:`--lookahead-slices` are all disabled,
@@ -219,10 +233,6 @@
 	If "none" is specified, then all four of the thread pool features are
 	implicitly disabled.
 
-	Multiple thread pools will be allocated for any NUMA node with more than
-	64 logical CPU cores. But any given thread pool will always use at most
-	one NUMA node.
-
 	Frame encoders are distributed between the available thread pools,
 	and the encoder will never generate more thread pools than
 	:option:`--frame-threads`.  The pools are used for WPP and for
@@ -238,8 +248,12 @@
 	system, a POSIX build of libx265 without libnuma will be less work
 	efficient. See :ref:`thread pools <pools>` for more detail.
 
-	Default "", one thread is allocated per detected hardware thread
-	(logical CPU cores) and one thread pool per NUMA node.
+	Default "", one pool is created across all available NUMA nodes, with
+	one thread allocated per detected hardware thread
+	(logical CPU cores). In the case that the total number of threads is more
+	than the maximum size that ATOMIC operations can handle (32 for 32-bit
+	compiles, and 64 for 64-bit compiles), multiple thread pools may be
+	spawned subject to the performance constraint described above.
 
 	Note that the string value will need to be escaped or quoted to
 	protect against shell expansion on many platforms
diff -r 86e9bd7dd192 -r 2421fae1de24 source/common/threadpool.cpp
--- a/source/common/threadpool.cpp	Tue Sep 01 17:06:05 2015 +0530
+++ b/source/common/threadpool.cpp	Thu Sep 03 14:41:06 2015 +0530
@@ -226,8 +226,13 @@
 {
     enum { MAX_NODE_NUM = 127 };
     int cpusPerNode[MAX_NODE_NUM + 1];
+    int threadsPerPool[MAX_NODE_NUM + 2];
+    uint32_t nodeMaskPerPool[MAX_NODE_NUM +2];
 
     memset(cpusPerNode, 0, sizeof(cpusPerNode));
+    memset(threadsPerPool, 0, sizeof(threadsPerPool));
+    memset(nodeMaskPerPool, 0, sizeof(nodeMaskPerPool));
+
     int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
     int cpuCount = getCpuCount();
     bool bNumaSupport = false;
@@ -258,7 +263,7 @@
         for (int i = 0; i < numNumaNodes; i++)
             x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical cores\n", i, cpusPerNode[i]);
 
-    /* limit nodes based on param->numaPools */
+    /* limit threads based on param->numaPools */
     if (p->numaPools && *p->numaPools)
     {
         const char *nodeStr = p->numaPools;
@@ -266,19 +271,30 @@
         {
             if (!*nodeStr)
             {
-                cpusPerNode[i] = 0;
+                threadsPerPool[i] = 0;
                 continue;
             }
             else if (*nodeStr == '-')
-                cpusPerNode[i] = 0;
+                threadsPerPool[i] = 0;
             else if (*nodeStr == '*')
+            {
+                for (int j = i; j < numNumaNodes; j++)
+                {
+                    threadsPerPool[numNumaNodes] += cpusPerNode[j];
+                    nodeMaskPerPool[numNumaNodes] |= (1U << j);
+                }
                 break;
+            }
             else if (*nodeStr == '+')
-                ;
+            {
+                threadsPerPool[numNumaNodes] += cpusPerNode[i];
+                nodeMaskPerPool[numNumaNodes] = (1U << i);
+            }
             else
             {
                 int count = atoi(nodeStr);
-                cpusPerNode[i] = X265_MIN(count, cpusPerNode[i]);
+                threadsPerPool[i] = X265_MIN(count, cpusPerNode[i]);
+                nodeMaskPerPool[i] = (1U << i);
             }
 
             /* consume current node string, comma, and white-space */
@@ -288,24 +304,31 @@
                ++nodeStr;
         }
     }
-
-    // In the case that numa is disabled and we have more CPUs than 64,
-    // spawn the last pool only if the # threads in that pool is > 1/2 max (heuristic)
-    if ((numNumaNodes == 1) &&
-        (cpusPerNode[0] > MAX_POOL_THREADS) &&
-        (cpusPerNode[0] % MAX_POOL_THREADS < (MAX_POOL_THREADS / 2)))
+    else
     {
-        cpusPerNode[0] -= (cpusPerNode[0] % MAX_POOL_THREADS);
-        x265_log(p, X265_LOG_DEBUG, "Creating only %d worker threads to prevent asymmetry in pools; may not use all HW contexts\n", cpusPerNode[0]);
+        for (int i = 0; i < numNumaNodes; i++)
+        {
+            threadsPerPool[numNumaNodes]  += cpusPerNode[i];
+            nodeMaskPerPool[numNumaNodes] |= (1U << i);
+        }
+    }
+ 
+    // If the last pool size is > MAX_POOL_THREADS, clip it to spawn thread pools only of size >= 1/2 max (heuristic)
+    if ((threadsPerPool[numNumaNodes] > MAX_POOL_THREADS) &&
+        ((threadsPerPool[numNumaNodes] % MAX_POOL_THREADS) < (MAX_POOL_THREADS / 2)))
+    {
+        threadsPerPool[numNumaNodes] -= (threadsPerPool[numNumaNodes] % MAX_POOL_THREADS);
+        x265_log(p, X265_LOG_DEBUG,
+                 "Creating only %d worker threads beyond specified numbers with --pools (if specified) to prevent asymmetry in pools; may not use all HW contexts\n", threadsPerPool[numNumaNodes]);
     }
 
     numPools = 0;
-    for (int i = 0; i < numNumaNodes; i++)
+    for (int i = 0; i < numNumaNodes + 1; i++)
     {
         if (bNumaSupport)
             x265_log(p, X265_LOG_DEBUG, "NUMA node %d may use %d logical cores\n", i, cpusPerNode[i]);
-        if (cpusPerNode[i])
-            numPools += (cpusPerNode[i] + MAX_POOL_THREADS - 1) / MAX_POOL_THREADS;
+        if (threadsPerPool[i])
+            numPools += (threadsPerPool[i] + MAX_POOL_THREADS - 1) / MAX_POOL_THREADS;
     }
 
     if (!numPools)
@@ -324,20 +347,20 @@
         int node = 0;
         for (int i = 0; i < numPools; i++)
         {
-            while (!cpusPerNode[node])
+            while (!threadsPerPool[node])
                 node++;
-            int cores = X265_MIN(MAX_POOL_THREADS, cpusPerNode[node]);
-            if (!pools[i].create(cores, maxProviders, node))
+            int numThreads = X265_MIN(MAX_POOL_THREADS, threadsPerPool[node]);
+            if (!pools[i].create(numThreads, maxProviders, nodeMaskPerPool[node]))
             {
                 X265_FREE(pools);
                 numPools = 0;
                 return NULL;
             }
             if (numNumaNodes > 1)
-                x265_log(p, X265_LOG_INFO, "Thread pool %d using %d threads on NUMA node %d\n", i, cores, node);
+                x265_log(p, X265_LOG_INFO, "Thread pool %d using %d threads with NUMA node mask %lx\n", i, numThreads, nodeMaskPerPool[node]);
             else
-                x265_log(p, X265_LOG_INFO, "Thread pool created using %d threads\n", cores);
-            cpusPerNode[node] -= cores;
+                x265_log(p, X265_LOG_INFO, "Thread pool created using %d threads\n", numThreads);
+            threadsPerPool[node] -= numThreads;
         }
     }
     else
@@ -350,11 +373,27 @@
     memset(this, 0, sizeof(*this));
 }
 
-bool ThreadPool::create(int numThreads, int maxProviders, int node)
+bool ThreadPool::create(int numThreads, int maxProviders, uint32_t nodeMask)
 {
     X265_CHECK(numThreads <= MAX_POOL_THREADS, "a single thread pool cannot have more than MAX_POOL_THREADS threads\n");
 
-    m_numaNode = node;
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
+    m_winNodemask = nodeMask & ~(0x1 << getNumaNodeCount());
+    m_numaNodeMask = &m_winNodemask;
+#elif HAVE_LIBNUMA
+    if (numa_available() >= 0)
+    {
+        struct bitmask* nodemask = numa_allocate_nodemask();
+        if (nodemask)
+        {
+            *(nodemask->maskp) = nodeMask;
+            m_numaNodeMask = nodemask;
+        }
+        else
+            x265_log(NULL, X265_LOG_ERROR, "unable to get NUMA node mask for %lx\n", nodeMask);
+    }
+#endif
+
     m_numWorkers = numThreads;
 
     m_workers = X265_MALLOC(WorkerThread, numThreads);
@@ -408,36 +447,37 @@
 
     X265_FREE(m_workers);
     X265_FREE(m_jpTable);
+
+#if HAVE_LIBNUMA
+    if(m_numaNodeMask)
+        numa_free_nodemask((struct bitmask*)m_numaNodeMask);
+#endif
 }
 
 void ThreadPool::setCurrentThreadAffinity()
 {
-    setThreadNodeAffinity(m_numaNode);
+    setThreadNodeAffinity(m_numaNodeMask);
 }
 
 /* static */
-void ThreadPool::setThreadNodeAffinity(int numaNode)
+void ThreadPool::setThreadNodeAffinity(void *numaNodeMask)
 {
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
-    GROUP_AFFINITY groupAffinity;
-    if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &groupAffinity))
-    {
-        if (SetThreadAffinityMask(GetCurrentThread(), (DWORD_PTR)groupAffinity.Mask))
-            return;
-    }
-    x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity to NUMA node %d\n", numaNode);
+    if (SetThreadAffinityMask(GetCurrentThread(), (DWORD_PTR)(*((DWORD*)numaNodeMask))))
+        return;
+    else
+        x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity for NUMA node mask\n");
 #elif HAVE_LIBNUMA
     if (numa_available() >= 0)
     {
-        numa_run_on_node(numaNode);
-        numa_set_preferred(numaNode);
+        numa_run_on_node_mask((struct bitmask*)numaNodeMask);
+        numa_set_interleave_mask((struct bitmask*)numaNodeMask);
         numa_set_localalloc();
         return;
     }
-    x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity to NUMA node %d\n", numaNode);
-#else
-    (void)numaNode;
+    x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity for NUMA node mask\n");
 #endif
+    return;
 }
 
 /* static */
diff -r 86e9bd7dd192 -r 2421fae1de24 source/common/threadpool.h
--- a/source/common/threadpool.h	Tue Sep 01 17:06:05 2015 +0530
+++ b/source/common/threadpool.h	Thu Sep 03 14:41:06 2015 +0530
@@ -83,7 +83,10 @@
     sleepbitmap_t m_sleepBitmap;
     int           m_numProviders;
     int           m_numWorkers;
-    int           m_numaNode;
+    void*         m_numaNodeMask;
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
+    DWORD         m_winNodemask;
+#endif
     bool          m_isActive;
 
     JobProvider** m_jpTable;
@@ -92,7 +95,7 @@
     ThreadPool();
     ~ThreadPool();
 
-    bool create(int numThreads, int maxProviders, int node);
+    bool create(int numThreads, int maxProviders, uint32_t nodeMask);
     bool start();
     void stopWorkers();
     void setCurrentThreadAffinity();
@@ -103,7 +106,7 @@
 
     static int  getCpuCount();
     static int  getNumaNodeCount();
-    static void setThreadNodeAffinity(int node);
+    static void setThreadNodeAffinity(void *numaNodeMask);
 };
 
 /* Any worker thread may enlist the help of idle worker threads from the same