[x265] [PATCH 6/6] perf(tme): optimize task scheduling

Shashank Pathipati shashank.pathipati at multicorewareinc.com
Fri Apr 10 09:23:33 UTC 2026


>From fe29efb47fa7327277cdd1e4ab4ed47966fbd0dd Mon Sep 17 00:00:00 2001
From: Shashank Pathipati <shashank.pathipati at multicorewareinc.com>
Date: Fri, 10 Apr 2026 14:36:06 +0530
Subject: [PATCH 6/6] perf(tme): optimize task scheduling

---
 source/encoder/threadedme.cpp | 60 ++++++++++++++++++++++++++++++++---
 source/encoder/threadedme.h   | 28 ++++++----------
 2 files changed, 66 insertions(+), 22 deletions(-)

diff --git a/source/encoder/threadedme.cpp b/source/encoder/threadedme.cpp
index 1028beb92..856063c03 100644
--- a/source/encoder/threadedme.cpp
+++ b/source/encoder/threadedme.cpp
@@ -127,6 +127,47 @@ void ThreadedME::enqueueReadyRows(int row, int layer, FrameEncoder* frameEnc)
     }
 }

+/* Compare two CTU tasks. Returns true if 'a' has lower priority than 'b'.
+ *
+ * Priority ordering (most urgent first):
+ *   Primary key:   urgency — distance from WPP frontier (smaller / negative = more urgent)
+ *   Secondary key: diagonal (row + col) — prefer earlier WPP wavefront position
+ *   Tertiary key:  enqueue sequence — preserve FIFO among equal candidates
+ *
+ * The sorted queue places lowest-priority tasks at the front and highest-priority
+ * tasks at the back, so findJob() can pop_back() in O(1). */
+static bool isLowerPriority(const CTUTask& a, const CTUTask& b)
+{
+    int ua = ctuTaskUrgency(a);
+    int ub = ctuTaskUrgency(b);
+    if (ua != ub)
+        return ua > ub;
+
+    int da = a.row + a.col;
+    int db = b.row + b.col;
+    if (da != db)
+        return da > db;
+
+    return a.seq > b.seq;
+}
+
+/* Binary search for the position at which 'task' should be inserted into the
+ * already-sorted queue so that the ascending-priority order is preserved. */
+static int findInsertIndex(const std::vector<CTUTask>& queue, const CTUTask& task)
+{
+    int lo = 0;
+    int hi = (int)queue.size();
+    while (lo < hi)
+    {
+        int mid = lo + ((hi - lo) >> 1);
+        if (isLowerPriority(task, queue[mid]))
+            hi = mid;
+        else
+            lo = mid + 1;
+    }
+    return lo;
+}
+
 void ThreadedME::threadMain()
 {
     while (m_active)
@@ -143,8 +184,11 @@ void ThreadedME::threadMain()
                 CTUTask task = frameEnc->m_tmeTasks.front();
                 frameEnc->m_tmeTasks.pop();

+                /* Note: ctuTaskUrgency() reads live WPP progress, so the order is a
+                 * snapshot taken at insertion time and may drift as WPP advances. */
                 m_taskQueueLock.acquire();
-                m_taskQueue.push(task);
+                int pos = findInsertIndex(m_taskQueue, task);
+                m_taskQueue.insert(m_taskQueue.begin() + pos, task);
                 m_taskQueueLock.release();

                 newCTUsPushed++;
@@ -168,7 +212,7 @@ void ThreadedME::findJob(int workerThreadId)
         m_taskQueueLock.release();
         return;
     }
-
+
     m_helpWanted = true;
     int64_t stime = x265_mdate();

@@ -177,8 +221,10 @@ void ThreadedME::findJob(int workerThreadId)
     m_tld[workerThreadId].analysis.m_stats[m_jpId].countTmeTasks++;
 #endif

-    CTUTask task = m_taskQueue.top();
-    m_taskQueue.pop();
+    /* The queue is kept sorted by threadMain() in ascending priority order, so
+     * the highest-priority task sits at the back and can be popped in O(1). */
+    CTUTask task = m_taskQueue.back();
+    m_taskQueue.pop_back();
     m_taskQueueLock.release();

     int numCols = (m_param->sourceWidth + m_param->maxCUSize - 1) / m_param->maxCUSize;
@@ -257,4 +303,10 @@ void initCTU(CUData& ctu, int row, int col, CTUTask& task)
     ctu.initCTU(frame, ctuAddr, slice->m_sliceQp, bFirstRowInSlice, bLastRowInSlice, bLastCuInSlice);
 }

+int ctuTaskUrgency(const CTUTask& task)
+{
+    int wppProgress = (int)task.frameEnc->m_rows[task.row].completed;
+    return task.col - wppProgress;
+}
+
 }
\ No newline at end of file
diff --git a/source/encoder/threadedme.h b/source/encoder/threadedme.h
index 5e5fc4878..71a9279c2 100644
--- a/source/encoder/threadedme.h
+++ b/source/encoder/threadedme.h
@@ -34,7 +34,6 @@
 #include "analysis.h"
 #include "mv.h"

-#include <queue>
 #include <vector>
 #include <fstream>

@@ -134,22 +133,6 @@ struct CTUTask
 };


-struct CompareCTUTask {
-    bool operator()(const CTUTask& a, const CTUTask& b) const {
-        if (a.frame->m_poc == b.frame->m_poc)
-        {
-            int a_pos = a.row + a.col;
-            int b_pos = b.row + b.col;
-            if (a_pos != b_pos) return a_pos > b_pos;
-        }
-
-        /* Compare by sequence number to preserve FIFO enqueue order.
-         * priority_queue in C++ is a max-heap, so return true when a.seq > b.seq
-         * to make smaller seq (earlier enqueue) the top() element. */
-        return a.seq > b.seq;
-    }
-};
-
 /**
  * @brief Threaded motion-estimation module that schedules CTU blocks across worker threads.
  *
@@ -163,7 +146,7 @@ public:
     x265_param*             m_param;
     Encoder&                m_enc;

-    std::priority_queue<CTUTask, std::vector<CTUTask>, CompareCTUTask>  m_taskQueue;
+    std::vector<CTUTask>    m_taskQueue;
     Lock                    m_taskQueueLock;
     Event                   m_taskEvent;

@@ -244,6 +227,15 @@ public:
  */
 void initCTU(CUData& ctu, int row, int col, CTUTask& task);

+/**
+ * @brief Compute scheduling urgency for a CTU task based on WPP progress.
+ *
+ * Returns how many CTUs WPP must process in this row before it reaches the
+ * task's first CTU.  Negative values mean WPP has already passed the task's
+ * column and is likely stalling on its result right now.
+ */
+int ctuTaskUrgency(const CTUTask& task);
+
 };

 #endif
--
2.52.0.windows.1



-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20260410/155c87a5/attachment-0001.htm>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0006-perf-tme-optimize-task-scheduling.patch
Type: application/octet-stream
Size: 5693 bytes
Desc: 0006-perf-tme-optimize-task-scheduling.patch
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20260410/155c87a5/attachment-0001.obj>


More information about the x265-devel mailing list