[x265] [PATCH 6/6] perf(tme): optimize task scheduling
Shashank Pathipati
shashank.pathipati at multicorewareinc.com
Fri Apr 10 09:23:33 UTC 2026
>From fe29efb47fa7327277cdd1e4ab4ed47966fbd0dd Mon Sep 17 00:00:00 2001
From: Shashank Pathipati <shashank.pathipati at multicorewareinc.com>
Date: Fri, 10 Apr 2026 14:36:06 +0530
Subject: [PATCH 6/6] perf(tme): optimize task scheduling
---
source/encoder/threadedme.cpp | 60 ++++++++++++++++++++++++++++++++---
source/encoder/threadedme.h | 28 ++++++----------
2 files changed, 66 insertions(+), 22 deletions(-)
diff --git a/source/encoder/threadedme.cpp b/source/encoder/threadedme.cpp
index 1028beb92..856063c03 100644
--- a/source/encoder/threadedme.cpp
+++ b/source/encoder/threadedme.cpp
@@ -127,6 +127,47 @@ void ThreadedME::enqueueReadyRows(int row, int layer, FrameEncoder* frameEnc)
}
}
+/* Compare two CTU tasks. Returns true if 'a' has lower priority than 'b'.
+ *
+ * Priority ordering (most urgent first):
+ * Primary key: urgency — distance from WPP frontier (smaller / negative = more urgent)
+ * Secondary key: diagonal (row + col) — prefer earlier WPP wavefront position
+ * Tertiary key: enqueue sequence — preserve FIFO among equal candidates
+ *
+ * The sorted queue places lowest-priority tasks at the front and highest-priority
+ * tasks at the back, so findJob() can pop_back() in O(1). */
+static bool isLowerPriority(const CTUTask& a, const CTUTask& b)
+{
+ int ua = ctuTaskUrgency(a);
+ int ub = ctuTaskUrgency(b);
+ if (ua != ub)
+ return ua > ub;
+
+ int da = a.row + a.col;
+ int db = b.row + b.col;
+ if (da != db)
+ return da > db;
+
+ return a.seq > b.seq;
+}
+
+/* Binary search for the position at which 'task' should be inserted into the
+ * already-sorted queue so that the ascending-priority order is preserved. */
+static int findInsertIndex(const std::vector<CTUTask>& queue, const CTUTask& task)
+{
+ int lo = 0;
+ int hi = (int)queue.size();
+ while (lo < hi)
+ {
+ int mid = lo + ((hi - lo) >> 1);
+ if (isLowerPriority(task, queue[mid]))
+ hi = mid;
+ else
+ lo = mid + 1;
+ }
+ return lo;
+}
+
void ThreadedME::threadMain()
{
while (m_active)
@@ -143,8 +184,11 @@ void ThreadedME::threadMain()
CTUTask task = frameEnc->m_tmeTasks.front();
frameEnc->m_tmeTasks.pop();
+ /* Note: ctuTaskUrgency() reads live WPP progress, so the order is a
+ * snapshot taken at insertion time and may drift as WPP advances. */
m_taskQueueLock.acquire();
- m_taskQueue.push(task);
+ int pos = findInsertIndex(m_taskQueue, task);
+ m_taskQueue.insert(m_taskQueue.begin() + pos, task);
m_taskQueueLock.release();
newCTUsPushed++;
@@ -168,7 +212,7 @@ void ThreadedME::findJob(int workerThreadId)
m_taskQueueLock.release();
return;
}
-
+
m_helpWanted = true;
int64_t stime = x265_mdate();
@@ -177,8 +221,10 @@ void ThreadedME::findJob(int workerThreadId)
m_tld[workerThreadId].analysis.m_stats[m_jpId].countTmeTasks++;
#endif
- CTUTask task = m_taskQueue.top();
- m_taskQueue.pop();
+ /* The queue is kept sorted by threadMain() in ascending priority order, so
+ * the highest-priority task sits at the back and can be popped in O(1). */
+ CTUTask task = m_taskQueue.back();
+ m_taskQueue.pop_back();
m_taskQueueLock.release();
int numCols = (m_param->sourceWidth + m_param->maxCUSize - 1) / m_param->maxCUSize;
@@ -257,4 +303,10 @@ void initCTU(CUData& ctu, int row, int col, CTUTask& task)
ctu.initCTU(frame, ctuAddr, slice->m_sliceQp, bFirstRowInSlice, bLastRowInSlice, bLastCuInSlice);
}
+int ctuTaskUrgency(const CTUTask& task)
+{
+ int wppProgress = (int)task.frameEnc->m_rows[task.row].completed;
+ return task.col - wppProgress;
+}
+
}
\ No newline at end of file
diff --git a/source/encoder/threadedme.h b/source/encoder/threadedme.h
index 5e5fc4878..71a9279c2 100644
--- a/source/encoder/threadedme.h
+++ b/source/encoder/threadedme.h
@@ -34,7 +34,6 @@
#include "analysis.h"
#include "mv.h"
-#include <queue>
#include <vector>
#include <fstream>
@@ -134,22 +133,6 @@ struct CTUTask
};
-struct CompareCTUTask {
- bool operator()(const CTUTask& a, const CTUTask& b) const {
- if (a.frame->m_poc == b.frame->m_poc)
- {
- int a_pos = a.row + a.col;
- int b_pos = b.row + b.col;
- if (a_pos != b_pos) return a_pos > b_pos;
- }
-
- /* Compare by sequence number to preserve FIFO enqueue order.
- * priority_queue in C++ is a max-heap, so return true when a.seq > b.seq
- * to make smaller seq (earlier enqueue) the top() element. */
- return a.seq > b.seq;
- }
-};
-
/**
* @brief Threaded motion-estimation module that schedules CTU blocks across worker threads.
*
@@ -163,7 +146,7 @@ public:
x265_param* m_param;
Encoder& m_enc;
- std::priority_queue<CTUTask, std::vector<CTUTask>, CompareCTUTask> m_taskQueue;
+ std::vector<CTUTask> m_taskQueue;
Lock m_taskQueueLock;
Event m_taskEvent;
@@ -244,6 +227,15 @@ public:
*/
void initCTU(CUData& ctu, int row, int col, CTUTask& task);
+/**
+ * @brief Compute scheduling urgency for a CTU task based on WPP progress.
+ *
+ * Returns how many CTUs WPP must process in this row before it reaches the
+ * task's first CTU. Negative values mean WPP has already passed the task's
+ * column and is likely stalling on its result right now.
+ */
+int ctuTaskUrgency(const CTUTask& task);
+
};
#endif
--
2.52.0.windows.1
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20260410/155c87a5/attachment-0001.htm>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0006-perf-tme-optimize-task-scheduling.patch
Type: application/octet-stream
Size: 5693 bytes
Desc: 0006-perf-tme-optimize-task-scheduling.patch
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20260410/155c87a5/attachment-0001.obj>
More information about the x265-devel
mailing list