[x265] [PATCH] perf(tme): tune tme parameters for presets 0-5
Shashank Pathipati
shashank.pathipati at multicorewareinc.com
Wed Mar 4 15:37:22 UTC 2026
This patch tunes performance of --threaded-me for presets ranging from ultrafast to medium, while having default tme params for the remaining presets.
>From b23a003f32ba8d6dca7c0afadc48dabf6c1d872e Mon Sep 17 00:00:00 2001
From: Shashank Pathipati <shashank.pathipati at multicorewareinc.com>
Date: Wed, 4 Mar 2026 19:06:12 +0530
Subject: [PATCH] perf(tme): tune tme parameters for presets 0-5
---
source/common/framedata.cpp | 18 ++------
source/common/param.cpp | 9 ++--
source/common/slice.h | 4 +-
source/common/threadpool.cpp | 84 ++++++++++++++++++++++++++++-------
source/common/threadpool.h | 2 +-
source/encoder/analysis.cpp | 2 +-
source/encoder/search.cpp | 11 ++---
source/encoder/threadedme.cpp | 15 -------
source/encoder/threadedme.h | 10 -----
source/x265.h | 3 ++
10 files changed, 85 insertions(+), 73 deletions(-)
diff --git a/source/common/framedata.cpp b/source/common/framedata.cpp
index 9c7e007b8..aa317dfe8 100644
--- a/source/common/framedata.cpp
+++ b/source/common/framedata.cpp
@@ -39,12 +39,9 @@ bool FrameData::create(const x265_param& param, const SPS& sps, int csp)
m_slice = new Slice;
if (m_param->bThreadedME)
{
- uint32_t bufferSize = sps.numCuInWidth * sps.numCuInHeight;
- m_slice->m_ctuMV = (CTUMVInfo*)x265_malloc(sizeof(CTUMVInfo) * bufferSize);
- for (uint32_t i = 0; i < bufferSize; i++)
- {
- m_slice->m_ctuMV[i].m_meData = (MEData*)x265_malloc(sizeof(MEData) * MAX_NUM_PUS_PER_CTU);
- }
+ uint32_t numCUs = sps.numCuInWidth * sps.numCuInHeight;
+ uint32_t totalPUs = numCUs * MAX_NUM_PUS_PER_CTU;
+ m_slice->m_ctuMV = X265_MALLOC(MEData, totalPUs);
}
m_picCTU = new CUData[sps.numCUsInFrame];
@@ -105,14 +102,7 @@ void FrameData::destroy()
{
delete [] m_picCTU;
- if (m_slice->m_ctuMV)
- {
- uint32_t bufferSize = m_slice->m_sps->numCuInWidth * m_slice->m_sps->numCuInHeight;
- for (uint32_t i = 0; i < bufferSize; i++)
- x265_free(m_slice->m_ctuMV[i].m_meData);
-
- x265_free(m_slice->m_ctuMV);
- }
+ X265_FREE(m_slice->m_ctuMV);
delete m_slice;
delete m_saoParam;
diff --git a/source/common/param.cpp b/source/common/param.cpp
index ce6ce6765..862da5d14 100755
--- a/source/common/param.cpp
+++ b/source/common/param.cpp
@@ -491,8 +491,6 @@ int x265_param_default_preset(x265_param* param, const char* preset, const char*
param->rc.hevcAq = 0;
param->rc.qgSize = 32;
param->bEnableFastIntra = 1;
- param->tmeTaskBlockSize = 0; // Auto-detect
- param->tmeNumBufferRows = 20;
}
else if (!strcmp(preset, "superfast"))
{
@@ -514,8 +512,6 @@ int x265_param_default_preset(x265_param* param, const char* preset, const char*
param->rc.qgSize = 32;
param->bEnableSAO = 0;
param->bEnableFastIntra = 1;
- param->tmeTaskBlockSize = 0; // Auto-detect
- param->tmeNumBufferRows = 20;
}
else if (!strcmp(preset, "veryfast"))
{
@@ -530,8 +526,6 @@ int x265_param_default_preset(x265_param* param, const char* preset, const char*
param->maxNumReferences = 2;
param->rc.qgSize = 32;
param->bEnableFastIntra = 1;
- param->tmeTaskBlockSize = 0; // Auto-detect
- param->tmeNumBufferRows = 20;
}
else if (!strcmp(preset, "faster"))
{
@@ -2104,6 +2098,9 @@ void x265_print_params(x265_param* param)
x265_log(param, X265_LOG_INFO, "Coding QT: max CU size, min CU size : %d / %d\n", param->maxCUSize, param->minCUSize);
+ if (param->bThreadedME)
+ x265_log(param, X265_LOG_INFO, "ThreadedME: task block / buf rows : %d / %d\n", param->tmeTaskBlockSize, param->tmeNumBufferRows);
+
x265_log(param, X265_LOG_INFO, "Residual QT: max TU size, max depth : %d / %d inter / %d intra\n",
param->maxTUSize, param->tuQTMaxInterDepth, param->tuQTMaxIntraDepth);
diff --git a/source/common/slice.h b/source/common/slice.h
index 8ede39898..076c439ea 100644
--- a/source/common/slice.h
+++ b/source/common/slice.h
@@ -36,7 +36,7 @@ class PicList;
class PicYuv;
class MotionReference;
-struct CTUMVInfo;
+struct MEData;
enum SliceType
{
@@ -381,7 +381,7 @@ public:
WeightParam m_weightPredTable[2][MAX_NUM_REF][3]; // [list][refIdx][0:Y, 1:U, 2:V]
MotionReference (*m_mref)[MAX_NUM_REF + 1];
RPS m_rps;
- CTUMVInfo* m_ctuMV;
+ MEData* m_ctuMV;
NalUnitType m_nalUnitType;
SliceType m_sliceType;
diff --git a/source/common/threadpool.cpp b/source/common/threadpool.cpp
index a5a02daca..ae97d2637 100644
--- a/source/common/threadpool.cpp
+++ b/source/common/threadpool.cpp
@@ -388,17 +388,10 @@ ThreadPool* ThreadPool::allocThreadPools(x265_param* p, int& numPools, bool isTh
if (p->bThreadedME)
{
- /**
- * TODO: The following thread split decision has only been tuned
- * for ultrafast and medium presets. Tuning for other presets
- * needs to be completed.
- */
- int targetTME = getTmeThreadCount(p, totalNumThreads);
- threadsFrameEnc = totalNumThreads - targetTME;
-
- if (targetTME < 1)
- targetTME = 1;
+ int targetTME = configureTmeThreadCount(p, totalNumThreads);
+ targetTME = (targetTME < 1) ? 1 : targetTME;
+ threadsFrameEnc = totalNumThreads - targetTME;
int defaultNumFT = getFrameThreadsCount(p, totalNumThreads);
if (threadsFrameEnc < defaultNumFT)
{
@@ -826,18 +819,75 @@ int ThreadPool::getFrameThreadsCount(x265_param* p, int cpuCount)
return 1;
}
-int ThreadPool::getTmeThreadCount(x265_param* param, int cpuCount)
+int ThreadPool::configureTmeThreadCount(x265_param* param, int cpuCount)
{
- bool isHighRes = (param->sourceWidth > 2000);
+ enum TmeResClass
+ {
+ TME_RES_LOW = 0,
+ TME_RES_MID,
+ TME_RES_HIGH,
+ TME_RES_COUNT
+ };
+
+ enum TmeRule
+ {
+ TME_RULE_FAST_MEDIUM_SLOW = 0,
+ TME_RULE_FASTER,
+ TME_RULE_VERYFAST,
+ TME_RULE_SUPERFAST,
+ TME_RULE_ULTRAFAST,
+ TME_RULE_COUNT
+ };
+
+ struct TmeRuleConfig
+ {
+ int taskBlockSize[TME_RES_COUNT];
+ int numBufferRows[TME_RES_COUNT];
+ int threadPercent[TME_RES_COUNT];
+ bool widthBasedTaskBlockSize;
+ };
+
+ static const TmeRuleConfig s_tmeRuleConfig[TME_RULE_COUNT] =
+ {
+ { { 1, 1, 1 }, { 10, 10, 10 }, { 90, 80, 70 }, false }, // fast / medium and slower presets
+ { { 1, 1, 1 }, { 10, 15, 10 }, { 90, 80, 70 }, false }, // faster preset and similar options
+ { { 1, 1, 1 }, { 10, 15, 20 }, { 90, 80, 70 }, false }, // veryfast preset and similar options
+ { { 2, 4, 4 }, { 10, 15, 20 }, { 90, 80, 60 }, false }, // superfast preset and similar options
+ { { 0, 0, 0 }, { 15, 20, 20 }, { 90, 80, 50 }, true } // ultrafast preset and similar options
+ };
+
+ const int resClass = (param->sourceHeight >= 1440) ? TME_RES_HIGH :
+ (param->sourceHeight <= 720) ? TME_RES_LOW : TME_RES_MID;
+
+ const bool ruleMatches[TME_RULE_COUNT] =
+ {
+ param->maxNumReferences >= 3 && param->subpelRefine >= 2,
+ param->maxNumReferences >= 2 && param->subpelRefine >= 2,
+ param->subpelRefine >= 1 && param->bframes > 3,
+ param->subpelRefine && param->maxCUSize < 64,
+ !param->subpelRefine || param->searchMethod == X265_DIA_SEARCH || param->minCUSize >= 16
+ };
+
+ int selectedRule = -1;
+ for (int i = 0; i < TME_RULE_COUNT; i++)
+ {
+ if (ruleMatches[i])
+ {
+ selectedRule = i;
+ break;
+ }
+ }
- // ultrafast preset or similar options
- if (!param->subpelRefine || param->minCUSize >= 16)
+ if (selectedRule >= 0)
{
- if (isHighRes) return cpuCount / 2;
+ const TmeRuleConfig& cfg = s_tmeRuleConfig[selectedRule];
+ param->tmeTaskBlockSize = cfg.widthBasedTaskBlockSize ? ((param->sourceWidth + 480 - 1) / 480) : cfg.taskBlockSize[resClass];
+ param->tmeNumBufferRows = cfg.numBufferRows[resClass];
+ return (cpuCount * cfg.threadPercent[resClass]) / 100;
}
- if (isHighRes) return (cpuCount * 7) / 10;
- else return (cpuCount * 4) / 5;
+ static const int s_defaultThreadPercent[TME_RES_COUNT] = { 80, 80, 70 };
+ return (cpuCount * s_defaultThreadPercent[resClass]) / 100;
}
} // end namespace X265_NS
diff --git a/source/common/threadpool.h b/source/common/threadpool.h
index 051f4cba4..f223fd010 100644
--- a/source/common/threadpool.h
+++ b/source/common/threadpool.h
@@ -107,7 +107,7 @@ public:
static int getCpuCount();
static int getNumaNodeCount();
static int getFrameThreadsCount(x265_param* p, int cpuCount);
- static int getTmeThreadCount(x265_param* p, int cpuCount);
+ static int configureTmeThreadCount(x265_param* p, int cpuCount);
};
/* Any worker thread may enlist the help of idle worker threads from the same
diff --git a/source/encoder/analysis.cpp b/source/encoder/analysis.cpp
index 6960583f4..06f538588 100644
--- a/source/encoder/analysis.cpp
+++ b/source/encoder/analysis.cpp
@@ -172,7 +172,7 @@ void Analysis::computeMVForPUs(CUData& ctu, const CUGeom& cuGeom, int qp, Frame&
uint32_t cuX = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
uint32_t cuY = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
- if (!(cuSize == m_param->maxCUSize))
+ if (cuSize != m_param->maxCUSize)
{
uint32_t subCUSize = m_param->maxCUSize / 2;
areaId = (cuX >= subCUSize) + 2 * (cuY >= subCUSize) + 1;
diff --git a/source/encoder/search.cpp b/source/encoder/search.cpp
index bf47e7a03..304911f96 100644
--- a/source/encoder/search.cpp
+++ b/source/encoder/search.cpp
@@ -285,7 +285,7 @@ void Search::puMotionEstimation(const Slice* slice, const CUGeom& cuGeom, CUData
int neighIdx = neighborIdx[dir];
if (neighIdx >= 0)
{
- MEData& neighborData = slice->m_ctuMV[slotIdx].m_meData[neighIdx];
+ MEData& neighborData = slice->m_ctuMV[slotIdx * MAX_NUM_PUS_PER_CTU + neighIdx];
for (int i = 0; i < 2; i++)
{
neighbours[dir].mv[i] = neighborData.mv[i];
@@ -310,8 +310,7 @@ void Search::puMotionEstimation(const Slice* slice, const CUGeom& cuGeom, CUData
}
else if (slice->m_refFrameList[list][ref]->m_encData->m_slice->m_sliceType != I_SLICE)
{
- CTUMVInfo& ctuMV = slice->m_refFrameList[list][ref]->m_encData->m_slice->m_ctuMV[slotIdx];
- MEData meData = ctuMV.m_meData[pos];
+ MEData meData = slice->m_refFrameList[list][ref]->m_encData->m_slice->m_ctuMV[slotIdx * MAX_NUM_PUS_PER_CTU + pos];
bool bi = (meData.ref[0] >= 0 && meData.ref[1] >= 0);
bool uniL0 = (meData.ref[0] >= 0 && meData.ref[1] == REF_NOT_VALID);
@@ -504,8 +503,7 @@ void Search::puMotionEstimation(const Slice* slice, const CUGeom& cuGeom, CUData
}
}
}
- CTUMVInfo & ctuInfo = slice->m_ctuMV[slotIdx];
- MEData& outME = ctuInfo.m_meData[pos];
+ MEData& outME = slice->m_ctuMV[slotIdx * MAX_NUM_PUS_PER_CTU + pos];
outME.ref[0] = REF_NOT_VALID;
outME.ref[1] = REF_NOT_VALID;
@@ -3024,8 +3022,7 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma
int slotIdx = (col % m_slice->m_sps->numCuInWidth) * m_slice->m_sps->numCuInHeight + row;
- CTUMVInfo& ctuInfo = slice->m_ctuMV[slotIdx];
- MEData meData = ctuInfo.m_meData[index];
+ MEData meData = slice->m_ctuMV[slotIdx * MAX_NUM_PUS_PER_CTU + index];
bestME[0].ref = meData.ref[0];
bestME[1].ref = meData.ref[1];
diff --git a/source/encoder/threadedme.cpp b/source/encoder/threadedme.cpp
index 11fd16e9c..1028beb92 100644
--- a/source/encoder/threadedme.cpp
+++ b/source/encoder/threadedme.cpp
@@ -45,27 +45,12 @@ bool ThreadedME::create()
initPuStartIdx();
- configure();
-
/* start sequence at zero */
m_enqueueSeq = 0ULL;
return true;
}
-void ThreadedME::configure()
-{
- if (!m_param->tmeTaskBlockSize)
- {
- m_param->tmeTaskBlockSize = m_param->sourceWidth / 480;
- }
-
- if (!m_param->tmeNumBufferRows)
- {
- m_param->tmeNumBufferRows = 10;
- }
-}
-
void ThreadedME::initPuStartIdx()
{
int startIdx = 0;
diff --git a/source/encoder/threadedme.h b/source/encoder/threadedme.h
index ee66116e3..5e5fc4878 100644
--- a/source/encoder/threadedme.h
+++ b/source/encoder/threadedme.h
@@ -118,11 +118,6 @@ struct MEData
uint32_t cost;
};
-struct CTUMVInfo
-{
- MEData* m_meData;
-};
-
struct CTUTask
{
uint64_t seq;
@@ -192,11 +187,6 @@ public:
*/
bool create();
- /**
- * @brief Configure ThreadedME parameters to match workload
- */
- void configure();
-
/**
* @brief Initialize lookup table used to index PU offsets for all valid CTU sizes.
*/
diff --git a/source/x265.h b/source/x265.h
index 6847febb5..3c5d3e3b6 100644
--- a/source/x265.h
+++ b/source/x265.h
@@ -2341,7 +2341,10 @@ typedef struct x265_param
int searchRangeForLayer2;
/* Threaded ME */
+ /* Number of CTUs processed at once when a worker thread picks up a task from ThreadedME. */
int tmeTaskBlockSize;
+
+ /* Number of rows upto which ThreadedME processes tasks ahead of WPP */
int tmeNumBufferRows;
/*SBRC*/
--
2.52.0.windows.1
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20260304/55c189fd/attachment-0001.htm>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-perf-tme-tune-tme-parameters-for-presets-0-5.patch
Type: application/octet-stream
Size: 13916 bytes
Desc: 0001-perf-tme-tune-tme-parameters-for-presets-0-5.patch
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20260304/55c189fd/attachment-0001.obj>
More information about the x265-devel
mailing list