[x265] [PATCH] perf(tme): tune tme parameters for presets 0-5

Shashank Pathipati shashank.pathipati at multicorewareinc.com
Wed Mar 4 15:37:22 UTC 2026


This patch tunes performance of --threaded-me for presets ranging from ultrafast to medium, while having default tme params for the remaining presets.

>From b23a003f32ba8d6dca7c0afadc48dabf6c1d872e Mon Sep 17 00:00:00 2001
From: Shashank Pathipati <shashank.pathipati at multicorewareinc.com>
Date: Wed, 4 Mar 2026 19:06:12 +0530
Subject: [PATCH] perf(tme): tune tme parameters for presets 0-5

---
 source/common/framedata.cpp   | 18 ++------
 source/common/param.cpp       |  9 ++--
 source/common/slice.h         |  4 +-
 source/common/threadpool.cpp  | 84 ++++++++++++++++++++++++++++-------
 source/common/threadpool.h    |  2 +-
 source/encoder/analysis.cpp   |  2 +-
 source/encoder/search.cpp     | 11 ++---
 source/encoder/threadedme.cpp | 15 -------
 source/encoder/threadedme.h   | 10 -----
 source/x265.h                 |  3 ++
 10 files changed, 85 insertions(+), 73 deletions(-)

diff --git a/source/common/framedata.cpp b/source/common/framedata.cpp
index 9c7e007b8..aa317dfe8 100644
--- a/source/common/framedata.cpp
+++ b/source/common/framedata.cpp
@@ -39,12 +39,9 @@ bool FrameData::create(const x265_param& param, const SPS& sps, int csp)
     m_slice  = new Slice;
     if (m_param->bThreadedME)
     {
-        uint32_t bufferSize = sps.numCuInWidth * sps.numCuInHeight;
-        m_slice->m_ctuMV = (CTUMVInfo*)x265_malloc(sizeof(CTUMVInfo) * bufferSize);
-        for (uint32_t i = 0; i < bufferSize; i++)
-        {
-            m_slice->m_ctuMV[i].m_meData = (MEData*)x265_malloc(sizeof(MEData) * MAX_NUM_PUS_PER_CTU);
-        }
+        uint32_t numCUs = sps.numCuInWidth * sps.numCuInHeight;
+        uint32_t totalPUs = numCUs * MAX_NUM_PUS_PER_CTU;
+        m_slice->m_ctuMV = X265_MALLOC(MEData, totalPUs);
     }

     m_picCTU = new CUData[sps.numCUsInFrame];
@@ -105,14 +102,7 @@ void FrameData::destroy()
 {
     delete [] m_picCTU;

-    if (m_slice->m_ctuMV)
-    {
-        uint32_t bufferSize = m_slice->m_sps->numCuInWidth * m_slice->m_sps->numCuInHeight;
-        for (uint32_t i = 0; i < bufferSize; i++)
-            x265_free(m_slice->m_ctuMV[i].m_meData);
-
-        x265_free(m_slice->m_ctuMV);
-    }
+    X265_FREE(m_slice->m_ctuMV);
     delete m_slice;
     delete m_saoParam;

diff --git a/source/common/param.cpp b/source/common/param.cpp
index ce6ce6765..862da5d14 100755
--- a/source/common/param.cpp
+++ b/source/common/param.cpp
@@ -491,8 +491,6 @@ int x265_param_default_preset(x265_param* param, const char* preset, const char*
             param->rc.hevcAq = 0;
             param->rc.qgSize = 32;
             param->bEnableFastIntra = 1;
-            param->tmeTaskBlockSize = 0; // Auto-detect
-            param->tmeNumBufferRows = 20;
         }
         else if (!strcmp(preset, "superfast"))
         {
@@ -514,8 +512,6 @@ int x265_param_default_preset(x265_param* param, const char* preset, const char*
             param->rc.qgSize = 32;
             param->bEnableSAO = 0;
             param->bEnableFastIntra = 1;
-            param->tmeTaskBlockSize = 0; // Auto-detect
-            param->tmeNumBufferRows = 20;
         }
         else if (!strcmp(preset, "veryfast"))
         {
@@ -530,8 +526,6 @@ int x265_param_default_preset(x265_param* param, const char* preset, const char*
             param->maxNumReferences = 2;
             param->rc.qgSize = 32;
             param->bEnableFastIntra = 1;
-            param->tmeTaskBlockSize = 0; // Auto-detect
-            param->tmeNumBufferRows = 20;
         }
         else if (!strcmp(preset, "faster"))
         {
@@ -2104,6 +2098,9 @@ void x265_print_params(x265_param* param)

     x265_log(param, X265_LOG_INFO, "Coding QT: max CU size, min CU size : %d / %d\n", param->maxCUSize, param->minCUSize);

+    if (param->bThreadedME)
+        x265_log(param, X265_LOG_INFO, "ThreadedME: task block / buf rows   : %d / %d\n", param->tmeTaskBlockSize, param->tmeNumBufferRows);
+
     x265_log(param, X265_LOG_INFO, "Residual QT: max TU size, max depth : %d / %d inter / %d intra\n",
              param->maxTUSize, param->tuQTMaxInterDepth, param->tuQTMaxIntraDepth);

diff --git a/source/common/slice.h b/source/common/slice.h
index 8ede39898..076c439ea 100644
--- a/source/common/slice.h
+++ b/source/common/slice.h
@@ -36,7 +36,7 @@ class PicList;
 class PicYuv;
 class MotionReference;

-struct CTUMVInfo;
+struct MEData;

 enum SliceType
 {
@@ -381,7 +381,7 @@ public:
     WeightParam m_weightPredTable[2][MAX_NUM_REF][3]; // [list][refIdx][0:Y, 1:U, 2:V]
     MotionReference (*m_mref)[MAX_NUM_REF + 1];
     RPS         m_rps;
-    CTUMVInfo*  m_ctuMV;
+    MEData*     m_ctuMV;

     NalUnitType m_nalUnitType;
     SliceType   m_sliceType;
diff --git a/source/common/threadpool.cpp b/source/common/threadpool.cpp
index a5a02daca..ae97d2637 100644
--- a/source/common/threadpool.cpp
+++ b/source/common/threadpool.cpp
@@ -388,17 +388,10 @@ ThreadPool* ThreadPool::allocThreadPools(x265_param* p, int& numPools, bool isTh

     if (p->bThreadedME)
     {
-        /**
-         * TODO: The following thread split decision has only been tuned
-         * for ultrafast and medium presets. Tuning for other presets
-         * needs to be completed.
-         */
-        int targetTME = getTmeThreadCount(p, totalNumThreads);
-        threadsFrameEnc = totalNumThreads - targetTME;
-
-        if (targetTME < 1)
-            targetTME = 1;
+        int targetTME = configureTmeThreadCount(p, totalNumThreads);
+        targetTME = (targetTME < 1) ? 1 : targetTME;

+        threadsFrameEnc = totalNumThreads - targetTME;
         int defaultNumFT = getFrameThreadsCount(p, totalNumThreads);
         if (threadsFrameEnc < defaultNumFT)
         {
@@ -826,18 +819,75 @@ int ThreadPool::getFrameThreadsCount(x265_param* p, int cpuCount)
         return 1;
 }

-int ThreadPool::getTmeThreadCount(x265_param* param, int cpuCount)
+int ThreadPool::configureTmeThreadCount(x265_param* param, int cpuCount)
 {
-    bool isHighRes = (param->sourceWidth > 2000);
+    enum TmeResClass
+    {
+        TME_RES_LOW = 0,
+        TME_RES_MID,
+        TME_RES_HIGH,
+        TME_RES_COUNT
+    };
+
+    enum TmeRule
+    {
+        TME_RULE_FAST_MEDIUM_SLOW = 0,
+        TME_RULE_FASTER,
+        TME_RULE_VERYFAST,
+        TME_RULE_SUPERFAST,
+        TME_RULE_ULTRAFAST,
+        TME_RULE_COUNT
+    };
+
+    struct TmeRuleConfig
+    {
+        int taskBlockSize[TME_RES_COUNT];
+        int numBufferRows[TME_RES_COUNT];
+        int threadPercent[TME_RES_COUNT];
+        bool widthBasedTaskBlockSize;
+    };
+
+    static const TmeRuleConfig s_tmeRuleConfig[TME_RULE_COUNT] =
+    {
+        { { 1, 1, 1 }, { 10, 10, 10 }, { 90, 80, 70 }, false }, // fast / medium and slower presets
+        { { 1, 1, 1 }, { 10, 15, 10 }, { 90, 80, 70 }, false }, // faster preset and similar options
+        { { 1, 1, 1 }, { 10, 15, 20 }, { 90, 80, 70 }, false }, // veryfast preset and similar options
+        { { 2, 4, 4 }, { 10, 15, 20 }, { 90, 80, 60 }, false }, // superfast preset and similar options
+        { { 0, 0, 0 }, { 15, 20, 20 }, { 90, 80, 50 }, true  }  // ultrafast preset and similar options
+    };
+
+    const int resClass = (param->sourceHeight >= 1440) ? TME_RES_HIGH :
+                         (param->sourceHeight <= 720) ? TME_RES_LOW : TME_RES_MID;
+
+    const bool ruleMatches[TME_RULE_COUNT] =
+    {
+        param->maxNumReferences >= 3 && param->subpelRefine >= 2,
+        param->maxNumReferences >= 2 && param->subpelRefine >= 2,
+        param->subpelRefine >= 1 && param->bframes > 3,
+        param->subpelRefine && param->maxCUSize < 64,
+        !param->subpelRefine || param->searchMethod == X265_DIA_SEARCH || param->minCUSize >= 16
+    };
+
+    int selectedRule = -1;
+    for (int i = 0; i < TME_RULE_COUNT; i++)
+    {
+        if (ruleMatches[i])
+        {
+            selectedRule = i;
+            break;
+        }
+    }

-    // ultrafast preset or similar options
-    if (!param->subpelRefine || param->minCUSize >= 16)
+    if (selectedRule >= 0)
     {
-        if (isHighRes) return cpuCount / 2;
+        const TmeRuleConfig& cfg = s_tmeRuleConfig[selectedRule];
+        param->tmeTaskBlockSize = cfg.widthBasedTaskBlockSize ? ((param->sourceWidth + 480 - 1) / 480) : cfg.taskBlockSize[resClass];
+        param->tmeNumBufferRows = cfg.numBufferRows[resClass];
+        return (cpuCount * cfg.threadPercent[resClass]) / 100;
     }

-    if (isHighRes) return (cpuCount * 7) / 10;
-    else return (cpuCount * 4) / 5;
+    static const int s_defaultThreadPercent[TME_RES_COUNT] = { 80, 80, 70 };
+    return (cpuCount * s_defaultThreadPercent[resClass]) / 100;
 }

 } // end namespace X265_NS
diff --git a/source/common/threadpool.h b/source/common/threadpool.h
index 051f4cba4..f223fd010 100644
--- a/source/common/threadpool.h
+++ b/source/common/threadpool.h
@@ -107,7 +107,7 @@ public:
     static int  getCpuCount();
     static int  getNumaNodeCount();
     static int  getFrameThreadsCount(x265_param* p, int cpuCount);
-    static int  getTmeThreadCount(x265_param* p, int cpuCount);
+    static int  configureTmeThreadCount(x265_param* p, int cpuCount);
 };

 /* Any worker thread may enlist the help of idle worker threads from the same
diff --git a/source/encoder/analysis.cpp b/source/encoder/analysis.cpp
index 6960583f4..06f538588 100644
--- a/source/encoder/analysis.cpp
+++ b/source/encoder/analysis.cpp
@@ -172,7 +172,7 @@ void Analysis::computeMVForPUs(CUData& ctu, const CUGeom& cuGeom, int qp, Frame&
     uint32_t cuX = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
     uint32_t cuY = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];

-    if (!(cuSize == m_param->maxCUSize))
+    if (cuSize != m_param->maxCUSize)
     {
         uint32_t subCUSize = m_param->maxCUSize / 2;
         areaId = (cuX >= subCUSize) + 2 * (cuY >= subCUSize) + 1;
diff --git a/source/encoder/search.cpp b/source/encoder/search.cpp
index bf47e7a03..304911f96 100644
--- a/source/encoder/search.cpp
+++ b/source/encoder/search.cpp
@@ -285,7 +285,7 @@ void Search::puMotionEstimation(const Slice* slice, const CUGeom& cuGeom, CUData
                         int neighIdx = neighborIdx[dir];
                         if (neighIdx >= 0)
                         {
-                            MEData& neighborData = slice->m_ctuMV[slotIdx].m_meData[neighIdx];
+                            MEData& neighborData = slice->m_ctuMV[slotIdx * MAX_NUM_PUS_PER_CTU + neighIdx];
                             for (int i = 0; i < 2; i++)
                             {
                                 neighbours[dir].mv[i] = neighborData.mv[i];
@@ -310,8 +310,7 @@ void Search::puMotionEstimation(const Slice* slice, const CUGeom& cuGeom, CUData
                     }
                     else if (slice->m_refFrameList[list][ref]->m_encData->m_slice->m_sliceType != I_SLICE)
                     {
-                        CTUMVInfo& ctuMV = slice->m_refFrameList[list][ref]->m_encData->m_slice->m_ctuMV[slotIdx];
-                        MEData meData = ctuMV.m_meData[pos];
+                        MEData meData = slice->m_refFrameList[list][ref]->m_encData->m_slice->m_ctuMV[slotIdx * MAX_NUM_PUS_PER_CTU + pos];

                         bool bi = (meData.ref[0] >= 0 && meData.ref[1] >= 0);
                         bool uniL0 = (meData.ref[0] >= 0 && meData.ref[1] == REF_NOT_VALID);
@@ -504,8 +503,7 @@ void Search::puMotionEstimation(const Slice* slice, const CUGeom& cuGeom, CUData
                 }
             }
         }
-        CTUMVInfo & ctuInfo = slice->m_ctuMV[slotIdx];
-        MEData& outME = ctuInfo.m_meData[pos];
+        MEData& outME = slice->m_ctuMV[slotIdx * MAX_NUM_PUS_PER_CTU + pos];

         outME.ref[0] = REF_NOT_VALID;
         outME.ref[1] = REF_NOT_VALID;
@@ -3024,8 +3022,7 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma

             int slotIdx = (col % m_slice->m_sps->numCuInWidth) * m_slice->m_sps->numCuInHeight + row;

-            CTUMVInfo& ctuInfo = slice->m_ctuMV[slotIdx];
-            MEData meData = ctuInfo.m_meData[index];
+            MEData meData = slice->m_ctuMV[slotIdx * MAX_NUM_PUS_PER_CTU + index];

             bestME[0].ref = meData.ref[0];
             bestME[1].ref = meData.ref[1];
diff --git a/source/encoder/threadedme.cpp b/source/encoder/threadedme.cpp
index 11fd16e9c..1028beb92 100644
--- a/source/encoder/threadedme.cpp
+++ b/source/encoder/threadedme.cpp
@@ -45,27 +45,12 @@ bool ThreadedME::create()

     initPuStartIdx();

-    configure();
-
     /* start sequence at zero */
     m_enqueueSeq = 0ULL;

     return true;
 }

-void ThreadedME::configure()
-{
-    if (!m_param->tmeTaskBlockSize)
-    {
-        m_param->tmeTaskBlockSize = m_param->sourceWidth / 480;
-    }
-
-    if (!m_param->tmeNumBufferRows)
-    {
-        m_param->tmeNumBufferRows = 10;
-    }
-}
-
 void ThreadedME::initPuStartIdx()
 {
     int startIdx = 0;
diff --git a/source/encoder/threadedme.h b/source/encoder/threadedme.h
index ee66116e3..5e5fc4878 100644
--- a/source/encoder/threadedme.h
+++ b/source/encoder/threadedme.h
@@ -118,11 +118,6 @@ struct MEData
     uint32_t cost;
 };

-struct CTUMVInfo
-{
-    MEData* m_meData;
-};
-
 struct CTUTask
 {
     uint64_t seq;
@@ -192,11 +187,6 @@ public:
      */
     bool create();

-    /**
-     * @brief Configure ThreadedME parameters to match workload
-     */
-    void configure();
-
     /**
      * @brief Initialize lookup table used to index PU offsets for all valid CTU sizes.
      */
diff --git a/source/x265.h b/source/x265.h
index 6847febb5..3c5d3e3b6 100644
--- a/source/x265.h
+++ b/source/x265.h
@@ -2341,7 +2341,10 @@ typedef struct x265_param
     int      searchRangeForLayer2;

     /* Threaded ME */
+    /* Number of CTUs processed at once when a worker thread picks up a task from ThreadedME. */
     int      tmeTaskBlockSize;
+
+    /* Number of rows upto which ThreadedME processes tasks ahead of WPP */
     int      tmeNumBufferRows;

     /*SBRC*/
--
2.52.0.windows.1



-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20260304/55c189fd/attachment-0001.htm>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-perf-tme-tune-tme-parameters-for-presets-0-5.patch
Type: application/octet-stream
Size: 13916 bytes
Desc: 0001-perf-tme-tune-tme-parameters-for-presets-0-5.patch
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20260304/55c189fd/attachment-0001.obj>


More information about the x265-devel mailing list