[x265] [PATCH] threaded-me: optimizations and fixes

Tue Apr 7 17:48:12 UTC 2026

From 7779ac6714434649ba89561247a58f539bffc955 Mon Sep 17 00:00:00 2001
From: Shashank Pathipati <shashank.pathipati at multicorewareinc.com>
Date: Thu, 26 Mar 2026 10:41:55 +0530
Subject: [PATCH] tme: - update doc and mark --threaded-me as experimental -
 minor optimizations for --threaded-me - add CPU freq adaptive threadpool
 split - add CLIs for smoke and regression - add support for --no-wpp and
 --me=sea

Co-Authored-By: Syed Majid <syed.majid at multicorewareinc.com>
---
 doc/reST/cli.rst                 | 19 ++++---
 source/common/threadpool.cpp     | 85 +++++++++++++++++++++++++++-
 source/common/threadpool.h       | 14 +++++
 source/encoder/frameencoder.cpp  | 50 +++++++++-------
 source/encoder/motion.cpp        | 12 ++++
 source/encoder/search.cpp        | 97 +++++++++++++++++++++++++++-----
 source/encoder/threadedme.cpp    | 45 ++++++++++++++-
 source/encoder/threadedme.h      | 28 ++++-----
 source/test/regression-tests.txt |  8 +++
 source/test/smoke-tests.txt      |  7 +++
 10 files changed, 301 insertions(+), 64 deletions(-)

diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst
index f602e1307..43c488b14 100755
--- a/doc/reST/cli.rst
+++ b/doc/reST/cli.rst
@@ -408,16 +408,21 @@ Performance Options
      can reduce compression efficiency. Recommended on many-core CPUs when
      encode speed is prioritized over compression efficiency.

-     If VBV options are enabled, Threaded ME is automatically disabled and a
-     warning is emitted.
+     This feature is automatically disabled in the following conditions:

-     This feature is implicitly disabled when no thread pool is present.
+     - When no thread pool is present.
+     - When the detected CPU core count is less than 32.
+     - If VBV options are enabled, due to incompatibility with re-encoding trigggers.

-     --threaded-me provides speedups on many-core CPUs, accompanied by a
-     compression efficiency loss.
+     Default disabled. **Experimental Feature**
+     
+     .. note::
+           :option:`--threaded-me` currently provides encoding speedups only on
+           many-core machines running at low clock frequencies (at or below
+           approximately 1.5 GHz). On high-frequency systems or machines with
+           fewer cores, the overhead of the additional motion estimation work
+           may outweigh the parallelism gains.

-     Default disabled.
-
 .. option:: --preset, -p <integer|string>

      Sets parameters to preselected values, trading off compression efficiency against
diff --git a/source/common/threadpool.cpp b/source/common/threadpool.cpp
index 79075425a..b3e29bea4 100644
--- a/source/common/threadpool.cpp
+++ b/source/common/threadpool.cpp
@@ -33,6 +33,15 @@
 #include <winnt.h>
 #endif

+#if defined(__APPLE__)
+#include <sys/sysctl.h>
+#elif !defined(_WIN32)
+#include <fstream>
+#include <string>
+#include <cstdio>
+#include <cstdlib>
+#endif
+
 #if X86_64

 #ifdef __GNUC__
@@ -351,7 +360,6 @@ static void distributeThreadsForTme(
         }

         // Apply calculated threadpool assignment
-        // TODO: Make sure this doesn't cause a problem later on
         memset(threadsPerPool, 0, sizeof(int) * (numNumaNodes + 2));
         memset(nodeMaskPerPool, 0, sizeof(uint64_t) * (numNumaNodes + 2));

@@ -905,16 +913,87 @@ int ThreadPool::configureTmeThreadCount(x265_param* param, int cpuCount)
         }
     }

+    bool isHighFreq = (getCPUFrequencyMHz() > 1500.0);
+
     if (selectedRule >= 0)
     {
         const TmeRuleConfig& cfg = s_tmeRuleConfig[selectedRule];
         param->tmeTaskBlockSize = cfg.widthBasedTaskBlockSize ? ((param->sourceWidth + 480 - 1) / 480) : cfg.taskBlockSize[resClass];
         param->tmeNumBufferRows = cfg.numBufferRows[resClass];
-        return (cpuCount * cfg.threadPercent[resClass]) / 100;
+        return (!isHighFreq) ? (cpuCount * cfg.threadPercent[resClass]) / 100 : cpuCount / 2;
     }

     static const int s_defaultThreadPercent[TME_RES_COUNT] = { 80, 80, 70 };
-    return (cpuCount * s_defaultThreadPercent[resClass]) / 100;
+    return (!isHighFreq) ? (cpuCount * s_defaultThreadPercent[resClass]) / 100 : cpuCount / 2;
+}
+
+double getCPUFrequencyMHz()
+{
+#if defined(_WIN32)
+    HKEY hKey;
+    DWORD mhz = 0;
+    DWORD size = sizeof(mhz);
+    if (RegOpenKeyExA(HKEY_LOCAL_MACHINE,
+                      "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
+                      0, KEY_READ, &hKey) == ERROR_SUCCESS)
+    {
+        RegQueryValueExA(hKey, "~MHz", NULL, NULL, (LPBYTE)&mhz, &size);
+        RegCloseKey(hKey);
+    }
+    return (double)mhz;
+
+#elif defined(__APPLE__)
+    uint64_t freq = 0;
+    size_t size = sizeof(freq);
+    if (sysctlbyname("hw.cpufrequency", &freq, &size, NULL, 0) == 0)
+        return (double)freq / 1.0e6;
+    return 0.0;
+
+#else  /* Linux */
+    /* scaling_cur_freq reflects the live frequency chosen by the governor
+     * and EPP hint. Iterate over all cpuN entries and return the highest observed value.
+     */
+    {
+        uint64_t maxKhz = 0;
+        char path[64];
+        for (int cpu = 0; ; ++cpu)
+        {
+            snprintf(path, sizeof(path),
+                     "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_cur_freq", cpu);
+            std::ifstream f(path);
+            if (!f.is_open())
+                break;
+            uint64_t khz = 0;
+            f >> khz;
+            if (khz > maxKhz)
+                maxKhz = khz;
+        }
+        if (maxKhz > 0)
+            return (double)maxKhz / 1000.0;
+    }
+    /* Fall back to /proc/cpuinfo — collect the max "cpu MHz" across all entries. */
+    {
+        std::ifstream f("/proc/cpuinfo");
+        std::string line;
+        double maxMhz = 0.0;
+        while (std::getline(f, line))
+        {
+            if (line.find("cpu MHz") != std::string::npos)
+            {
+                size_t colon = line.find(':');
+                if (colon != std::string::npos)
+                {
+                    double mhz = strtod(line.c_str() + colon + 1, NULL);
+                    if (mhz > maxMhz)
+                        maxMhz = mhz;
+                }
+            }
+        }
+        if (maxMhz > 0.0)
+            return maxMhz;
+    }
+    return 0.0;
+#endif
 }

 } // end namespace X265_NS
diff --git a/source/common/threadpool.h b/source/common/threadpool.h
index f223fd010..04e35528e 100644
--- a/source/common/threadpool.h
+++ b/source/common/threadpool.h
@@ -171,6 +171,20 @@ public:
     virtual void processTasks(int workerThreadId) = 0;
 };

+/**
+ * @brief Return the highest current CPU frequency in MHz across all cores, or 0.0 if unavailable.
+ *
+ * The value reflects the live frequency as reported by the cpufreq subsystem,
+ * which accounts for the active scaling governor and EPP hint.
+ *
+ * Platform support:
+ *   Linux   – iterates /sys/devices/system/cpu/cpuN/cpufreq/scaling_cur_freq (kHz)
+ *              for all cores and returns the maximum; falls back to /proc/cpuinfo
+ *   macOS   – sysctl hw.cpufrequency (Hz)
+ *   Windows – registry ~MHz under CentralProcessor\0
+ */
+double getCPUFrequencyMHz();
+
 } // end namespace X265_NS

 #endif // ifndef X265_THREADPOOL_H
diff --git a/source/encoder/frameencoder.cpp b/source/encoder/frameencoder.cpp
index af73626af..c8bf12508 100644
--- a/source/encoder/frameencoder.cpp
+++ b/source/encoder/frameencoder.cpp
@@ -1036,6 +1036,13 @@ void FrameEncoder::compressFrame(int layer)
                     }
                 }

+                if (m_top->m_threadedME && !slice->isIntra())
+                {
+                    ScopedLock lock(m_tmeDepLock);
+                    m_tmeDeps[i].external = true;
+                    m_top->m_threadedME->enqueueReadyRows(i, layer, this);
+                }
+
                 if (!i)
                     m_row0WaitTime[layer] = x265_mdate();
                 else if (i == m_numRows - 1)
@@ -1636,6 +1643,29 @@ void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld, int layer
         const uint32_t cuAddr = lineStartCUAddr + col;
         CUData* ctu = curEncData.getPicCTU(cuAddr);
         const uint32_t bLastCuInSlice = (bLastRowInSlice & (col == numCols - 1)) ? 1 : 0;
+
+        /* Must wait for TME to finish before initCTU because both threads
+         * operate on the same CUData — the encoder's initCTU would corrupt
+         * data that deriveMVsForCTU is still reading. */
+        if (m_top->m_threadedME && slice->m_sliceType != I_SLICE)
+        {
+            int64_t waitStart = x265_mdate();
+            bool waited = false;
+
+            while (m_frame[layer]->m_ctuMEFlags[cuAddr].get() == 0)
+            {
+#ifdef DETAILED_CU_STATS
+                tld.analysis.m_stats[m_jpId].countTmeBlockedCTUs++;
+#endif
+                m_frame[layer]->m_ctuMEFlags[cuAddr].waitForChange(0);
+                waited = true;
+            }
+
+            int64_t waitEnd = x265_mdate();
+            if (waited)
+                ATOMIC_ADD(&m_totalThreadedMEWait[layer], waitEnd - waitStart);
+        }
+
         ctu->initCTU(*m_frame[layer], cuAddr, slice->m_sliceQp, bFirstRowInSlice, bLastRowInSlice, bLastCuInSlice);

         if (!layer && bIsVbv)
@@ -1692,26 +1722,6 @@ void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld, int layer
         if (m_param->dynamicRd && (int32_t)(m_rce.qpaRc - m_rce.qpNoVbv) > 0)
             ctu->m_vbvAffected = true;

-        if (m_top->m_threadedME && slice->m_sliceType != I_SLICE)
-        {
-            int64_t waitStart = x265_mdate();
-            bool waited = false;
-
-            // Wait for threadedME to complete ME upto this CTU
-            while (m_frame[layer]->m_ctuMEFlags[cuAddr].get() == 0)
-            {
-#ifdef DETAILED_CU_STATS
-                tld.analysis.m_stats[m_jpId].countTmeBlockedCTUs++;
-#endif
-                m_frame[layer]->m_ctuMEFlags[cuAddr].waitForChange(0);
-                waited = true;
-            }
-
-            int64_t waitEnd = x265_mdate();
-            if (waited)
-                ATOMIC_ADD(&m_totalThreadedMEWait[layer], waitEnd - waitStart);
-        }
-
         // Does all the CU analysis, returns best top level mode decision
         Mode& best = tld.analysis.compressCTU(*ctu, *m_frame[layer], m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder);

diff --git a/source/encoder/motion.cpp b/source/encoder/motion.cpp
index 1a8cf6371..230037e2a 100644
--- a/source/encoder/motion.cpp
+++ b/source/encoder/motion.cpp
@@ -642,6 +642,7 @@ int MotionEstimate::diamondSearch(ReferencePlanes* ref, const MV& mvmin, const M

     for (int16_t dist = 1; dist <= 4; dist <<= 1)
     {
+        const MV bmv0 = bmv;
         const int32_t top = omv.y - dist;
         const int32_t bottom = omv.y + dist;
         const int32_t left = omv.x - dist;
@@ -697,10 +698,13 @@ int MotionEstimate::diamondSearch(ReferencePlanes* ref, const MV& mvmin, const M
                 COST_MV(omv.x, bottom);
             }
         }
+        if (bmv == bmv0)
+        break;
     }

     for (int16_t dist = 8; dist <= 64; dist += 8)
     {
+        const MV bmv0 = bmv;
         const int32_t top = omv.y - dist;
         const int32_t bottom = omv.y + dist;
         const int32_t left = omv.x - dist;
@@ -772,6 +776,8 @@ int MotionEstimate::diamondSearch(ReferencePlanes* ref, const MV& mvmin, const M
                 }
             }
         }
+        if (bmv == bmv0)
+        break;
     }
     outMV = bmv;
     return bcost;
@@ -996,6 +1002,12 @@ int MotionEstimate::motionEstimate(ReferencePlanes *ref,
     pmv = pmv.roundToFPel();
     MV omv = bmv;  // current search origin or starting point

+    if (bcost == 0)
+    {
+        outQMv = bmv.toQPel();
+        return mvcost(bmv << 2); // return just the MV cost (no residual)
+    }
+
     int search = ref->isHMELowres ? (hme ? searchMethodL0 : searchMethodL1) : searchMethod;
     switch (search)
     {
diff --git a/source/encoder/search.cpp b/source/encoder/search.cpp
index 304911f96..ebf914912 100644
--- a/source/encoder/search.cpp
+++ b/source/encoder/search.cpp
@@ -348,6 +348,12 @@ void Search::puMotionEstimation(const Slice* slice, const CUGeom& cuGeom, CUData
                 PicYuv* recon = slice->m_mref[list][ref].reconPic;
                 int offset = recon->getLumaAddr(cu.m_cuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) - recon->getLumaAddr(0);

+                if (m_param->searchMethod == X265_SEA)
+                {
+                    for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
+                        m_me.integral[planes] = slice->m_refFrameList[list][ref]->m_encData->m_meIntegral[planes] + offset;
+                }
+
                 m_me.setSourcePU(fencPic->m_picOrg[0], fencPic->m_stride, offset, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine);
                 setSearchRange(cu, mvp, searchRange, mvmin, mvmax);

@@ -359,24 +365,89 @@ void Search::puMotionEstimation(const Slice* slice, const CUGeom& cuGeom, CUData
                 else
                 {
                     m_vertRestriction = slice->m_refPOCList[list][ref] == slice->m_poc;
-                    satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction,
-                        m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
-
-                    if (bLowresMVP && mvp_lowres.notZero() && mvp_lowres != mvp)
+                    pixel* srcRef = m_param->bSourceReferenceEstimation ?
+                        m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0;
+
+                    MV bestMvp = mvp;
+                    bool usedLowresMvp = false;
+
+                    /* Only do SAD comparison when:
+                     * 1. srcRef is null (not source reference estimation mode)
+                     * 2. lowres MVP is valid and different from spatial MVP
+                     * 3. fencPUYuv is initialised */
+                    if (!srcRef &&
+                        bLowresMVP && mvp_lowres.notZero() && mvp_lowres != mvp &&
+                        m_me.fencPUYuv.m_buf[0] != NULL)
                     {
-                        MV outmv_lowres;
-                        bLowresMVP = false;
-                        setSearchRange(cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
-                        int lowresMvCost = m_me.motionEstimate(&slice->m_mref[list][ref],  mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange,outmv_lowres, m_param->maxSlices,
-                            m_vertRestriction, m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0): 0);
+                        intptr_t stride  = slice->m_mref[list][ref].lumaStride;
+                        PicYuv*  refPic  = slice->m_mref[list][ref].reconPic;

-                        if (lowresMvCost < satdCost)
+                        /* Only proceed if strides match */
+                        if (refPic->m_stride == stride)
                         {
-                            outmv = outmv_lowres;
-                            satdCost = lowresMvCost;
-                            bLowresMVP = true;
+                            intptr_t bOffset = refPic->getLumaAddr(cu.m_cuAddr,
+                                                   pu.cuAbsPartIdx + pu.puAbsPartIdx)
+                                             - refPic->getLumaAddr(0);
+
+                            pixel* fenc     = m_me.fencPUYuv.m_buf[0];
+                            pixel* frefBase = slice->m_mref[list][ref].fpelPlane[0]
+                                             + bOffset;
+
+                            MV mvp_fp = mvp.clipped(
+                                MV(mvmin.x << 2, mvmin.y << 2),
+                                MV(mvmax.x << 2, mvmax.y << 2)).roundToFPel();
+
+                            MV lowres_fp = mvp_lowres.clipped(
+                                MV(mvmin.x << 2, mvmin.y << 2),
+                                MV(mvmax.x << 2, mvmax.y << 2)).roundToFPel();
+
+                            /* Picture boundary check for 4K safety */
+                            int picW = refPic->m_picWidth;
+                            int picH = refPic->m_picHeight;
+
+                            bool mvpValid = (mvp_fp.x    >= mvmin.x &&
+                                             mvp_fp.x    <= mvmax.x &&
+                                             mvp_fp.y    >= mvmin.y &&
+                                             mvp_fp.y    <= mvmax.y &&
+                                             mvp_fp.x + pu.width  <= picW &&
+                                             mvp_fp.y + pu.height <= picH);
+
+                            bool lowresValid = (lowres_fp.x >= mvmin.x &&
+                                                lowres_fp.x <= mvmax.x &&
+                                                lowres_fp.y >= mvmin.y &&
+                                                lowres_fp.y <= mvmax.y &&
+                                                lowres_fp.x + pu.width  <= picW &&
+                                                lowres_fp.y + pu.height <= picH);
+
+                            if (mvpValid && lowresValid && mvp_fp != lowres_fp)
+                            {
+                                pixelcmp_t sadFunc = primitives.pu[m_me.partEnum].sad;
+
+                                int sadMvp = sadFunc(fenc, FENC_STRIDE,
+                                    frefBase + mvp_fp.x    + mvp_fp.y    * stride,
+                                    stride);
+                                int sadLowres = sadFunc(fenc, FENC_STRIDE,
+                                    frefBase + lowres_fp.x + lowres_fp.y * stride,
+                                    stride);
+
+                                if (sadLowres < sadMvp)
+                                {
+                                    bestMvp       = mvp_lowres;
+                                    mvp           = mvp_lowres; /* fix mvcost basis */
+                                    usedLowresMvp = true;
+                                }
+                            }
                         }
                     }
+
+                    satdCost = m_me.motionEstimate(&slice->m_mref[list][ref],
+                        mvmin, mvmax,
+                        bestMvp,
+                        numMvc, mvc,
+                        m_param->searchRange, outmv,
+                        m_param->maxSlices, m_vertRestriction, srcRef);
+
+                    bLowresMVP = usedLowresMvp;
                 }

                 bits += m_me.bitcost(outmv);
diff --git a/source/encoder/threadedme.cpp b/source/encoder/threadedme.cpp
index 1028beb92..3c27835f3 100644
--- a/source/encoder/threadedme.cpp
+++ b/source/encoder/threadedme.cpp
@@ -144,7 +144,7 @@ void ThreadedME::threadMain()
                 frameEnc->m_tmeTasks.pop();

                 m_taskQueueLock.acquire();
-                m_taskQueue.push(task);
+                m_taskQueue.push_back(task);
                 m_taskQueueLock.release();

                 newCTUsPushed++;
@@ -177,8 +177,41 @@ void ThreadedME::findJob(int workerThreadId)
     m_tld[workerThreadId].analysis.m_stats[m_jpId].countTmeTasks++;
 #endif

-    CTUTask task = m_taskQueue.top();
-    m_taskQueue.pop();
+    /* Scan for the most urgent task based on live WPP progress.
+     * Primary key:   urgency — distance from WPP frontier (smaller / negative = more urgent)
+     * Secondary key: diagonal (row + col) — prefer earlier WPP wavefront position
+     * Tertiary key:  enqueue sequence — preserve FIFO among equal candidates */
+    int bestIdx = 0;
+    int bestUrgency = ctuTaskUrgency(m_taskQueue[0]);
+    int bestDiag = m_taskQueue[0].row + m_taskQueue[0].col;
+    uint64_t bestSeq = m_taskQueue[0].seq;
+
+    for (int k = 1; k < (int)m_taskQueue.size(); k++)
+    {
+        const CTUTask& t = m_taskQueue[k];
+        int urgency = ctuTaskUrgency(t);
+        int diag = t.row + t.col;
+
+        bool isBetter = false;
+        if (urgency < bestUrgency)
+            isBetter = true;
+        else if (urgency == bestUrgency && diag < bestDiag)
+            isBetter = true;
+        else if (urgency == bestUrgency && diag == bestDiag && t.seq < bestSeq)
+            isBetter = true;
+
+        if (isBetter)
+        {
+            bestIdx = k;
+            bestUrgency = urgency;
+            bestDiag = diag;
+            bestSeq = t.seq;
+        }
+    }
+
+    CTUTask task = m_taskQueue[bestIdx];
+    m_taskQueue[bestIdx] = m_taskQueue.back();
+    m_taskQueue.pop_back();
     m_taskQueueLock.release();

     int numCols = (m_param->sourceWidth + m_param->maxCUSize - 1) / m_param->maxCUSize;
@@ -257,4 +290,10 @@ void initCTU(CUData& ctu, int row, int col, CTUTask& task)
     ctu.initCTU(frame, ctuAddr, slice->m_sliceQp, bFirstRowInSlice, bLastRowInSlice, bLastCuInSlice);
 }

+int ctuTaskUrgency(const CTUTask& task)
+{
+    int wppProgress = (int)task.frameEnc->m_rows[task.row].completed;
+    return task.col - wppProgress;
+}
+
 }
\ No newline at end of file
diff --git a/source/encoder/threadedme.h b/source/encoder/threadedme.h
index 5e5fc4878..71a9279c2 100644
--- a/source/encoder/threadedme.h
+++ b/source/encoder/threadedme.h
@@ -34,7 +34,6 @@
 #include "analysis.h"
 #include "mv.h"

-#include <queue>
 #include <vector>
 #include <fstream>

@@ -134,22 +133,6 @@ struct CTUTask
 };


-struct CompareCTUTask {
-    bool operator()(const CTUTask& a, const CTUTask& b) const {
-        if (a.frame->m_poc == b.frame->m_poc)
-        {
-            int a_pos = a.row + a.col;
-            int b_pos = b.row + b.col;
-            if (a_pos != b_pos) return a_pos > b_pos;
-        }
-
-        /* Compare by sequence number to preserve FIFO enqueue order.
-         * priority_queue in C++ is a max-heap, so return true when a.seq > b.seq
-         * to make smaller seq (earlier enqueue) the top() element. */
-        return a.seq > b.seq;
-    }
-};
-
 /**
  * @brief Threaded motion-estimation module that schedules CTU blocks across worker threads.
  *
@@ -163,7 +146,7 @@ public:
     x265_param*             m_param;
     Encoder&                m_enc;

-    std::priority_queue<CTUTask, std::vector<CTUTask>, CompareCTUTask>  m_taskQueue;
+    std::vector<CTUTask>    m_taskQueue;
     Lock                    m_taskQueueLock;
     Event                   m_taskEvent;

@@ -244,6 +227,15 @@ public:
  */
 void initCTU(CUData& ctu, int row, int col, CTUTask& task);

+/**
+ * @brief Compute scheduling urgency for a CTU task based on WPP progress.
+ *
+ * Returns how many CTUs WPP must process in this row before it reaches the
+ * task's first CTU.  Negative values mean WPP has already passed the task's
+ * column and is likely stalling on its result right now.
+ */
+int ctuTaskUrgency(const CTUTask& task);
+
 };

 #endif
diff --git a/source/test/regression-tests.txt b/source/test/regression-tests.txt
index 081013cca..7551dbb7b 100644
--- a/source/test/regression-tests.txt
+++ b/source/test/regression-tests.txt
@@ -201,3 +201,11 @@ ParkScene_1920x1080_24.y4m, --crf 24 --mcstf --preset slow --bframes 5
 BasketballDrive_1920x1080_50.y4m, --crf 26 --preset slow --sbrc --no-open-gop --keyint 60 --min-keyint 60 --vbv-bufsize 6000 --vbv-maxrate 5000 --temporal-layers 4 --b-adapt 0 --no-cutree
 crowd_run_1080p50.y4m, --crf 22 --preset superfast --sbrc --no-open-gop --me sea --vbv-maxrate 9000 --vbv-bufsize 7500 --keyint 100 --min-keyint 100
 # vim: tw=200
+
+#Threaded ME tests
+BasketballDrive_1920x1080_50.y4m,--preset ultrafast --threaded-me --constrained-intra --signhide --qp 26
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset medium --threaded-me --no-wpp --no-cutree --rdoq-level 1 --limit-refs 1
+DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slow --threaded-me --temporal-layers 2 --no-psy-rd --qg-size 32
+old_town_cross_444_720p50.y4m,--preset faster --threaded-me --me sea --weightp --limit-modes
+big_buck_bunny_360p24.y4m,--preset medium --threaded-me --no-wpp --aq-mode 3 --aq-strength 1.5 --weightb
+
diff --git a/source/test/smoke-tests.txt b/source/test/smoke-tests.txt
index 89f8ce452..4e6acfa61 100644
--- a/source/test/smoke-tests.txt
+++ b/source/test/smoke-tests.txt
@@ -25,3 +25,10 @@ CrowdRun_1920x1080_50_10bit_444.yuv,--preset=superfast --bitrate 7000 --sao --li
 # CrowdRun_1920x1080_50_10bit_444.yuv,--preset=ultrafast --weightp --keyint -1 --film-grain "CrowdRun_1920x1080_50_10bit_444.bin"
 # DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryfast --min-cu 16 --film-grain "DucksAndLegs_1920x1080_60_10bit_422.bin"
 # NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset=superfast --bitrate 10000 --sao --limit-sao --cll --max-cll "1000,400" --film-grain "NebutaFestival_2560x1600_60_10bit_crop.bin"
+
+#Threaded ME tests
+BasketballDrive_1920x1080_50.y4m,--preset=superfast --threaded-me --qp 28
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset=medium --threaded-me --no-wpp --crf 22
+ducks_take_off_444_720p50.y4m,--preset=slower --threaded-me --me sea --limit-refs 2
+RaceHorses_416x240_30_10bit.yuv,--preset=faster --threaded-me --aq-mode 3 --rdoq-level 1
+News-4k.y4m,--preset=veryfast --threaded-me --no-cutree --tune psnr
--
2.52.0.windows.1
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20260407/14546c06/attachment-0001.htm>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-tme.patch
Type: application/octet-stream
Size: 25315 bytes
Desc: 0001-tme.patch
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20260407/14546c06/attachment-0001.obj>