[x265] [PATCH RFC] pool: allow thread private data structures, save memory allocations

Steve Borho steve at borho.org
Wed May 21 21:27:10 CEST 2014


# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1398578192 -28800
#      Sun Apr 27 13:56:32 2014 +0800
# Node ID c569e43631fcc4bf44f4c79961ca847080a2aa5a
# Parent  dadb646a7266f377cbfa33747ecb0a97f50a00f9
pool: allow thread private data structures, save memory allocations

pass worker's threadId to JobProvider::findJob() and allow job providers to use
this ID as they see fit to keep thread local data.

This feature is then used to allocate most CU analysis data per-thread instead
of per-row-per-frame-encoder.  It should save a good chunk of memory.

This is a precursor to enabling more fine-grained parallelism

diff -r dadb646a7266 -r c569e43631fc source/Lib/TLibEncoder/TEncCu.cpp
--- a/source/Lib/TLibEncoder/TEncCu.cpp	Wed May 21 11:23:29 2014 -0500
+++ b/source/Lib/TLibEncoder/TEncCu.cpp	Sun Apr 27 13:56:32 2014 +0800
@@ -353,10 +353,10 @@
 
 /** \param    pcEncTop      pointer of encoder class
  */
-void TEncCu::init(Encoder* top)
+void TEncCu::init(Encoder& top)
 {
-    m_param = top->param;
-    m_CUTransquantBypassFlagValue = top->m_CUTransquantBypassFlagValue;
+    m_param = top.param;
+    m_CUTransquantBypassFlagValue = top.m_CUTransquantBypassFlagValue;
 }
 
 // ====================================================================================================================
diff -r dadb646a7266 -r c569e43631fc source/Lib/TLibEncoder/TEncCu.h
--- a/source/Lib/TLibEncoder/TEncCu.h	Wed May 21 11:23:29 2014 -0500
+++ b/source/Lib/TLibEncoder/TEncCu.h	Sun Apr 27 13:56:32 2014 +0800
@@ -135,7 +135,7 @@
 #endif
     TEncCu();
 
-    void init(Encoder* top);
+    void init(Encoder& top);
     bool create(uint8_t totalDepth, uint32_t maxWidth);
     void destroy();
     void compressCU(TComDataCU* cu);
diff -r dadb646a7266 -r c569e43631fc source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Wed May 21 11:23:29 2014 -0500
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Sun Apr 27 13:56:32 2014 +0800
@@ -95,21 +95,21 @@
     m_qtTempTransformSkipYuv.destroy();
 }
 
-bool TEncSearch::init(Encoder* cfg, RDCost* rdCost, TComTrQuant* trQuant)
+bool TEncSearch::init(Encoder& enc, RDCost* rdCost, TComTrQuant* trQuant)
 {
-    m_cfg     = cfg;
+    m_cfg     = &enc;
     m_trQuant = trQuant;
     m_rdCost  = rdCost;
 
-    initTempBuff(cfg->param->internalCsp);
-    m_me.setSearchMethod(cfg->param->searchMethod);
-    m_me.setSubpelRefine(cfg->param->subpelRefine);
+    initTempBuff(enc.param->internalCsp);
+    m_me.setSearchMethod(enc.param->searchMethod);
+    m_me.setSubpelRefine(enc.param->subpelRefine);
 
     /* When frame parallelism is active, only 'refLagPixels' of reference frames will be guaranteed
      * available for motion reference.  See refLagRows in FrameEncoder::compressCTURows() */
-    m_refLagPixels = cfg->param->frameNumThreads > 1 ? cfg->param->searchRange : cfg->param->sourceHeight;
-
-    const uint32_t numLayersToAllocate = cfg->m_quadtreeTULog2MaxSize - cfg->m_quadtreeTULog2MinSize + 1;
+    m_refLagPixels = enc.param->frameNumThreads > 1 ? enc.param->searchRange : enc.param->sourceHeight;
+
+    const uint32_t numLayersToAllocate = enc.m_quadtreeTULog2MaxSize - enc.m_quadtreeTULog2MinSize + 1;
     m_qtTempCoeffY   = new coeff_t*[numLayersToAllocate * 3];
     m_qtTempCoeffCb  = m_qtTempCoeffY + numLayersToAllocate;
     m_qtTempCoeffCr  = m_qtTempCoeffY + numLayersToAllocate * 2;
@@ -121,7 +121,7 @@
         m_qtTempCoeffY[i]  = X265_MALLOC(coeff_t, sizeL + sizeC * 2);
         m_qtTempCoeffCb[i] = m_qtTempCoeffY[i] + sizeL;
         m_qtTempCoeffCr[i] = m_qtTempCoeffY[i] + sizeL + sizeC;
-        m_qtTempShortYuv[i].create(MAX_CU_SIZE, MAX_CU_SIZE, cfg->param->internalCsp);
+        m_qtTempShortYuv[i].create(MAX_CU_SIZE, MAX_CU_SIZE, enc.param->internalCsp);
     }
 
     const uint32_t numPartitions = 1 << (g_maxCUDepth << 1);
@@ -137,7 +137,7 @@
     m_qtTempTUCoeffCb = m_qtTempTUCoeffY + MAX_TS_WIDTH * MAX_TS_HEIGHT;
     m_qtTempTUCoeffCr = m_qtTempTUCoeffY + MAX_TS_WIDTH * MAX_TS_HEIGHT * 2;
 
-    return m_qtTempTransformSkipYuv.create(g_maxCUSize, g_maxCUSize, cfg->param->internalCsp);
+    return m_qtTempTransformSkipYuv.create(g_maxCUSize, g_maxCUSize, enc.param->internalCsp);
 
 fail:
     return false;
@@ -462,7 +462,7 @@
     int lastPos = -1;
     cu->setTrIdxSubParts(trDepth, absPartIdx, fullDepth);
 
-    int      chFmt        = cu->getChromaFormat();
+    int chFmt = cu->getChromaFormat();
     m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
     m_trQuant->selectLambda(TEXT_LUMA);
 
diff -r dadb646a7266 -r c569e43631fc source/Lib/TLibEncoder/TEncSearch.h
--- a/source/Lib/TLibEncoder/TEncSearch.h	Wed May 21 11:23:29 2014 -0500
+++ b/source/Lib/TLibEncoder/TEncSearch.h	Sun Apr 27 13:56:32 2014 +0800
@@ -130,6 +130,7 @@
     // interface to option
     Encoder*        m_cfg;
 
+public:
     // interface to classes
     TComTrQuant*    m_trQuant;
     RDCost*         m_rdCost;
@@ -151,7 +152,7 @@
     TEncSearch();
     virtual ~TEncSearch();
 
-    bool init(Encoder* cfg, RDCost* rdCost, TComTrQuant *trQuant);
+    bool init(Encoder& top, RDCost* rdCost, TComTrQuant *trQuant);
 
     uint32_t xModeBitsIntra(TComDataCU* cu, uint32_t mode, uint32_t partOffset, uint32_t depth);
     uint32_t xModeBitsRemIntra(TComDataCU * cu, uint32_t partOffset, uint32_t depth, uint32_t preds[3], uint64_t & mpms);
diff -r dadb646a7266 -r c569e43631fc source/common/threadpool.cpp
--- a/source/common/threadpool.cpp	Wed May 21 11:23:29 2014 -0500
+++ b/source/common/threadpool.cpp	Sun Apr 27 13:56:32 2014 +0800
@@ -156,7 +156,7 @@
         {
             // FindJob() may perform actual work and return true.  If
             // it does we restart the job search
-            if (cur->findJob() == true)
+            if (cur->findJob(m_id) == true)
                 break;
 
             cur = cur->m_nextProvider;
@@ -254,6 +254,7 @@
 {
     if (numThreads == 0)
         numThreads = getCpuCount();
+
     m_numSleepMapWords = (numThreads + 63) >> 6;
     m_sleepMap = X265_MALLOC(uint64_t, m_numSleepMapWords);
 
diff -r dadb646a7266 -r c569e43631fc source/common/threadpool.h
--- a/source/common/threadpool.h	Wed May 21 11:23:29 2014 -0500
+++ b/source/common/threadpool.h	Sun Apr 27 13:56:32 2014 +0800
@@ -62,7 +62,7 @@
 
     // Worker threads will call this method to find a job.  Must return true if
     // work was completed.  False if no work was available.
-    virtual bool findJob() = 0;
+    virtual bool findJob(int threadId) = 0;
 
     // All derived objects that call Enqueue *MUST* call flush before allowing
     // their object to be destroyed, otherwise you will see random crashes involving
diff -r dadb646a7266 -r c569e43631fc source/common/wavefront.cpp
--- a/source/common/wavefront.cpp	Wed May 21 11:23:29 2014 -0500
+++ b/source/common/wavefront.cpp	Sun Apr 27 13:56:32 2014 +0800
@@ -112,7 +112,7 @@
     return ATOMIC_CAS(&m_internalDependencyBitmap[row >> 6], oldval, newval) == oldval;
 }
 
-bool WaveFront::findJob()
+bool WaveFront::findJob(int threadId)
 {
     unsigned long id;
 
@@ -130,7 +130,7 @@
             if (ATOMIC_CAS(&m_internalDependencyBitmap[w], oldval, newval) == oldval)
             {
                 // we cleared the bit, process row
-                processRow(w * 64 + id);
+                processRow(w * 64 + id, threadId);
                 return true;
             }
             // some other thread cleared the bit, try another bit
diff -r dadb646a7266 -r c569e43631fc source/common/wavefront.h
--- a/source/common/wavefront.h	Wed May 21 11:23:29 2014 -0500
+++ b/source/common/wavefront.h	Sun Apr 27 13:56:32 2014 +0800
@@ -87,11 +87,11 @@
     // WaveFront's implementation of JobProvider::findJob. Consults
     // m_queuedBitmap and calls ProcessRow(row) for lowest numbered queued row
     // or returns false
-    bool findJob();
+    bool findJob(int threadId);
 
     // Start or resume encode processing of this row, must be implemented by
     // derived classes.
-    virtual void processRow(int row) = 0;
+    virtual void processRow(int row, int threadId) = 0;
 
     // Returns true if a row above curRow is available for processing.  The processRow()
     // method may call this function periodically and voluntarily exit
diff -r dadb646a7266 -r c569e43631fc source/encoder/cturow.cpp
--- a/source/encoder/cturow.cpp	Wed May 21 11:23:29 2014 -0500
+++ b/source/encoder/cturow.cpp	Sun Apr 27 13:56:32 2014 +0800
@@ -24,17 +24,46 @@
  *****************************************************************************/
 
 #include "encoder.h"
+#include "frameencoder.h"
 #include "PPA/ppa.h"
 #include "cturow.h"
 
 using namespace x265;
 
-bool CTURow::create(Encoder* top)
+void ThreadLocalData::init(Encoder& enc)
+{
+    m_trQuant.init(1 << enc.m_quadtreeTULog2MaxSize, enc.bEnableRDOQ, enc.bEnableRDOQTS, !!enc.param->bEnableTSkipFast);
+    if (enc.m_useScalingListId == SCALING_LIST_OFF)
+    {
+        m_trQuant.setFlatScalingList();
+        m_trQuant.setUseScalingList(false);
+    }
+    else if (enc.m_useScalingListId == SCALING_LIST_DEFAULT)
+    {
+        m_trQuant.setScalingList(enc.getScalingList());
+        m_trQuant.setUseScalingList(true);
+    }
+
+    m_rdCost.setPsyRdScale(enc.param->rdLevel >= 4 ? enc.param->psyRd : 0);
+
+    m_search.init(enc, &m_rdCost, &m_trQuant);
+
+    m_cuCoder.init(enc);
+    m_cuCoder.setPredSearch(&m_search);
+    m_cuCoder.setTrQuant(&m_trQuant);
+    m_cuCoder.setRdCost(&m_rdCost);
+    m_cuCoder.create((uint8_t)g_maxCUDepth, g_maxCUSize);
+}
+
+ThreadLocalData::~ThreadLocalData()
+{
+    m_cuCoder.destroy();
+}
+
+bool CTURow::create()
 {
     m_rdGoOnSbacCoder.init(&m_rdGoOnBinCodersCABAC);
     m_sbacCoder.init(&m_binCoderCABAC);
-    m_trQuant.init(1 << top->m_quadtreeTULog2MaxSize, top->bEnableRDOQ, top->bEnableRDOQTS, !!top->param->bEnableTSkipFast);
-    m_rdCost.setPsyRdScale(top->param->rdLevel >= 4 ? top->param->psyRd : 0);
     m_rdSbacCoders = new TEncSbac * *[g_maxCUDepth + 1];
     m_binCodersCABAC = new TEncBinCABAC * *[g_maxCUDepth + 1];
 
@@ -51,22 +80,20 @@
         }
     }
 
-    m_cuCoder.init(top);
-    m_cuCoder.setRdCost(&m_rdCost);
-    m_cuCoder.setRDSbacCoder(m_rdSbacCoders);
-    m_cuCoder.setEntropyCoder(&m_entropyCoder);
-    m_cuCoder.setPredSearch(&m_search);
-    m_cuCoder.setTrQuant(&m_trQuant);
-    m_cuCoder.setRdCost(&m_rdCost);
-    m_search.setRDSbacCoder(m_rdSbacCoders);
-    m_search.setEntropyCoder(&m_entropyCoder);
-    m_search.setRDGoOnSbacCoder(&m_rdGoOnSbacCoder);
-
-    return m_search.init(top, &m_rdCost, &m_trQuant) &&
-           m_cuCoder.create((uint8_t)g_maxCUDepth, g_maxCUSize);
+    /* TODO: check all mallocs */
+    return m_rdSbacCoders && m_binCodersCABAC;
 }
 
-void CTURow::processCU(TComDataCU *cu, TComSlice *slice, TEncSbac *bufferSbac, bool bSaveSBac)
+void CTURow::setThreadLocalData(ThreadLocalData& tld)
+{
+    tld.m_cuCoder.setRDSbacCoder(m_rdSbacCoders);
+    tld.m_cuCoder.setEntropyCoder(&m_entropyCoder);
+    tld.m_search.setRDSbacCoder(m_rdSbacCoders);
+    tld.m_search.setEntropyCoder(&m_entropyCoder);
+    tld.m_search.setRDGoOnSbacCoder(&m_rdGoOnSbacCoder);
+}
+
+void CTURow::processCU(TComDataCU *cu, TComSlice *slice, TEncSbac *bufferSbac, ThreadLocalData& tld, bool bSaveSBac)
 {
     if (bufferSbac)
     {
@@ -76,17 +103,17 @@
 
     m_entropyCoder.setEntropyCoder(&m_rdGoOnSbacCoder, slice);
     m_entropyCoder.setBitstream(&m_bitCounter);
-    m_cuCoder.setRDGoOnSbacCoder(&m_rdGoOnSbacCoder);
+    tld.m_cuCoder.setRDGoOnSbacCoder(&m_rdGoOnSbacCoder);
 
-    m_cuCoder.compressCU(cu); // Does all the CU analysis
+    tld.m_cuCoder.compressCU(cu); // Does all the CU analysis
 
     // restore entropy coder to an initial state
     m_entropyCoder.setEntropyCoder(m_rdSbacCoders[0][CI_CURR_BEST], slice);
     m_entropyCoder.setBitstream(&m_bitCounter);
-    m_cuCoder.setBitCounter(&m_bitCounter);
+    tld.m_cuCoder.setBitCounter(&m_bitCounter);
     m_bitCounter.resetBits();
 
-    m_cuCoder.encodeCU(cu);  // Count bits
+    tld.m_cuCoder.encodeCU(cu);  // Count bits
 
     if (bSaveSBac)
     {
@@ -114,5 +141,4 @@
 
     delete[] m_rdSbacCoders;
     delete[] m_binCodersCABAC;
-    m_cuCoder.destroy();
 }
diff -r dadb646a7266 -r c569e43631fc source/encoder/cturow.h
--- a/source/encoder/cturow.h	Wed May 21 11:23:29 2014 -0500
+++ b/source/encoder/cturow.h	Sun Apr 27 13:56:32 2014 +0800
@@ -40,6 +40,17 @@
 
 class Encoder;
 
+struct ThreadLocalData
+{
+    TEncSearch  m_search;
+    TEncCu      m_cuCoder;
+    RDCost      m_rdCost;
+    TComTrQuant m_trQuant;
+
+    void init(Encoder&);
+    ~ThreadLocalData();
+};
+
 /* manages the state of encoding one row of CTU blocks.  When
  * WPP is active, several rows will be simultaneously encoded.
  * When WPP is inactive, only one CTURow instance is used. */
@@ -55,15 +66,11 @@
     TEncBinCABAC           m_binCoderCABAC;
     TEncBinCABAC           m_rdGoOnBinCodersCABAC;
     TComBitCounter         m_bitCounter;
-    RDCost                 m_rdCost;
     TEncEntropy            m_entropyCoder;
-    TEncSearch             m_search;
-    TEncCu                 m_cuCoder;
-    TComTrQuant            m_trQuant;
     TEncSbac            ***m_rdSbacCoders;
     TEncBinCABAC        ***m_binCodersCABAC;
 
-    bool create(Encoder* top);
+    bool create();
 
     void destroy();
 
@@ -86,7 +93,9 @@
         m_rdGoOnSbacCoder.resetEntropy();
     }
 
-    void processCU(TComDataCU *cu, TComSlice *slice, TEncSbac *bufferSBac, bool bSaveCabac);
+    void setThreadLocalData(ThreadLocalData& tld);
+
+    void processCU(TComDataCU *cu, TComSlice *slice, TEncSbac *bufferSBac, ThreadLocalData& tld, bool bSaveCabac);
 
     /* Threading variables */
 
diff -r dadb646a7266 -r c569e43631fc source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp	Wed May 21 11:23:29 2014 -0500
+++ b/source/encoder/encoder.cpp	Sun Apr 27 13:56:32 2014 +0800
@@ -33,6 +33,7 @@
 
 #include "TLibEncoder/NALwrite.h"
 #include "bitcost.h"
+#include "cturow.h"
 #include "encoder.h"
 #include "slicetype.h"
 #include "frameencoder.h"
@@ -95,6 +96,19 @@
             m_frameEncoder[i].setThreadPool(m_threadPool);
         }
     }
+
+    /* Allocate thread local data shared by all frame encoders */
+    ThreadPool *pool = ThreadPool::getThreadPool();
+    const int poolThreadCount = pool ? pool->getThreadCount() : 1;
+    m_threadLocalData = new ThreadLocalData[poolThreadCount];
+    if (m_threadLocalData)
+    {
+        for (int i = 0; i < poolThreadCount; i++)
+            m_threadLocalData[i].init(*this);
+    }
+    else
+        m_aborted = true;
+
     m_lookahead = new Lookahead(this, m_threadPool);
     m_dpb = new DPB(this);
     m_rateControl = new RateControl(this);
@@ -143,6 +157,9 @@
         delete [] m_frameEncoder;
     }
 
+    if (m_threadLocalData)
+        delete [] m_threadLocalData;
+
     while (!m_freeList.empty())
     {
         TComPic* pic = m_freeList.popFront();
@@ -184,6 +201,7 @@
             }
         }
     }
+
     m_lookahead->init();
     m_encodeStartTime = x265_mdate();
 }
diff -r dadb646a7266 -r c569e43631fc source/encoder/encoder.h
--- a/source/encoder/encoder.h	Wed May 21 11:23:29 2014 -0500
+++ b/source/encoder/encoder.h	Sun Apr 27 13:56:32 2014 +0800
@@ -68,6 +68,7 @@
 struct RateControl;
 class ThreadPool;
 struct NALUnitEBSP;
+struct ThreadLocalData;
 
 class Encoder : public x265_encoder
 {
@@ -183,6 +184,7 @@
 
     x265_param*        param;
     RateControl*       m_rateControl;
+    ThreadLocalData*   m_threadLocalData;
 
     bool               bEnableRDOQ;
     bool               bEnableRDOQTS;
diff -r dadb646a7266 -r c569e43631fc source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp	Wed May 21 11:23:29 2014 -0500
+++ b/source/encoder/frameencoder.cpp	Sun Apr 27 13:56:32 2014 +0800
@@ -36,12 +36,6 @@
 namespace x265 {
 void weightAnalyse(TComSlice& slice, x265_param& param);
 
-enum SCALING_LIST_PARAMETER
-{
-    SCALING_LIST_OFF,
-    SCALING_LIST_DEFAULT,
-};
-
 FrameEncoder::FrameEncoder()
     : WaveFront(NULL)
     , m_threadActive(true)
@@ -110,15 +104,7 @@
     m_rows = new CTURow[m_numRows];
     for (int i = 0; i < m_numRows; ++i)
     {
-        ok &= m_rows[i].create(top);
-
-        for (int list = 0; list <= 1; list++)
-        {
-            for (int ref = 0; ref <= MAX_NUM_REF; ref++)
-            {
-                m_rows[i].m_search.m_mref[list][ref] = &m_mref[list][ref];
-            }
-        }
+        ok &= m_rows[i].create();
     }
 
     // NOTE: 2 times of numRows because both Encoder and Filter in same queue
@@ -128,6 +114,7 @@
         m_pool = NULL;
     }
 
+    m_tld.init(*top);
     m_frameFilter.init(top, numRows, getRDGoOnSbacCoder(0));
 
     // initialize SPS
@@ -153,23 +140,11 @@
     // set default slice level flag to the same as SPS level flag
     if (m_cfg->m_useScalingListId == SCALING_LIST_OFF)
     {
-        for (int i = 0; i < m_numRows; i++)
-        {
-            m_rows[i].m_trQuant.setFlatScalingList();
-            m_rows[i].m_trQuant.setUseScalingList(false);
-        }
-
         m_sps.setScalingListPresentFlag(false);
         m_pps.setScalingListPresentFlag(false);
     }
     else if (m_cfg->m_useScalingListId == SCALING_LIST_DEFAULT)
     {
-        for (int i = 0; i < m_numRows; i++)
-        {
-            m_rows[i].m_trQuant.setScalingList(m_top->getScalingList());
-            m_rows[i].m_trQuant.setUseScalingList(true);
-        }
-
         m_sps.setScalingListPresentFlag(false);
         m_pps.setScalingListPresentFlag(false);
     }
@@ -378,9 +353,10 @@
     while (m_threadActive);
 }
 
-void FrameEncoder::setLambda(int qp, int row)
+void FrameEncoder::setLambda(int qp, ThreadLocalData &tld)
 {
     TComSlice*  slice = m_pic->getSlice();
+    TComPicYuv* fenc  = slice->getPic()->getPicYuvOrg();
     int         chFmt = slice->getSPS()->getChromaFormatIdc();
 
     // for RDO
@@ -394,7 +370,9 @@
     qpc = Clip3(0, MAX_MAX_QP, qp + chromaQPOffset);
     double crWeight = pow(2.0, (qp - g_chromaScale[chFmt][qpc]) / 3.0); // takes into account of the chroma qp mapping and chroma qp Offset
 
-    m_rows[row].m_search.setQP(qp, crWeight, cbWeight);
+    tld.m_search.setQP(qp, crWeight, cbWeight);
+    tld.m_search.m_me.setQP(qp);
+    tld.m_search.m_me.setSourcePlane(fenc->getLumaAddr(), fenc->getStride());
 }
 
 void FrameEncoder::compressFrame()
@@ -435,7 +413,6 @@
     int qpc;
     int chromaQPOffset = slice->getPPS()->getChromaCbQpOffset() + slice->getSliceQpDeltaCb();
     qpc = Clip3(0, MAX_MAX_QP, qp + chromaQPOffset);
-    double cbWeight = pow(2.0, (qp - g_chromaScale[chFmt][qpc]) / 3.0); // takes into account of the chroma qp mapping and chroma qp Offset
 
     chromaQPOffset = slice->getPPS()->getChromaCrQpOffset() + slice->getSliceQpDeltaCr();
     qpc = Clip3(0, MAX_MAX_QP, qp + chromaQPOffset);
@@ -448,13 +425,6 @@
     m_frameFilter.m_sao.lumaLambda = lambda;
     m_frameFilter.m_sao.chromaLambda = chromaLambda;
 
-    TComPicYuv *fenc = slice->getPic()->getPicYuvOrg();
-    for (int i = 0; i < m_numRows; i++)
-    {
-        m_rows[i].m_search.m_me.setSourcePlane(fenc->getLumaAddr(), fenc->getStride());
-        m_rows[i].m_search.setQP(qp, crWeight, cbWeight);
-    }
-
     m_frameFilter.m_sao.lumaLambda = lambda;
     m_frameFilter.m_sao.chromaLambda = chromaLambda;
     m_bAllRowsStop = false;
@@ -678,7 +648,7 @@
 
     {
         // Construct the final bitstream by flushing and concatenating substreams.
-        // The final bitstream is either nalu.m_bitstream or pcBitstreamRedirect;
+        // The final bitstream is either nalu.m_bitstream or bitstreamRedirect;
         uint32_t* substreamSizes = slice->getSubstreamSizes();
         for (int i = 0; i < numSubstreams; i++)
         {
@@ -801,7 +771,7 @@
 
     // Initialize slice singletons
     m_sbacCoder.init(&m_binCoderCABAC);
-    getCuEncoder(0)->setBitCounter(NULL);
+    m_tld.m_cuCoder.setBitCounter(NULL);
     entropyCoder->setEntropyCoder(&m_sbacCoder, slice);
 
     uint32_t cuAddr;
@@ -945,8 +915,8 @@
 #if ENC_DEC_TRACE
         g_bJustDoIt = g_bEncDecTraceEnable;
 #endif
-        getCuEncoder(0)->setEntropyCoder(entropyCoder);
-        getCuEncoder(0)->encodeCU(cu);
+        m_tld.m_cuCoder.setEntropyCoder(entropyCoder);
+        m_tld.m_cuCoder.encodeCU(cu);
 
 #if ENC_DEC_TRACE
         g_bJustDoIt = g_bEncDecTraceDisable;
@@ -1081,13 +1051,13 @@
                     }
                 }
 
-                processRow(i * 2 + 0);
+                processRow(i * 2 + 0, -1);
             }
 
             // Filter
             if (i >= m_filterRowDelay)
             {
-                processRow((i - m_filterRowDelay) * 2 + 1);
+                processRow((i - m_filterRowDelay) * 2 + 1, -1);
             }
         }
     }
@@ -1096,7 +1066,7 @@
 }
 
 // Called by worker threads
-void FrameEncoder::processRowEncoder(int row)
+void FrameEncoder::processRowEncoder(int row, const int threadId)
 {
     PPAScopeEvent(Thread_ProcessRow);
 
@@ -1117,22 +1087,35 @@
              * believe the problem is fixed, but are leaving this check in place
              * to prevent crashes in case it is not */
             x265_log(m_cfg->param, X265_LOG_WARNING,
-                     "internal error - simulaneous row access detected. Please report HW to x265-devel at videolan.org\n");
+                     "internal error - simultaneous row access detected. Please report HW to x265-devel at videolan.org\n");
             return;
         }
         curRow.m_busy = true;
     }
 
+    // setup thread-local data
+    ThreadLocalData& tld = threadId >= 0 ? m_cfg->m_threadLocalData[threadId] : m_tld;
+    tld.m_trQuant.m_nr = &m_nr;
+    for (int list = 0; list <= 1; list++)
+    {
+        for (int ref = 0; ref <= MAX_NUM_REF; ref++)
+        {
+            tld.m_search.m_mref[list][ref] = &m_mref[list][ref];
+        }
+    }
+    curRow.setThreadLocalData(tld);
+
     int64_t startTime = x265_mdate();
     const uint32_t numCols = m_pic->getPicSym()->getFrameWidthInCU();
     const uint32_t lineStartCUAddr = row * numCols;
     bool bIsVbv = m_cfg->param->rc.vbvBufferSize > 0 && m_cfg->param->rc.vbvMaxBitrate > 0;
+    if (!m_cfg->param->rc.aqMode && !bIsVbv)
+        setLambda(m_pic->getSlice()->getSliceQp(), tld);
 
     while (curRow.m_completed < numCols)
     {
         int col = curRow.m_completed;
         const uint32_t cuAddr = lineStartCUAddr + col;
-        curRow.m_trQuant.m_nr = &m_nr;
         TComDataCU* cu = m_pic->getCU(cuAddr);
         cu->initCU(m_pic, cuAddr);
         cu->setQPSubParts(m_pic->getSlice()->getSliceQp(), 0, 0);
@@ -1153,7 +1136,7 @@
         if (m_cfg->param->rc.aqMode || bIsVbv)
         {
             int qp = calcQpForCu(cuAddr, cu->m_baseQp);
-            setLambda(qp, row);
+            setLambda(qp, tld);
             qp = Clip3(-QP_BD_OFFSET, MAX_QP, qp);
             cu->setQPSubParts(char(qp), 0, 0);
             if (m_cfg->param->rc.aqMode)
@@ -1163,7 +1146,7 @@
         TEncSbac *bufSbac = (m_cfg->param->bEnableWavefront && col == 0 && row > 0) ? &m_rows[row - 1].m_bufferSbacCoder : NULL;
         codeRow.m_entropyCoder.setEntropyCoder(&m_sbacCoder, m_pic->getSlice());
         codeRow.m_entropyCoder.resetEntropy();
-        codeRow.processCU(cu, m_pic->getSlice(), bufSbac, m_cfg->param->bEnableWavefront && col == 1);
+        codeRow.processCU(cu, m_pic->getSlice(), bufSbac, tld, m_cfg->param->bEnableWavefront && col == 1);
         // Completed CU processing
         curRow.m_completed++;
 
diff -r dadb646a7266 -r c569e43631fc source/encoder/frameencoder.h
--- a/source/encoder/frameencoder.h	Wed May 21 11:23:29 2014 -0500
+++ b/source/encoder/frameencoder.h	Sun Apr 27 13:56:32 2014 +0800
@@ -45,6 +45,12 @@
 namespace x265 {
 // private x265 namespace
 
+enum SCALING_LIST_PARAMETER
+{
+    SCALING_LIST_OFF,
+    SCALING_LIST_DEFAULT,
+};
+
 class ThreadPool;
 class Encoder;
 
@@ -63,7 +69,7 @@
 
     void destroy();
 
-    void processRowEncoder(int row);
+    void processRowEncoder(int row, const int threadId);
 
     void processRowFilter(int row)
     {
@@ -90,7 +96,7 @@
         WaveFront::enableRow(row * 2 + 1);
     }
 
-    void processRow(int row)
+    void processRow(int row, int threadId)
     {
         const int realRow = row >> 1;
         const int typeNum = row & 1;
@@ -98,7 +104,7 @@
         // TODO: use switch when more type
         if (typeNum == 0)
         {
-            processRowEncoder(realRow);
+            processRowEncoder(realRow, threadId);
         }
         else
         {
@@ -120,8 +126,6 @@
 
     TEncSbac*    getBufferSBac(int row)        { return &this->m_rows[row].m_bufferSbacCoder; }
 
-    TEncCu*      getCuEncoder(int row)         { return &this->m_rows[row].m_cuCoder; }
-
     /* Frame singletons, last the life of the encoder */
     TEncSampleAdaptiveOffset* getSAO()         { return &m_frameFilter.m_sao; }
 
@@ -149,7 +153,7 @@
     /* blocks until worker thread is done, returns encoded picture and bitstream */
     TComPic *getEncodedPicture(NALUnitEBSP **nalunits);
 
-    void setLambda(int qp, int row);
+    void setLambda(int qp, ThreadLocalData& tld);
 
     // worker thread
     void threadMain();
@@ -184,6 +188,7 @@
     FrameFilter              m_frameFilter;
     TComBitCounter           m_bitCounter;
     NoiseReduction           m_nr;
+    ThreadLocalData          m_tld;
 
     /* Picture being encoded, and its output NAL list */
     TComPic*                 m_pic;
diff -r dadb646a7266 -r c569e43631fc source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Wed May 21 11:23:29 2014 -0500
+++ b/source/encoder/slicetype.cpp	Sun Apr 27 13:56:32 2014 +0800
@@ -189,7 +189,7 @@
 }
 
 /* Called by pool worker threads */
-bool Lookahead::findJob()
+bool Lookahead::findJob(int)
 {
     if (bReady && ATOMIC_CAS32(&bReady, 1, 0) == 1)
     {
@@ -1289,7 +1289,7 @@
             enqueueRow(0);
             while (!bFrameCompleted)
             {
-                WaveFront::findJob();
+                WaveFront::findJob(-1);
             }
 
             WaveFront::dequeue();
@@ -1298,7 +1298,7 @@
         {
             for (int row = 0; row < heightInCU; row++)
             {
-                processRow(row);
+                processRow(row, -1);
             }
 
             x265_emms();
@@ -1455,7 +1455,7 @@
     }
 }
 
-void CostEstimate::processRow(int row)
+void CostEstimate::processRow(int row, const int /*threadId*/)
 {
     int realrow = heightInCU - 1 - row;
     Lowres **frames = curframes;
diff -r dadb646a7266 -r c569e43631fc source/encoder/slicetype.h
--- a/source/encoder/slicetype.h	Wed May 21 11:23:29 2014 -0500
+++ b/source/encoder/slicetype.h	Sun Apr 27 13:56:32 2014 +0800
@@ -110,7 +110,7 @@
     volatile bool    bFrameCompleted;
     int              curb, curp0, curp1;
 
-    void     processRow(int row);
+    void     processRow(int row, int threadId);
     int64_t  estimateFrameCost(Lowres **frames, int p0, int p1, int b, bool bIntraPenalty);
 
 protected:
@@ -155,7 +155,7 @@
     volatile bool bFilling;
     volatile bool bFlushed;
 
-    bool findJob();
+    bool findJob(int);
 
     /* called by addPicture() or flush() to trigger slice decisions */
     void slicetypeDecide();
diff -r dadb646a7266 -r c569e43631fc source/test/testpool.cpp
--- a/source/test/testpool.cpp	Wed May 21 11:23:29 2014 -0500
+++ b/source/test/testpool.cpp	Sun Apr 27 13:56:32 2014 +0800
@@ -87,7 +87,7 @@
 
     void encode();
 
-    void processRow(int row);
+    void processRow(int row, int threadid);
 };
 
 void MD5Frame::initialize(int cols, int rows)
@@ -130,7 +130,7 @@
         std::cout << "Bad hash: " << ss.str() << std::endl;
 }
 
-void MD5Frame::processRow(int rownum)
+void MD5Frame::processRow(int rownum, int)
 {
     // Called by worker thread
     RowData &curRow = this->row[rownum];


More information about the x265-devel mailing list