[x265] [PATCH RFC] pool: allow thread private data structures, save memory allocations
Steve Borho
steve at borho.org
Wed May 21 21:27:10 CEST 2014
# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1398578192 -28800
# Sun Apr 27 13:56:32 2014 +0800
# Node ID c569e43631fcc4bf44f4c79961ca847080a2aa5a
# Parent dadb646a7266f377cbfa33747ecb0a97f50a00f9
pool: allow thread private data structures, save memory allocations
pass worker's threadId to JobProvider::findJob() and allow job providers to use
this ID as they see fit to keep thread local data.
This feature is then used to allocate most CU analysis data per-thread instead
of per-row-per-frame-encoder. It should save a good chunk of memory.
This is a precursor to enabling more fine-grained parallelism
diff -r dadb646a7266 -r c569e43631fc source/Lib/TLibEncoder/TEncCu.cpp
--- a/source/Lib/TLibEncoder/TEncCu.cpp Wed May 21 11:23:29 2014 -0500
+++ b/source/Lib/TLibEncoder/TEncCu.cpp Sun Apr 27 13:56:32 2014 +0800
@@ -353,10 +353,10 @@
/** \param pcEncTop pointer of encoder class
*/
-void TEncCu::init(Encoder* top)
+void TEncCu::init(Encoder& top)
{
- m_param = top->param;
- m_CUTransquantBypassFlagValue = top->m_CUTransquantBypassFlagValue;
+ m_param = top.param;
+ m_CUTransquantBypassFlagValue = top.m_CUTransquantBypassFlagValue;
}
// ====================================================================================================================
diff -r dadb646a7266 -r c569e43631fc source/Lib/TLibEncoder/TEncCu.h
--- a/source/Lib/TLibEncoder/TEncCu.h Wed May 21 11:23:29 2014 -0500
+++ b/source/Lib/TLibEncoder/TEncCu.h Sun Apr 27 13:56:32 2014 +0800
@@ -135,7 +135,7 @@
#endif
TEncCu();
- void init(Encoder* top);
+ void init(Encoder& top);
bool create(uint8_t totalDepth, uint32_t maxWidth);
void destroy();
void compressCU(TComDataCU* cu);
diff -r dadb646a7266 -r c569e43631fc source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Wed May 21 11:23:29 2014 -0500
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Sun Apr 27 13:56:32 2014 +0800
@@ -95,21 +95,21 @@
m_qtTempTransformSkipYuv.destroy();
}
-bool TEncSearch::init(Encoder* cfg, RDCost* rdCost, TComTrQuant* trQuant)
+bool TEncSearch::init(Encoder& enc, RDCost* rdCost, TComTrQuant* trQuant)
{
- m_cfg = cfg;
+ m_cfg = &enc;
m_trQuant = trQuant;
m_rdCost = rdCost;
- initTempBuff(cfg->param->internalCsp);
- m_me.setSearchMethod(cfg->param->searchMethod);
- m_me.setSubpelRefine(cfg->param->subpelRefine);
+ initTempBuff(enc.param->internalCsp);
+ m_me.setSearchMethod(enc.param->searchMethod);
+ m_me.setSubpelRefine(enc.param->subpelRefine);
/* When frame parallelism is active, only 'refLagPixels' of reference frames will be guaranteed
* available for motion reference. See refLagRows in FrameEncoder::compressCTURows() */
- m_refLagPixels = cfg->param->frameNumThreads > 1 ? cfg->param->searchRange : cfg->param->sourceHeight;
-
- const uint32_t numLayersToAllocate = cfg->m_quadtreeTULog2MaxSize - cfg->m_quadtreeTULog2MinSize + 1;
+ m_refLagPixels = enc.param->frameNumThreads > 1 ? enc.param->searchRange : enc.param->sourceHeight;
+
+ const uint32_t numLayersToAllocate = enc.m_quadtreeTULog2MaxSize - enc.m_quadtreeTULog2MinSize + 1;
m_qtTempCoeffY = new coeff_t*[numLayersToAllocate * 3];
m_qtTempCoeffCb = m_qtTempCoeffY + numLayersToAllocate;
m_qtTempCoeffCr = m_qtTempCoeffY + numLayersToAllocate * 2;
@@ -121,7 +121,7 @@
m_qtTempCoeffY[i] = X265_MALLOC(coeff_t, sizeL + sizeC * 2);
m_qtTempCoeffCb[i] = m_qtTempCoeffY[i] + sizeL;
m_qtTempCoeffCr[i] = m_qtTempCoeffY[i] + sizeL + sizeC;
- m_qtTempShortYuv[i].create(MAX_CU_SIZE, MAX_CU_SIZE, cfg->param->internalCsp);
+ m_qtTempShortYuv[i].create(MAX_CU_SIZE, MAX_CU_SIZE, enc.param->internalCsp);
}
const uint32_t numPartitions = 1 << (g_maxCUDepth << 1);
@@ -137,7 +137,7 @@
m_qtTempTUCoeffCb = m_qtTempTUCoeffY + MAX_TS_WIDTH * MAX_TS_HEIGHT;
m_qtTempTUCoeffCr = m_qtTempTUCoeffY + MAX_TS_WIDTH * MAX_TS_HEIGHT * 2;
- return m_qtTempTransformSkipYuv.create(g_maxCUSize, g_maxCUSize, cfg->param->internalCsp);
+ return m_qtTempTransformSkipYuv.create(g_maxCUSize, g_maxCUSize, enc.param->internalCsp);
fail:
return false;
@@ -462,7 +462,7 @@
int lastPos = -1;
cu->setTrIdxSubParts(trDepth, absPartIdx, fullDepth);
- int chFmt = cu->getChromaFormat();
+ int chFmt = cu->getChromaFormat();
m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
m_trQuant->selectLambda(TEXT_LUMA);
diff -r dadb646a7266 -r c569e43631fc source/Lib/TLibEncoder/TEncSearch.h
--- a/source/Lib/TLibEncoder/TEncSearch.h Wed May 21 11:23:29 2014 -0500
+++ b/source/Lib/TLibEncoder/TEncSearch.h Sun Apr 27 13:56:32 2014 +0800
@@ -130,6 +130,7 @@
// interface to option
Encoder* m_cfg;
+public:
// interface to classes
TComTrQuant* m_trQuant;
RDCost* m_rdCost;
@@ -151,7 +152,7 @@
TEncSearch();
virtual ~TEncSearch();
- bool init(Encoder* cfg, RDCost* rdCost, TComTrQuant *trQuant);
+ bool init(Encoder& top, RDCost* rdCost, TComTrQuant *trQuant);
uint32_t xModeBitsIntra(TComDataCU* cu, uint32_t mode, uint32_t partOffset, uint32_t depth);
uint32_t xModeBitsRemIntra(TComDataCU * cu, uint32_t partOffset, uint32_t depth, uint32_t preds[3], uint64_t & mpms);
diff -r dadb646a7266 -r c569e43631fc source/common/threadpool.cpp
--- a/source/common/threadpool.cpp Wed May 21 11:23:29 2014 -0500
+++ b/source/common/threadpool.cpp Sun Apr 27 13:56:32 2014 +0800
@@ -156,7 +156,7 @@
{
// FindJob() may perform actual work and return true. If
// it does we restart the job search
- if (cur->findJob() == true)
+ if (cur->findJob(m_id) == true)
break;
cur = cur->m_nextProvider;
@@ -254,6 +254,7 @@
{
if (numThreads == 0)
numThreads = getCpuCount();
+
m_numSleepMapWords = (numThreads + 63) >> 6;
m_sleepMap = X265_MALLOC(uint64_t, m_numSleepMapWords);
diff -r dadb646a7266 -r c569e43631fc source/common/threadpool.h
--- a/source/common/threadpool.h Wed May 21 11:23:29 2014 -0500
+++ b/source/common/threadpool.h Sun Apr 27 13:56:32 2014 +0800
@@ -62,7 +62,7 @@
// Worker threads will call this method to find a job. Must return true if
// work was completed. False if no work was available.
- virtual bool findJob() = 0;
+ virtual bool findJob(int threadId) = 0;
// All derived objects that call Enqueue *MUST* call flush before allowing
// their object to be destroyed, otherwise you will see random crashes involving
diff -r dadb646a7266 -r c569e43631fc source/common/wavefront.cpp
--- a/source/common/wavefront.cpp Wed May 21 11:23:29 2014 -0500
+++ b/source/common/wavefront.cpp Sun Apr 27 13:56:32 2014 +0800
@@ -112,7 +112,7 @@
return ATOMIC_CAS(&m_internalDependencyBitmap[row >> 6], oldval, newval) == oldval;
}
-bool WaveFront::findJob()
+bool WaveFront::findJob(int threadId)
{
unsigned long id;
@@ -130,7 +130,7 @@
if (ATOMIC_CAS(&m_internalDependencyBitmap[w], oldval, newval) == oldval)
{
// we cleared the bit, process row
- processRow(w * 64 + id);
+ processRow(w * 64 + id, threadId);
return true;
}
// some other thread cleared the bit, try another bit
diff -r dadb646a7266 -r c569e43631fc source/common/wavefront.h
--- a/source/common/wavefront.h Wed May 21 11:23:29 2014 -0500
+++ b/source/common/wavefront.h Sun Apr 27 13:56:32 2014 +0800
@@ -87,11 +87,11 @@
// WaveFront's implementation of JobProvider::findJob. Consults
// m_queuedBitmap and calls ProcessRow(row) for lowest numbered queued row
// or returns false
- bool findJob();
+ bool findJob(int threadId);
// Start or resume encode processing of this row, must be implemented by
// derived classes.
- virtual void processRow(int row) = 0;
+ virtual void processRow(int row, int threadId) = 0;
// Returns true if a row above curRow is available for processing. The processRow()
// method may call this function periodically and voluntarily exit
diff -r dadb646a7266 -r c569e43631fc source/encoder/cturow.cpp
--- a/source/encoder/cturow.cpp Wed May 21 11:23:29 2014 -0500
+++ b/source/encoder/cturow.cpp Sun Apr 27 13:56:32 2014 +0800
@@ -24,17 +24,46 @@
*****************************************************************************/
#include "encoder.h"
+#include "frameencoder.h"
#include "PPA/ppa.h"
#include "cturow.h"
using namespace x265;
-bool CTURow::create(Encoder* top)
+void ThreadLocalData::init(Encoder& enc)
+{
+ m_trQuant.init(1 << enc.m_quadtreeTULog2MaxSize, enc.bEnableRDOQ, enc.bEnableRDOQTS, !!enc.param->bEnableTSkipFast);
+ if (enc.m_useScalingListId == SCALING_LIST_OFF)
+ {
+ m_trQuant.setFlatScalingList();
+ m_trQuant.setUseScalingList(false);
+ }
+ else if (enc.m_useScalingListId == SCALING_LIST_DEFAULT)
+ {
+ m_trQuant.setScalingList(enc.getScalingList());
+ m_trQuant.setUseScalingList(true);
+ }
+
+ m_rdCost.setPsyRdScale(enc.param->rdLevel >= 4 ? enc.param->psyRd : 0);
+
+ m_search.init(enc, &m_rdCost, &m_trQuant);
+
+ m_cuCoder.init(enc);
+ m_cuCoder.setPredSearch(&m_search);
+ m_cuCoder.setTrQuant(&m_trQuant);
+ m_cuCoder.setRdCost(&m_rdCost);
+ m_cuCoder.create((uint8_t)g_maxCUDepth, g_maxCUSize);
+}
+
+ThreadLocalData::~ThreadLocalData()
+{
+ m_cuCoder.destroy();
+}
+
+bool CTURow::create()
{
m_rdGoOnSbacCoder.init(&m_rdGoOnBinCodersCABAC);
m_sbacCoder.init(&m_binCoderCABAC);
- m_trQuant.init(1 << top->m_quadtreeTULog2MaxSize, top->bEnableRDOQ, top->bEnableRDOQTS, !!top->param->bEnableTSkipFast);
- m_rdCost.setPsyRdScale(top->param->rdLevel >= 4 ? top->param->psyRd : 0);
m_rdSbacCoders = new TEncSbac * *[g_maxCUDepth + 1];
m_binCodersCABAC = new TEncBinCABAC * *[g_maxCUDepth + 1];
@@ -51,22 +80,20 @@
}
}
- m_cuCoder.init(top);
- m_cuCoder.setRdCost(&m_rdCost);
- m_cuCoder.setRDSbacCoder(m_rdSbacCoders);
- m_cuCoder.setEntropyCoder(&m_entropyCoder);
- m_cuCoder.setPredSearch(&m_search);
- m_cuCoder.setTrQuant(&m_trQuant);
- m_cuCoder.setRdCost(&m_rdCost);
- m_search.setRDSbacCoder(m_rdSbacCoders);
- m_search.setEntropyCoder(&m_entropyCoder);
- m_search.setRDGoOnSbacCoder(&m_rdGoOnSbacCoder);
-
- return m_search.init(top, &m_rdCost, &m_trQuant) &&
- m_cuCoder.create((uint8_t)g_maxCUDepth, g_maxCUSize);
+ /* TODO: check all mallocs */
+ return m_rdSbacCoders && m_binCodersCABAC;
}
-void CTURow::processCU(TComDataCU *cu, TComSlice *slice, TEncSbac *bufferSbac, bool bSaveSBac)
+void CTURow::setThreadLocalData(ThreadLocalData& tld)
+{
+ tld.m_cuCoder.setRDSbacCoder(m_rdSbacCoders);
+ tld.m_cuCoder.setEntropyCoder(&m_entropyCoder);
+ tld.m_search.setRDSbacCoder(m_rdSbacCoders);
+ tld.m_search.setEntropyCoder(&m_entropyCoder);
+ tld.m_search.setRDGoOnSbacCoder(&m_rdGoOnSbacCoder);
+}
+
+void CTURow::processCU(TComDataCU *cu, TComSlice *slice, TEncSbac *bufferSbac, ThreadLocalData& tld, bool bSaveSBac)
{
if (bufferSbac)
{
@@ -76,17 +103,17 @@
m_entropyCoder.setEntropyCoder(&m_rdGoOnSbacCoder, slice);
m_entropyCoder.setBitstream(&m_bitCounter);
- m_cuCoder.setRDGoOnSbacCoder(&m_rdGoOnSbacCoder);
+ tld.m_cuCoder.setRDGoOnSbacCoder(&m_rdGoOnSbacCoder);
- m_cuCoder.compressCU(cu); // Does all the CU analysis
+ tld.m_cuCoder.compressCU(cu); // Does all the CU analysis
// restore entropy coder to an initial state
m_entropyCoder.setEntropyCoder(m_rdSbacCoders[0][CI_CURR_BEST], slice);
m_entropyCoder.setBitstream(&m_bitCounter);
- m_cuCoder.setBitCounter(&m_bitCounter);
+ tld.m_cuCoder.setBitCounter(&m_bitCounter);
m_bitCounter.resetBits();
- m_cuCoder.encodeCU(cu); // Count bits
+ tld.m_cuCoder.encodeCU(cu); // Count bits
if (bSaveSBac)
{
@@ -114,5 +141,4 @@
delete[] m_rdSbacCoders;
delete[] m_binCodersCABAC;
- m_cuCoder.destroy();
}
diff -r dadb646a7266 -r c569e43631fc source/encoder/cturow.h
--- a/source/encoder/cturow.h Wed May 21 11:23:29 2014 -0500
+++ b/source/encoder/cturow.h Sun Apr 27 13:56:32 2014 +0800
@@ -40,6 +40,17 @@
class Encoder;
+struct ThreadLocalData
+{
+ TEncSearch m_search;
+ TEncCu m_cuCoder;
+ RDCost m_rdCost;
+ TComTrQuant m_trQuant;
+
+ void init(Encoder&);
+ ~ThreadLocalData();
+};
+
/* manages the state of encoding one row of CTU blocks. When
* WPP is active, several rows will be simultaneously encoded.
* When WPP is inactive, only one CTURow instance is used. */
@@ -55,15 +66,11 @@
TEncBinCABAC m_binCoderCABAC;
TEncBinCABAC m_rdGoOnBinCodersCABAC;
TComBitCounter m_bitCounter;
- RDCost m_rdCost;
TEncEntropy m_entropyCoder;
- TEncSearch m_search;
- TEncCu m_cuCoder;
- TComTrQuant m_trQuant;
TEncSbac ***m_rdSbacCoders;
TEncBinCABAC ***m_binCodersCABAC;
- bool create(Encoder* top);
+ bool create();
void destroy();
@@ -86,7 +93,9 @@
m_rdGoOnSbacCoder.resetEntropy();
}
- void processCU(TComDataCU *cu, TComSlice *slice, TEncSbac *bufferSBac, bool bSaveCabac);
+ void setThreadLocalData(ThreadLocalData& tld);
+
+ void processCU(TComDataCU *cu, TComSlice *slice, TEncSbac *bufferSBac, ThreadLocalData& tld, bool bSaveCabac);
/* Threading variables */
diff -r dadb646a7266 -r c569e43631fc source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp Wed May 21 11:23:29 2014 -0500
+++ b/source/encoder/encoder.cpp Sun Apr 27 13:56:32 2014 +0800
@@ -33,6 +33,7 @@
#include "TLibEncoder/NALwrite.h"
#include "bitcost.h"
+#include "cturow.h"
#include "encoder.h"
#include "slicetype.h"
#include "frameencoder.h"
@@ -95,6 +96,19 @@
m_frameEncoder[i].setThreadPool(m_threadPool);
}
}
+
+ /* Allocate thread local data shared by all frame encoders */
+ ThreadPool *pool = ThreadPool::getThreadPool();
+ const int poolThreadCount = pool ? pool->getThreadCount() : 1;
+ m_threadLocalData = new ThreadLocalData[poolThreadCount];
+ if (m_threadLocalData)
+ {
+ for (int i = 0; i < poolThreadCount; i++)
+ m_threadLocalData[i].init(*this);
+ }
+ else
+ m_aborted = true;
+
m_lookahead = new Lookahead(this, m_threadPool);
m_dpb = new DPB(this);
m_rateControl = new RateControl(this);
@@ -143,6 +157,9 @@
delete [] m_frameEncoder;
}
+ if (m_threadLocalData)
+ delete [] m_threadLocalData;
+
while (!m_freeList.empty())
{
TComPic* pic = m_freeList.popFront();
@@ -184,6 +201,7 @@
}
}
}
+
m_lookahead->init();
m_encodeStartTime = x265_mdate();
}
diff -r dadb646a7266 -r c569e43631fc source/encoder/encoder.h
--- a/source/encoder/encoder.h Wed May 21 11:23:29 2014 -0500
+++ b/source/encoder/encoder.h Sun Apr 27 13:56:32 2014 +0800
@@ -68,6 +68,7 @@
struct RateControl;
class ThreadPool;
struct NALUnitEBSP;
+struct ThreadLocalData;
class Encoder : public x265_encoder
{
@@ -183,6 +184,7 @@
x265_param* param;
RateControl* m_rateControl;
+ ThreadLocalData* m_threadLocalData;
bool bEnableRDOQ;
bool bEnableRDOQTS;
diff -r dadb646a7266 -r c569e43631fc source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp Wed May 21 11:23:29 2014 -0500
+++ b/source/encoder/frameencoder.cpp Sun Apr 27 13:56:32 2014 +0800
@@ -36,12 +36,6 @@
namespace x265 {
void weightAnalyse(TComSlice& slice, x265_param& param);
-enum SCALING_LIST_PARAMETER
-{
- SCALING_LIST_OFF,
- SCALING_LIST_DEFAULT,
-};
-
FrameEncoder::FrameEncoder()
: WaveFront(NULL)
, m_threadActive(true)
@@ -110,15 +104,7 @@
m_rows = new CTURow[m_numRows];
for (int i = 0; i < m_numRows; ++i)
{
- ok &= m_rows[i].create(top);
-
- for (int list = 0; list <= 1; list++)
- {
- for (int ref = 0; ref <= MAX_NUM_REF; ref++)
- {
- m_rows[i].m_search.m_mref[list][ref] = &m_mref[list][ref];
- }
- }
+ ok &= m_rows[i].create();
}
// NOTE: 2 times of numRows because both Encoder and Filter in same queue
@@ -128,6 +114,7 @@
m_pool = NULL;
}
+ m_tld.init(*top);
m_frameFilter.init(top, numRows, getRDGoOnSbacCoder(0));
// initialize SPS
@@ -153,23 +140,11 @@
// set default slice level flag to the same as SPS level flag
if (m_cfg->m_useScalingListId == SCALING_LIST_OFF)
{
- for (int i = 0; i < m_numRows; i++)
- {
- m_rows[i].m_trQuant.setFlatScalingList();
- m_rows[i].m_trQuant.setUseScalingList(false);
- }
-
m_sps.setScalingListPresentFlag(false);
m_pps.setScalingListPresentFlag(false);
}
else if (m_cfg->m_useScalingListId == SCALING_LIST_DEFAULT)
{
- for (int i = 0; i < m_numRows; i++)
- {
- m_rows[i].m_trQuant.setScalingList(m_top->getScalingList());
- m_rows[i].m_trQuant.setUseScalingList(true);
- }
-
m_sps.setScalingListPresentFlag(false);
m_pps.setScalingListPresentFlag(false);
}
@@ -378,9 +353,10 @@
while (m_threadActive);
}
-void FrameEncoder::setLambda(int qp, int row)
+void FrameEncoder::setLambda(int qp, ThreadLocalData &tld)
{
TComSlice* slice = m_pic->getSlice();
+ TComPicYuv* fenc = slice->getPic()->getPicYuvOrg();
int chFmt = slice->getSPS()->getChromaFormatIdc();
// for RDO
@@ -394,7 +370,9 @@
qpc = Clip3(0, MAX_MAX_QP, qp + chromaQPOffset);
double crWeight = pow(2.0, (qp - g_chromaScale[chFmt][qpc]) / 3.0); // takes into account of the chroma qp mapping and chroma qp Offset
- m_rows[row].m_search.setQP(qp, crWeight, cbWeight);
+ tld.m_search.setQP(qp, crWeight, cbWeight);
+ tld.m_search.m_me.setQP(qp);
+ tld.m_search.m_me.setSourcePlane(fenc->getLumaAddr(), fenc->getStride());
}
void FrameEncoder::compressFrame()
@@ -435,7 +413,6 @@
int qpc;
int chromaQPOffset = slice->getPPS()->getChromaCbQpOffset() + slice->getSliceQpDeltaCb();
qpc = Clip3(0, MAX_MAX_QP, qp + chromaQPOffset);
- double cbWeight = pow(2.0, (qp - g_chromaScale[chFmt][qpc]) / 3.0); // takes into account of the chroma qp mapping and chroma qp Offset
chromaQPOffset = slice->getPPS()->getChromaCrQpOffset() + slice->getSliceQpDeltaCr();
qpc = Clip3(0, MAX_MAX_QP, qp + chromaQPOffset);
@@ -448,13 +425,6 @@
m_frameFilter.m_sao.lumaLambda = lambda;
m_frameFilter.m_sao.chromaLambda = chromaLambda;
- TComPicYuv *fenc = slice->getPic()->getPicYuvOrg();
- for (int i = 0; i < m_numRows; i++)
- {
- m_rows[i].m_search.m_me.setSourcePlane(fenc->getLumaAddr(), fenc->getStride());
- m_rows[i].m_search.setQP(qp, crWeight, cbWeight);
- }
-
m_frameFilter.m_sao.lumaLambda = lambda;
m_frameFilter.m_sao.chromaLambda = chromaLambda;
m_bAllRowsStop = false;
@@ -678,7 +648,7 @@
{
// Construct the final bitstream by flushing and concatenating substreams.
- // The final bitstream is either nalu.m_bitstream or pcBitstreamRedirect;
+ // The final bitstream is either nalu.m_bitstream or bitstreamRedirect;
uint32_t* substreamSizes = slice->getSubstreamSizes();
for (int i = 0; i < numSubstreams; i++)
{
@@ -801,7 +771,7 @@
// Initialize slice singletons
m_sbacCoder.init(&m_binCoderCABAC);
- getCuEncoder(0)->setBitCounter(NULL);
+ m_tld.m_cuCoder.setBitCounter(NULL);
entropyCoder->setEntropyCoder(&m_sbacCoder, slice);
uint32_t cuAddr;
@@ -945,8 +915,8 @@
#if ENC_DEC_TRACE
g_bJustDoIt = g_bEncDecTraceEnable;
#endif
- getCuEncoder(0)->setEntropyCoder(entropyCoder);
- getCuEncoder(0)->encodeCU(cu);
+ m_tld.m_cuCoder.setEntropyCoder(entropyCoder);
+ m_tld.m_cuCoder.encodeCU(cu);
#if ENC_DEC_TRACE
g_bJustDoIt = g_bEncDecTraceDisable;
@@ -1081,13 +1051,13 @@
}
}
- processRow(i * 2 + 0);
+ processRow(i * 2 + 0, -1);
}
// Filter
if (i >= m_filterRowDelay)
{
- processRow((i - m_filterRowDelay) * 2 + 1);
+ processRow((i - m_filterRowDelay) * 2 + 1, -1);
}
}
}
@@ -1096,7 +1066,7 @@
}
// Called by worker threads
-void FrameEncoder::processRowEncoder(int row)
+void FrameEncoder::processRowEncoder(int row, const int threadId)
{
PPAScopeEvent(Thread_ProcessRow);
@@ -1117,22 +1087,35 @@
* believe the problem is fixed, but are leaving this check in place
* to prevent crashes in case it is not */
x265_log(m_cfg->param, X265_LOG_WARNING,
- "internal error - simulaneous row access detected. Please report HW to x265-devel at videolan.org\n");
+ "internal error - simultaneous row access detected. Please report HW to x265-devel at videolan.org\n");
return;
}
curRow.m_busy = true;
}
+ // setup thread-local data
+ ThreadLocalData& tld = threadId >= 0 ? m_cfg->m_threadLocalData[threadId] : m_tld;
+ tld.m_trQuant.m_nr = &m_nr;
+ for (int list = 0; list <= 1; list++)
+ {
+ for (int ref = 0; ref <= MAX_NUM_REF; ref++)
+ {
+ tld.m_search.m_mref[list][ref] = &m_mref[list][ref];
+ }
+ }
+ curRow.setThreadLocalData(tld);
+
int64_t startTime = x265_mdate();
const uint32_t numCols = m_pic->getPicSym()->getFrameWidthInCU();
const uint32_t lineStartCUAddr = row * numCols;
bool bIsVbv = m_cfg->param->rc.vbvBufferSize > 0 && m_cfg->param->rc.vbvMaxBitrate > 0;
+ if (!m_cfg->param->rc.aqMode && !bIsVbv)
+ setLambda(m_pic->getSlice()->getSliceQp(), tld);
while (curRow.m_completed < numCols)
{
int col = curRow.m_completed;
const uint32_t cuAddr = lineStartCUAddr + col;
- curRow.m_trQuant.m_nr = &m_nr;
TComDataCU* cu = m_pic->getCU(cuAddr);
cu->initCU(m_pic, cuAddr);
cu->setQPSubParts(m_pic->getSlice()->getSliceQp(), 0, 0);
@@ -1153,7 +1136,7 @@
if (m_cfg->param->rc.aqMode || bIsVbv)
{
int qp = calcQpForCu(cuAddr, cu->m_baseQp);
- setLambda(qp, row);
+ setLambda(qp, tld);
qp = Clip3(-QP_BD_OFFSET, MAX_QP, qp);
cu->setQPSubParts(char(qp), 0, 0);
if (m_cfg->param->rc.aqMode)
@@ -1163,7 +1146,7 @@
TEncSbac *bufSbac = (m_cfg->param->bEnableWavefront && col == 0 && row > 0) ? &m_rows[row - 1].m_bufferSbacCoder : NULL;
codeRow.m_entropyCoder.setEntropyCoder(&m_sbacCoder, m_pic->getSlice());
codeRow.m_entropyCoder.resetEntropy();
- codeRow.processCU(cu, m_pic->getSlice(), bufSbac, m_cfg->param->bEnableWavefront && col == 1);
+ codeRow.processCU(cu, m_pic->getSlice(), bufSbac, tld, m_cfg->param->bEnableWavefront && col == 1);
// Completed CU processing
curRow.m_completed++;
diff -r dadb646a7266 -r c569e43631fc source/encoder/frameencoder.h
--- a/source/encoder/frameencoder.h Wed May 21 11:23:29 2014 -0500
+++ b/source/encoder/frameencoder.h Sun Apr 27 13:56:32 2014 +0800
@@ -45,6 +45,12 @@
namespace x265 {
// private x265 namespace
+enum SCALING_LIST_PARAMETER
+{
+ SCALING_LIST_OFF,
+ SCALING_LIST_DEFAULT,
+};
+
class ThreadPool;
class Encoder;
@@ -63,7 +69,7 @@
void destroy();
- void processRowEncoder(int row);
+ void processRowEncoder(int row, const int threadId);
void processRowFilter(int row)
{
@@ -90,7 +96,7 @@
WaveFront::enableRow(row * 2 + 1);
}
- void processRow(int row)
+ void processRow(int row, int threadId)
{
const int realRow = row >> 1;
const int typeNum = row & 1;
@@ -98,7 +104,7 @@
// TODO: use switch when more type
if (typeNum == 0)
{
- processRowEncoder(realRow);
+ processRowEncoder(realRow, threadId);
}
else
{
@@ -120,8 +126,6 @@
TEncSbac* getBufferSBac(int row) { return &this->m_rows[row].m_bufferSbacCoder; }
- TEncCu* getCuEncoder(int row) { return &this->m_rows[row].m_cuCoder; }
-
/* Frame singletons, last the life of the encoder */
TEncSampleAdaptiveOffset* getSAO() { return &m_frameFilter.m_sao; }
@@ -149,7 +153,7 @@
/* blocks until worker thread is done, returns encoded picture and bitstream */
TComPic *getEncodedPicture(NALUnitEBSP **nalunits);
- void setLambda(int qp, int row);
+ void setLambda(int qp, ThreadLocalData& tld);
// worker thread
void threadMain();
@@ -184,6 +188,7 @@
FrameFilter m_frameFilter;
TComBitCounter m_bitCounter;
NoiseReduction m_nr;
+ ThreadLocalData m_tld;
/* Picture being encoded, and its output NAL list */
TComPic* m_pic;
diff -r dadb646a7266 -r c569e43631fc source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp Wed May 21 11:23:29 2014 -0500
+++ b/source/encoder/slicetype.cpp Sun Apr 27 13:56:32 2014 +0800
@@ -189,7 +189,7 @@
}
/* Called by pool worker threads */
-bool Lookahead::findJob()
+bool Lookahead::findJob(int)
{
if (bReady && ATOMIC_CAS32(&bReady, 1, 0) == 1)
{
@@ -1289,7 +1289,7 @@
enqueueRow(0);
while (!bFrameCompleted)
{
- WaveFront::findJob();
+ WaveFront::findJob(-1);
}
WaveFront::dequeue();
@@ -1298,7 +1298,7 @@
{
for (int row = 0; row < heightInCU; row++)
{
- processRow(row);
+ processRow(row, -1);
}
x265_emms();
@@ -1455,7 +1455,7 @@
}
}
-void CostEstimate::processRow(int row)
+void CostEstimate::processRow(int row, const int /*threadId*/)
{
int realrow = heightInCU - 1 - row;
Lowres **frames = curframes;
diff -r dadb646a7266 -r c569e43631fc source/encoder/slicetype.h
--- a/source/encoder/slicetype.h Wed May 21 11:23:29 2014 -0500
+++ b/source/encoder/slicetype.h Sun Apr 27 13:56:32 2014 +0800
@@ -110,7 +110,7 @@
volatile bool bFrameCompleted;
int curb, curp0, curp1;
- void processRow(int row);
+ void processRow(int row, int threadId);
int64_t estimateFrameCost(Lowres **frames, int p0, int p1, int b, bool bIntraPenalty);
protected:
@@ -155,7 +155,7 @@
volatile bool bFilling;
volatile bool bFlushed;
- bool findJob();
+ bool findJob(int);
/* called by addPicture() or flush() to trigger slice decisions */
void slicetypeDecide();
diff -r dadb646a7266 -r c569e43631fc source/test/testpool.cpp
--- a/source/test/testpool.cpp Wed May 21 11:23:29 2014 -0500
+++ b/source/test/testpool.cpp Sun Apr 27 13:56:32 2014 +0800
@@ -87,7 +87,7 @@
void encode();
- void processRow(int row);
+ void processRow(int row, int threadid);
};
void MD5Frame::initialize(int cols, int rows)
@@ -130,7 +130,7 @@
std::cout << "Bad hash: " << ss.str() << std::endl;
}
-void MD5Frame::processRow(int rownum)
+void MD5Frame::processRow(int rownum, int)
{
// Called by worker thread
RowData &curRow = this->row[rownum];
More information about the x265-devel
mailing list