[x265] [PATCH] Performance: Enabling recon frames to be NUMA-aware when the
Pradeep
pradeep at multicorewareinc.com
Wed Aug 5 16:05:29 CEST 2015
# HG changeset patch
# User Pradeep <pradeep at multicorewareinc.com>
# Date 1438704601 0
# Tue Aug 04 16:10:01 2015 +0000
# Node ID 0206efdac228891f348c8d6c7ad7ced369c840a3
# Parent 0c1f9d98294454d3bf896aeb24be881d8aa53434
Performance: Enabling recon frames to be NUMA-aware when the
frame encoder thread creates them. Seeing considerable reduction in
no. cross-socket accesses, but impact on performance of sample videos
is rather small
diff -r 0c1f9d982944 -r 0206efdac228 source/common/frame.cpp
--- a/source/common/frame.cpp Tue Aug 04 15:37:26 2015 +0000
+++ b/source/common/frame.cpp Tue Aug 04 16:10:01 2015 +0000
@@ -51,10 +51,34 @@
m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode);
}
-bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
+bool Frame::allocEncodeData(x265_param *param, const SPS& sps, const int numaNode)
{
- m_encData = new FrameData;
- m_reconPic = new PicYuv;
+ int selNumaNode = numaNode ;
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
+ GROUP_AFFINITY groupAffinity;
+ if (GetNumaNodeProcessorMaskEx((USHORT)selNumaNode, &groupAffinity)) {
+ if(VirtualAllocExNuma(GetCurrentProcess(),
+ NULL,
+ sizeof(FrameData)+sizeof(PicYuv),
+ MEM_COMMIT,
+ PAGE_READWRITE,
+ selNumaNode)) {
+ // Successful commit, do nothing
+ }
+ }
+#elif HAVE_LIBNUMA
+ if(numa_available() >= 0) {
+ numa_set_preferred(selNumaNode) ;
+ numa_set_localalloc() ;
+ } else {
+ selNumaNode = -1 ;
+ }
+#else
+ selNumaNode = -1 ;
+#endif // HAVE_LIBNUMA
+
+ m_encData = new FrameData(selNumaNode) ;
+ m_reconPic = new PicYuv(selNumaNode) ;
m_encData->m_reconPic = m_reconPic;
bool ok = m_encData->create(param, sps) && m_reconPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
if (ok)
diff -r 0c1f9d982944 -r 0206efdac228 source/common/frame.h
--- a/source/common/frame.h Tue Aug 04 15:37:26 2015 +0000
+++ b/source/common/frame.h Tue Aug 04 16:10:01 2015 +0000
@@ -28,6 +28,10 @@
#include "lowres.h"
#include "threading.h"
+#if HAVE_LIBNUMA
+#include <numa.h>
+#endif // HAVE_LIBNUMA
+
namespace X265_NS {
// private namespace
@@ -67,10 +71,11 @@
Frame* m_prev;
x265_param* m_param; // Points to the latest param set for the frame.
x265_analysis_data m_analysisData;
+
Frame();
bool create(x265_param *param);
- bool allocEncodeData(x265_param *param, const SPS& sps);
+ bool allocEncodeData(x265_param *param, const SPS& sps, const int numaNode);
void reinit(const SPS& sps);
void destroy();
};
diff -r 0c1f9d982944 -r 0206efdac228 source/common/framedata.cpp
--- a/source/common/framedata.cpp Tue Aug 04 15:37:26 2015 +0000
+++ b/source/common/framedata.cpp Tue Aug 04 16:10:01 2015 +0000
@@ -26,9 +26,10 @@
using namespace X265_NS;
-FrameData::FrameData()
+FrameData::FrameData(int numaNode)
{
memset(this, 0, sizeof(*this));
+ m_numaNode = numaNode ;
}
bool FrameData::create(x265_param *param, const SPS& sps)
diff -r 0c1f9d982944 -r 0206efdac228 source/common/framedata.h
--- a/source/common/framedata.h Tue Aug 04 15:37:26 2015 +0000
+++ b/source/common/framedata.h Tue Aug 04 16:10:01 2015 +0000
@@ -107,6 +107,8 @@
CUDataMemPool m_cuMemPool;
CUData* m_picCTU;
+ int m_numaNode ;
+
/* Rate control data used during encode and by references */
struct RCStatCU
{
@@ -140,7 +142,7 @@
double m_avgQpAq; /* avg QP as decided by AQ in addition to rate-control */
double m_rateFactor; /* calculated based on the Frame QP */
- FrameData();
+ FrameData(int numaNode=-1);
bool create(x265_param *param, const SPS& sps);
void reinit(const SPS& sps);
diff -r 0c1f9d982944 -r 0206efdac228 source/common/param.cpp
--- a/source/common/param.cpp Tue Aug 04 15:37:26 2015 +0000
+++ b/source/common/param.cpp Tue Aug 04 16:10:01 2015 +0000
@@ -855,6 +855,7 @@
OPT("qg-size") p->rc.qgSize = atoi(value);
OPT("master-display") p->masteringDisplayColorVolume = strdup(value);
OPT("max-cll") p->contentLightLevelInfo = strdup(value);
+ OPT("print-numa-stats") p->printNumaStats = atobool(value) ;
else
return X265_PARAM_BAD_NAME;
#undef OPT
diff -r 0c1f9d982944 -r 0206efdac228 source/common/picyuv.cpp
--- a/source/common/picyuv.cpp Tue Aug 04 15:37:26 2015 +0000
+++ b/source/common/picyuv.cpp Tue Aug 04 16:10:01 2015 +0000
@@ -28,7 +28,8 @@
using namespace X265_NS;
-PicYuv::PicYuv()
+PicYuv::PicYuv(int numaNode):
+ m_numaNode(numaNode)
{
m_picBuf[0] = NULL;
m_picBuf[1] = NULL;
diff -r 0c1f9d982944 -r 0206efdac228 source/common/picyuv.h
--- a/source/common/picyuv.h Tue Aug 04 15:37:26 2015 +0000
+++ b/source/common/picyuv.h Tue Aug 04 16:10:01 2015 +0000
@@ -59,8 +59,9 @@
uint32_t m_lumaMarginY;
uint32_t m_chromaMarginX;
uint32_t m_chromaMarginY;
+ int32_t m_numaNode ;
- PicYuv();
+ PicYuv(int numaNode=-1);
bool create(uint32_t picWidth, uint32_t picHeight, uint32_t csp);
bool createOffsets(const SPS& sps);
diff -r 0c1f9d982944 -r 0206efdac228 source/common/threadpool.cpp
--- a/source/common/threadpool.cpp Tue Aug 04 15:37:26 2015 +0000
+++ b/source/common/threadpool.cpp Tue Aug 04 16:10:01 2015 +0000
@@ -338,6 +338,7 @@
ThreadPool::ThreadPool()
{
memset(this, 0, sizeof(*this));
+ m_numaNode = -1 ;
}
bool ThreadPool::create(int numThreads, int maxProviders, int node)
diff -r 0c1f9d982944 -r 0206efdac228 source/encoder/dpb.cpp
--- a/source/encoder/dpb.cpp Tue Aug 04 15:37:26 2015 +0000
+++ b/source/encoder/dpb.cpp Tue Aug 04 16:10:01 2015 +0000
@@ -58,6 +58,23 @@
delete m_picSymFreeList;
m_picSymFreeList = next;
}
+
+ if(m_picSymFreeListNuma) {
+ for(int i=0; i<m_numNumaNodes; i++) {
+ while(m_picSymFreeListNuma[i]) {
+ FrameData* next = m_picSymFreeListNuma[i]->m_freeListNext;
+ m_picSymFreeListNuma[i]->destroy();
+
+ m_picSymFreeListNuma[i]->m_reconPic->destroy();
+ delete m_picSymFreeListNuma[i]->m_reconPic;
+
+ delete m_picSymFreeListNuma[i];
+ m_picSymFreeListNuma[i] = next;
+ }
+ delete m_picSymFreeListNuma[i] ;
+ }
+ delete m_picSymFreeListNuma ;
+ }
}
// move unreferenced pictures from picList to freeList for recycle
@@ -78,9 +95,17 @@
m_picList.remove(*curFrame);
iterFrame = m_picList.first();
+ int encDataNumaNode = curFrame->m_encData->m_numaNode ;
+ if(encDataNumaNode != -1) {
+ X265_CHECK(encDataNumaNode < m_numNumaNodes,
+ "fatal: frame allocated on non-existant numa node!\n") ;
+ curFrame->m_encData->m_freeListNext = m_picSymFreeListNuma[encDataNumaNode] ;
+ m_picSymFreeListNuma[encDataNumaNode] = curFrame->m_encData ;
+ } else {
+ curFrame->m_encData->m_freeListNext = m_picSymFreeList;
+ m_picSymFreeList = curFrame->m_encData;
+ }
m_freeList.pushBack(*curFrame);
- curFrame->m_encData->m_freeListNext = m_picSymFreeList;
- m_picSymFreeList = curFrame->m_encData;
curFrame->m_encData = NULL;
curFrame->m_reconPic = NULL;
}
diff -r 0c1f9d982944 -r 0206efdac228 source/encoder/dpb.h
--- a/source/encoder/dpb.h Tue Aug 04 15:37:26 2015 +0000
+++ b/source/encoder/dpb.h Tue Aug 04 16:10:01 2015 +0000
@@ -47,6 +47,9 @@
PicList m_picList;
PicList m_freeList;
FrameData* m_picSymFreeList;
+ x265_param* m_param;
+ int m_numNumaNodes ;
+ FrameData **m_picSymFreeListNuma ;
DPB(x265_param *param)
{
@@ -58,6 +61,27 @@
m_maxRefL1 = param->bBPyramid ? 2 : 1;
m_bOpenGOP = param->bOpenGOP;
m_bTemporalSublayer = !!param->bEnableTemporalSubLayers;
+ m_param = param ;
+ m_numNumaNodes = -1 ;
+
+#if (defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7)
+ // NUMA supported by default on windows
+ m_numNumaNodes = 1 ;
+ if(GetNumaHighestNodeNumber(&num)) {
+ m_numNumaNodes ++ ;
+ }
+#elif HAVE_LIBNUMA
+ if(numa_available()>=0) {
+ m_numNumaNodes = numa_max_node() + 1 ;
+ }
+#endif // HAVE_LIBNUMA
+
+ if(m_numNumaNodes>0) {
+ m_picSymFreeListNuma = new FrameData*[m_numNumaNodes] ;
+ for(int i=0; i<m_numNumaNodes; i++) {
+ m_picSymFreeListNuma[i] = NULL ;
+ }
+ }
}
~DPB();
@@ -66,6 +90,17 @@
void recycleUnreferenced();
+ bool isFreeEncDataAvailable() {
+ if(m_picSymFreeList) {
+ return true ;
+ }
+ for(int i=0; i<m_numNumaNodes; i++) {
+ if(m_picSymFreeListNuma[i])
+ return true ;
+ }
+ return false ;
+ }
+
protected:
void computeRPS(int curPoc, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer);
diff -r 0c1f9d982944 -r 0206efdac228 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp Tue Aug 04 15:37:26 2015 +0000
+++ b/source/encoder/encoder.cpp Tue Aug 04 16:10:01 2015 +0000
@@ -286,6 +286,11 @@
void Encoder::destroy()
{
+ int numRefSameNuma = 0 ;
+ int numRefDiffNuma = 0 ;
+ int numReconSameNuma = 0 ;
+ int numReconDiffNuma = 0 ;
+
if (m_exportedPic)
{
ATOMIC_DEC(&m_exportedPic->m_countRefEncoders);
@@ -296,6 +301,13 @@
{
if (m_frameEncoder[i])
{
+ if(m_param->printNumaStats) {
+ numRefSameNuma += m_frameEncoder[i]->getNumRefFramesSameNuma() ;
+ numRefDiffNuma += m_frameEncoder[i]->getNumRefFramesDiffNuma() ;
+ numReconSameNuma += m_frameEncoder[i]->getNumReconFramesSameNuma() ;
+ numReconDiffNuma += m_frameEncoder[i]->getNumReconFramesDiffNuma() ;
+ }
+
m_frameEncoder[i]->destroy();
delete m_frameEncoder[i];
}
@@ -323,6 +335,16 @@
X265_FREE(m_buOffsetY);
X265_FREE(m_buOffsetC);
+ if(m_param && m_param->printNumaStats) {
+ printf("Num new Encoder data alloc = %d\n", m_numNewEncodeDataAlloc) ;
+ printf("Num same node Encoder data reuse = %d\n", m_numSameNumaEncData) ;
+ printf("Num diff node Encoder data reuse = %d\n", m_numDiffNumaEncData) ;
+ printf("Num Ref frames in Same numa = %d\n", numRefSameNuma) ;
+ printf("Num Ref frames in Diff numa = %d\n", numRefDiffNuma) ;
+ printf("Num Recon frames in Same numa = %d\n", numReconSameNuma) ;
+ printf("Num Recon frames in Diff numa = %d\n", numReconDiffNuma) ;
+ }
+
if (m_analysisFile)
fclose(m_analysisFile);
@@ -511,6 +533,7 @@
FrameEncoder *curEncoder = m_frameEncoder[m_curEncoder];
m_curEncoder = (m_curEncoder + 1) % m_param->frameNumThreads;
+
int ret = 0;
/* Normal operation is to wait for the current frame encoder to complete its current frame
@@ -633,15 +656,49 @@
if (frameEnc && !pass)
{
/* give this frame a FrameData instance before encoding */
- if (m_dpb->m_picSymFreeList)
+ // If NUMA aware allocation is enabled, try to preferably select a frame from this numa
+ // node if available. If disabled, give any free node. If no free node, allocate new data
+ if (m_dpb->isFreeEncDataAvailable())
{
- frameEnc->m_encData = m_dpb->m_picSymFreeList;
- m_dpb->m_picSymFreeList = m_dpb->m_picSymFreeList->m_freeListNext;
- frameEnc->reinit(m_sps);
+ // Need to figure out which NUMA node this in frame is going to be
+ // decoded on! try to allocate in data on that node.
+ int threadNumaNode = curEncoder->m_pool->m_numaNode ;
+ int dataNumaNode = -1 ;
+ if(threadNumaNode!=-1) {
+ int checkingNumaNode = threadNumaNode ;
+ int numNumaNodes = m_dpb->m_numNumaNodes ;
+ bool found = false ;
+ for(int i=0; i<numNumaNodes;i++) {
+ if(m_dpb->m_picSymFreeListNuma[checkingNumaNode]) {
+ dataNumaNode = checkingNumaNode ;
+ frameEnc->m_encData = m_dpb->m_picSymFreeListNuma[dataNumaNode] ;
+ m_dpb->m_picSymFreeListNuma[dataNumaNode] =
+ m_dpb->m_picSymFreeListNuma[dataNumaNode]->m_freeListNext ;
+ frameEnc->reinit(m_sps) ;
+ // printf("Worker threads on %d, recon frame data on %d\n",
+ // threadNumaNode, dataNumaNode) ;
+ found = true ;
+ break ;
+ }
+ checkingNumaNode = (checkingNumaNode+1) % numNumaNodes ;
+ }
+ X265_CHECK(found, "Should've found buffer for in frame!\n") ;
+ } else {
+ frameEnc->m_encData = m_dpb->m_picSymFreeList;
+ m_dpb->m_picSymFreeList = m_dpb->m_picSymFreeList->m_freeListNext;
+ frameEnc->reinit(m_sps);
+ dataNumaNode = frameEnc->m_encData->m_numaNode ;
+ }
+ if(dataNumaNode == threadNumaNode) {
+ m_numSameNumaEncData ++ ;
+ } else {
+ m_numDiffNumaEncData ++ ;
+ }
}
else
{
- frameEnc->allocEncodeData(m_param, m_sps);
+ m_numNewEncodeDataAlloc ++ ;
+ frameEnc->allocEncodeData(m_param, m_sps, curEncoder->m_pool->m_numaNode);
Slice* slice = frameEnc->m_encData->m_slice;
slice->m_sps = &m_sps;
slice->m_pps = &m_pps;
diff -r 0c1f9d982944 -r 0206efdac228 source/encoder/encoder.h
--- a/source/encoder/encoder.h Tue Aug 04 15:37:26 2015 +0000
+++ b/source/encoder/encoder.h Tue Aug 04 16:10:01 2015 +0000
@@ -133,6 +133,9 @@
bool m_aborted; // fatal error detected
bool m_reconfigured; // reconfigure of encoder detected
+ uint32_t m_numNewEncodeDataAlloc ;
+ uint32_t m_numSameNumaEncData ;
+ uint32_t m_numDiffNumaEncData ;
Encoder();
~Encoder() {}
diff -r 0c1f9d982944 -r 0206efdac228 source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp Tue Aug 04 15:37:26 2015 +0000
+++ b/source/encoder/frameencoder.cpp Tue Aug 04 16:10:01 2015 +0000
@@ -60,6 +60,11 @@
m_ctuGeomMap = NULL;
m_localTldIdx = 0;
memset(&m_rce, 0, sizeof(RateControlEntry));
+
+ m_numRefFramesSameNuma = 0 ;
+ m_numRefFrameDiffNuma = 0 ;
+ m_numReconFramesSameNuma = 0 ;
+ m_numReconFramesDiffNuma = 0 ;
}
void FrameEncoder::destroy()
@@ -357,7 +362,15 @@
WeightParam *w = NULL;
if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].bPresentFlag)
w = slice->m_weightPredTable[l][ref];
- m_mref[l][ref].init(slice->m_refPicList[l][ref]->m_reconPic, w, *m_param);
+ PicYuv* closestReconPic = slice->m_refPicList[l][ref]->m_reconPic ;
+ m_mref[l][ref].init(closestReconPic, w, *m_param);
+ if(m_param->printNumaStats) {
+ if(m_pool->m_numaNode != closestReconPic->m_numaNode) {
+ m_numRefFrameDiffNuma ++ ;
+ } else {
+ m_numRefFramesSameNuma ++ ;
+ }
+ }
}
}
@@ -932,6 +945,13 @@
// Does all the CU analysis, returns best top level mode decision
Mode& best = tld.analysis.compressCTU(*ctu, *m_frame, m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder);
+ if(m_param->printNumaStats) {
+ if(m_pool->m_numaNode != m_frame->m_reconPic->m_numaNode) {
+ m_numReconFramesDiffNuma ++ ;
+ } else {
+ m_numReconFramesSameNuma ++ ;
+ }
+ }
// take a sample of the current active worker count
ATOMIC_ADD(&m_totalActiveWorkerCount, m_activeWorkerCount);
diff -r 0c1f9d982944 -r 0206efdac228 source/encoder/frameencoder.h
--- a/source/encoder/frameencoder.h Tue Aug 04 15:37:26 2015 +0000
+++ b/source/encoder/frameencoder.h Tue Aug 04 16:10:01 2015 +0000
@@ -206,7 +206,16 @@
WeightAnalysis operator=(const WeightAnalysis&);
};
-protected:
+ unsigned int m_numRefFramesSameNuma ;
+ unsigned int m_numRefFrameDiffNuma ;
+ unsigned int m_numReconFramesSameNuma ;
+ unsigned int m_numReconFramesDiffNuma ;
+ unsigned int getNumRefFramesSameNuma() { return m_numRefFramesSameNuma ; }
+ unsigned int getNumRefFramesDiffNuma() { return m_numRefFrameDiffNuma ; }
+ unsigned int getNumReconFramesSameNuma() { return m_numReconFramesSameNuma ; }
+ unsigned int getNumReconFramesDiffNuma() { return m_numReconFramesDiffNuma ; }
+
+ protected:
bool initializeGeoms();
diff -r 0c1f9d982944 -r 0206efdac228 source/x265.h
--- a/source/x265.h Tue Aug 04 15:37:26 2015 +0000
+++ b/source/x265.h Tue Aug 04 16:10:01 2015 +0000
@@ -1172,6 +1172,11 @@
* picture average light level (or 0). */
const char* contentLightLevelInfo;
+ /* Print NUMA statistics collected from the code on the console to show the
+ * number of times the recon and ref pics were locatd on the same NUMA socket,
+ * and on different sockets */
+ int printNumaStats ;
+
} x265_param;
/* x265_param_alloc:
diff -r 0c1f9d982944 -r 0206efdac228 source/x265cli.h
--- a/source/x265cli.h Tue Aug 04 15:37:26 2015 +0000
+++ b/source/x265cli.h Tue Aug 04 16:10:01 2015 +0000
@@ -218,6 +218,7 @@
{ "no-temporal-layers", no_argument, NULL, 0 },
{ "qg-size", required_argument, NULL, 0 },
{ "recon-y4m-exec", required_argument, NULL, 0 },
+ { "print-numa-stats", no_argument, NULL, 0 },
{ 0, 0, 0, 0 },
{ 0, 0, 0, 0 },
{ 0, 0, 0, 0 },
@@ -414,6 +415,7 @@
H1("-r/--recon <filename> Reconstructed raw image YUV or Y4M output file name\n");
H1(" --recon-depth <integer> Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M\n");
H1(" --recon-y4m-exec <string> pipe reconstructed frames to Y4M viewer, ex:\"ffplay -i pipe:0 -autoexit\"\n");
+ H1(" --print-numa-stats print statistics related to socket information for ref and recon frames\n");
H1("\nExecutable return codes:\n");
H1(" 0 - encode successful\n");
H1(" 1 - unable to parse command line\n");
More information about the x265-devel
mailing list