[x265] [PATCH] Performance: Enabling recon frames to be NUMA-aware when the
Pradeep Ramachandran
pradeep at multicorewareinc.com
Wed Aug 5 11:02:12 CEST 2015
There was some merge problem rendering this patch unapplicable on the tip.
Please ignore.
Apologies for the confusion.
Pradeep.
Pradeep Ramachandran, PhD
Solution Architect,
Multicoreware Inc.
Ph: +91 99627 82018
On Wed, Aug 5, 2015 at 7:35 PM, Pradeep <pradeep at multicorewareinc.com>
wrote:
> # HG changeset patch
> # User Pradeep <pradeep at multicorewareinc.com>
> # Date 1438704601 0
> # Tue Aug 04 16:10:01 2015 +0000
> # Node ID 0206efdac228891f348c8d6c7ad7ced369c840a3
> # Parent 0c1f9d98294454d3bf896aeb24be881d8aa53434
> Performance: Enabling recon frames to be NUMA-aware when the
> frame encoder thread creates them. Seeing considerable reduction in
> no. cross-socket accesses, but impact on performance of sample videos
> is rather small
>
> diff -r 0c1f9d982944 -r 0206efdac228 source/common/frame.cpp
> --- a/source/common/frame.cpp Tue Aug 04 15:37:26 2015 +0000
> +++ b/source/common/frame.cpp Tue Aug 04 16:10:01 2015 +0000
> @@ -51,10 +51,34 @@
> m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode);
> }
>
> -bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
> +bool Frame::allocEncodeData(x265_param *param, const SPS& sps, const int
> numaNode)
> {
> - m_encData = new FrameData;
> - m_reconPic = new PicYuv;
> + int selNumaNode = numaNode ;
> +#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
> + GROUP_AFFINITY groupAffinity;
> + if (GetNumaNodeProcessorMaskEx((USHORT)selNumaNode, &groupAffinity)) {
> + if(VirtualAllocExNuma(GetCurrentProcess(),
> + NULL,
> + sizeof(FrameData)+sizeof(PicYuv),
> + MEM_COMMIT,
> + PAGE_READWRITE,
> + selNumaNode)) {
> + // Successful commit, do nothing
> + }
> + }
> +#elif HAVE_LIBNUMA
> + if(numa_available() >= 0) {
> + numa_set_preferred(selNumaNode) ;
> + numa_set_localalloc() ;
> + } else {
> + selNumaNode = -1 ;
> + }
> +#else
> + selNumaNode = -1 ;
> +#endif // HAVE_LIBNUMA
> +
> + m_encData = new FrameData(selNumaNode) ;
> + m_reconPic = new PicYuv(selNumaNode) ;
> m_encData->m_reconPic = m_reconPic;
> bool ok = m_encData->create(param, sps) &&
> m_reconPic->create(param->sourceWidth, param->sourceHeight,
> param->internalCsp);
> if (ok)
> diff -r 0c1f9d982944 -r 0206efdac228 source/common/frame.h
> --- a/source/common/frame.h Tue Aug 04 15:37:26 2015 +0000
> +++ b/source/common/frame.h Tue Aug 04 16:10:01 2015 +0000
> @@ -28,6 +28,10 @@
> #include "lowres.h"
> #include "threading.h"
>
> +#if HAVE_LIBNUMA
> +#include <numa.h>
> +#endif // HAVE_LIBNUMA
> +
> namespace X265_NS {
> // private namespace
>
> @@ -67,10 +71,11 @@
> Frame* m_prev;
> x265_param* m_param; // Points to the latest
> param set for the frame.
> x265_analysis_data m_analysisData;
> +
> Frame();
>
> bool create(x265_param *param);
> - bool allocEncodeData(x265_param *param, const SPS& sps);
> + bool allocEncodeData(x265_param *param, const SPS& sps, const int
> numaNode);
> void reinit(const SPS& sps);
> void destroy();
> };
> diff -r 0c1f9d982944 -r 0206efdac228 source/common/framedata.cpp
> --- a/source/common/framedata.cpp Tue Aug 04 15:37:26 2015 +0000
> +++ b/source/common/framedata.cpp Tue Aug 04 16:10:01 2015 +0000
> @@ -26,9 +26,10 @@
>
> using namespace X265_NS;
>
> -FrameData::FrameData()
> +FrameData::FrameData(int numaNode)
> {
> memset(this, 0, sizeof(*this));
> + m_numaNode = numaNode ;
> }
>
> bool FrameData::create(x265_param *param, const SPS& sps)
> diff -r 0c1f9d982944 -r 0206efdac228 source/common/framedata.h
> --- a/source/common/framedata.h Tue Aug 04 15:37:26 2015 +0000
> +++ b/source/common/framedata.h Tue Aug 04 16:10:01 2015 +0000
> @@ -107,6 +107,8 @@
> CUDataMemPool m_cuMemPool;
> CUData* m_picCTU;
>
> + int m_numaNode ;
> +
> /* Rate control data used during encode and by references */
> struct RCStatCU
> {
> @@ -140,7 +142,7 @@
> double m_avgQpAq; /* avg QP as decided by AQ in addition
> to rate-control */
> double m_rateFactor; /* calculated based on the Frame QP */
>
> - FrameData();
> + FrameData(int numaNode=-1);
>
> bool create(x265_param *param, const SPS& sps);
> void reinit(const SPS& sps);
> diff -r 0c1f9d982944 -r 0206efdac228 source/common/param.cpp
> --- a/source/common/param.cpp Tue Aug 04 15:37:26 2015 +0000
> +++ b/source/common/param.cpp Tue Aug 04 16:10:01 2015 +0000
> @@ -855,6 +855,7 @@
> OPT("qg-size") p->rc.qgSize = atoi(value);
> OPT("master-display") p->masteringDisplayColorVolume = strdup(value);
> OPT("max-cll") p->contentLightLevelInfo = strdup(value);
> + OPT("print-numa-stats") p->printNumaStats = atobool(value) ;
> else
> return X265_PARAM_BAD_NAME;
> #undef OPT
> diff -r 0c1f9d982944 -r 0206efdac228 source/common/picyuv.cpp
> --- a/source/common/picyuv.cpp Tue Aug 04 15:37:26 2015 +0000
> +++ b/source/common/picyuv.cpp Tue Aug 04 16:10:01 2015 +0000
> @@ -28,7 +28,8 @@
>
> using namespace X265_NS;
>
> -PicYuv::PicYuv()
> +PicYuv::PicYuv(int numaNode):
> + m_numaNode(numaNode)
> {
> m_picBuf[0] = NULL;
> m_picBuf[1] = NULL;
> diff -r 0c1f9d982944 -r 0206efdac228 source/common/picyuv.h
> --- a/source/common/picyuv.h Tue Aug 04 15:37:26 2015 +0000
> +++ b/source/common/picyuv.h Tue Aug 04 16:10:01 2015 +0000
> @@ -59,8 +59,9 @@
> uint32_t m_lumaMarginY;
> uint32_t m_chromaMarginX;
> uint32_t m_chromaMarginY;
> + int32_t m_numaNode ;
>
> - PicYuv();
> + PicYuv(int numaNode=-1);
>
> bool create(uint32_t picWidth, uint32_t picHeight, uint32_t csp);
> bool createOffsets(const SPS& sps);
> diff -r 0c1f9d982944 -r 0206efdac228 source/common/threadpool.cpp
> --- a/source/common/threadpool.cpp Tue Aug 04 15:37:26 2015 +0000
> +++ b/source/common/threadpool.cpp Tue Aug 04 16:10:01 2015 +0000
> @@ -338,6 +338,7 @@
> ThreadPool::ThreadPool()
> {
> memset(this, 0, sizeof(*this));
> + m_numaNode = -1 ;
> }
>
> bool ThreadPool::create(int numThreads, int maxProviders, int node)
> diff -r 0c1f9d982944 -r 0206efdac228 source/encoder/dpb.cpp
> --- a/source/encoder/dpb.cpp Tue Aug 04 15:37:26 2015 +0000
> +++ b/source/encoder/dpb.cpp Tue Aug 04 16:10:01 2015 +0000
> @@ -58,6 +58,23 @@
> delete m_picSymFreeList;
> m_picSymFreeList = next;
> }
> +
> + if(m_picSymFreeListNuma) {
> + for(int i=0; i<m_numNumaNodes; i++) {
> + while(m_picSymFreeListNuma[i]) {
> + FrameData* next = m_picSymFreeListNuma[i]->m_freeListNext;
> + m_picSymFreeListNuma[i]->destroy();
> +
> + m_picSymFreeListNuma[i]->m_reconPic->destroy();
> + delete m_picSymFreeListNuma[i]->m_reconPic;
> +
> + delete m_picSymFreeListNuma[i];
> + m_picSymFreeListNuma[i] = next;
> + }
> + delete m_picSymFreeListNuma[i] ;
> + }
> + delete m_picSymFreeListNuma ;
> + }
> }
>
> // move unreferenced pictures from picList to freeList for recycle
> @@ -78,9 +95,17 @@
> m_picList.remove(*curFrame);
> iterFrame = m_picList.first();
>
> + int encDataNumaNode = curFrame->m_encData->m_numaNode ;
> + if(encDataNumaNode != -1) {
> + X265_CHECK(encDataNumaNode < m_numNumaNodes,
> + "fatal: frame allocated on non-existant numa
> node!\n") ;
> + curFrame->m_encData->m_freeListNext =
> m_picSymFreeListNuma[encDataNumaNode] ;
> + m_picSymFreeListNuma[encDataNumaNode] =
> curFrame->m_encData ;
> + } else {
> + curFrame->m_encData->m_freeListNext = m_picSymFreeList;
> + m_picSymFreeList = curFrame->m_encData;
> + }
> m_freeList.pushBack(*curFrame);
> - curFrame->m_encData->m_freeListNext = m_picSymFreeList;
> - m_picSymFreeList = curFrame->m_encData;
> curFrame->m_encData = NULL;
> curFrame->m_reconPic = NULL;
> }
> diff -r 0c1f9d982944 -r 0206efdac228 source/encoder/dpb.h
> --- a/source/encoder/dpb.h Tue Aug 04 15:37:26 2015 +0000
> +++ b/source/encoder/dpb.h Tue Aug 04 16:10:01 2015 +0000
> @@ -47,6 +47,9 @@
> PicList m_picList;
> PicList m_freeList;
> FrameData* m_picSymFreeList;
> + x265_param* m_param;
> + int m_numNumaNodes ;
> + FrameData **m_picSymFreeListNuma ;
>
> DPB(x265_param *param)
> {
> @@ -58,6 +61,27 @@
> m_maxRefL1 = param->bBPyramid ? 2 : 1;
> m_bOpenGOP = param->bOpenGOP;
> m_bTemporalSublayer = !!param->bEnableTemporalSubLayers;
> + m_param = param ;
> + m_numNumaNodes = -1 ;
> +
> +#if (defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7)
> + // NUMA supported by default on windows
> + m_numNumaNodes = 1 ;
> + if(GetNumaHighestNodeNumber(&num)) {
> + m_numNumaNodes ++ ;
> + }
> +#elif HAVE_LIBNUMA
> + if(numa_available()>=0) {
> + m_numNumaNodes = numa_max_node() + 1 ;
> + }
> +#endif // HAVE_LIBNUMA
> +
> + if(m_numNumaNodes>0) {
> + m_picSymFreeListNuma = new FrameData*[m_numNumaNodes] ;
> + for(int i=0; i<m_numNumaNodes; i++) {
> + m_picSymFreeListNuma[i] = NULL ;
> + }
> + }
> }
>
> ~DPB();
> @@ -66,6 +90,17 @@
>
> void recycleUnreferenced();
>
> + bool isFreeEncDataAvailable() {
> + if(m_picSymFreeList) {
> + return true ;
> + }
> + for(int i=0; i<m_numNumaNodes; i++) {
> + if(m_picSymFreeListNuma[i])
> + return true ;
> + }
> + return false ;
> + }
> +
> protected:
>
> void computeRPS(int curPoc, bool isRAP, RPS * rps, unsigned int
> maxDecPicBuffer);
> diff -r 0c1f9d982944 -r 0206efdac228 source/encoder/encoder.cpp
> --- a/source/encoder/encoder.cpp Tue Aug 04 15:37:26 2015 +0000
> +++ b/source/encoder/encoder.cpp Tue Aug 04 16:10:01 2015 +0000
> @@ -286,6 +286,11 @@
>
> void Encoder::destroy()
> {
> + int numRefSameNuma = 0 ;
> + int numRefDiffNuma = 0 ;
> + int numReconSameNuma = 0 ;
> + int numReconDiffNuma = 0 ;
> +
> if (m_exportedPic)
> {
> ATOMIC_DEC(&m_exportedPic->m_countRefEncoders);
> @@ -296,6 +301,13 @@
> {
> if (m_frameEncoder[i])
> {
> + if(m_param->printNumaStats) {
> + numRefSameNuma +=
> m_frameEncoder[i]->getNumRefFramesSameNuma() ;
> + numRefDiffNuma +=
> m_frameEncoder[i]->getNumRefFramesDiffNuma() ;
> + numReconSameNuma +=
> m_frameEncoder[i]->getNumReconFramesSameNuma() ;
> + numReconDiffNuma +=
> m_frameEncoder[i]->getNumReconFramesDiffNuma() ;
> + }
> +
> m_frameEncoder[i]->destroy();
> delete m_frameEncoder[i];
> }
> @@ -323,6 +335,16 @@
> X265_FREE(m_buOffsetY);
> X265_FREE(m_buOffsetC);
>
> + if(m_param && m_param->printNumaStats) {
> + printf("Num new Encoder data alloc = %d\n",
> m_numNewEncodeDataAlloc) ;
> + printf("Num same node Encoder data reuse = %d\n",
> m_numSameNumaEncData) ;
> + printf("Num diff node Encoder data reuse = %d\n",
> m_numDiffNumaEncData) ;
> + printf("Num Ref frames in Same numa = %d\n", numRefSameNuma)
> ;
> + printf("Num Ref frames in Diff numa = %d\n", numRefDiffNuma)
> ;
> + printf("Num Recon frames in Same numa = %d\n",
> numReconSameNuma) ;
> + printf("Num Recon frames in Diff numa = %d\n",
> numReconDiffNuma) ;
> + }
> +
> if (m_analysisFile)
> fclose(m_analysisFile);
>
> @@ -511,6 +533,7 @@
>
> FrameEncoder *curEncoder = m_frameEncoder[m_curEncoder];
> m_curEncoder = (m_curEncoder + 1) % m_param->frameNumThreads;
> +
> int ret = 0;
>
> /* Normal operation is to wait for the current frame encoder to
> complete its current frame
> @@ -633,15 +656,49 @@
> if (frameEnc && !pass)
> {
> /* give this frame a FrameData instance before encoding */
> - if (m_dpb->m_picSymFreeList)
> + // If NUMA aware allocation is enabled, try to preferably
> select a frame from this numa
> + // node if available. If disabled, give any free node. If no
> free node, allocate new data
> + if (m_dpb->isFreeEncDataAvailable())
> {
> - frameEnc->m_encData = m_dpb->m_picSymFreeList;
> - m_dpb->m_picSymFreeList =
> m_dpb->m_picSymFreeList->m_freeListNext;
> - frameEnc->reinit(m_sps);
> + // Need to figure out which NUMA node this in frame is
> going to be
> + // decoded on! try to allocate in data on that node.
> + int threadNumaNode = curEncoder->m_pool->m_numaNode ;
> + int dataNumaNode = -1 ;
> + if(threadNumaNode!=-1) {
> + int checkingNumaNode = threadNumaNode ;
> + int numNumaNodes = m_dpb->m_numNumaNodes ;
> + bool found = false ;
> + for(int i=0; i<numNumaNodes;i++) {
> + if(m_dpb->m_picSymFreeListNuma[checkingNumaNode])
> {
> + dataNumaNode = checkingNumaNode ;
> + frameEnc->m_encData =
> m_dpb->m_picSymFreeListNuma[dataNumaNode] ;
> + m_dpb->m_picSymFreeListNuma[dataNumaNode] =
> +
> m_dpb->m_picSymFreeListNuma[dataNumaNode]->m_freeListNext ;
> + frameEnc->reinit(m_sps) ;
> + // printf("Worker threads on %d, recon frame
> data on %d\n",
> + // threadNumaNode, dataNumaNode) ;
> + found = true ;
> + break ;
> + }
> + checkingNumaNode = (checkingNumaNode+1) %
> numNumaNodes ;
> + }
> + X265_CHECK(found, "Should've found buffer for in
> frame!\n") ;
> + } else {
> + frameEnc->m_encData = m_dpb->m_picSymFreeList;
> + m_dpb->m_picSymFreeList =
> m_dpb->m_picSymFreeList->m_freeListNext;
> + frameEnc->reinit(m_sps);
> + dataNumaNode = frameEnc->m_encData->m_numaNode ;
> + }
> + if(dataNumaNode == threadNumaNode) {
> + m_numSameNumaEncData ++ ;
> + } else {
> + m_numDiffNumaEncData ++ ;
> + }
> }
> else
> {
> - frameEnc->allocEncodeData(m_param, m_sps);
> + m_numNewEncodeDataAlloc ++ ;
> + frameEnc->allocEncodeData(m_param, m_sps,
> curEncoder->m_pool->m_numaNode);
> Slice* slice = frameEnc->m_encData->m_slice;
> slice->m_sps = &m_sps;
> slice->m_pps = &m_pps;
> diff -r 0c1f9d982944 -r 0206efdac228 source/encoder/encoder.h
> --- a/source/encoder/encoder.h Tue Aug 04 15:37:26 2015 +0000
> +++ b/source/encoder/encoder.h Tue Aug 04 16:10:01 2015 +0000
> @@ -133,6 +133,9 @@
> bool m_aborted; // fatal error detected
> bool m_reconfigured; // reconfigure of encoder
> detected
>
> + uint32_t m_numNewEncodeDataAlloc ;
> + uint32_t m_numSameNumaEncData ;
> + uint32_t m_numDiffNumaEncData ;
> Encoder();
> ~Encoder() {}
>
> diff -r 0c1f9d982944 -r 0206efdac228 source/encoder/frameencoder.cpp
> --- a/source/encoder/frameencoder.cpp Tue Aug 04 15:37:26 2015 +0000
> +++ b/source/encoder/frameencoder.cpp Tue Aug 04 16:10:01 2015 +0000
> @@ -60,6 +60,11 @@
> m_ctuGeomMap = NULL;
> m_localTldIdx = 0;
> memset(&m_rce, 0, sizeof(RateControlEntry));
> +
> + m_numRefFramesSameNuma = 0 ;
> + m_numRefFrameDiffNuma = 0 ;
> + m_numReconFramesSameNuma = 0 ;
> + m_numReconFramesDiffNuma = 0 ;
> }
>
> void FrameEncoder::destroy()
> @@ -357,7 +362,15 @@
> WeightParam *w = NULL;
> if ((bUseWeightP || bUseWeightB) &&
> slice->m_weightPredTable[l][ref][0].bPresentFlag)
> w = slice->m_weightPredTable[l][ref];
> - m_mref[l][ref].init(slice->m_refPicList[l][ref]->m_reconPic,
> w, *m_param);
> + PicYuv* closestReconPic =
> slice->m_refPicList[l][ref]->m_reconPic ;
> + m_mref[l][ref].init(closestReconPic, w, *m_param);
> + if(m_param->printNumaStats) {
> + if(m_pool->m_numaNode != closestReconPic->m_numaNode) {
> + m_numRefFrameDiffNuma ++ ;
> + } else {
> + m_numRefFramesSameNuma ++ ;
> + }
> + }
> }
> }
>
> @@ -932,6 +945,13 @@
>
> // Does all the CU analysis, returns best top level mode decision
> Mode& best = tld.analysis.compressCTU(*ctu, *m_frame,
> m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder);
> + if(m_param->printNumaStats) {
> + if(m_pool->m_numaNode != m_frame->m_reconPic->m_numaNode) {
> + m_numReconFramesDiffNuma ++ ;
> + } else {
> + m_numReconFramesSameNuma ++ ;
> + }
> + }
>
> // take a sample of the current active worker count
> ATOMIC_ADD(&m_totalActiveWorkerCount, m_activeWorkerCount);
> diff -r 0c1f9d982944 -r 0206efdac228 source/encoder/frameencoder.h
> --- a/source/encoder/frameencoder.h Tue Aug 04 15:37:26 2015 +0000
> +++ b/source/encoder/frameencoder.h Tue Aug 04 16:10:01 2015 +0000
> @@ -206,7 +206,16 @@
> WeightAnalysis operator=(const WeightAnalysis&);
> };
>
> -protected:
> + unsigned int m_numRefFramesSameNuma ;
> + unsigned int m_numRefFrameDiffNuma ;
> + unsigned int m_numReconFramesSameNuma ;
> + unsigned int m_numReconFramesDiffNuma ;
> + unsigned int getNumRefFramesSameNuma() { return
> m_numRefFramesSameNuma ; }
> + unsigned int getNumRefFramesDiffNuma() { return
> m_numRefFrameDiffNuma ; }
> + unsigned int getNumReconFramesSameNuma() { return
> m_numReconFramesSameNuma ; }
> + unsigned int getNumReconFramesDiffNuma() { return
> m_numReconFramesDiffNuma ; }
> +
> + protected:
>
> bool initializeGeoms();
>
> diff -r 0c1f9d982944 -r 0206efdac228 source/x265.h
> --- a/source/x265.h Tue Aug 04 15:37:26 2015 +0000
> +++ b/source/x265.h Tue Aug 04 16:10:01 2015 +0000
> @@ -1172,6 +1172,11 @@
> * picture average light level (or 0). */
> const char* contentLightLevelInfo;
>
> + /* Print NUMA statistics collected from the code on the console to
> show the
> + * number of times the recon and ref pics were locatd on the same
> NUMA socket,
> + * and on different sockets */
> + int printNumaStats ;
> +
> } x265_param;
>
> /* x265_param_alloc:
> diff -r 0c1f9d982944 -r 0206efdac228 source/x265cli.h
> --- a/source/x265cli.h Tue Aug 04 15:37:26 2015 +0000
> +++ b/source/x265cli.h Tue Aug 04 16:10:01 2015 +0000
> @@ -218,6 +218,7 @@
> { "no-temporal-layers", no_argument, NULL, 0 },
> { "qg-size", required_argument, NULL, 0 },
> { "recon-y4m-exec", required_argument, NULL, 0 },
> + { "print-numa-stats", no_argument, NULL, 0 },
> { 0, 0, 0, 0 },
> { 0, 0, 0, 0 },
> { 0, 0, 0, 0 },
> @@ -414,6 +415,7 @@
> H1("-r/--recon <filename> Reconstructed raw image YUV or
> Y4M output file name\n");
> H1(" --recon-depth <integer> Bit-depth of reconstructed raw
> image file. Defaults to input bit depth, or 8 if Y4M\n");
> H1(" --recon-y4m-exec <string> pipe reconstructed frames to Y4M
> viewer, ex:\"ffplay -i pipe:0 -autoexit\"\n");
> + H1(" --print-numa-stats print statistics related to
> socket information for ref and recon frames\n");
> H1("\nExecutable return codes:\n");
> H1(" 0 - encode successful\n");
> H1(" 1 - unable to parse command line\n");
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150805/535d66e0/attachment-0001.html>
More information about the x265-devel
mailing list