[x265] [PATCH] Performance: Enabling recon frames to be NUMA-aware when the

Pradeep pradeep at multicorewareinc.com
Wed Aug 5 16:05:29 CEST 2015


# HG changeset patch
# User Pradeep <pradeep at multicorewareinc.com>
# Date 1438704601 0
#      Tue Aug 04 16:10:01 2015 +0000
# Node ID 0206efdac228891f348c8d6c7ad7ced369c840a3
# Parent  0c1f9d98294454d3bf896aeb24be881d8aa53434
Performance: Enabling recon frames to be NUMA-aware when the
frame encoder thread creates them. Seeing considerable reduction in
no. cross-socket accesses, but impact on performance of sample videos
is rather small

diff -r 0c1f9d982944 -r 0206efdac228 source/common/frame.cpp
--- a/source/common/frame.cpp	Tue Aug 04 15:37:26 2015 +0000
+++ b/source/common/frame.cpp	Tue Aug 04 16:10:01 2015 +0000
@@ -51,10 +51,34 @@
            m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode);
 }
 
-bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
+bool Frame::allocEncodeData(x265_param *param, const SPS& sps, const int numaNode)
 {
-    m_encData = new FrameData;
-    m_reconPic = new PicYuv;
+    int selNumaNode = numaNode ;
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
+    GROUP_AFFINITY groupAffinity;
+    if (GetNumaNodeProcessorMaskEx((USHORT)selNumaNode, &groupAffinity)) {
+        if(VirtualAllocExNuma(GetCurrentProcess(),
+                              NULL,
+                              sizeof(FrameData)+sizeof(PicYuv),
+                              MEM_COMMIT,
+                              PAGE_READWRITE,
+                              selNumaNode)) {
+            // Successful commit, do nothing
+        }
+    }
+#elif HAVE_LIBNUMA
+    if(numa_available() >= 0) {
+        numa_set_preferred(selNumaNode) ;
+        numa_set_localalloc() ;
+    } else {
+        selNumaNode = -1 ;
+    }
+#else
+    selNumaNode = -1 ;
+#endif // HAVE_LIBNUMA
+
+    m_encData = new FrameData(selNumaNode) ;
+    m_reconPic = new PicYuv(selNumaNode) ;
     m_encData->m_reconPic = m_reconPic;
     bool ok = m_encData->create(param, sps) && m_reconPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
     if (ok)
diff -r 0c1f9d982944 -r 0206efdac228 source/common/frame.h
--- a/source/common/frame.h	Tue Aug 04 15:37:26 2015 +0000
+++ b/source/common/frame.h	Tue Aug 04 16:10:01 2015 +0000
@@ -28,6 +28,10 @@
 #include "lowres.h"
 #include "threading.h"
 
+#if HAVE_LIBNUMA
+#include <numa.h>
+#endif // HAVE_LIBNUMA
+
 namespace X265_NS {
 // private namespace
 
@@ -67,10 +71,11 @@
     Frame*                 m_prev;
     x265_param*            m_param;              // Points to the latest param set for the frame.
     x265_analysis_data     m_analysisData;
+
     Frame();
 
     bool create(x265_param *param);
-    bool allocEncodeData(x265_param *param, const SPS& sps);
+    bool allocEncodeData(x265_param *param, const SPS& sps, const int numaNode);
     void reinit(const SPS& sps);
     void destroy();
 };
diff -r 0c1f9d982944 -r 0206efdac228 source/common/framedata.cpp
--- a/source/common/framedata.cpp	Tue Aug 04 15:37:26 2015 +0000
+++ b/source/common/framedata.cpp	Tue Aug 04 16:10:01 2015 +0000
@@ -26,9 +26,10 @@
 
 using namespace X265_NS;
 
-FrameData::FrameData()
+FrameData::FrameData(int numaNode)
 {
     memset(this, 0, sizeof(*this));
+    m_numaNode = numaNode ;
 }
 
 bool FrameData::create(x265_param *param, const SPS& sps)
diff -r 0c1f9d982944 -r 0206efdac228 source/common/framedata.h
--- a/source/common/framedata.h	Tue Aug 04 15:37:26 2015 +0000
+++ b/source/common/framedata.h	Tue Aug 04 16:10:01 2015 +0000
@@ -107,6 +107,8 @@
     CUDataMemPool  m_cuMemPool;
     CUData*        m_picCTU;
 
+    int            m_numaNode ;
+
     /* Rate control data used during encode and by references */
     struct RCStatCU
     {
@@ -140,7 +142,7 @@
     double         m_avgQpAq;    /* avg QP as decided by AQ in addition to rate-control */
     double         m_rateFactor; /* calculated based on the Frame QP */
 
-    FrameData();
+    FrameData(int numaNode=-1);
 
     bool create(x265_param *param, const SPS& sps);
     void reinit(const SPS& sps);
diff -r 0c1f9d982944 -r 0206efdac228 source/common/param.cpp
--- a/source/common/param.cpp	Tue Aug 04 15:37:26 2015 +0000
+++ b/source/common/param.cpp	Tue Aug 04 16:10:01 2015 +0000
@@ -855,6 +855,7 @@
     OPT("qg-size") p->rc.qgSize = atoi(value);
     OPT("master-display") p->masteringDisplayColorVolume = strdup(value);
     OPT("max-cll") p->contentLightLevelInfo = strdup(value);
+    OPT("print-numa-stats") p->printNumaStats = atobool(value) ;
     else
         return X265_PARAM_BAD_NAME;
 #undef OPT
diff -r 0c1f9d982944 -r 0206efdac228 source/common/picyuv.cpp
--- a/source/common/picyuv.cpp	Tue Aug 04 15:37:26 2015 +0000
+++ b/source/common/picyuv.cpp	Tue Aug 04 16:10:01 2015 +0000
@@ -28,7 +28,8 @@
 
 using namespace X265_NS;
 
-PicYuv::PicYuv()
+PicYuv::PicYuv(int numaNode):
+    m_numaNode(numaNode)
 {
     m_picBuf[0] = NULL;
     m_picBuf[1] = NULL;
diff -r 0c1f9d982944 -r 0206efdac228 source/common/picyuv.h
--- a/source/common/picyuv.h	Tue Aug 04 15:37:26 2015 +0000
+++ b/source/common/picyuv.h	Tue Aug 04 16:10:01 2015 +0000
@@ -59,8 +59,9 @@
     uint32_t m_lumaMarginY;
     uint32_t m_chromaMarginX;
     uint32_t m_chromaMarginY;
+    int32_t  m_numaNode ;
 
-    PicYuv();
+    PicYuv(int numaNode=-1);
 
     bool  create(uint32_t picWidth, uint32_t picHeight, uint32_t csp);
     bool  createOffsets(const SPS& sps);
diff -r 0c1f9d982944 -r 0206efdac228 source/common/threadpool.cpp
--- a/source/common/threadpool.cpp	Tue Aug 04 15:37:26 2015 +0000
+++ b/source/common/threadpool.cpp	Tue Aug 04 16:10:01 2015 +0000
@@ -338,6 +338,7 @@
 ThreadPool::ThreadPool()
 {
     memset(this, 0, sizeof(*this));
+    m_numaNode = -1 ;
 }
 
 bool ThreadPool::create(int numThreads, int maxProviders, int node)
diff -r 0c1f9d982944 -r 0206efdac228 source/encoder/dpb.cpp
--- a/source/encoder/dpb.cpp	Tue Aug 04 15:37:26 2015 +0000
+++ b/source/encoder/dpb.cpp	Tue Aug 04 16:10:01 2015 +0000
@@ -58,6 +58,23 @@
         delete m_picSymFreeList;
         m_picSymFreeList = next;
     }
+
+    if(m_picSymFreeListNuma) {
+        for(int i=0; i<m_numNumaNodes; i++) {
+            while(m_picSymFreeListNuma[i]) {
+                FrameData* next = m_picSymFreeListNuma[i]->m_freeListNext;
+                m_picSymFreeListNuma[i]->destroy();
+
+                m_picSymFreeListNuma[i]->m_reconPic->destroy();
+                delete m_picSymFreeListNuma[i]->m_reconPic;
+
+                delete m_picSymFreeListNuma[i];
+                m_picSymFreeListNuma[i] = next;
+            }
+            delete m_picSymFreeListNuma[i] ;
+        }
+        delete m_picSymFreeListNuma ;
+    }
 }
 
 // move unreferenced pictures from picList to freeList for recycle
@@ -78,9 +95,17 @@
             m_picList.remove(*curFrame);
             iterFrame = m_picList.first();
 
+            int encDataNumaNode = curFrame->m_encData->m_numaNode ;
+            if(encDataNumaNode != -1) {
+                X265_CHECK(encDataNumaNode < m_numNumaNodes,
+                           "fatal: frame allocated on non-existant numa node!\n") ;
+                curFrame->m_encData->m_freeListNext = m_picSymFreeListNuma[encDataNumaNode] ;
+                m_picSymFreeListNuma[encDataNumaNode] = curFrame->m_encData ;
+            } else {
+                curFrame->m_encData->m_freeListNext = m_picSymFreeList;
+                m_picSymFreeList = curFrame->m_encData;
+            }
             m_freeList.pushBack(*curFrame);
-            curFrame->m_encData->m_freeListNext = m_picSymFreeList;
-            m_picSymFreeList = curFrame->m_encData;
             curFrame->m_encData = NULL;
             curFrame->m_reconPic = NULL;
         }
diff -r 0c1f9d982944 -r 0206efdac228 source/encoder/dpb.h
--- a/source/encoder/dpb.h	Tue Aug 04 15:37:26 2015 +0000
+++ b/source/encoder/dpb.h	Tue Aug 04 16:10:01 2015 +0000
@@ -47,6 +47,9 @@
     PicList            m_picList;
     PicList            m_freeList;
     FrameData*         m_picSymFreeList;
+    x265_param*        m_param;
+    int                m_numNumaNodes ;
+    FrameData        **m_picSymFreeListNuma ;
 
     DPB(x265_param *param)
     {
@@ -58,6 +61,27 @@
         m_maxRefL1 = param->bBPyramid ? 2 : 1;
         m_bOpenGOP = param->bOpenGOP;
         m_bTemporalSublayer = !!param->bEnableTemporalSubLayers;
+        m_param = param ;
+        m_numNumaNodes = -1 ;
+
+#if (defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7)
+        // NUMA supported by default on windows
+        m_numNumaNodes = 1 ;
+        if(GetNumaHighestNodeNumber(&num)) {
+            m_numNumaNodes ++ ;
+        }
+#elif HAVE_LIBNUMA
+        if(numa_available()>=0) {
+            m_numNumaNodes = numa_max_node() + 1 ;
+        }
+#endif // HAVE_LIBNUMA
+
+        if(m_numNumaNodes>0) {
+            m_picSymFreeListNuma = new FrameData*[m_numNumaNodes] ;
+            for(int i=0; i<m_numNumaNodes; i++) {
+                m_picSymFreeListNuma[i] = NULL ;
+            }
+        }
     }
 
     ~DPB();
@@ -66,6 +90,17 @@
 
     void recycleUnreferenced();
 
+    bool isFreeEncDataAvailable() {
+        if(m_picSymFreeList) {
+            return true ;
+        }
+        for(int i=0; i<m_numNumaNodes; i++) {
+            if(m_picSymFreeListNuma[i])
+                return true ;
+        }
+        return false ;
+    }
+
 protected:
 
     void computeRPS(int curPoc, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer);
diff -r 0c1f9d982944 -r 0206efdac228 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp	Tue Aug 04 15:37:26 2015 +0000
+++ b/source/encoder/encoder.cpp	Tue Aug 04 16:10:01 2015 +0000
@@ -286,6 +286,11 @@
 
 void Encoder::destroy()
 {
+    int numRefSameNuma = 0 ;
+    int numRefDiffNuma = 0 ;
+    int numReconSameNuma = 0 ;
+    int numReconDiffNuma = 0 ;
+
     if (m_exportedPic)
     {
         ATOMIC_DEC(&m_exportedPic->m_countRefEncoders);
@@ -296,6 +301,13 @@
     {
         if (m_frameEncoder[i])
         {
+            if(m_param->printNumaStats) {
+                numRefSameNuma += m_frameEncoder[i]->getNumRefFramesSameNuma() ;
+                numRefDiffNuma += m_frameEncoder[i]->getNumRefFramesDiffNuma() ;
+                numReconSameNuma += m_frameEncoder[i]->getNumReconFramesSameNuma() ;
+                numReconDiffNuma += m_frameEncoder[i]->getNumReconFramesDiffNuma() ;
+            }
+
             m_frameEncoder[i]->destroy();
             delete m_frameEncoder[i];
         }
@@ -323,6 +335,16 @@
     X265_FREE(m_buOffsetY);
     X265_FREE(m_buOffsetC);
 
+    if(m_param && m_param->printNumaStats) {
+        printf("Num new Encoder data alloc       = %d\n", m_numNewEncodeDataAlloc) ;
+        printf("Num same node Encoder data reuse = %d\n", m_numSameNumaEncData) ;
+        printf("Num diff node Encoder data reuse = %d\n", m_numDiffNumaEncData) ;
+        printf("Num Ref frames in Same numa      = %d\n", numRefSameNuma) ;
+        printf("Num Ref frames in Diff numa      = %d\n", numRefDiffNuma) ;
+        printf("Num Recon frames in Same numa    = %d\n", numReconSameNuma) ;
+        printf("Num Recon frames in Diff numa    = %d\n", numReconDiffNuma) ;
+    }
+
     if (m_analysisFile)
         fclose(m_analysisFile);
 
@@ -511,6 +533,7 @@
 
     FrameEncoder *curEncoder = m_frameEncoder[m_curEncoder];
     m_curEncoder = (m_curEncoder + 1) % m_param->frameNumThreads;
+
     int ret = 0;
 
     /* Normal operation is to wait for the current frame encoder to complete its current frame
@@ -633,15 +656,49 @@
         if (frameEnc && !pass)
         {
             /* give this frame a FrameData instance before encoding */
-            if (m_dpb->m_picSymFreeList)
+            // If NUMA aware allocation is enabled, try to preferably select a frame from this numa
+            // node if available. If disabled, give any free node. If no free node, allocate new data
+            if (m_dpb->isFreeEncDataAvailable())
             {
-                frameEnc->m_encData = m_dpb->m_picSymFreeList;
-                m_dpb->m_picSymFreeList = m_dpb->m_picSymFreeList->m_freeListNext;
-                frameEnc->reinit(m_sps);
+                // Need to figure out which NUMA node this in frame is going to be
+                // decoded on! try to allocate in data on that node.
+                int threadNumaNode = curEncoder->m_pool->m_numaNode ;
+                int dataNumaNode = -1 ;
+                if(threadNumaNode!=-1) {
+                    int checkingNumaNode = threadNumaNode ;
+                    int numNumaNodes     = m_dpb->m_numNumaNodes ;
+                    bool found           = false ;
+                    for(int i=0; i<numNumaNodes;i++) {
+                        if(m_dpb->m_picSymFreeListNuma[checkingNumaNode]) {
+                            dataNumaNode = checkingNumaNode ;
+                            frameEnc->m_encData = m_dpb->m_picSymFreeListNuma[dataNumaNode] ;
+                            m_dpb->m_picSymFreeListNuma[dataNumaNode] =
+                                m_dpb->m_picSymFreeListNuma[dataNumaNode]->m_freeListNext ;
+                            frameEnc->reinit(m_sps) ;
+                            // printf("Worker threads on %d, recon frame data on %d\n",
+                            //    threadNumaNode, dataNumaNode) ;
+                            found = true ;
+                            break ;
+                        }
+                        checkingNumaNode = (checkingNumaNode+1) % numNumaNodes ;
+                    }
+                    X265_CHECK(found, "Should've found buffer for in frame!\n") ;
+                } else {
+                    frameEnc->m_encData = m_dpb->m_picSymFreeList;
+                    m_dpb->m_picSymFreeList = m_dpb->m_picSymFreeList->m_freeListNext;
+                    frameEnc->reinit(m_sps);
+                    dataNumaNode = frameEnc->m_encData->m_numaNode ;
+                }
+                if(dataNumaNode == threadNumaNode) {
+                    m_numSameNumaEncData ++ ;
+                } else {
+                    m_numDiffNumaEncData ++ ;
+                }
             }
             else
             {
-                frameEnc->allocEncodeData(m_param, m_sps);
+                m_numNewEncodeDataAlloc ++ ;
+                frameEnc->allocEncodeData(m_param, m_sps, curEncoder->m_pool->m_numaNode);
                 Slice* slice = frameEnc->m_encData->m_slice;
                 slice->m_sps = &m_sps;
                 slice->m_pps = &m_pps;
diff -r 0c1f9d982944 -r 0206efdac228 source/encoder/encoder.h
--- a/source/encoder/encoder.h	Tue Aug 04 15:37:26 2015 +0000
+++ b/source/encoder/encoder.h	Tue Aug 04 16:10:01 2015 +0000
@@ -133,6 +133,9 @@
     bool               m_aborted;          // fatal error detected
     bool               m_reconfigured;      // reconfigure of encoder detected
 
+    uint32_t           m_numNewEncodeDataAlloc ;
+    uint32_t           m_numSameNumaEncData ;
+    uint32_t           m_numDiffNumaEncData ;
     Encoder();
     ~Encoder() {}
 
diff -r 0c1f9d982944 -r 0206efdac228 source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp	Tue Aug 04 15:37:26 2015 +0000
+++ b/source/encoder/frameencoder.cpp	Tue Aug 04 16:10:01 2015 +0000
@@ -60,6 +60,11 @@
     m_ctuGeomMap = NULL;
     m_localTldIdx = 0;
     memset(&m_rce, 0, sizeof(RateControlEntry));
+
+    m_numRefFramesSameNuma = 0 ;
+    m_numRefFrameDiffNuma = 0 ;
+    m_numReconFramesSameNuma = 0 ;
+    m_numReconFramesDiffNuma = 0 ;
 }
 
 void FrameEncoder::destroy()
@@ -357,7 +362,15 @@
             WeightParam *w = NULL;
             if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].bPresentFlag)
                 w = slice->m_weightPredTable[l][ref];
-            m_mref[l][ref].init(slice->m_refPicList[l][ref]->m_reconPic, w, *m_param);
+            PicYuv* closestReconPic = slice->m_refPicList[l][ref]->m_reconPic ;
+            m_mref[l][ref].init(closestReconPic, w, *m_param);
+            if(m_param->printNumaStats) {
+                if(m_pool->m_numaNode != closestReconPic->m_numaNode) {
+                    m_numRefFrameDiffNuma ++ ;
+                } else {
+                    m_numRefFramesSameNuma ++ ;
+                }
+            }
         }
     }
 
@@ -932,6 +945,13 @@
 
         // Does all the CU analysis, returns best top level mode decision
         Mode& best = tld.analysis.compressCTU(*ctu, *m_frame, m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder);
+        if(m_param->printNumaStats) {
+            if(m_pool->m_numaNode != m_frame->m_reconPic->m_numaNode) {
+                m_numReconFramesDiffNuma ++ ;
+            } else {
+                m_numReconFramesSameNuma ++ ;
+            }
+        }
 
         // take a sample of the current active worker count
         ATOMIC_ADD(&m_totalActiveWorkerCount, m_activeWorkerCount);
diff -r 0c1f9d982944 -r 0206efdac228 source/encoder/frameencoder.h
--- a/source/encoder/frameencoder.h	Tue Aug 04 15:37:26 2015 +0000
+++ b/source/encoder/frameencoder.h	Tue Aug 04 16:10:01 2015 +0000
@@ -206,7 +206,16 @@
         WeightAnalysis operator=(const WeightAnalysis&);
     };
 
-protected:
+    unsigned int             m_numRefFramesSameNuma ;
+    unsigned int             m_numRefFrameDiffNuma ;
+    unsigned int             m_numReconFramesSameNuma ;
+    unsigned int             m_numReconFramesDiffNuma ;
+    unsigned int             getNumRefFramesSameNuma() { return m_numRefFramesSameNuma ; }
+    unsigned int             getNumRefFramesDiffNuma() { return m_numRefFrameDiffNuma ; }
+    unsigned int             getNumReconFramesSameNuma() { return m_numReconFramesSameNuma ; }
+    unsigned int             getNumReconFramesDiffNuma() { return m_numReconFramesDiffNuma ; }
+
+    protected:
 
     bool initializeGeoms();
 
diff -r 0c1f9d982944 -r 0206efdac228 source/x265.h
--- a/source/x265.h	Tue Aug 04 15:37:26 2015 +0000
+++ b/source/x265.h	Tue Aug 04 16:10:01 2015 +0000
@@ -1172,6 +1172,11 @@
      * picture average light level (or 0). */
     const char* contentLightLevelInfo;
 
+    /* Print NUMA statistics collected from the code on the console to show the
+     * number of times the recon and ref pics were locatd on the same NUMA socket,
+     * and on different sockets */
+    int printNumaStats ;
+
 } x265_param;
 
 /* x265_param_alloc:
diff -r 0c1f9d982944 -r 0206efdac228 source/x265cli.h
--- a/source/x265cli.h	Tue Aug 04 15:37:26 2015 +0000
+++ b/source/x265cli.h	Tue Aug 04 16:10:01 2015 +0000
@@ -218,6 +218,7 @@
     { "no-temporal-layers",   no_argument, NULL, 0 },
     { "qg-size",        required_argument, NULL, 0 },
     { "recon-y4m-exec", required_argument, NULL, 0 },
+    { "print-numa-stats", no_argument, NULL, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
@@ -414,6 +415,7 @@
     H1("-r/--recon <filename>            Reconstructed raw image YUV or Y4M output file name\n");
     H1("   --recon-depth <integer>       Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M\n");
     H1("   --recon-y4m-exec <string>     pipe reconstructed frames to Y4M viewer, ex:\"ffplay -i pipe:0 -autoexit\"\n");
+    H1("   --print-numa-stats            print statistics related to socket information for ref and recon frames\n");
     H1("\nExecutable return codes:\n");
     H1("    0 - encode successful\n");
     H1("    1 - unable to parse command line\n");


More information about the x265-devel mailing list