[x265] x265-devel Digest, Vol 28, Issue 8

Mon Sep 7 12:51:35 CEST 2015

Thanks min,
Will send modified patch soon

Thank you
Regards
Ramya

On Fri, Sep 4, 2015 at 1:34 AM, <x265-devel-request at videolan.org> wrote:

> Send x265-devel mailing list submissions to
>         x265-devel at videolan.org
>
> To subscribe or unsubscribe via the World Wide Web, visit
>         https://mailman.videolan.org/listinfo/x265-devel
> or, via email, send a message with subject or body 'help' to
>         x265-devel-request at videolan.org
>
> You can reach the person managing the list at
>         x265-devel-owner at videolan.org
>
> When replying, please edit your Subject line so it is more specific
> than "Re: Contents of x265-devel digest..."
>
>
> Today's Topics:
>
>    1. Re: [PATCH 2 of 2] asm: fix sse_pp[32x64] sse2 asm for 12 bit
>       (chen)
>    2. Re: [PATCH] Performance: Don't split threads into per-NUMA
>       pools unless specified in cli (Steve Borho)
>
>
> ----------------------------------------------------------------------
>
> Message: 1
> Date: Fri, 4 Sep 2015 03:32:59 +0800 (CST)
> From: chen  <chenm003 at 163.com>
> To: "Development for x265" <x265-devel at videolan.org>
> Subject: Re: [x265] [PATCH 2 of 2] asm: fix sse_pp[32x64] sse2 asm for
>         12 bit
> Message-ID: <252ec460.2.14f94b2a4c5.Coremail.chenm003 at 163.com>
> Content-Type: text/plain; charset="gbk"
>
>
>
>
> At 2015-09-03 16:22:53,ramya at multicorewareinc.com wrote:
> ># HG changeset patch
> ># User Ramya Sriraman <ramya at multicorewareinc.com>
> ># Date 1441088473 -19800
> >#      Tue Sep 01 11:51:13 2015 +0530
> ># Node ID 8864f6a4d9ce4c1aa3c1b3c934befffb616ace35
> ># Parent  83dc8aea6ba7c10e0d78ec7dc34b3d8f7d114563
> >asm: fix sse_pp[32x64] sse2 asm for 12 bit
> >
> >diff -r 83dc8aea6ba7 -r 8864f6a4d9ce source/common/x86/asm-primitives.cpp
> >--- a/source/common/x86/asm-primitives.cpp     Wed Aug 26 17:06:25 2015
> +0530
> >+++ b/source/common/x86/asm-primitives.cpp     Tue Sep 01 11:51:13 2015
> +0530
> >@@ -1001,11 +1001,11 @@
> >         p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp =
> (pixel_sse_t)PFX(pixel_ssd_ss_4x8_mmx2);
> >         p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp =
> (pixel_sse_t)PFX(pixel_ssd_ss_8x16_sse2);
> >         p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp =
> (pixel_sse_t)PFX(pixel_ssd_ss_16x32_sse2);
> >-
> >+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp =
> (pixel_sse_t)PFX(pixel_ssd_ss_32x64_sse2);
> >+
> > #if X265_DEPTH <= 10
> >         p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
> >         ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
> >-        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp =
> (pixel_sse_t)PFX(pixel_ssd_ss_32x64_sse2);
> > #endif
> >
> >         p.cu[BLOCK_4x4].dct = PFX(dct4_sse2);
> >diff -r 83dc8aea6ba7 -r 8864f6a4d9ce source/common/x86/ssd-a.asm
> >--- a/source/common/x86/ssd-a.asm      Wed Aug 26 17:06:25 2015 +0530
> >+++ b/source/common/x86/ssd-a.asm      Tue Sep 01 11:51:13 2015 +0530
> >@@ -125,6 +125,61 @@
> >     RET
> > %endmacro
> >
> >+; Function to find ssd for 32x16 block, sse2, 12 bit depth
> >+; Defined sepeartely to be called from SSD_ONE_32 macro
> >+INIT_XMM sse2
> >+cglobal ssd_ss_32x16
> >+    pxor    m8, m8
> >+    mov     r4d, 16
> >+.loop:
> >+    movu        m0, [r0]
> >+    movu        m1, [r0+mmsize]
> >+    movu        m2, [r0+2*mmsize]
> >+    movu        m3, [r0+3*mmsize]
> >+    movu        m4, [r2]
> >+    movu        m5, [r2+mmsize]
> >+    movu        m6, [r2+2*mmsize]
> >+    movu        m7, [r2+3*mmsize]
> >+    psubw       m0, m4
> >+    psubw       m1, m5
> >+    psubw       m2, m6
> >+    psubw       m3, m7
> >+    lea         r0, [r0+r1]
> >+    lea         r2, [r2+r3]
> >+    pmaddwd     m0, m0
> >+    pmaddwd     m1, m1
> >+    pmaddwd     m2, m2
> >+    pmaddwd     m3, m3
> >+    paddd       m2, m3
> >+    paddd       m1, m2
> >+    paddd       m0, m1
> >+    paddd       m8, m0
> 2 \
> 3 + - 0 - 1 - 8
> Above dependency link a little longer, try below one
> 0 \
> 1 + - + 8
> 2 + /
> 3 /
>
> -------------- next part --------------
> An HTML attachment was scrubbed...
> URL: <
> http://mailman.videolan.org/pipermail/x265-devel/attachments/20150904/677ab3ea/attachment-0001.html
> >
>
> ------------------------------
>
> Message: 2
> Date: Thu, 3 Sep 2015 15:05:52 -0500
> From: Steve Borho <steve at borho.org>
> To: Development for x265 <x265-devel at videolan.org>
> Subject: Re: [x265] [PATCH] Performance: Don't split threads into
>         per-NUMA pools unless specified in cli
> Message-ID: <20150903200552.GC6293 at borho.org>
> Content-Type: text/plain; charset=us-ascii
>
> On 09/03, pradeep at multicorewareinc.com wrote:
> > # HG changeset patch
> > # User Pradeep Ramachandran <pradeep at multicorewareinc.com>
> > # Date 1441271466 -19800
> > #      Thu Sep 03 14:41:06 2015 +0530
> > # Node ID 2421fae1de247cb6758c0ad2dbb4bd5f45f23727
> > # Parent  86e9bd7dd19278fceef65fc93a06dc8746ec9daf
> > Performance: Don't split threads into per-NUMA pools unless specified in
> cli
> >
> > Fixes the threadpools to store a numa node mask instead of a numa node
> id to
> > use liNUMA functionality, and changes default thread pool behavior to
> use one
> > monolithic pool
> >
> > We see gains of 10%+ on Intel Xeon E5-2666v3, E5-2640 v2 machines for 4K
> videos
> > encoding at an ABR of 15Mbps in slower and versylow settings. Ultrafast
> shows a
> > perf dip of ~8% on the same machines. So when running in ultrafast mode,
> explicilty
> > specify # threads with the --pools command (--pools 18,18 on the
> E5-2666v3 which
> > has two sockets of 18 threads each, for example)
>
> I haven't verified it by running it, but it LGTM
>
> > diff -r 86e9bd7dd192 -r 2421fae1de24 doc/reST/cli.rst
> > --- a/doc/reST/cli.rst        Tue Sep 01 17:06:05 2015 +0530
> > +++ b/doc/reST/cli.rst        Thu Sep 03 14:41:06 2015 +0530
> > @@ -202,15 +202,29 @@
> >       "-"       - same as "none"
> >       "10"      - allocate one pool, using up to 10 cores on node 0
> >       "-,+"     - allocate one pool, using all cores on node 1
> > -     "+,-,+"   - allocate two pools, using all cores on nodes 0 and 2
> > -     "+,-,+,-" - allocate two pools, using all cores on nodes 0 and 2
> > -     "-,*"     - allocate three pools, using all cores on nodes 1, 2
> and 3
> > +     "+,-,+"   - allocate one pool, using only cores on nodes 0 and 2
> > +     "+,-,+,-" - allocate one pool, using only cores on nodes 0 and 2
> > +     "-,*"     - allocate one pool, using all cores on nodes 1, 2 and 3
> >       "8,8,8,8" - allocate four pools with up to 8 threads in each pool
> > +     "8,+,+,+" - allocate two pools, the first with 8 threads on node
> 0, and the second with all cores on node 1,2,3
> >
> > -     The total number of threads will be determined by the number of
> threads
> > -     assigned to all nodes. The worker threads will each be given
> affinity for
> > -     their node, they will not be allowed to migrate between nodes, but
> they
> > -     will be allowed to move between CPU cores within their node.
> > +     A thread pool dedicated to a given NUMA node is enabled only when
> the
> > +     number of threads to be created on that NUMA node is explicitly
> mentioned
> > +     in that corresponding position with the --pools option. Else, all
> threads
> > +     are spawned from a single pool. The total number of threads will be
> > +     determined by the number of threads assigned to the enabled NUMA
> nodes for
> > +     that pool. The worker threads are be given affinity to all the
> enabled
> > +     NUMA nodes for that pool and may migrate between them, unless
> explicitly
> > +     specified as described above.
> > +
> > +     In the case that any threadpool has more than 64 threads, the
> threadpool
> > +     may be broken down into multiple pools of 64 threads each; on
> 32-bit
> > +     machines, this number is 32. All pools are given affinity to the
> NUMA
> > +     nodes on which the original pool had affinity. For performance
> reasons,
> > +     the last thread pool is spawned only if it has more than 32
> threads for
> > +     64-bit machines, or 16 for 32-bit machines. If the total number of
> threads
> > +     in the system doesn't obey this constraint, we may spawn fewer
> threads
> > +     than cores which has been emperically shown to be better for
> performance.
> >
> >       If the four pool features: :option:`--wpp`, :option:`--pmode`,
> >       :option:`--pme` and :option:`--lookahead-slices` are all disabled,
> > @@ -219,10 +233,6 @@
> >       If "none" is specified, then all four of the thread pool features
> are
> >       implicitly disabled.
> >
> > -     Multiple thread pools will be allocated for any NUMA node with
> more than
> > -     64 logical CPU cores. But any given thread pool will always use at
> most
> > -     one NUMA node.
> > -
> >       Frame encoders are distributed between the available thread pools,
> >       and the encoder will never generate more thread pools than
> >       :option:`--frame-threads`.  The pools are used for WPP and for
> > @@ -238,8 +248,12 @@
> >       system, a POSIX build of libx265 without libnuma will be less work
> >       efficient. See :ref:`thread pools <pools>` for more detail.
> >
> > -     Default "", one thread is allocated per detected hardware thread
> > -     (logical CPU cores) and one thread pool per NUMA node.
> > +     Default "", one pool is created across all available NUMA nodes,
> with
> > +     one thread allocated per detected hardware thread
> > +     (logical CPU cores). In the case that the total number of threads
> is more
> > +     than the maximum size that ATOMIC operations can handle (32 for
> 32-bit
> > +     compiles, and 64 for 64-bit compiles), multiple thread pools may be
> > +     spawned subject to the performance constraint described above.
> >
> >       Note that the string value will need to be escaped or quoted to
> >       protect against shell expansion on many platforms
> > diff -r 86e9bd7dd192 -r 2421fae1de24 source/common/threadpool.cpp
> > --- a/source/common/threadpool.cpp    Tue Sep 01 17:06:05 2015 +0530
> > +++ b/source/common/threadpool.cpp    Thu Sep 03 14:41:06 2015 +0530
> > @@ -226,8 +226,13 @@
> >  {
> >      enum { MAX_NODE_NUM = 127 };
> >      int cpusPerNode[MAX_NODE_NUM + 1];
> > +    int threadsPerPool[MAX_NODE_NUM + 2];
> > +    uint32_t nodeMaskPerPool[MAX_NODE_NUM +2];
> >
> >      memset(cpusPerNode, 0, sizeof(cpusPerNode));
> > +    memset(threadsPerPool, 0, sizeof(threadsPerPool));
> > +    memset(nodeMaskPerPool, 0, sizeof(nodeMaskPerPool));
> > +
> >      int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
> >      int cpuCount = getCpuCount();
> >      bool bNumaSupport = false;
> > @@ -258,7 +263,7 @@
> >          for (int i = 0; i < numNumaNodes; i++)
> >              x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d
> logical cores\n", i, cpusPerNode[i]);
> >
> > -    /* limit nodes based on param->numaPools */
> > +    /* limit threads based on param->numaPools */
> >      if (p->numaPools && *p->numaPools)
> >      {
> >          const char *nodeStr = p->numaPools;
> > @@ -266,19 +271,30 @@
> >          {
> >              if (!*nodeStr)
> >              {
> > -                cpusPerNode[i] = 0;
> > +                threadsPerPool[i] = 0;
> >                  continue;
> >              }
> >              else if (*nodeStr == '-')
> > -                cpusPerNode[i] = 0;
> > +                threadsPerPool[i] = 0;
> >              else if (*nodeStr == '*')
> > +            {
> > +                for (int j = i; j < numNumaNodes; j++)
> > +                {
> > +                    threadsPerPool[numNumaNodes] += cpusPerNode[j];
> > +                    nodeMaskPerPool[numNumaNodes] |= (1U << j);
> > +                }
> >                  break;
> > +            }
> >              else if (*nodeStr == '+')
> > -                ;
> > +            {
> > +                threadsPerPool[numNumaNodes] += cpusPerNode[i];
> > +                nodeMaskPerPool[numNumaNodes] = (1U << i);
> > +            }
> >              else
> >              {
> >                  int count = atoi(nodeStr);
> > -                cpusPerNode[i] = X265_MIN(count, cpusPerNode[i]);
> > +                threadsPerPool[i] = X265_MIN(count, cpusPerNode[i]);
> > +                nodeMaskPerPool[i] = (1U << i);
> >              }
> >
> >              /* consume current node string, comma, and white-space */
> > @@ -288,24 +304,31 @@
> >                 ++nodeStr;
> >          }
> >      }
> > -
> > -    // In the case that numa is disabled and we have more CPUs than 64,
> > -    // spawn the last pool only if the # threads in that pool is > 1/2
> max (heuristic)
> > -    if ((numNumaNodes == 1) &&
> > -        (cpusPerNode[0] > MAX_POOL_THREADS) &&
> > -        (cpusPerNode[0] % MAX_POOL_THREADS < (MAX_POOL_THREADS / 2)))
> > +    else
> >      {
> > -        cpusPerNode[0] -= (cpusPerNode[0] % MAX_POOL_THREADS);
> > -        x265_log(p, X265_LOG_DEBUG, "Creating only %d worker threads to
> prevent asymmetry in pools; may not use all HW contexts\n", cpusPerNode[0]);
> > +        for (int i = 0; i < numNumaNodes; i++)
> > +        {
> > +            threadsPerPool[numNumaNodes]  += cpusPerNode[i];
> > +            nodeMaskPerPool[numNumaNodes] |= (1U << i);
> > +        }
> > +    }
> > +
> > +    // If the last pool size is > MAX_POOL_THREADS, clip it to spawn
> thread pools only of size >= 1/2 max (heuristic)
> > +    if ((threadsPerPool[numNumaNodes] > MAX_POOL_THREADS) &&
> > +        ((threadsPerPool[numNumaNodes] % MAX_POOL_THREADS) <
> (MAX_POOL_THREADS / 2)))
> > +    {
> > +        threadsPerPool[numNumaNodes] -= (threadsPerPool[numNumaNodes] %
> MAX_POOL_THREADS);
> > +        x265_log(p, X265_LOG_DEBUG,
> > +                 "Creating only %d worker threads beyond specified
> numbers with --pools (if specified) to prevent asymmetry in pools; may not
> use all HW contexts\n", threadsPerPool[numNumaNodes]);
> >      }
> >
> >      numPools = 0;
> > -    for (int i = 0; i < numNumaNodes; i++)
> > +    for (int i = 0; i < numNumaNodes + 1; i++)
> >      {
> >          if (bNumaSupport)
> >              x265_log(p, X265_LOG_DEBUG, "NUMA node %d may use %d
> logical cores\n", i, cpusPerNode[i]);
> > -        if (cpusPerNode[i])
> > -            numPools += (cpusPerNode[i] + MAX_POOL_THREADS - 1) /
> MAX_POOL_THREADS;
> > +        if (threadsPerPool[i])
> > +            numPools += (threadsPerPool[i] + MAX_POOL_THREADS - 1) /
> MAX_POOL_THREADS;
> >      }
> >
> >      if (!numPools)
> > @@ -324,20 +347,20 @@
> >          int node = 0;
> >          for (int i = 0; i < numPools; i++)
> >          {
> > -            while (!cpusPerNode[node])
> > +            while (!threadsPerPool[node])
> >                  node++;
> > -            int cores = X265_MIN(MAX_POOL_THREADS, cpusPerNode[node]);
> > -            if (!pools[i].create(cores, maxProviders, node))
> > +            int numThreads = X265_MIN(MAX_POOL_THREADS,
> threadsPerPool[node]);
> > +            if (!pools[i].create(numThreads, maxProviders,
> nodeMaskPerPool[node]))
> >              {
> >                  X265_FREE(pools);
> >                  numPools = 0;
> >                  return NULL;
> >              }
> >              if (numNumaNodes > 1)
> > -                x265_log(p, X265_LOG_INFO, "Thread pool %d using %d
> threads on NUMA node %d\n", i, cores, node);
> > +                x265_log(p, X265_LOG_INFO, "Thread pool %d using %d
> threads with NUMA node mask %lx\n", i, numThreads, nodeMaskPerPool[node]);
> >              else
> > -                x265_log(p, X265_LOG_INFO, "Thread pool created using
> %d threads\n", cores);
> > -            cpusPerNode[node] -= cores;
> > +                x265_log(p, X265_LOG_INFO, "Thread pool created using
> %d threads\n", numThreads);
> > +            threadsPerPool[node] -= numThreads;
> >          }
> >      }
> >      else
> > @@ -350,11 +373,27 @@
> >      memset(this, 0, sizeof(*this));
> >  }
> >
> > -bool ThreadPool::create(int numThreads, int maxProviders, int node)
> > +bool ThreadPool::create(int numThreads, int maxProviders, uint32_t
> nodeMask)
> >  {
> >      X265_CHECK(numThreads <= MAX_POOL_THREADS, "a single thread pool
> cannot have more than MAX_POOL_THREADS threads\n");
> >
> > -    m_numaNode = node;
> > +#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
> > +    m_winNodemask = nodeMask & ~(0x1 << getNumaNodeCount());
> > +    m_numaNodeMask = &m_winNodemask;
> > +#elif HAVE_LIBNUMA
> > +    if (numa_available() >= 0)
> > +    {
> > +        struct bitmask* nodemask = numa_allocate_nodemask();
> > +        if (nodemask)
> > +        {
> > +            *(nodemask->maskp) = nodeMask;
> > +            m_numaNodeMask = nodemask;
> > +        }
> > +        else
> > +            x265_log(NULL, X265_LOG_ERROR, "unable to get NUMA node
> mask for %lx\n", nodeMask);
> > +    }
> > +#endif
> > +
> >      m_numWorkers = numThreads;
> >
> >      m_workers = X265_MALLOC(WorkerThread, numThreads);
> > @@ -408,36 +447,37 @@
> >
> >      X265_FREE(m_workers);
> >      X265_FREE(m_jpTable);
> > +
> > +#if HAVE_LIBNUMA
> > +    if(m_numaNodeMask)
> > +        numa_free_nodemask((struct bitmask*)m_numaNodeMask);
> > +#endif
> >  }
> >
> >  void ThreadPool::setCurrentThreadAffinity()
> >  {
> > -    setThreadNodeAffinity(m_numaNode);
> > +    setThreadNodeAffinity(m_numaNodeMask);
> >  }
> >
> >  /* static */
> > -void ThreadPool::setThreadNodeAffinity(int numaNode)
> > +void ThreadPool::setThreadNodeAffinity(void *numaNodeMask)
> >  {
> >  #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
> > -    GROUP_AFFINITY groupAffinity;
> > -    if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &groupAffinity))
> > -    {
> > -        if (SetThreadAffinityMask(GetCurrentThread(),
> (DWORD_PTR)groupAffinity.Mask))
> > -            return;
> > -    }
> > -    x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity to
> NUMA node %d\n", numaNode);
> > +    if (SetThreadAffinityMask(GetCurrentThread(),
> (DWORD_PTR)(*((DWORD*)numaNodeMask))))
> > +        return;
> > +    else
> > +        x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity
> for NUMA node mask\n");
> >  #elif HAVE_LIBNUMA
> >      if (numa_available() >= 0)
> >      {
> > -        numa_run_on_node(numaNode);
> > -        numa_set_preferred(numaNode);
> > +        numa_run_on_node_mask((struct bitmask*)numaNodeMask);
> > +        numa_set_interleave_mask((struct bitmask*)numaNodeMask);
> >          numa_set_localalloc();
> >          return;
> >      }
> > -    x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity to
> NUMA node %d\n", numaNode);
> > -#else
> > -    (void)numaNode;
> > +    x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity for
> NUMA node mask\n");
> >  #endif
> > +    return;
> >  }
> >
> >  /* static */
> > diff -r 86e9bd7dd192 -r 2421fae1de24 source/common/threadpool.h
> > --- a/source/common/threadpool.h      Tue Sep 01 17:06:05 2015 +0530
> > +++ b/source/common/threadpool.h      Thu Sep 03 14:41:06 2015 +0530
> > @@ -83,7 +83,10 @@
> >      sleepbitmap_t m_sleepBitmap;
> >      int           m_numProviders;
> >      int           m_numWorkers;
> > -    int           m_numaNode;
> > +    void*         m_numaNodeMask;
> > +#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
> > +    DWORD         m_winNodemask;
> > +#endif
> >      bool          m_isActive;
> >
> >      JobProvider** m_jpTable;
> > @@ -92,7 +95,7 @@
> >      ThreadPool();
> >      ~ThreadPool();
> >
> > -    bool create(int numThreads, int maxProviders, int node);
> > +    bool create(int numThreads, int maxProviders, uint32_t nodeMask);
> >      bool start();
> >      void stopWorkers();
> >      void setCurrentThreadAffinity();
> > @@ -103,7 +106,7 @@
> >
> >      static int  getCpuCount();
> >      static int  getNumaNodeCount();
> > -    static void setThreadNodeAffinity(int node);
> > +    static void setThreadNodeAffinity(void *numaNodeMask);
> >  };
> >
> >  /* Any worker thread may enlist the help of idle worker threads from
> the same
> > _______________________________________________
> > x265-devel mailing list
> > x265-devel at videolan.org
> > https://mailman.videolan.org/listinfo/x265-devel
>
> --
> Steve Borho
>
>
> ------------------------------
>
> Subject: Digest Footer
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
> ------------------------------
>
> End of x265-devel Digest, Vol 28, Issue 8
> *****************************************
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150907/e6cbee98/attachment-0001.html>