[x265] [PATCH] Use atomic bit test and set/reset operations on x86

Andrey Semashev andrey.semashev at gmail.com
Wed Nov 16 13:09:42 CET 2016


# HG changeset patch
# User Andrey Semashev <andrey.semashev at gmail.com>
# Date 1475709624 -10800
#      Thu Oct 06 02:20:24 2016 +0300
# Branch atomic_bit_ops
# Node ID 539c83edceb540e7785719876a161a48382b4c09
# Parent  af3678bc1dff6eb3df5879ac49fdb532ce8bd6ac
Use atomic bit test and set/reset operations on x86.

The 'lock bts/btr' instructions are potentially more efficient than the
'lock cmpxchg' loops which are emitted to implement ATOMIC_AND and ATOMIC_OR
on x86. The commit adds new macros ATOMIC_BTS and ATOMIC_BTR which atomically
set/reset the specified bit in the integer and return the previous value of
the modified bit.

Since in many places of the code the result is not needed, two more macros
are provided as well: ATOMIC_BTS_VOID and ATOMIC_BTR_VOID. The effect of these
macros is the same except that they don't return the previous value. These
macros may generate a slightly more efficient code.

diff -r af3678bc1dff -r 539c83edceb5 source/common/threading.h
--- a/source/common/threading.h	Wed Oct 05 11:58:49 2016 +0530
+++ b/source/common/threading.h	Thu Oct 06 02:20:24 2016 +0300
@@ -80,6 +80,91 @@
 #define ATOMIC_ADD(ptr, val)  __sync_fetch_and_add((volatile int32_t*)ptr, val)
 #define GIVE_UP_TIME()        usleep(0)
 
+#if defined(__x86_64__) || defined(__i386__)
+
+namespace X265_NS {
+
+inline __attribute__((always_inline)) void atomic_bit_test_and_set_void(uint32_t* ptr, uint32_t bit)
+{
+    __asm__ __volatile__
+    (
+        "lock; btsl %[bit], %[mem]\n\t"
+        : [mem] "+m" (*ptr)
+        : [bit] "Kq" (bit)
+        : "memory"
+    );
+}
+
+inline __attribute__((always_inline)) void atomic_bit_test_and_reset_void(uint32_t* ptr, uint32_t bit)
+{
+    __asm__ __volatile__
+    (
+        "lock; btrl %[bit], %[mem]\n\t"
+        : [mem] "+m" (*ptr)
+        : [bit] "Kq" (bit)
+        : "memory"
+    );
+}
+
+inline __attribute__((always_inline)) bool atomic_bit_test_and_set(uint32_t* ptr, uint32_t bit)
+{
+    bool res;
+#if defined(__GCC_ASM_FLAG_OUTPUTS__)
+    __asm__ __volatile__
+    (
+        "lock; btsl %[bit], %[mem]\n\t"
+        : [mem] "+m" (*ptr), [res] "=@ccc" (res)
+        : [bit] "Kq" (bit)
+        : "memory"
+    );
+#else
+    res = false; // to avoid false dependency on the higher part of the result register
+    __asm__ __volatile__
+    (
+        "lock; btsl %[bit], %[mem]\n\t"
+        "setc %[res]\n\t"
+        : [mem] "+m" (*ptr), [res] "+q" (res)
+        : [bit] "Kq" (bit)
+        : "memory"
+    );
+#endif
+    return res;
+}
+
+inline __attribute__((always_inline)) bool atomic_bit_test_and_reset(uint32_t* ptr, uint32_t bit)
+{
+    bool res;
+#if defined(__GCC_ASM_FLAG_OUTPUTS__)
+    __asm__ __volatile__
+    (
+        "lock; btrl %[bit], %[mem]\n\t"
+        : [mem] "+m" (*ptr), [res] "=@ccc" (res)
+        : [bit] "Kq" (bit)
+        : "memory"
+    );
+#else
+    res = false; // to avoid false dependency on the higher part of the result register
+    __asm__ __volatile__
+    (
+        "lock; btrl %[bit], %[mem]\n\t"
+        "setc %[res]\n\t"
+        : [mem] "+m" (*ptr), [res] "+q" (res)
+        : [bit] "Kq" (bit)
+        : "memory"
+    );
+#endif
+    return res;
+}
+
+}
+
+#define ATOMIC_BTS_VOID(ptr, bit)  atomic_bit_test_and_set_void((uint32_t*)(ptr), (bit))
+#define ATOMIC_BTR_VOID(ptr, bit)  atomic_bit_test_and_reset_void((uint32_t*)(ptr), (bit))
+#define ATOMIC_BTS(ptr, bit)  atomic_bit_test_and_set((uint32_t*)(ptr), (bit))
+#define ATOMIC_BTR(ptr, bit)  atomic_bit_test_and_reset((uint32_t*)(ptr), (bit))
+
+#endif // defined(__x86_64__) || defined(__i386__)
+
 #elif defined(_MSC_VER)       /* Windows atomic intrinsics */
 
 #include <intrin.h>
@@ -93,8 +178,26 @@
 #define ATOMIC_AND(ptr, mask) _InterlockedAnd((volatile LONG*)ptr, (LONG)mask)
 #define GIVE_UP_TIME()        Sleep(0)
 
+#if defined(_M_IX86) || defined(_M_X64)
+#define ATOMIC_BTS(ptr, bit)  (!!_interlockedbittestandset((long*)(ptr), (bit)))
+#define ATOMIC_BTR(ptr, bit)  (!!_interlockedbittestandreset((long*)(ptr), (bit)))
+#endif // defined(_M_IX86) || defined(_M_X64)
+
 #endif // ifdef __GNUC__
 
+#if !defined(ATOMIC_BTS)
+#define ATOMIC_BTS(ptr, bit)  (!!(ATOMIC_OR(ptr, (1u << (bit)))) & (1u << (bit)))
+#endif
+#if !defined(ATOMIC_BTR)
+#define ATOMIC_BTR(ptr, bit)  (!!(ATOMIC_AND(ptr, ~(1u << (bit)))) & (1u << (bit)))
+#endif
+#if !defined(ATOMIC_BTS_VOID)
+#define ATOMIC_BTS_VOID ATOMIC_BTS
+#endif
+#if !defined(ATOMIC_BTR_VOID)
+#define ATOMIC_BTR_VOID ATOMIC_BTR
+#endif
+
 namespace X265_NS {
 // x265 private namespace
 
diff -r af3678bc1dff -r 539c83edceb5 source/common/threadpool.cpp
--- a/source/common/threadpool.cpp	Wed Oct 05 11:58:49 2016 +0530
+++ b/source/common/threadpool.cpp	Thu Oct 06 02:20:24 2016 +0300
@@ -37,14 +37,95 @@
 #ifdef __GNUC__
 
 #define SLEEPBITMAP_CTZ(id, x)     id = (unsigned long)__builtin_ctzll(x)
-#define SLEEPBITMAP_OR(ptr, mask)  __sync_fetch_and_or(ptr, mask)
-#define SLEEPBITMAP_AND(ptr, mask) __sync_fetch_and_and(ptr, mask)
+
+namespace X265_NS {
+
+inline __attribute__((always_inline)) void atomic_bit_test_and_set64_void(uint64_t* ptr, uint64_t bit)
+{
+    __asm__ __volatile__
+    (
+        "lock; btsq %[bit], %[mem]\n\t"
+        : [mem] "+m" (*ptr)
+        : [bit] "Kq" (bit)
+        : "memory"
+    );
+}
+
+inline __attribute__((always_inline)) void atomic_bit_test_and_reset64_void(uint64_t* ptr, uint64_t bit)
+{
+    __asm__ __volatile__
+    (
+        "lock; btrq %[bit], %[mem]\n\t"
+        : [mem] "+m" (*ptr)
+        : [bit] "Kq" (bit)
+        : "memory"
+    );
+}
+
+inline __attribute__((always_inline)) bool atomic_bit_test_and_set64(uint64_t* ptr, uint64_t bit)
+{
+    bool res;
+#if defined(__GCC_ASM_FLAG_OUTPUTS__)
+    __asm__ __volatile__
+    (
+        "lock; btsq %[bit], %[mem]\n\t"
+        : [mem] "+m" (*ptr), [res] "=@ccc" (res)
+        : [bit] "Kq" (bit)
+        : "memory"
+    );
+#else
+    res = false; // to avoid false dependency on the higher part of the result register
+    __asm__ __volatile__
+    (
+        "lock; btsq %[bit], %[mem]\n\t"
+        "setc %[res]\n\t"
+        : [mem] "+m" (*ptr), [res] "+q" (res)
+        : [bit] "Kq" (bit)
+        : "memory"
+    );
+#endif
+    return res;
+}
+
+inline __attribute__((always_inline)) bool atomic_bit_test_and_reset64(uint64_t* ptr, uint64_t bit)
+{
+    bool res;
+#if defined(__GCC_ASM_FLAG_OUTPUTS__)
+    __asm__ __volatile__
+    (
+        "lock; btrq %[bit], %[mem]\n\t"
+        : [mem] "+m" (*ptr), [res] "=@ccc" (res)
+        : [bit] "Kq" (bit)
+        : "memory"
+    );
+#else
+    res = false; // to avoid false dependency on the higher part of the result register
+    __asm__ __volatile__
+    (
+        "lock; btrq %[bit], %[mem]\n\t"
+        "setc %[res]\n\t"
+        : [mem] "+m" (*ptr), [res] "+q" (res)
+        : [bit] "Kq" (bit)
+        : "memory"
+    );
+#endif
+    return res;
+}
+
+}
+
+#define SLEEPBITMAP_BTS(ptr, bit)  atomic_bit_test_and_set64((uint64_t*)(ptr), (bit))
+#define SLEEPBITMAP_BTR(ptr, bit)  atomic_bit_test_and_reset64((uint64_t*)(ptr), (bit))
+#define SLEEPBITMAP_BTS_VOID(ptr, bit)  atomic_bit_test_and_set64_void((uint64_t*)(ptr), (bit))
+#define SLEEPBITMAP_BTR_VOID(ptr, bit)  atomic_bit_test_and_reset64_void((uint64_t*)(ptr), (bit))
 
 #elif defined(_MSC_VER)
 
 #define SLEEPBITMAP_CTZ(id, x)     _BitScanForward64(&id, x)
-#define SLEEPBITMAP_OR(ptr, mask)  InterlockedOr64((volatile LONG64*)ptr, (LONG)mask)
-#define SLEEPBITMAP_AND(ptr, mask) InterlockedAnd64((volatile LONG64*)ptr, (LONG)mask)
+#define SLEEPBITMAP_BTS(ptr, bit)  (!!_interlockedbittestandset64((__int64*)(ptr), (bit)))
+#define SLEEPBITMAP_BTR(ptr, bit)  (!!_interlockedbittestandreset64((__int64*)(ptr), (bit)))
+#define SLEEPBITMAP_BTS_VOID SLEEPBITMAP_BTS
+#define SLEEPBITMAP_BTR_VOID SLEEPBITMAP_BTR
 
 #endif // ifdef __GNUC__
 
@@ -52,8 +133,10 @@
 
 /* use 32-bit primitives defined in threading.h */
 #define SLEEPBITMAP_CTZ CTZ
-#define SLEEPBITMAP_OR  ATOMIC_OR
-#define SLEEPBITMAP_AND ATOMIC_AND
+#define SLEEPBITMAP_BTS ATOMIC_BTS
+#define SLEEPBITMAP_BTR ATOMIC_BTR
+#define SLEEPBITMAP_BTS_VOID ATOMIC_BTS_VOID
+#define SLEEPBITMAP_BTR_VOID ATOMIC_BTR_VOID
 
 #endif
 
@@ -120,12 +203,12 @@
 
     m_pool.setCurrentThreadAffinity();
 
-    sleepbitmap_t idBit = (sleepbitmap_t)1 << m_id;
+    int idBit = m_id;
     m_curJobProvider = m_pool.m_jpTable[0];
     m_bondMaster = NULL;
 
-    SLEEPBITMAP_OR(&m_curJobProvider->m_ownerBitmap, idBit);
-    SLEEPBITMAP_OR(&m_pool.m_sleepBitmap, idBit);
+    SLEEPBITMAP_BTS_VOID(&m_curJobProvider->m_ownerBitmap, idBit);
+    SLEEPBITMAP_BTS_VOID(&m_pool.m_sleepBitmap, idBit);
     m_wakeEvent.wait();
 
     while (m_pool.m_isActive)
@@ -159,21 +242,21 @@
             }
             if (nextProvider != -1 && m_curJobProvider != m_pool.m_jpTable[nextProvider])
             {
-                SLEEPBITMAP_AND(&m_curJobProvider->m_ownerBitmap, ~idBit);
+                SLEEPBITMAP_BTR_VOID(&m_curJobProvider->m_ownerBitmap, idBit);
                 m_curJobProvider = m_pool.m_jpTable[nextProvider];
-                SLEEPBITMAP_OR(&m_curJobProvider->m_ownerBitmap, idBit);
+                SLEEPBITMAP_BTS_VOID(&m_curJobProvider->m_ownerBitmap, idBit);
             }
         }
         while (m_curJobProvider->m_helpWanted);
 
         /* While the worker sleeps, a job-provider or bond-group may acquire this
-         * worker's sleep bitmap bit. Once acquired, that thread may modify 
+         * worker's sleep bitmap bit. Once acquired, that thread may modify
          * m_bondMaster or m_curJobProvider, then waken the thread */
-        SLEEPBITMAP_OR(&m_pool.m_sleepBitmap, idBit);
+        SLEEPBITMAP_BTS_VOID(&m_pool.m_sleepBitmap, idBit);
         m_wakeEvent.wait();
     }
 
-    SLEEPBITMAP_OR(&m_pool.m_sleepBitmap, idBit);
+    SLEEPBITMAP_BTS_VOID(&m_pool.m_sleepBitmap, idBit);
 }
 
 void JobProvider::tryWakeOne()
@@ -188,10 +271,9 @@
     WorkerThread& worker = m_pool->m_workers[id];
     if (worker.m_curJobProvider != this) /* poaching */
     {
-        sleepbitmap_t bit = (sleepbitmap_t)1 << id;
-        SLEEPBITMAP_AND(&worker.m_curJobProvider->m_ownerBitmap, ~bit);
+        SLEEPBITMAP_BTR_VOID(&worker.m_curJobProvider->m_ownerBitmap, id);
         worker.m_curJobProvider = this;
-        SLEEPBITMAP_OR(&worker.m_curJobProvider->m_ownerBitmap, bit);
+        SLEEPBITMAP_BTS_VOID(&worker.m_curJobProvider->m_ownerBitmap, id);
     }
     worker.awaken();
 }
@@ -205,8 +287,7 @@
     {
         SLEEPBITMAP_CTZ(id, masked);
 
-        sleepbitmap_t bit = (sleepbitmap_t)1 << id;
-        if (SLEEPBITMAP_AND(&m_sleepBitmap, ~bit) & bit)
+        if (SLEEPBITMAP_BTR(&m_sleepBitmap, id))
             return (int)id;
 
         masked = m_sleepBitmap & firstTryBitmap;
@@ -217,8 +298,7 @@
     {
         SLEEPBITMAP_CTZ(id, masked);
 
-        sleepbitmap_t bit = (sleepbitmap_t)1 << id;
-        if (SLEEPBITMAP_AND(&m_sleepBitmap, ~bit) & bit)
+        if (SLEEPBITMAP_BTR(&m_sleepBitmap, id))
             return (int)id;
 
         masked = m_sleepBitmap & secondTryBitmap;
@@ -259,7 +339,7 @@
     int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
     bool bNumaSupport = false;
 
-#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
     bNumaSupport = true;
 #elif HAVE_LIBNUMA
     bNumaSupport = numa_available() >= 0;
@@ -370,7 +450,7 @@
             nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << i);
         }
     }
- 
+
     // If the last pool size is > MAX_POOL_THREADS, clip it to spawn thread pools only of size >= 1/2 max (heuristic)
     if ((threadsPerPool[numNumaNodes] > MAX_POOL_THREADS) &&
         ((threadsPerPool[numNumaNodes] % MAX_POOL_THREADS) < (MAX_POOL_THREADS / 2)))
@@ -442,7 +522,7 @@
 {
     X265_CHECK(numThreads <= MAX_POOL_THREADS, "a single thread pool cannot have more than MAX_POOL_THREADS threads\n");
 
-#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
     memset(&m_groupAffinity, 0, sizeof(GROUP_AFFINITY));
     for (int i = 0; i < getNumaNodeCount(); i++)
     {
@@ -535,7 +615,7 @@
 
 void ThreadPool::setThreadNodeAffinity(void *numaMask)
 {
-#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
     UNREFERENCED_PARAMETER(numaMask);
     GROUP_AFFINITY groupAffinity;
     memset(&groupAffinity, 0, sizeof(GROUP_AFFINITY));
@@ -564,7 +644,7 @@
 /* static */
 int ThreadPool::getNumaNodeCount()
 {
-#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
     ULONG num = 1;
     if (GetNumaHighestNodeNumber(&num))
         num++;
diff -r af3678bc1dff -r 539c83edceb5 source/common/wavefront.cpp
--- a/source/common/wavefront.cpp	Wed Oct 05 11:58:49 2016 +0530
+++ b/source/common/wavefront.cpp	Thu Oct 06 02:20:24 2016 +0300
@@ -60,14 +60,14 @@
 
 void WaveFront::enqueueRow(int row)
 {
-    uint32_t bit = 1 << (row & 31);
-    ATOMIC_OR(&m_internalDependencyBitmap[row >> 5], bit);
+    uint32_t bit = row & 31;
+    ATOMIC_BTS_VOID(&m_internalDependencyBitmap[row >> 5], bit);
 }
 
 void WaveFront::enableRow(int row)
 {
-    uint32_t bit = 1 << (row & 31);
-    ATOMIC_OR(&m_externalDependencyBitmap[row >> 5], bit);
+    uint32_t bit = row & 31;
+    ATOMIC_BTS_VOID(&m_externalDependencyBitmap[row >> 5], bit);
 }
 
 void WaveFront::enableAllRows()
@@ -77,8 +77,8 @@
 
 bool WaveFront::dequeueRow(int row)
 {
-    uint32_t bit = 1 << (row & 31);
-    return !!(ATOMIC_AND(&m_internalDependencyBitmap[row >> 5], ~bit) & bit);
+    uint32_t bit = row & 31;
+    return ATOMIC_BTR(&m_internalDependencyBitmap[row >> 5], bit);
 }
 
 void WaveFront::findJob(int threadId)
@@ -93,8 +93,7 @@
         {
             CTZ(id, oldval);
 
-            uint32_t bit = 1 << id;
-            if (ATOMIC_AND(&m_internalDependencyBitmap[w], ~bit) & bit)
+            if (ATOMIC_BTR(&m_internalDependencyBitmap[w], id))
             {
                 /* we cleared the bit, we get to process the row */
                 processRow(w * 32 + id, threadId);


More information about the x265-devel mailing list