[x265] [PATCH] Use atomic bit test and set/reset operations on x86

Andrey Semashev andrey.semashev at gmail.com
Wed Jan 10 17:06:29 CET 2018


On 01/10/18 18:53, chen wrote:
> Hi Andrey,
> 
> Our code rule prohibit inline assembly, especially the patch used GCC 
> extension syntax.

Ok, I see.

> the "lock" prefix will lock the CPU bus, it will be greater penalty on 
> the multi-core system.

Just for the record, the lock prefix is implemented much more 
efficiently nowdays and involves CPU cache management rather bus 
locking. It used to lock the memory bus on early CPUs (I want to say 
before Pentium, but I'm not sure which exact architecture changed this). 
In any case, the patch does not introduce new lock instructions but it 
replaces "lock; cmpxchg" loops that are normally generated for the 
atomic AND and OR operations with a single instruction.

> At 2018-01-10 23:30:06, "Andrey Semashev" <andrey.semashev at gmail.com <mailto:andrey.semashev at gmail.com>> wrote:
>>Any feedback on this one?
>>
>>I've been using it for quite some time locally. It does seem to work 
>>slightly faster on my Sandy Bridge machine (it should be a few percents 
>>of gain in fps, although I didn't save the benchmark numbers).
>>
>>On 01/01/18 15:28, Andrey Semashev wrote:
>>> # HG changeset patch
>>> # User Andrey Semashev <andrey.semashev at gmail.com <mailto:andrey.semashev at gmail.com>>
>>> # Date 1514809583 -10800
>>> #      Mon Jan 01 15:26:23 2018 +0300
>>> # Branch atomic_bit_opsv2
>>> # Node ID 81529b6bd6adc8eb31162daeee44399dc1f95999
>>> # Parent  ff02513b92c000c3bb3dcc51deb79af57f5358d5
>>> Use atomic bit test and set/reset operations on x86.
>>> 
>>> The 'lock bts/btr' instructions are potentially more efficient than the
>>> 'lock cmpxchg' loops which are emitted to implement ATOMIC_AND and ATOMIC_OR
>>> on x86. The commit adds new macros ATOMIC_BTS and ATOMIC_BTR which atomically
>>> set/reset the specified bit in the integer and return the previous value of
>>> the modified bit.
>>> 
>>> Since in many places of the code the result is not needed, two more macros are
>>> provided as well: ATOMIC_BTS_VOID and ATOMIC_BTR_VOID. The effect of these
>>> macros is the same except that they don't return the previous value. These
>>> macros may generate a slightly more efficient code.
>>> 
>>> diff -r ff02513b92c0 -r 81529b6bd6ad source/common/threading.h
>>> --- a/source/common/threading.h	Fri Dec 22 18:23:24 2017 +0530
>>> +++ b/source/common/threading.h	Mon Jan 01 15:26:23 2018 +0300
>>> @@ -80,6 +80,91 @@
>>>   #define ATOMIC_ADD(ptr, val)  __sync_fetch_and_add((volatile int32_t*)ptr, val)
>>>   #define GIVE_UP_TIME()        usleep(0)
>>>   
>>> +#if defined(__x86_64__) || defined(__i386__)
>>> +
>>> +namespace X265_NS {
>>> +
>>> +inline __attribute__((always_inline)) void atomic_bit_test_and_set_void(uint32_t* ptr, uint32_t bit)
>>> +{
>>> +    __asm__ __volatile__
>>> +    (
>>> +        "lock; btsl %[bit], %[mem]\n\t"
>>> +        : [mem] "+m" (*ptr)
>>> +        : [bit] "Kq" (bit)
>>> +        : "memory"
>>> +    );
>>> +}
>>> +
>>> +inline __attribute__((always_inline)) void atomic_bit_test_and_reset_void(uint32_t* ptr, uint32_t bit)
>>> +{
>>> +    __asm__ __volatile__
>>> +    (
>>> +        "lock; btrl %[bit], %[mem]\n\t"
>>> +        : [mem] "+m" (*ptr)
>>> +        : [bit] "Kq" (bit)
>>> +        : "memory"
>>> +    );
>>> +}
>>> +
>>> +inline __attribute__((always_inline)) bool atomic_bit_test_and_set(uint32_t* ptr, uint32_t bit)
>>> +{
>>> +    bool res;
>>> +#if defined(__GCC_ASM_FLAG_OUTPUTS__)
>>> +    __asm__ __volatile__
>>> +    (
>>> +        "lock; btsl %[bit], %[mem]\n\t"
>>> +        : [mem] "+m" (*ptr), [res] "=@ccc" (res)
>>> +        : [bit] "Kq" (bit)
>>> +        : "memory"
>>> +    );
>>> +#else
>>> +    res = false; // to avoid false dependency on the higher part of the result register
>>> +    __asm__ __volatile__
>>> +    (
>>> +        "lock; btsl %[bit], %[mem]\n\t"
>>> +        "setc %[res]\n\t"
>>> +        : [mem] "+m" (*ptr), [res] "+q" (res)
>>> +        : [bit] "Kq" (bit)
>>> +        : "memory"
>>> +    );
>>> +#endif
>>> +    return res;
>>> +}
>>> +
>>> +inline __attribute__((always_inline)) bool atomic_bit_test_and_reset(uint32_t* ptr, uint32_t bit)
>>> +{
>>> +    bool res;
>>> +#if defined(__GCC_ASM_FLAG_OUTPUTS__)
>>> +    __asm__ __volatile__
>>> +    (
>>> +        "lock; btrl %[bit], %[mem]\n\t"
>>> +        : [mem] "+m" (*ptr), [res] "=@ccc" (res)
>>> +        : [bit] "Kq" (bit)
>>> +        : "memory"
>>> +    );
>>> +#else
>>> +    res = false; // to avoid false dependency on the higher part of the result register
>>> +    __asm__ __volatile__
>>> +    (
>>> +        "lock; btrl %[bit], %[mem]\n\t"
>>> +        "setc %[res]\n\t"
>>> +        : [mem] "+m" (*ptr), [res] "+q" (res)
>>> +        : [bit] "Kq" (bit)
>>> +        : "memory"
>>> +    );
>>> +#endif
>>> +    return res;
>>> +}
>>> +
>>> +}
>>> +
>>> +#define ATOMIC_BTS_VOID(ptr, bit)  atomic_bit_test_and_set_void((uint32_t*)(ptr), (bit))
>>> +#define ATOMIC_BTR_VOID(ptr, bit)  atomic_bit_test_and_reset_void((uint32_t*)(ptr), (bit))
>>> +#define ATOMIC_BTS(ptr, bit)  atomic_bit_test_and_set((uint32_t*)(ptr), (bit))
>>> +#define ATOMIC_BTR(ptr, bit)  atomic_bit_test_and_reset((uint32_t*)(ptr), (bit))
>>> +
>>> +#endif // defined(__x86_64__) || defined(__i386__)
>>> +
>>>   #elif defined(_MSC_VER)       /* Windows atomic intrinsics */
>>>   
>>>   #include <intrin.h>
>>> @@ -93,8 +178,26 @@
>>>   #define ATOMIC_AND(ptr, mask) _InterlockedAnd((volatile LONG*)ptr, (LONG)mask)
>>>   #define GIVE_UP_TIME()        Sleep(0)
>>>   
>>> +#if defined(_M_IX86) || defined(_M_X64)
>>> +#define ATOMIC_BTS(ptr, bit)  (!!_interlockedbittestandset((long*)(ptr), (bit)))
>>> +#define ATOMIC_BTR(ptr, bit)  (!!_interlockedbittestandreset((long*)(ptr), (bit)))
>>> +#endif // defined(_M_IX86) || defined(_M_X64)
>>> +
>>>   #endif // ifdef __GNUC__
>>>   
>>> +#if !defined(ATOMIC_BTS)
>>> +#define ATOMIC_BTS(ptr, bit)  (!!(ATOMIC_OR(ptr, (1u << (bit)))) & (1u << (bit)))
>>> +#endif
>>> +#if !defined(ATOMIC_BTR)
>>> +#define ATOMIC_BTR(ptr, bit)  (!!(ATOMIC_AND(ptr, ~(1u << (bit)))) & (1u << (bit)))
>>> +#endif
>>> +#if !defined(ATOMIC_BTS_VOID)
>>> +#define ATOMIC_BTS_VOID ATOMIC_BTS
>>> +#endif
>>> +#if !defined(ATOMIC_BTR_VOID)
>>> +#define ATOMIC_BTR_VOID ATOMIC_BTR
>>> +#endif
>>> +
>>>   namespace X265_NS {
>>>   // x265 private namespace
>>>   
>>> diff -r ff02513b92c0 -r 81529b6bd6ad source/common/threadpool.cpp
>>> --- a/source/common/threadpool.cpp	Fri Dec 22 18:23:24 2017 +0530
>>> +++ b/source/common/threadpool.cpp	Mon Jan 01 15:26:23 2018 +0300
>>> @@ -37,14 +37,95 @@
>>>   #ifdef __GNUC__
>>>   
>>>   #define SLEEPBITMAP_CTZ(id, x)     id = (unsigned long)__builtin_ctzll(x)
>>> -#define SLEEPBITMAP_OR(ptr, mask)  __sync_fetch_and_or(ptr, mask)
>>> -#define SLEEPBITMAP_AND(ptr, mask) __sync_fetch_and_and(ptr, mask)
>>> +
>>> +namespace X265_NS {
>>> +
>>> +inline __attribute__((always_inline)) void atomic_bit_test_and_set64_void(uint64_t* ptr, uint64_t bit)
>>> +{
>>> +    __asm__ __volatile__
>>> +    (
>>> +        "lock; btsq %[bit], %[mem]\n\t"
>>> +        : [mem] "+m" (*ptr)
>>> +        : [bit] "Kq" (bit)
>>> +        : "memory"
>>> +    );
>>> +}
>>> +
>>> +inline __attribute__((always_inline)) void atomic_bit_test_and_reset64_void(uint64_t* ptr, uint64_t bit)
>>> +{
>>> +    __asm__ __volatile__
>>> +    (
>>> +        "lock; btrq %[bit], %[mem]\n\t"
>>> +        : [mem] "+m" (*ptr)
>>> +        : [bit] "Kq" (bit)
>>> +        : "memory"
>>> +    );
>>> +}
>>> +
>>> +inline __attribute__((always_inline)) bool atomic_bit_test_and_set64(uint64_t* ptr, uint64_t bit)
>>> +{
>>> +    bool res;
>>> +#if defined(__GCC_ASM_FLAG_OUTPUTS__)
>>> +    __asm__ __volatile__
>>> +    (
>>> +        "lock; btsq %[bit], %[mem]\n\t"
>>> +        : [mem] "+m" (*ptr), [res] "=@ccc" (res)
>>> +        : [bit] "Kq" (bit)
>>> +        : "memory"
>>> +    );
>>> +#else
>>> +    res = false; // to avoid false dependency on the higher part of the result register
>>> +    __asm__ __volatile__
>>> +    (
>>> +        "lock; btsq %[bit], %[mem]\n\t"
>>> +        "setc %[res]\n\t"
>>> +        : [mem] "+m" (*ptr), [res] "+q" (res)
>>> +        : [bit] "Kq" (bit)
>>> +        : "memory"
>>> +    );
>>> +#endif
>>> +    return res;
>>> +}
>>> +
>>> +inline __attribute__((always_inline)) bool atomic_bit_test_and_reset64(uint64_t* ptr, uint64_t bit)
>>> +{
>>> +    bool res;
>>> +#if defined(__GCC_ASM_FLAG_OUTPUTS__)
>>> +    __asm__ __volatile__
>>> +    (
>>> +        "lock; btrq %[bit], %[mem]\n\t"
>>> +        : [mem] "+m" (*ptr), [res] "=@ccc" (res)
>>> +        : [bit] "Kq" (bit)
>>> +        : "memory"
>>> +    );
>>> +#else
>>> +    res = false; // to avoid false dependency on the higher part of the result register
>>> +    __asm__ __volatile__
>>> +    (
>>> +        "lock; btrq %[bit], %[mem]\n\t"
>>> +        "setc %[res]\n\t"
>>> +        : [mem] "+m" (*ptr), [res] "+q" (res)
>>> +        : [bit] "Kq" (bit)
>>> +        : "memory"
>>> +    );
>>> +#endif
>>> +    return res;
>>> +}
>>> +
>>> +}
>>> +
>>> +#define SLEEPBITMAP_BTS(ptr, bit)  atomic_bit_test_and_set64((uint64_t*)(ptr), (bit))
>>> +#define SLEEPBITMAP_BTR(ptr, bit)  atomic_bit_test_and_reset64((uint64_t*)(ptr), (bit))
>>> +#define SLEEPBITMAP_BTS_VOID(ptr, bit)  atomic_bit_test_and_set64_void((uint64_t*)(ptr), (bit))
>>> +#define SLEEPBITMAP_BTR_VOID(ptr, bit)  atomic_bit_test_and_reset64_void((uint64_t*)(ptr), (bit))
>>>   
>>>   #elif defined(_MSC_VER)
>>>   
>>>   #define SLEEPBITMAP_CTZ(id, x)     _BitScanForward64(&id, x)
>>> -#define SLEEPBITMAP_OR(ptr, mask)  InterlockedOr64((volatile LONG64*)ptr, (LONG)mask)
>>> -#define SLEEPBITMAP_AND(ptr, mask) InterlockedAnd64((volatile LONG64*)ptr, (LONG)mask)
>>> +#define SLEEPBITMAP_BTS(ptr, bit)  (!!_interlockedbittestandset64((__int64*)(ptr), (bit)))
>>> +#define SLEEPBITMAP_BTR(ptr, bit)  (!!_interlockedbittestandreset64((__int64*)(ptr), (bit)))
>>> +#define SLEEPBITMAP_BTS_VOID SLEEPBITMAP_BTS
>>> +#define SLEEPBITMAP_BTR_VOID SLEEPBITMAP_BTR
>>>   
>>>   #endif // ifdef __GNUC__
>>>   
>>> @@ -52,8 +133,10 @@
>>>   
>>>   /* use 32-bit primitives defined in threading.h */
>>>   #define SLEEPBITMAP_CTZ CTZ
>>> -#define SLEEPBITMAP_OR  ATOMIC_OR
>>> -#define SLEEPBITMAP_AND ATOMIC_AND
>>> +#define SLEEPBITMAP_BTS ATOMIC_BTS
>>> +#define SLEEPBITMAP_BTR ATOMIC_BTR
>>> +#define SLEEPBITMAP_BTS_VOID ATOMIC_BTS_VOID
>>> +#define SLEEPBITMAP_BTR_VOID ATOMIC_BTR_VOID
>>>   
>>>   #endif
>>>   
>>> @@ -123,12 +206,12 @@
>>>   
>>>       m_pool.setCurrentThreadAffinity();
>>>   
>>> -    sleepbitmap_t idBit = (sleepbitmap_t)1 << m_id;
>>> +    int idBit = m_id;
>>>       m_curJobProvider = m_pool.m_jpTable[0];
>>>       m_bondMaster = NULL;
>>>   
>>> -    SLEEPBITMAP_OR(&m_curJobProvider->m_ownerBitmap, idBit);
>>> -    SLEEPBITMAP_OR(&m_pool.m_sleepBitmap, idBit);
>>> +    SLEEPBITMAP_BTS_VOID(&m_curJobProvider->m_ownerBitmap, idBit);
>>> +    SLEEPBITMAP_BTS_VOID(&m_pool.m_sleepBitmap, idBit);
>>>       m_wakeEvent.wait();
>>>   
>>>       while (m_pool.m_isActive)
>>> @@ -162,21 +245,21 @@
>>>               }
>>>               if (nextProvider != -1 && m_curJobProvider != m_pool.m_jpTable[nextProvider])
>>>               {
>>> -                SLEEPBITMAP_AND(&m_curJobProvider->m_ownerBitmap, ~idBit);
>>> +                SLEEPBITMAP_BTR_VOID(&m_curJobProvider->m_ownerBitmap, idBit);
>>>                   m_curJobProvider = m_pool.m_jpTable[nextProvider];
>>> -                SLEEPBITMAP_OR(&m_curJobProvider->m_ownerBitmap, idBit);
>>> +                SLEEPBITMAP_BTS_VOID(&m_curJobProvider->m_ownerBitmap, idBit);
>>>               }
>>>           }
>>>           while (m_curJobProvider->m_helpWanted);
>>>   
>>>           /* While the worker sleeps, a job-provider or bond-group may acquire this
>>> -         * worker's sleep bitmap bit. Once acquired, that thread may modify
>>> +         * worker's sleep bitmap bit. Once acquired, that thread may modify
>>>            * m_bondMaster or m_curJobProvider, then waken the thread */
>>> -        SLEEPBITMAP_OR(&m_pool.m_sleepBitmap, idBit);
>>> +        SLEEPBITMAP_BTS_VOID(&m_pool.m_sleepBitmap, idBit);
>>>           m_wakeEvent.wait();
>>>       }
>>>   
>>> -    SLEEPBITMAP_OR(&m_pool.m_sleepBitmap, idBit);
>>> +    SLEEPBITMAP_BTS_VOID(&m_pool.m_sleepBitmap, idBit);
>>>   }
>>>   
>>>   void JobProvider::tryWakeOne()
>>> @@ -191,10 +274,9 @@
>>>       WorkerThread& worker = m_pool->m_workers[id];
>>>       if (worker.m_curJobProvider != this) /* poaching */
>>>       {
>>> -        sleepbitmap_t bit = (sleepbitmap_t)1 << id;
>>> -        SLEEPBITMAP_AND(&worker.m_curJobProvider->m_ownerBitmap, ~bit);
>>> +        SLEEPBITMAP_BTR_VOID(&worker.m_curJobProvider->m_ownerBitmap, id);
>>>           worker.m_curJobProvider = this;
>>> -        SLEEPBITMAP_OR(&worker.m_curJobProvider->m_ownerBitmap, bit);
>>> +        SLEEPBITMAP_BTS_VOID(&worker.m_curJobProvider->m_ownerBitmap, id);
>>>       }
>>>       worker.awaken();
>>>   }
>>> @@ -208,8 +290,7 @@
>>>       {
>>>           SLEEPBITMAP_CTZ(id, masked);
>>>   
>>> -        sleepbitmap_t bit = (sleepbitmap_t)1 << id;
>>> -        if (SLEEPBITMAP_AND(&m_sleepBitmap, ~bit) & bit)
>>> +        if (SLEEPBITMAP_BTR(&m_sleepBitmap, id))
>>>               return (int)id;
>>>   
>>>           masked = m_sleepBitmap & firstTryBitmap;
>>> @@ -220,8 +301,7 @@
>>>       {
>>>           SLEEPBITMAP_CTZ(id, masked);
>>>   
>>> -        sleepbitmap_t bit = (sleepbitmap_t)1 << id;
>>> -        if (SLEEPBITMAP_AND(&m_sleepBitmap, ~bit) & bit)
>>> +        if (SLEEPBITMAP_BTR(&m_sleepBitmap, id))
>>>               return (int)id;
>>>   
>>>           masked = m_sleepBitmap & secondTryBitmap;
>>> diff -r ff02513b92c0 -r 81529b6bd6ad source/common/wavefront.cpp
>>> --- a/source/common/wavefront.cpp	Fri Dec 22 18:23:24 2017 +0530
>>> +++ b/source/common/wavefront.cpp	Mon Jan 01 15:26:23 2018 +0300
>>> @@ -66,14 +66,14 @@
>>>   
>>>   void WaveFront::enqueueRow(int row)
>>>   {
>>> -    uint32_t bit = 1 << (row & 31);
>>> -    ATOMIC_OR(&m_internalDependencyBitmap[row >> 5], bit);
>>> +    uint32_t bit = row & 31;
>>> +    ATOMIC_BTS_VOID(&m_internalDependencyBitmap[row >> 5], bit);
>>>   }
>>>   
>>>   void WaveFront::enableRow(int row)
>>>   {
>>> -    uint32_t bit = 1 << (row & 31);
>>> -    ATOMIC_OR(&m_externalDependencyBitmap[row >> 5], bit);
>>> +    uint32_t bit = row & 31;
>>> +    ATOMIC_BTS_VOID(&m_externalDependencyBitmap[row >> 5], bit);
>>>   }
>>>   
>>>   void WaveFront::enableAllRows()
>>> @@ -83,8 +83,8 @@
>>>   
>>>   bool WaveFront::dequeueRow(int row)
>>>   {
>>> -    uint32_t bit = 1 << (row & 31);
>>> -    return !!(ATOMIC_AND(&m_internalDependencyBitmap[row >> 5], ~bit) & bit);
>>> +    uint32_t bit = row & 31;
>>> +    return ATOMIC_BTR(&m_internalDependencyBitmap[row >> 5], bit);
>>>   }
>>>   
>>>   void WaveFront::findJob(int threadId)
>>> @@ -99,8 +99,7 @@
>>>           {
>>>               CTZ(id, oldval);
>>>   
>>> -            uint32_t bit = 1 << id;
>>> -            if (ATOMIC_AND(&m_internalDependencyBitmap[w], ~bit) & bit)
>>> +            if (ATOMIC_BTR(&m_internalDependencyBitmap[w], id))
>>>               {
>>>                   /* we cleared the bit, we get to process the row */
>>>                   processRow(w * 32 + id, threadId);
>>> 
>>
>>_______________________________________________
>>x265-devel mailing list
>>x265-devel at videolan.org <mailto:x265-devel at videolan.org>
>>https://mailman.videolan.org/listinfo/x265-devel
> 
> 
> 
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
> 



More information about the x265-devel mailing list