<div style="line-height:1.7;color:#000000;font-size:14px;font-family:Arial"><pre><br>At 2018-01-11 00:06:29, "Andrey Semashev" <andrey.semashev@gmail.com> wrote:
>On 01/10/18 18:53, chen wrote:
>> Hi Andrey,
>>
>> Our code rule prohibit inline assembly, especially the patch used GCC
>> extension syntax.
>
>Ok, I see.
>
>> the "lock" prefix will lock the CPU bus, it will be greater penalty on
>> the multi-core system.
>
>Just for the record, the lock prefix is implemented much more
>efficiently nowdays and involves CPU cache management rather bus
>locking. It used to lock the memory bus on early CPUs (I want to say
>before Pentium, but I'm not sure which exact architecture changed this).
>In any case, the patch does not introduce new lock instructions but it
>replaces "lock; cmpxchg" loops that are normally generated for the
<div>>atomic AND and OR operations with a single instruction.</div><div><br></div><pre>https://htor.inf.ethz.ch/publications/img/atomic-bench.pdf</pre><div>In this paper, the author explain toat lock (SWP) just performance drop</div><div>a little in modern CPUs, but they just try less cores system (Xeon Phi</div><div>have more lost and it is single socket CPU), on multi-socket system,</div><div>the cache coherency maintenance will be very expensive.</div><pre>However, the intrinsic may get more benefit from compiler, it may decide</pre><div>which method is best choice on target platform.</div><div><br></div>>
>> At 2018-01-10 23:30:06, "Andrey Semashev" <andrey.semashev@gmail.com <mailto:andrey.semashev@gmail.com>> wrote:
>>>Any feedback on this one?
>>>
>>>I've been using it for quite some time locally. It does seem to work
>>>slightly faster on my Sandy Bridge machine (it should be a few percents
>>>of gain in fps, although I didn't save the benchmark numbers).
>>>
>>>On 01/01/18 15:28, Andrey Semashev wrote:
>>>> # HG changeset patch
>>>> # User Andrey Semashev <andrey.semashev@gmail.com <mailto:andrey.semashev@gmail.com>>
>>>> # Date 1514809583 -10800
>>>> # Mon Jan 01 15:26:23 2018 +0300
>>>> # Branch atomic_bit_opsv2
>>>> # Node ID 81529b6bd6adc8eb31162daeee44399dc1f95999
>>>> # Parent ff02513b92c000c3bb3dcc51deb79af57f5358d5
>>>> Use atomic bit test and set/reset operations on x86.
>>>>
>>>> The 'lock bts/btr' instructions are potentially more efficient than the
>>>> 'lock cmpxchg' loops which are emitted to implement ATOMIC_AND and ATOMIC_OR
>>>> on x86. The commit adds new macros ATOMIC_BTS and ATOMIC_BTR which atomically
>>>> set/reset the specified bit in the integer and return the previous value of
>>>> the modified bit.
>>>>
>>>> Since in many places of the code the result is not needed, two more macros are
>>>> provided as well: ATOMIC_BTS_VOID and ATOMIC_BTR_VOID. The effect of these
>>>> macros is the same except that they don't return the previous value. These
>>>> macros may generate a slightly more efficient code.
>>>>
>>>> diff -r ff02513b92c0 -r 81529b6bd6ad source/common/threading.h
>>>> --- a/source/common/threading.h Fri Dec 22 18:23:24 2017 +0530
>>>> +++ b/source/common/threading.h Mon Jan 01 15:26:23 2018 +0300
>>>> @@ -80,6 +80,91 @@
>>>> #define ATOMIC_ADD(ptr, val) __sync_fetch_and_add((volatile int32_t*)ptr, val)
>>>> #define GIVE_UP_TIME() usleep(0)
>>>>
>>>> +#if defined(__x86_64__) || defined(__i386__)
>>>> +
>>>> +namespace X265_NS {
>>>> +
>>>> +inline __attribute__((always_inline)) void atomic_bit_test_and_set_void(uint32_t* ptr, uint32_t bit)
>>>> +{
>>>> + __asm__ __volatile__
>>>> + (
>>>> + "lock; btsl %[bit], %[mem]\n\t"
>>>> + : [mem] "+m" (*ptr)
>>>> + : [bit] "Kq" (bit)
>>>> + : "memory"
>>>> + );
>>>> +}
>>>> +
>>>> +inline __attribute__((always_inline)) void atomic_bit_test_and_reset_void(uint32_t* ptr, uint32_t bit)
>>>> +{
>>>> + __asm__ __volatile__
>>>> + (
>>>> + "lock; btrl %[bit], %[mem]\n\t"
>>>> + : [mem] "+m" (*ptr)
>>>> + : [bit] "Kq" (bit)
>>>> + : "memory"
>>>> + );
>>>> +}
>>>> +
>>>> +inline __attribute__((always_inline)) bool atomic_bit_test_and_set(uint32_t* ptr, uint32_t bit)
>>>> +{
>>>> + bool res;
>>>> +#if defined(__GCC_ASM_FLAG_OUTPUTS__)
>>>> + __asm__ __volatile__
>>>> + (
>>>> + "lock; btsl %[bit], %[mem]\n\t"
>>>> + : [mem] "+m" (*ptr), [res] "=@ccc" (res)
>>>> + : [bit] "Kq" (bit)
>>>> + : "memory"
>>>> + );
>>>> +#else
>>>> + res = false; // to avoid false dependency on the higher part of the result register
>>>> + __asm__ __volatile__
>>>> + (
>>>> + "lock; btsl %[bit], %[mem]\n\t"
>>>> + "setc %[res]\n\t"
>>>> + : [mem] "+m" (*ptr), [res] "+q" (res)
>>>> + : [bit] "Kq" (bit)
>>>> + : "memory"
>>>> + );
>>>> +#endif
>>>> + return res;
>>>> +}
>>>> +
>>>> +inline __attribute__((always_inline)) bool atomic_bit_test_and_reset(uint32_t* ptr, uint32_t bit)
>>>> +{
>>>> + bool res;
>>>> +#if defined(__GCC_ASM_FLAG_OUTPUTS__)
>>>> + __asm__ __volatile__
>>>> + (
>>>> + "lock; btrl %[bit], %[mem]\n\t"
>>>> + : [mem] "+m" (*ptr), [res] "=@ccc" (res)
>>>> + : [bit] "Kq" (bit)
>>>> + : "memory"
>>>> + );
>>>> +#else
>>>> + res = false; // to avoid false dependency on the higher part of the result register
>>>> + __asm__ __volatile__
>>>> + (
>>>> + "lock; btrl %[bit], %[mem]\n\t"
>>>> + "setc %[res]\n\t"
>>>> + : [mem] "+m" (*ptr), [res] "+q" (res)
>>>> + : [bit] "Kq" (bit)
>>>> + : "memory"
>>>> + );
>>>> +#endif
>>>> + return res;
>>>> +}
>>>> +
>>>> +}
>>>> +
>>>> +#define ATOMIC_BTS_VOID(ptr, bit) atomic_bit_test_and_set_void((uint32_t*)(ptr), (bit))
>>>> +#define ATOMIC_BTR_VOID(ptr, bit) atomic_bit_test_and_reset_void((uint32_t*)(ptr), (bit))
>>>> +#define ATOMIC_BTS(ptr, bit) atomic_bit_test_and_set((uint32_t*)(ptr), (bit))
>>>> +#define ATOMIC_BTR(ptr, bit) atomic_bit_test_and_reset((uint32_t*)(ptr), (bit))
>>>> +
>>>> +#endif // defined(__x86_64__) || defined(__i386__)
>>>> +
>>>> #elif defined(_MSC_VER) /* Windows atomic intrinsics */
>>>>
>>>> #include <intrin.h>
>>>> @@ -93,8 +178,26 @@
>>>> #define ATOMIC_AND(ptr, mask) _InterlockedAnd((volatile LONG*)ptr, (LONG)mask)
>>>> #define GIVE_UP_TIME() Sleep(0)
>>>>
>>>> +#if defined(_M_IX86) || defined(_M_X64)
>>>> +#define ATOMIC_BTS(ptr, bit) (!!_interlockedbittestandset((long*)(ptr), (bit)))
>>>> +#define ATOMIC_BTR(ptr, bit) (!!_interlockedbittestandreset((long*)(ptr), (bit)))
>>>> +#endif // defined(_M_IX86) || defined(_M_X64)
>>>> +
>>>> #endif // ifdef __GNUC__
>>>>
>>>> +#if !defined(ATOMIC_BTS)
>>>> +#define ATOMIC_BTS(ptr, bit) (!!(ATOMIC_OR(ptr, (1u << (bit)))) & (1u << (bit)))
>>>> +#endif
>>>> +#if !defined(ATOMIC_BTR)
>>>> +#define ATOMIC_BTR(ptr, bit) (!!(ATOMIC_AND(ptr, ~(1u << (bit)))) & (1u << (bit)))
>>>> +#endif
>>>> +#if !defined(ATOMIC_BTS_VOID)
>>>> +#define ATOMIC_BTS_VOID ATOMIC_BTS
>>>> +#endif
>>>> +#if !defined(ATOMIC_BTR_VOID)
>>>> +#define ATOMIC_BTR_VOID ATOMIC_BTR
>>>> +#endif
>>>> +
>>>> namespace X265_NS {
>>>> // x265 private namespace
>>>>
>>>> diff -r ff02513b92c0 -r 81529b6bd6ad source/common/threadpool.cpp
>>>> --- a/source/common/threadpool.cpp Fri Dec 22 18:23:24 2017 +0530
>>>> +++ b/source/common/threadpool.cpp Mon Jan 01 15:26:23 2018 +0300
>>>> @@ -37,14 +37,95 @@
>>>> #ifdef __GNUC__
>>>>
>>>> #define SLEEPBITMAP_CTZ(id, x) id = (unsigned long)__builtin_ctzll(x)
>>>> -#define SLEEPBITMAP_OR(ptr, mask) __sync_fetch_and_or(ptr, mask)
>>>> -#define SLEEPBITMAP_AND(ptr, mask) __sync_fetch_and_and(ptr, mask)
>>>> +
>>>> +namespace X265_NS {
>>>> +
>>>> +inline __attribute__((always_inline)) void atomic_bit_test_and_set64_void(uint64_t* ptr, uint64_t bit)
>>>> +{
>>>> + __asm__ __volatile__
>>>> + (
>>>> + "lock; btsq %[bit], %[mem]\n\t"
>>>> + : [mem] "+m" (*ptr)
>>>> + : [bit] "Kq" (bit)
>>>> + : "memory"
>>>> + );
>>>> +}
>>>> +
>>>> +inline __attribute__((always_inline)) void atomic_bit_test_and_reset64_void(uint64_t* ptr, uint64_t bit)
>>>> +{
>>>> + __asm__ __volatile__
>>>> + (
>>>> + "lock; btrq %[bit], %[mem]\n\t"
>>>> + : [mem] "+m" (*ptr)
>>>> + : [bit] "Kq" (bit)
>>>> + : "memory"
>>>> + );
>>>> +}
>>>> +
>>>> +inline __attribute__((always_inline)) bool atomic_bit_test_and_set64(uint64_t* ptr, uint64_t bit)
>>>> +{
>>>> + bool res;
>>>> +#if defined(__GCC_ASM_FLAG_OUTPUTS__)
>>>> + __asm__ __volatile__
>>>> + (
>>>> + "lock; btsq %[bit], %[mem]\n\t"
>>>> + : [mem] "+m" (*ptr), [res] "=@ccc" (res)
>>>> + : [bit] "Kq" (bit)
>>>> + : "memory"
>>>> + );
>>>> +#else
>>>> + res = false; // to avoid false dependency on the higher part of the result register
>>>> + __asm__ __volatile__
>>>> + (
>>>> + "lock; btsq %[bit], %[mem]\n\t"
>>>> + "setc %[res]\n\t"
>>>> + : [mem] "+m" (*ptr), [res] "+q" (res)
>>>> + : [bit] "Kq" (bit)
>>>> + : "memory"
>>>> + );
>>>> +#endif
>>>> + return res;
>>>> +}
>>>> +
>>>> +inline __attribute__((always_inline)) bool atomic_bit_test_and_reset64(uint64_t* ptr, uint64_t bit)
>>>> +{
>>>> + bool res;
>>>> +#if defined(__GCC_ASM_FLAG_OUTPUTS__)
>>>> + __asm__ __volatile__
>>>> + (
>>>> + "lock; btrq %[bit], %[mem]\n\t"
>>>> + : [mem] "+m" (*ptr), [res] "=@ccc" (res)
>>>> + : [bit] "Kq" (bit)
>>>> + : "memory"
>>>> + );
>>>> +#else
>>>> + res = false; // to avoid false dependency on the higher part of the result register
>>>> + __asm__ __volatile__
>>>> + (
>>>> + "lock; btrq %[bit], %[mem]\n\t"
>>>> + "setc %[res]\n\t"
>>>> + : [mem] "+m" (*ptr), [res] "+q" (res)
>>>> + : [bit] "Kq" (bit)
>>>> + : "memory"
>>>> + );
>>>> +#endif
>>>> + return res;
>>>> +}
>>>> +
>>>> +}
>>>> +
>>>> +#define SLEEPBITMAP_BTS(ptr, bit) atomic_bit_test_and_set64((uint64_t*)(ptr), (bit))
>>>> +#define SLEEPBITMAP_BTR(ptr, bit) atomic_bit_test_and_reset64((uint64_t*)(ptr), (bit))
>>>> +#define SLEEPBITMAP_BTS_VOID(ptr, bit) atomic_bit_test_and_set64_void((uint64_t*)(ptr), (bit))
>>>> +#define SLEEPBITMAP_BTR_VOID(ptr, bit) atomic_bit_test_and_reset64_void((uint64_t*)(ptr), (bit))
>>>>
>>>> #elif defined(_MSC_VER)
>>>>
>>>> #define SLEEPBITMAP_CTZ(id, x) _BitScanForward64(&id, x)
>>>> -#define SLEEPBITMAP_OR(ptr, mask) InterlockedOr64((volatile LONG64*)ptr, (LONG)mask)
>>>> -#define SLEEPBITMAP_AND(ptr, mask) InterlockedAnd64((volatile LONG64*)ptr, (LONG)mask)
>>>> +#define SLEEPBITMAP_BTS(ptr, bit) (!!_interlockedbittestandset64((__int64*)(ptr), (bit)))
>>>> +#define SLEEPBITMAP_BTR(ptr, bit) (!!_interlockedbittestandreset64((__int64*)(ptr), (bit)))
>>>> +#define SLEEPBITMAP_BTS_VOID SLEEPBITMAP_BTS
>>>> +#define SLEEPBITMAP_BTR_VOID SLEEPBITMAP_BTR
>>>>
>>>> #endif // ifdef __GNUC__
>>>>
>>>> @@ -52,8 +133,10 @@
>>>>
>>>> /* use 32-bit primitives defined in threading.h */
>>>> #define SLEEPBITMAP_CTZ CTZ
>>>> -#define SLEEPBITMAP_OR ATOMIC_OR
>>>> -#define SLEEPBITMAP_AND ATOMIC_AND
>>>> +#define SLEEPBITMAP_BTS ATOMIC_BTS
>>>> +#define SLEEPBITMAP_BTR ATOMIC_BTR
>>>> +#define SLEEPBITMAP_BTS_VOID ATOMIC_BTS_VOID
>>>> +#define SLEEPBITMAP_BTR_VOID ATOMIC_BTR_VOID
>>>>
>>>> #endif
>>>>
>>>> @@ -123,12 +206,12 @@
>>>>
>>>> m_pool.setCurrentThreadAffinity();
>>>>
>>>> - sleepbitmap_t idBit = (sleepbitmap_t)1 << m_id;
>>>> + int idBit = m_id;
>>>> m_curJobProvider = m_pool.m_jpTable[0];
>>>> m_bondMaster = NULL;
>>>>
>>>> - SLEEPBITMAP_OR(&m_curJobProvider->m_ownerBitmap, idBit);
>>>> - SLEEPBITMAP_OR(&m_pool.m_sleepBitmap, idBit);
>>>> + SLEEPBITMAP_BTS_VOID(&m_curJobProvider->m_ownerBitmap, idBit);
>>>> + SLEEPBITMAP_BTS_VOID(&m_pool.m_sleepBitmap, idBit);
>>>> m_wakeEvent.wait();
>>>>
>>>> while (m_pool.m_isActive)
>>>> @@ -162,21 +245,21 @@
>>>> }
>>>> if (nextProvider != -1 && m_curJobProvider != m_pool.m_jpTable[nextProvider])
>>>> {
>>>> - SLEEPBITMAP_AND(&m_curJobProvider->m_ownerBitmap, ~idBit);
>>>> + SLEEPBITMAP_BTR_VOID(&m_curJobProvider->m_ownerBitmap, idBit);
>>>> m_curJobProvider = m_pool.m_jpTable[nextProvider];
>>>> - SLEEPBITMAP_OR(&m_curJobProvider->m_ownerBitmap, idBit);
>>>> + SLEEPBITMAP_BTS_VOID(&m_curJobProvider->m_ownerBitmap, idBit);
>>>> }
>>>> }
>>>> while (m_curJobProvider->m_helpWanted);
>>>>
>>>> /* While the worker sleeps, a job-provider or bond-group may acquire this
>>>> - * worker's sleep bitmap bit. Once acquired, that thread may modify
>>>> + * worker's sleep bitmap bit. Once acquired, that thread may modify
>>>> * m_bondMaster or m_curJobProvider, then waken the thread */
>>>> - SLEEPBITMAP_OR(&m_pool.m_sleepBitmap, idBit);
>>>> + SLEEPBITMAP_BTS_VOID(&m_pool.m_sleepBitmap, idBit);
>>>> m_wakeEvent.wait();
>>>> }
>>>>
>>>> - SLEEPBITMAP_OR(&m_pool.m_sleepBitmap, idBit);
>>>> + SLEEPBITMAP_BTS_VOID(&m_pool.m_sleepBitmap, idBit);
>>>> }
>>>>
>>>> void JobProvider::tryWakeOne()
>>>> @@ -191,10 +274,9 @@
>>>> WorkerThread& worker = m_pool->m_workers[id];
>>>> if (worker.m_curJobProvider != this) /* poaching */
>>>> {
>>>> - sleepbitmap_t bit = (sleepbitmap_t)1 << id;
>>>> - SLEEPBITMAP_AND(&worker.m_curJobProvider->m_ownerBitmap, ~bit);
>>>> + SLEEPBITMAP_BTR_VOID(&worker.m_curJobProvider->m_ownerBitmap, id);
>>>> worker.m_curJobProvider = this;
>>>> - SLEEPBITMAP_OR(&worker.m_curJobProvider->m_ownerBitmap, bit);
>>>> + SLEEPBITMAP_BTS_VOID(&worker.m_curJobProvider->m_ownerBitmap, id);
>>>> }
>>>> worker.awaken();
>>>> }
>>>> @@ -208,8 +290,7 @@
>>>> {
>>>> SLEEPBITMAP_CTZ(id, masked);
>>>>
>>>> - sleepbitmap_t bit = (sleepbitmap_t)1 << id;
>>>> - if (SLEEPBITMAP_AND(&m_sleepBitmap, ~bit) & bit)
>>>> + if (SLEEPBITMAP_BTR(&m_sleepBitmap, id))
>>>> return (int)id;
>>>>
>>>> masked = m_sleepBitmap & firstTryBitmap;
>>>> @@ -220,8 +301,7 @@
>>>> {
>>>> SLEEPBITMAP_CTZ(id, masked);
>>>>
>>>> - sleepbitmap_t bit = (sleepbitmap_t)1 << id;
>>>> - if (SLEEPBITMAP_AND(&m_sleepBitmap, ~bit) & bit)
>>>> + if (SLEEPBITMAP_BTR(&m_sleepBitmap, id))
>>>> return (int)id;
>>>>
>>>> masked = m_sleepBitmap & secondTryBitmap;
>>>> diff -r ff02513b92c0 -r 81529b6bd6ad source/common/wavefront.cpp
>>>> --- a/source/common/wavefront.cpp Fri Dec 22 18:23:24 2017 +0530
>>>> +++ b/source/common/wavefront.cpp Mon Jan 01 15:26:23 2018 +0300
>>>> @@ -66,14 +66,14 @@
>>>>
>>>> void WaveFront::enqueueRow(int row)
>>>> {
>>>> - uint32_t bit = 1 << (row & 31);
>>>> - ATOMIC_OR(&m_internalDependencyBitmap[row >> 5], bit);
>>>> + uint32_t bit = row & 31;
>>>> + ATOMIC_BTS_VOID(&m_internalDependencyBitmap[row >> 5], bit);
>>>> }
>>>>
>>>> void WaveFront::enableRow(int row)
>>>> {
>>>> - uint32_t bit = 1 << (row & 31);
>>>> - ATOMIC_OR(&m_externalDependencyBitmap[row >> 5], bit);
>>>> + uint32_t bit = row & 31;
>>>> + ATOMIC_BTS_VOID(&m_externalDependencyBitmap[row >> 5], bit);
>>>> }
>>>>
>>>> void WaveFront::enableAllRows()
>>>> @@ -83,8 +83,8 @@
>>>>
>>>> bool WaveFront::dequeueRow(int row)
>>>> {
>>>> - uint32_t bit = 1 << (row & 31);
>>>> - return !!(ATOMIC_AND(&m_internalDependencyBitmap[row >> 5], ~bit) & bit);
>>>> + uint32_t bit = row & 31;
>>>> + return ATOMIC_BTR(&m_internalDependencyBitmap[row >> 5], bit);
>>>> }
>>>>
>>>> void WaveFront::findJob(int threadId)
>>>> @@ -99,8 +99,7 @@
>>>> {
>>>> CTZ(id, oldval);
>>>>
>>>> - uint32_t bit = 1 << id;
>>>> - if (ATOMIC_AND(&m_internalDependencyBitmap[w], ~bit) & bit)
>>>> + if (ATOMIC_BTR(&m_internalDependencyBitmap[w], id))
>>>> {
>>>> /* we cleared the bit, we get to process the row */
>>>> processRow(w * 32 + id, threadId);
>>>>
>>>
>>>_______________________________________________
>>>x265-devel mailing list
>>>x265-devel@videolan.org <mailto:x265-devel@videolan.org>
>>>https://mailman.videolan.org/listinfo/x265-devel
>>
>>
>>
>> _______________________________________________
>> x265-devel mailing list
>> x265-devel@videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
>>
>
>_______________________________________________
>x265-devel mailing list
>x265-devel@videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
</pre></div>