<div style="line-height:1.7;color:#000000;font-size:14px;font-family:Arial"><pre><br>At 2018-01-11 00:06:29, "Andrey Semashev" <andrey.semashev@gmail.com> wrote:

>On 01/10/18 18:53, chen wrote:

>> Hi Andrey,

>> 

>> Our code rule prohibit inline assembly, especially the patch used GCC 

>> extension syntax.

>

>Ok, I see.

>

>> the "lock" prefix will lock the CPU bus, it will be greater penalty on 

>> the multi-core system.

>

>Just for the record, the lock prefix is implemented much more 

>efficiently nowdays and involves CPU cache management rather bus 

>locking. It used to lock the memory bus on early CPUs (I want to say 

>before Pentium, but I'm not sure which exact architecture changed this). 

>In any case, the patch does not introduce new lock instructions but it 

>replaces "lock; cmpxchg" loops that are normally generated for the 

<div>>atomic AND and OR operations with a single instruction.</div><div><br></div><pre>https://htor.inf.ethz.ch/publications/img/atomic-bench.pdf</pre><div>In this paper, the author explain toat lock (SWP) just performance drop</div><div>a little in modern CPUs, but they just try less cores system (Xeon Phi</div><div>have more lost and it is single socket CPU), on multi-socket system,</div><div>the cache coherency maintenance will be very expensive.</div><pre>However, the intrinsic may get more benefit from compiler, it may decide</pre><div>which method is best choice on target platform.</div><div><br></div>>

>> At 2018-01-10 23:30:06, "Andrey Semashev" <andrey.semashev@gmail.com <mailto:andrey.semashev@gmail.com>> wrote:

>>>Any feedback on this one?

>>>

>>>I've been using it for quite some time locally. It does seem to work 

>>>slightly faster on my Sandy Bridge machine (it should be a few percents 

>>>of gain in fps, although I didn't save the benchmark numbers).

>>>

>>>On 01/01/18 15:28, Andrey Semashev wrote:

>>>> # HG changeset patch

>>>> # User Andrey Semashev <andrey.semashev@gmail.com <mailto:andrey.semashev@gmail.com>>

>>>> # Date 1514809583 -10800

>>>> #      Mon Jan 01 15:26:23 2018 +0300

>>>> # Branch atomic_bit_opsv2

>>>> # Node ID 81529b6bd6adc8eb31162daeee44399dc1f95999

>>>> # Parent  ff02513b92c000c3bb3dcc51deb79af57f5358d5

>>>> Use atomic bit test and set/reset operations on x86.

>>>> 

>>>> The 'lock bts/btr' instructions are potentially more efficient than the

>>>> 'lock cmpxchg' loops which are emitted to implement ATOMIC_AND and ATOMIC_OR

>>>> on x86. The commit adds new macros ATOMIC_BTS and ATOMIC_BTR which atomically

>>>> set/reset the specified bit in the integer and return the previous value of

>>>> the modified bit.

>>>> 

>>>> Since in many places of the code the result is not needed, two more macros are

>>>> provided as well: ATOMIC_BTS_VOID and ATOMIC_BTR_VOID. The effect of these

>>>> macros is the same except that they don't return the previous value. These

>>>> macros may generate a slightly more efficient code.

>>>> 

>>>> diff -r ff02513b92c0 -r 81529b6bd6ad source/common/threading.h

>>>> --- a/source/common/threading.h        Fri Dec 22 18:23:24 2017 +0530

>>>> +++ b/source/common/threading.h        Mon Jan 01 15:26:23 2018 +0300

>>>> @@ -80,6 +80,91 @@

>>>>   #define ATOMIC_ADD(ptr, val)  __sync_fetch_and_add((volatile int32_t*)ptr, val)

>>>>   #define GIVE_UP_TIME()        usleep(0)

>>>>   

>>>> +#if defined(__x86_64__) || defined(__i386__)

>>>> +

>>>> +namespace X265_NS {

>>>> +

>>>> +inline __attribute__((always_inline)) void atomic_bit_test_and_set_void(uint32_t* ptr, uint32_t bit)

>>>> +{

>>>> +    __asm__ __volatile__

>>>> +    (

>>>> +        "lock; btsl %[bit], %[mem]\n\t"

>>>> +        : [mem] "+m" (*ptr)

>>>> +        : [bit] "Kq" (bit)

>>>> +        : "memory"

>>>> +    );

>>>> +}

>>>> +

>>>> +inline __attribute__((always_inline)) void atomic_bit_test_and_reset_void(uint32_t* ptr, uint32_t bit)

>>>> +{

>>>> +    __asm__ __volatile__

>>>> +    (

>>>> +        "lock; btrl %[bit], %[mem]\n\t"

>>>> +        : [mem] "+m" (*ptr)

>>>> +        : [bit] "Kq" (bit)

>>>> +        : "memory"

>>>> +    );

>>>> +}

>>>> +

>>>> +inline __attribute__((always_inline)) bool atomic_bit_test_and_set(uint32_t* ptr, uint32_t bit)

>>>> +{

>>>> +    bool res;

>>>> +#if defined(__GCC_ASM_FLAG_OUTPUTS__)

>>>> +    __asm__ __volatile__

>>>> +    (

>>>> +        "lock; btsl %[bit], %[mem]\n\t"

>>>> +        : [mem] "+m" (*ptr), [res] "=@ccc" (res)

>>>> +        : [bit] "Kq" (bit)

>>>> +        : "memory"

>>>> +    );

>>>> +#else

>>>> +    res = false; // to avoid false dependency on the higher part of the result register

>>>> +    __asm__ __volatile__

>>>> +    (

>>>> +        "lock; btsl %[bit], %[mem]\n\t"

>>>> +        "setc %[res]\n\t"

>>>> +        : [mem] "+m" (*ptr), [res] "+q" (res)

>>>> +        : [bit] "Kq" (bit)

>>>> +        : "memory"

>>>> +    );

>>>> +#endif

>>>> +    return res;

>>>> +}

>>>> +

>>>> +inline __attribute__((always_inline)) bool atomic_bit_test_and_reset(uint32_t* ptr, uint32_t bit)

>>>> +{

>>>> +    bool res;

>>>> +#if defined(__GCC_ASM_FLAG_OUTPUTS__)

>>>> +    __asm__ __volatile__

>>>> +    (

>>>> +        "lock; btrl %[bit], %[mem]\n\t"

>>>> +        : [mem] "+m" (*ptr), [res] "=@ccc" (res)

>>>> +        : [bit] "Kq" (bit)

>>>> +        : "memory"

>>>> +    );

>>>> +#else

>>>> +    res = false; // to avoid false dependency on the higher part of the result register

>>>> +    __asm__ __volatile__

>>>> +    (

>>>> +        "lock; btrl %[bit], %[mem]\n\t"

>>>> +        "setc %[res]\n\t"

>>>> +        : [mem] "+m" (*ptr), [res] "+q" (res)

>>>> +        : [bit] "Kq" (bit)

>>>> +        : "memory"

>>>> +    );

>>>> +#endif

>>>> +    return res;

>>>> +}

>>>> +

>>>> +}

>>>> +

>>>> +#define ATOMIC_BTS_VOID(ptr, bit)  atomic_bit_test_and_set_void((uint32_t*)(ptr), (bit))

>>>> +#define ATOMIC_BTR_VOID(ptr, bit)  atomic_bit_test_and_reset_void((uint32_t*)(ptr), (bit))

>>>> +#define ATOMIC_BTS(ptr, bit)  atomic_bit_test_and_set((uint32_t*)(ptr), (bit))

>>>> +#define ATOMIC_BTR(ptr, bit)  atomic_bit_test_and_reset((uint32_t*)(ptr), (bit))

>>>> +

>>>> +#endif // defined(__x86_64__) || defined(__i386__)

>>>> +

>>>>   #elif defined(_MSC_VER)       /* Windows atomic intrinsics */

>>>>   

>>>>   #include <intrin.h>

>>>> @@ -93,8 +178,26 @@

>>>>   #define ATOMIC_AND(ptr, mask) _InterlockedAnd((volatile LONG*)ptr, (LONG)mask)

>>>>   #define GIVE_UP_TIME()        Sleep(0)

>>>>   

>>>> +#if defined(_M_IX86) || defined(_M_X64)

>>>> +#define ATOMIC_BTS(ptr, bit)  (!!_interlockedbittestandset((long*)(ptr), (bit)))

>>>> +#define ATOMIC_BTR(ptr, bit)  (!!_interlockedbittestandreset((long*)(ptr), (bit)))

>>>> +#endif // defined(_M_IX86) || defined(_M_X64)

>>>> +

>>>>   #endif // ifdef __GNUC__

>>>>   

>>>> +#if !defined(ATOMIC_BTS)

>>>> +#define ATOMIC_BTS(ptr, bit)  (!!(ATOMIC_OR(ptr, (1u << (bit)))) & (1u << (bit)))

>>>> +#endif

>>>> +#if !defined(ATOMIC_BTR)

>>>> +#define ATOMIC_BTR(ptr, bit)  (!!(ATOMIC_AND(ptr, ~(1u << (bit)))) & (1u << (bit)))

>>>> +#endif

>>>> +#if !defined(ATOMIC_BTS_VOID)

>>>> +#define ATOMIC_BTS_VOID ATOMIC_BTS

>>>> +#endif

>>>> +#if !defined(ATOMIC_BTR_VOID)

>>>> +#define ATOMIC_BTR_VOID ATOMIC_BTR

>>>> +#endif

>>>> +

>>>>   namespace X265_NS {

>>>>   // x265 private namespace

>>>>   

>>>> diff -r ff02513b92c0 -r 81529b6bd6ad source/common/threadpool.cpp

>>>> --- a/source/common/threadpool.cpp     Fri Dec 22 18:23:24 2017 +0530

>>>> +++ b/source/common/threadpool.cpp     Mon Jan 01 15:26:23 2018 +0300

>>>> @@ -37,14 +37,95 @@

>>>>   #ifdef __GNUC__

>>>>   

>>>>   #define SLEEPBITMAP_CTZ(id, x)     id = (unsigned long)__builtin_ctzll(x)

>>>> -#define SLEEPBITMAP_OR(ptr, mask)  __sync_fetch_and_or(ptr, mask)

>>>> -#define SLEEPBITMAP_AND(ptr, mask) __sync_fetch_and_and(ptr, mask)

>>>> +

>>>> +namespace X265_NS {

>>>> +

>>>> +inline __attribute__((always_inline)) void atomic_bit_test_and_set64_void(uint64_t* ptr, uint64_t bit)

>>>> +{

>>>> +    __asm__ __volatile__

>>>> +    (

>>>> +        "lock; btsq %[bit], %[mem]\n\t"

>>>> +        : [mem] "+m" (*ptr)

>>>> +        : [bit] "Kq" (bit)

>>>> +        : "memory"

>>>> +    );

>>>> +}

>>>> +

>>>> +inline __attribute__((always_inline)) void atomic_bit_test_and_reset64_void(uint64_t* ptr, uint64_t bit)

>>>> +{

>>>> +    __asm__ __volatile__

>>>> +    (

>>>> +        "lock; btrq %[bit], %[mem]\n\t"

>>>> +        : [mem] "+m" (*ptr)

>>>> +        : [bit] "Kq" (bit)

>>>> +        : "memory"

>>>> +    );

>>>> +}

>>>> +

>>>> +inline __attribute__((always_inline)) bool atomic_bit_test_and_set64(uint64_t* ptr, uint64_t bit)

>>>> +{

>>>> +    bool res;

>>>> +#if defined(__GCC_ASM_FLAG_OUTPUTS__)

>>>> +    __asm__ __volatile__

>>>> +    (

>>>> +        "lock; btsq %[bit], %[mem]\n\t"

>>>> +        : [mem] "+m" (*ptr), [res] "=@ccc" (res)

>>>> +        : [bit] "Kq" (bit)

>>>> +        : "memory"

>>>> +    );

>>>> +#else

>>>> +    res = false; // to avoid false dependency on the higher part of the result register

>>>> +    __asm__ __volatile__

>>>> +    (

>>>> +        "lock; btsq %[bit], %[mem]\n\t"

>>>> +        "setc %[res]\n\t"

>>>> +        : [mem] "+m" (*ptr), [res] "+q" (res)

>>>> +        : [bit] "Kq" (bit)

>>>> +        : "memory"

>>>> +    );

>>>> +#endif

>>>> +    return res;

>>>> +}

>>>> +

>>>> +inline __attribute__((always_inline)) bool atomic_bit_test_and_reset64(uint64_t* ptr, uint64_t bit)

>>>> +{

>>>> +    bool res;

>>>> +#if defined(__GCC_ASM_FLAG_OUTPUTS__)

>>>> +    __asm__ __volatile__

>>>> +    (

>>>> +        "lock; btrq %[bit], %[mem]\n\t"

>>>> +        : [mem] "+m" (*ptr), [res] "=@ccc" (res)

>>>> +        : [bit] "Kq" (bit)

>>>> +        : "memory"

>>>> +    );

>>>> +#else

>>>> +    res = false; // to avoid false dependency on the higher part of the result register

>>>> +    __asm__ __volatile__

>>>> +    (

>>>> +        "lock; btrq %[bit], %[mem]\n\t"

>>>> +        "setc %[res]\n\t"

>>>> +        : [mem] "+m" (*ptr), [res] "+q" (res)

>>>> +        : [bit] "Kq" (bit)

>>>> +        : "memory"

>>>> +    );

>>>> +#endif

>>>> +    return res;

>>>> +}

>>>> +

>>>> +}

>>>> +

>>>> +#define SLEEPBITMAP_BTS(ptr, bit)  atomic_bit_test_and_set64((uint64_t*)(ptr), (bit))

>>>> +#define SLEEPBITMAP_BTR(ptr, bit)  atomic_bit_test_and_reset64((uint64_t*)(ptr), (bit))

>>>> +#define SLEEPBITMAP_BTS_VOID(ptr, bit)  atomic_bit_test_and_set64_void((uint64_t*)(ptr), (bit))

>>>> +#define SLEEPBITMAP_BTR_VOID(ptr, bit)  atomic_bit_test_and_reset64_void((uint64_t*)(ptr), (bit))

>>>>   

>>>>   #elif defined(_MSC_VER)

>>>>   

>>>>   #define SLEEPBITMAP_CTZ(id, x)     _BitScanForward64(&id, x)

>>>> -#define SLEEPBITMAP_OR(ptr, mask)  InterlockedOr64((volatile LONG64*)ptr, (LONG)mask)

>>>> -#define SLEEPBITMAP_AND(ptr, mask) InterlockedAnd64((volatile LONG64*)ptr, (LONG)mask)

>>>> +#define SLEEPBITMAP_BTS(ptr, bit)  (!!_interlockedbittestandset64((__int64*)(ptr), (bit)))

>>>> +#define SLEEPBITMAP_BTR(ptr, bit)  (!!_interlockedbittestandreset64((__int64*)(ptr), (bit)))

>>>> +#define SLEEPBITMAP_BTS_VOID SLEEPBITMAP_BTS

>>>> +#define SLEEPBITMAP_BTR_VOID SLEEPBITMAP_BTR

>>>>   

>>>>   #endif // ifdef __GNUC__

>>>>   

>>>> @@ -52,8 +133,10 @@

>>>>   

>>>>   /* use 32-bit primitives defined in threading.h */

>>>>   #define SLEEPBITMAP_CTZ CTZ

>>>> -#define SLEEPBITMAP_OR  ATOMIC_OR

>>>> -#define SLEEPBITMAP_AND ATOMIC_AND

>>>> +#define SLEEPBITMAP_BTS ATOMIC_BTS

>>>> +#define SLEEPBITMAP_BTR ATOMIC_BTR

>>>> +#define SLEEPBITMAP_BTS_VOID ATOMIC_BTS_VOID

>>>> +#define SLEEPBITMAP_BTR_VOID ATOMIC_BTR_VOID

>>>>   

>>>>   #endif

>>>>   

>>>> @@ -123,12 +206,12 @@

>>>>   

>>>>       m_pool.setCurrentThreadAffinity();

>>>>   

>>>> -    sleepbitmap_t idBit = (sleepbitmap_t)1 << m_id;

>>>> +    int idBit = m_id;

>>>>       m_curJobProvider = m_pool.m_jpTable[0];

>>>>       m_bondMaster = NULL;

>>>>   

>>>> -    SLEEPBITMAP_OR(&m_curJobProvider->m_ownerBitmap, idBit);

>>>> -    SLEEPBITMAP_OR(&m_pool.m_sleepBitmap, idBit);

>>>> +    SLEEPBITMAP_BTS_VOID(&m_curJobProvider->m_ownerBitmap, idBit);

>>>> +    SLEEPBITMAP_BTS_VOID(&m_pool.m_sleepBitmap, idBit);

>>>>       m_wakeEvent.wait();

>>>>   

>>>>       while (m_pool.m_isActive)

>>>> @@ -162,21 +245,21 @@

>>>>               }

>>>>               if (nextProvider != -1 && m_curJobProvider != m_pool.m_jpTable[nextProvider])

>>>>               {

>>>> -                SLEEPBITMAP_AND(&m_curJobProvider->m_ownerBitmap, ~idBit);

>>>> +                SLEEPBITMAP_BTR_VOID(&m_curJobProvider->m_ownerBitmap, idBit);

>>>>                   m_curJobProvider = m_pool.m_jpTable[nextProvider];

>>>> -                SLEEPBITMAP_OR(&m_curJobProvider->m_ownerBitmap, idBit);

>>>> +                SLEEPBITMAP_BTS_VOID(&m_curJobProvider->m_ownerBitmap, idBit);

>>>>               }

>>>>           }

>>>>           while (m_curJobProvider->m_helpWanted);

>>>>   

>>>>           /* While the worker sleeps, a job-provider or bond-group may acquire this

>>>> -         * worker's sleep bitmap bit. Once acquired, that thread may modify

>>>> +         * worker's sleep bitmap bit. Once acquired, that thread may modify

>>>>            * m_bondMaster or m_curJobProvider, then waken the thread */

>>>> -        SLEEPBITMAP_OR(&m_pool.m_sleepBitmap, idBit);

>>>> +        SLEEPBITMAP_BTS_VOID(&m_pool.m_sleepBitmap, idBit);

>>>>           m_wakeEvent.wait();

>>>>       }

>>>>   

>>>> -    SLEEPBITMAP_OR(&m_pool.m_sleepBitmap, idBit);

>>>> +    SLEEPBITMAP_BTS_VOID(&m_pool.m_sleepBitmap, idBit);

>>>>   }

>>>>   

>>>>   void JobProvider::tryWakeOne()

>>>> @@ -191,10 +274,9 @@

>>>>       WorkerThread& worker = m_pool->m_workers[id];

>>>>       if (worker.m_curJobProvider != this) /* poaching */

>>>>       {

>>>> -        sleepbitmap_t bit = (sleepbitmap_t)1 << id;

>>>> -        SLEEPBITMAP_AND(&worker.m_curJobProvider->m_ownerBitmap, ~bit);

>>>> +        SLEEPBITMAP_BTR_VOID(&worker.m_curJobProvider->m_ownerBitmap, id);

>>>>           worker.m_curJobProvider = this;

>>>> -        SLEEPBITMAP_OR(&worker.m_curJobProvider->m_ownerBitmap, bit);

>>>> +        SLEEPBITMAP_BTS_VOID(&worker.m_curJobProvider->m_ownerBitmap, id);

>>>>       }

>>>>       worker.awaken();

>>>>   }

>>>> @@ -208,8 +290,7 @@

>>>>       {

>>>>           SLEEPBITMAP_CTZ(id, masked);

>>>>   

>>>> -        sleepbitmap_t bit = (sleepbitmap_t)1 << id;

>>>> -        if (SLEEPBITMAP_AND(&m_sleepBitmap, ~bit) & bit)

>>>> +        if (SLEEPBITMAP_BTR(&m_sleepBitmap, id))

>>>>               return (int)id;

>>>>   

>>>>           masked = m_sleepBitmap & firstTryBitmap;

>>>> @@ -220,8 +301,7 @@

>>>>       {

>>>>           SLEEPBITMAP_CTZ(id, masked);

>>>>   

>>>> -        sleepbitmap_t bit = (sleepbitmap_t)1 << id;

>>>> -        if (SLEEPBITMAP_AND(&m_sleepBitmap, ~bit) & bit)

>>>> +        if (SLEEPBITMAP_BTR(&m_sleepBitmap, id))

>>>>               return (int)id;

>>>>   

>>>>           masked = m_sleepBitmap & secondTryBitmap;

>>>> diff -r ff02513b92c0 -r 81529b6bd6ad source/common/wavefront.cpp

>>>> --- a/source/common/wavefront.cpp      Fri Dec 22 18:23:24 2017 +0530

>>>> +++ b/source/common/wavefront.cpp      Mon Jan 01 15:26:23 2018 +0300

>>>> @@ -66,14 +66,14 @@

>>>>   

>>>>   void WaveFront::enqueueRow(int row)

>>>>   {

>>>> -    uint32_t bit = 1 << (row & 31);

>>>> -    ATOMIC_OR(&m_internalDependencyBitmap[row >> 5], bit);

>>>> +    uint32_t bit = row & 31;

>>>> +    ATOMIC_BTS_VOID(&m_internalDependencyBitmap[row >> 5], bit);

>>>>   }

>>>>   

>>>>   void WaveFront::enableRow(int row)

>>>>   {

>>>> -    uint32_t bit = 1 << (row & 31);

>>>> -    ATOMIC_OR(&m_externalDependencyBitmap[row >> 5], bit);

>>>> +    uint32_t bit = row & 31;

>>>> +    ATOMIC_BTS_VOID(&m_externalDependencyBitmap[row >> 5], bit);

>>>>   }

>>>>   

>>>>   void WaveFront::enableAllRows()

>>>> @@ -83,8 +83,8 @@

>>>>   

>>>>   bool WaveFront::dequeueRow(int row)

>>>>   {

>>>> -    uint32_t bit = 1 << (row & 31);

>>>> -    return !!(ATOMIC_AND(&m_internalDependencyBitmap[row >> 5], ~bit) & bit);

>>>> +    uint32_t bit = row & 31;

>>>> +    return ATOMIC_BTR(&m_internalDependencyBitmap[row >> 5], bit);

>>>>   }

>>>>   

>>>>   void WaveFront::findJob(int threadId)

>>>> @@ -99,8 +99,7 @@

>>>>           {

>>>>               CTZ(id, oldval);

>>>>   

>>>> -            uint32_t bit = 1 << id;

>>>> -            if (ATOMIC_AND(&m_internalDependencyBitmap[w], ~bit) & bit)

>>>> +            if (ATOMIC_BTR(&m_internalDependencyBitmap[w], id))

>>>>               {

>>>>                   /* we cleared the bit, we get to process the row */

>>>>                   processRow(w * 32 + id, threadId);

>>>> 

>>>

>>>_______________________________________________

>>>x265-devel mailing list

>>>x265-devel@videolan.org <mailto:x265-devel@videolan.org>

>>>https://mailman.videolan.org/listinfo/x265-devel

>> 

>> 

>> 

>> _______________________________________________

>> x265-devel mailing list

>> x265-devel@videolan.org

>> https://mailman.videolan.org/listinfo/x265-devel

>> 

>

>_______________________________________________

>x265-devel mailing list

>x265-devel@videolan.org

>https://mailman.videolan.org/listinfo/x265-devel

</pre></div>