[x265-commits] [x265] threading: use atomic increment when building semaphore o...

Steve Borho steve at borho.org
Wed Feb 26 18:34:00 CET 2014


details:   http://hg.videolan.org/x265/rev/948626475a46
branches:  
changeset: 6303:948626475a46
user:      Steve Borho <steve at borho.org>
date:      Tue Feb 25 23:53:43 2014 -0600
description:
threading: use atomic increment when building semaphore object

Required moving atomic defines to the top of threading.h, removing the need
for potentially redundant includes of unistd.h
Subject: [x265] encoder: tweak slice stats to more closely match x264 outputs

details:   http://hg.videolan.org/x265/rev/46207f6f5c8c
branches:  
changeset: 6304:46207f6f5c8c
user:      Steve Borho <steve at borho.org>
date:      Wed Feb 26 00:43:46 2014 -0600
description:
encoder: tweak slice stats to more closely match x264 outputs
Subject: [x265] fix: uninitialized variable

details:   http://hg.videolan.org/x265/rev/483e699a9527
branches:  
changeset: 6305:483e699a9527
user:      Satoshi Nakagawa <nakagawa424 at oki.com>
date:      Wed Feb 26 21:06:22 2014 +0900
description:
fix: uninitialized variable
Subject: [x265] all_angs_pred_32x32, asm code improvement

details:   http://hg.videolan.org/x265/rev/0b9c77b41599
branches:  
changeset: 6306:0b9c77b41599
user:      Praveen Tiwari
date:      Wed Feb 26 17:58:24 2014 +0530
description:
all_angs_pred_32x32, asm code improvement
Subject: [x265] rc:bug fix-store average Qp as decided by AQ  only if aq is enabled

details:   http://hg.videolan.org/x265/rev/9b0c9b76d902
branches:  
changeset: 6307:9b0c9b76d902
user:      Santhoshini Sekar <santhoshini at multicorewareinc.com>
date:      Wed Feb 26 18:01:05 2014 +0530
description:
rc:bug fix-store average Qp as decided by AQ  only if aq is enabled

diffstat:

 source/common/threading.cpp      |    2 +-
 source/common/threading.h        |  192 ++++++-----
 source/common/x86/intrapred8.asm |  622 ++++++++++++++------------------------
 source/encoder/encoder.cpp       |    8 +-
 source/encoder/frameencoder.cpp  |    3 +-
 source/encoder/slicetype.cpp     |    2 +
 6 files changed, 341 insertions(+), 488 deletions(-)

diffs (truncated from 1654 to 300 lines):

diff -r b47fc23c75df -r 9b0c9b76d902 source/common/threading.cpp
--- a/source/common/threading.cpp	Wed Feb 26 03:26:00 2014 +0530
+++ b/source/common/threading.cpp	Wed Feb 26 18:01:05 2014 +0530
@@ -65,7 +65,7 @@ Thread::~Thread()
 
 #else /* POSIX / pthreads */
 
-int Event::s_incr /* = 0 */;
+volatile int Event::s_incr /* = 0 */;
 
 static void *ThreadShim(void *opaque)
 {
diff -r b47fc23c75df -r 9b0c9b76d902 source/common/threading.h
--- a/source/common/threading.h	Wed Feb 26 03:26:00 2014 +0530
+++ b/source/common/threading.h	Wed Feb 26 18:01:05 2014 +0530
@@ -35,9 +35,102 @@
 #include <stdio.h>
 #include <errno.h>
 #include <fcntl.h>
+#endif
+
+#include <stdint.h>
+
+#if MACOS
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#endif
+
+#ifdef __GNUC__                         /* GCCs builtin atomics */
+
 #include <unistd.h>
+#include <limits.h>
+
+#define CTZ64(id, x)                        id = (unsigned long)__builtin_ctzll(x)
+#define ATOMIC_OR(ptr, mask)                __sync_or_and_fetch(ptr, mask)
+#define ATOMIC_CAS(ptr, oldval, newval)     __sync_val_compare_and_swap(ptr, oldval, newval)
+#define ATOMIC_CAS32(ptr, oldval, newval)   __sync_val_compare_and_swap(ptr, oldval, newval)
+#define ATOMIC_INC(ptr)                     __sync_add_and_fetch((volatile int32_t*)ptr, 1)
+#define ATOMIC_DEC(ptr)                     __sync_add_and_fetch((volatile int32_t*)ptr, -1)
+#define GIVE_UP_TIME()                      usleep(0)
+
+#elif defined(_MSC_VER)                 /* Windows atomic intrinsics */
+
+#include <intrin.h>
+
+#if !_WIN64
+inline int _BitScanReverse64(DWORD *id, uint64_t x64) // fake 64bit CLZ
+{
+    uint32_t high32 = (uint32_t)(x64 >> 32);
+    uint32_t low32 = (uint32_t)x64;
+
+    if (high32)
+    {
+        _BitScanReverse(id, high32);
+        *id += 32;
+        return 1;
+    }
+    else if (low32)
+        return _BitScanReverse(id, low32);
+    else
+        return *id = 0;
+}
+
+inline int _BitScanForward64(DWORD *id, uint64_t x64) // fake 64bit CLZ
+{
+    uint32_t high32 = (uint32_t)(x64 >> 32);
+    uint32_t low32 = (uint32_t)x64;
+
+    if (high32)
+    {
+        _BitScanForward(id, high32);
+        *id += 32;
+        return 1;
+    }
+    else if (low32)
+        return _BitScanForward(id, low32);
+    else
+        return *id = 0;
+}
+
+#endif // if !_WIN64
+
+#if _WIN32_WINNT <= _WIN32_WINNT_WINXP
+/* Windows XP did not define this intrinsic */
+FORCEINLINE LONGLONG x265_interlocked_OR64(__inout LONGLONG volatile *Destination,
+                                           __in    LONGLONG           Value)
+{
+    LONGLONG Old;
+
+    do
+    {
+        Old = *Destination;
+    }
+    while (_InterlockedCompareExchange64(Destination, Old | Value, Old) != Old);
+
+    return Old;
+}
+
+#define ATOMIC_OR(ptr, mask)            x265_interlocked_OR64((volatile LONG64*)ptr, mask)
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#pragma intrinsic(_InterlockedCompareExchange64)
 #endif
-#include <stdint.h>
+#else // if _WIN32_WINNT <= _WIN32_WINNT_WINXP
+#define ATOMIC_OR(ptr, mask)            InterlockedOr64((volatile LONG64*)ptr, mask)
+#endif // if _WIN32_WINNT <= _WIN32_WINNT_WINXP
+
+#define CTZ64(id, x)                        _BitScanForward64(&id, x)
+#define ATOMIC_CAS(ptr, oldval, newval)     (uint64_t)_InterlockedCompareExchange64((volatile LONG64*)ptr, newval, oldval)
+#define ATOMIC_CAS32(ptr, oldval, newval)   (uint64_t)_InterlockedCompareExchange((volatile LONG*)ptr, newval, oldval)
+#define ATOMIC_INC(ptr)                     InterlockedIncrement((volatile LONG*)ptr)
+#define ATOMIC_DEC(ptr)                     InterlockedDecrement((volatile LONG*)ptr)
+#define GIVE_UP_TIME()                      Sleep(0)
+
+#endif // ifdef __GNUC__
+
 
 namespace x265 {
 // x265 private namespace
@@ -146,7 +239,8 @@ public:
         int pid = (int)getpid();
         do
         {
-            snprintf(name, sizeof(name), "/x265_%d_%d", pid, s_incr++);
+            int num = ATOMIC_INC(&s_incr);
+            snprintf(name, sizeof(name), "/x265_%d_%d", pid, num);
             this->semaphore = sem_open(name, O_CREAT | O_EXCL, 0777, 0);
         }
         while (this->semaphore == SEM_FAILED);
@@ -175,7 +269,7 @@ public:
 
 protected:
 
-    static int s_incr;
+    static volatile int s_incr;
     char name[64];
 
     /* the POSIX version uses a counting semaphore */
@@ -229,96 +323,4 @@ public:
 };
 } // end namespace x265
 
-#if MACOS
-#include <sys/param.h>
-#include <sys/sysctl.h>
-#endif
-
-#ifdef __GNUC__                         /* GCCs builtin atomics */
-
-#include <unistd.h>
-#include <limits.h>
-
-#define CTZ64(id, x)                        id = (unsigned long)__builtin_ctzll(x)
-#define ATOMIC_OR(ptr, mask)                __sync_or_and_fetch(ptr, mask)
-#define ATOMIC_CAS(ptr, oldval, newval)     __sync_val_compare_and_swap(ptr, oldval, newval)
-#define ATOMIC_CAS32(ptr, oldval, newval)   __sync_val_compare_and_swap(ptr, oldval, newval)
-#define ATOMIC_INC(ptr)                     __sync_add_and_fetch((volatile int32_t*)ptr, 1)
-#define ATOMIC_DEC(ptr)                     __sync_add_and_fetch((volatile int32_t*)ptr, -1)
-#define GIVE_UP_TIME()                      usleep(0)
-
-#elif defined(_MSC_VER)                 /* Windows atomic intrinsics */
-
-#include <intrin.h>
-
-#if !_WIN64
-inline int _BitScanReverse64(DWORD *id, uint64_t x64) // fake 64bit CLZ
-{
-    uint32_t high32 = (uint32_t)(x64 >> 32);
-    uint32_t low32 = (uint32_t)x64;
-
-    if (high32)
-    {
-        _BitScanReverse(id, high32);
-        *id += 32;
-        return 1;
-    }
-    else if (low32)
-        return _BitScanReverse(id, low32);
-    else
-        return *id = 0;
-}
-
-inline int _BitScanForward64(DWORD *id, uint64_t x64) // fake 64bit CLZ
-{
-    uint32_t high32 = (uint32_t)(x64 >> 32);
-    uint32_t low32 = (uint32_t)x64;
-
-    if (high32)
-    {
-        _BitScanForward(id, high32);
-        *id += 32;
-        return 1;
-    }
-    else if (low32)
-        return _BitScanForward(id, low32);
-    else
-        return *id = 0;
-}
-
-#endif // if !_WIN64
-
-#if _WIN32_WINNT <= _WIN32_WINNT_WINXP
-/* Windows XP did not define this intrinsic */
-FORCEINLINE LONGLONG x265_interlocked_OR64(__inout LONGLONG volatile *Destination,
-                                           __in    LONGLONG           Value)
-{
-    LONGLONG Old;
-
-    do
-    {
-        Old = *Destination;
-    }
-    while (_InterlockedCompareExchange64(Destination, Old | Value, Old) != Old);
-
-    return Old;
-}
-
-#define ATOMIC_OR(ptr, mask)            x265_interlocked_OR64((volatile LONG64*)ptr, mask)
-#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
-#pragma intrinsic(_InterlockedCompareExchange64)
-#endif
-#else // if _WIN32_WINNT <= _WIN32_WINNT_WINXP
-#define ATOMIC_OR(ptr, mask)            InterlockedOr64((volatile LONG64*)ptr, mask)
-#endif // if _WIN32_WINNT <= _WIN32_WINNT_WINXP
-
-#define CTZ64(id, x)                        _BitScanForward64(&id, x)
-#define ATOMIC_CAS(ptr, oldval, newval)     (uint64_t)_InterlockedCompareExchange64((volatile LONG64*)ptr, newval, oldval)
-#define ATOMIC_CAS32(ptr, oldval, newval)   (uint64_t)_InterlockedCompareExchange((volatile LONG*)ptr, newval, oldval)
-#define ATOMIC_INC(ptr)                     InterlockedIncrement((volatile LONG*)ptr)
-#define ATOMIC_DEC(ptr)                     InterlockedDecrement((volatile LONG*)ptr)
-#define GIVE_UP_TIME()                      Sleep(0)
-
-#endif // ifdef __GNUC__
-
 #endif // ifndef X265_THREADING_H
diff -r b47fc23c75df -r 9b0c9b76d902 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Wed Feb 26 03:26:00 2014 +0530
+++ b/source/common/x86/intrapred8.asm	Wed Feb 26 18:01:05 2014 +0530
@@ -21028,20 +21028,17 @@ pinsrb        m6,    [r3 + 7], 0
 pmaddubsw     m3,    m6,        [r5 + 24 * 16]
 pmulhrsw      m3,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r4 + 6],  1
-pinsrb        m2,    [r4 + 5],  0
+pinsrw         m2,    [r4 + 5], 0
 pmaddubsw     m5,    m2,        [r5 + 24 * 16]
 pmulhrsw      m5,    m7
 packuswb      m3,    m5
 movu          [r0 + 782 * 16],  m3
 pslldq        m1,    2
-pinsrb        m1,    [r4 + 14], 1
-pinsrb        m1,    [r4 + 13], 0
+pinsrw        m1,    [r4 + 13], 0
 pmaddubsw     m3,    m1,        [r5 + 24 * 16]
 pmulhrsw      m3,    m7
 pslldq        m4,    2
-pinsrb        m4,    [r4 + 22], 1
-pinsrb        m4,    [r4 + 21], 0
+pinsrw        m4,    [r4 + 21], 0
 pmaddubsw     m5,    m4,        [r5 + 24 * 16]
 pmulhrsw      m5,    m7
 packuswb      m3,    m5
@@ -21242,15 +21239,13 @@ pinsrb        m7,    [r3 + 10], 0
 pmaddubsw     m3,    m7,        [r5 + 30 * 16]
 pmulhrsw      m3,    [pw_1024]
 pslldq        m2,    2
-pinsrb        m2,    [r4 + 5],  1
-pinsrb        m2,    [r4 + 4],  0
+pinsrw        m2,     [r4 + 4],  0
 pmaddubsw     m5,    m2,        [r5 + 30 * 16]
 pmulhrsw      m5,    [pw_1024]
 packuswb      m3,    m5
 movu          [r0 + 786 * 16],  m3
 pslldq        m1,    2
-pinsrb        m1,    [r4 + 13], 1
-pinsrb        m1,    [r4 + 12], 0
+pinsrw        m1,    [r4 + 12], 0
 pmaddubsw     m3,    m1,        [r5 + 30 * 16]
 pmulhrsw      m3,    [pw_1024]
 pslldq        m4,    2
@@ -21459,20 +21454,17 @@ pinsrb        m6,    [r3 + 18],  0
 pmaddubsw     m3,    m6,         [r5 + 30 * 16]
 pmulhrsw      m3,    [pw_1024]
 pslldq        m2,    2
-pinsrb        m2,    [r4 + 4],  1
-pinsrb        m2,    [r4 + 3],  0
+pinsrw        m2,     [r4 + 3],  0
 pmaddubsw     m5,    m2,        [r5 + 30 * 16]
 pmulhrsw      m5,    [pw_1024]
 packuswb      m3,    m5
 movu          [r0 + 738 * 16],  m3
 pslldq        m1,    2
-pinsrb        m1,    [r4 + 12], 1
-pinsrb        m1,    [r4 + 11], 0
+pinsrw        m1,    [r4 + 11], 0
 pmaddubsw     m3,    m1,        [r5 + 30 * 16]
 pmulhrsw      m3,    [pw_1024]
 pslldq        m4,    2
-pinsrb        m4,    [r4 + 20], 1
-pinsrb        m4,    [r4 + 19], 0


More information about the x265-commits mailing list