[x265-commits] [x265] docs: cleanup restructured text for --pools

Tue Feb 24 22:45:51 CET 2015

details:   http://hg.videolan.org/x265/rev/7252c10278a1
branches:  
changeset: 9405:7252c10278a1
user:      Steve Borho <steve at borho.org>
date:      Mon Feb 23 23:11:49 2015 -0600
description:
docs: cleanup restructured text for --pools
Subject: [x265] entropy: nit - initialize m_pad

details:   http://hg.videolan.org/x265/rev/ba32f5d14a8b
branches:  
changeset: 9406:ba32f5d14a8b
user:      Steve Borho <steve at borho.org>
date:      Tue Feb 24 12:45:56 2015 -0600
description:
entropy: nit - initialize m_pad

the variable is never used, but it looks conspicuous in a debugger that it is
never initialized.
Subject: [x265] threadpool: only mention NUMA node in pool message if multiple are present

details:   http://hg.videolan.org/x265/rev/282903f583a3
branches:  
changeset: 9407:282903f583a3
user:      Steve Borho <steve at borho.org>
date:      Tue Feb 24 13:33:40 2015 -0600
description:
threadpool: only mention NUMA node in pool message if multiple are present

don't want to confuse the great quantity of users with single-socket systems
Subject: [x265] slicetype: fencIntra[] alignment is implied by 8x8 size of prediction[]

details:   http://hg.videolan.org/x265/rev/0d2ebfdc61e9
branches:  
changeset: 9408:0d2ebfdc61e9
user:      Steve Borho <steve at borho.org>
date:      Tue Feb 24 13:36:32 2015 -0600
description:
slicetype: fencIntra[] alignment is implied by 8x8 size of prediction[]
Subject: [x265] slicetype: nit

details:   http://hg.videolan.org/x265/rev/0cea19ff76d2
branches:  
changeset: 9409:0cea19ff76d2
user:      Steve Borho <steve at borho.org>
date:      Tue Feb 24 13:50:36 2015 -0600
description:
slicetype: nit
Subject: [x265] slicetype: fix I slice cost estimates

details:   http://hg.videolan.org/x265/rev/55713d567fc2
branches:  
changeset: 9410:55713d567fc2
user:      Steve Borho <steve at borho.org>
date:      Tue Feb 24 13:50:29 2015 -0600
description:
slicetype: fix I slice cost estimates
Subject: [x265] slicetype: zero row-satds prior to cooperative (slice based) cost estimate

details:   http://hg.videolan.org/x265/rev/8ba297f59e48
branches:  
changeset: 9411:8ba297f59e48
user:      Steve Borho <steve at borho.org>
date:      Tue Feb 24 14:00:37 2015 -0600
description:
slicetype: zero row-satds prior to cooperative (slice based) cost estimate

diffstat:

 doc/reST/cli.rst             |  30 +++++++++++++++---------------
 source/common/threadpool.cpp |   2 +-
 source/encoder/entropy.cpp   |   1 +
 source/encoder/slicetype.cpp |  26 +++++++++++++++++++++-----
 4 files changed, 38 insertions(+), 21 deletions(-)

diffs (141 lines):

diff -r 8986b6e427b2 -r 8ba297f59e48 doc/reST/cli.rst

--- a/doc/reST/cli.rst	Mon Feb 23 22:42:02 2015 -0600
+++ b/doc/reST/cli.rst	Tue Feb 24 14:00:37 2015 -0600
@@ -184,16 +184,17 @@ Performance Options
 	'-'  is a special value indicating no cores on the node, same as '0'
 
 	example strings for a 4-node system::
-		""        - default, unspecified, all numa nodes are used for thread pools
-		"*"       - same as default
-		"none"    - no thread pools are created, only frame parallelism possible
-		"-"       - same as "none"
-		"10"      - allocate one pool, using up to 10 cores on node 0
-		"-,+"     - allocate one pool, using all cores on node 1
-		"+,-,+"   - allocate two pools, using all cores on nodes 0 and 2
-		"+,-,+,-" - allocate two pools, using all cores on nodes 0 and 2
-		"-,*"     - allocate three pools, using all cores on nodes 1, 2 and 3
-		"8,8,8,8" - allocate four pools with up to 8 threads in each pool
+
+	""        - default, unspecified, all numa nodes are used for thread pools
+	"*"       - same as default
+	"none"    - no thread pools are created, only frame parallelism possible
+	"-"       - same as "none"
+	"10"      - allocate one pool, using up to 10 cores on node 0
+	"-,+"     - allocate one pool, using all cores on node 1
+	"+,-,+"   - allocate two pools, using all cores on nodes 0 and 2
+	"+,-,+,-" - allocate two pools, using all cores on nodes 0 and 2
+	"-,*"     - allocate three pools, using all cores on nodes 1, 2 and 3
+	"8,8,8,8" - allocate four pools with up to 8 threads in each pool
 
 	The total number of threads will be determined by the number of threads
 	assigned to all nodes. The worker threads will each be given affinity for
@@ -211,11 +212,10 @@ Performance Options
 	64 logical CPU cores. But any given thread pool will always use at most
 	one NUMA node.
 
-	Frame encoders are distributed between the available thread pools, and
-	the encoder will never generate more thread pools than frameNumThreads
-
-	Number of threads to allocate for the worker thread pool  This pool
-	is used for WPP and for distributed analysis and motion search:
+	Frame encoders are distributed between the available thread pools,
+	and the encoder will never generate more thread pools than
+	:option:`--frame-threads`.  The pools are used for WPP and for
+	distributed analysis and motion search.
 
 	Default "", one thread is allocated per detected hardware thread
 	(logical CPU cores) and one thread pool per NUMA node.
diff -r 8986b6e427b2 -r 8ba297f59e48 source/common/threadpool.cpp
--- a/source/common/threadpool.cpp	Mon Feb 23 22:42:02 2015 -0600
+++ b/source/common/threadpool.cpp	Tue Feb 24 14:00:37 2015 -0600
@@ -321,7 +321,7 @@ ThreadPool* ThreadPool::allocThreadPools
                 numPools = 0;
                 return NULL;
             }
-            if (bNumaSupport)
+            if (numNumaNodes > 1)
                 x265_log(p, X265_LOG_INFO, "Thread pool %d using %d threads on NUMA node %d\n", i, cores, node);
             else
                 x265_log(p, X265_LOG_INFO, "Thread pool created using %d threads\n", cores);
diff -r 8986b6e427b2 -r 8ba297f59e48 source/encoder/entropy.cpp
--- a/source/encoder/entropy.cpp	Mon Feb 23 22:42:02 2015 -0600
+++ b/source/encoder/entropy.cpp	Tue Feb 24 14:00:37 2015 -0600
@@ -43,6 +43,7 @@ Entropy::Entropy()
 {
     markValid();
     m_fracBits = 0;
+    m_pad = 0;
     X265_CHECK(sizeof(m_contextState) >= sizeof(m_contextState[0]) * MAX_OFF_CTX_MOD, "context state table is too small\n");
 }
 
diff -r 8986b6e427b2 -r 8ba297f59e48 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Mon Feb 23 22:42:02 2015 -0600
+++ b/source/encoder/slicetype.cpp	Tue Feb 24 14:00:37 2015 -0600
@@ -210,7 +210,7 @@ void LookaheadTLD::calcAdaptiveQuantFram
 void LookaheadTLD::lowresIntraEstimate(Lowres& fenc)
 {
     ALIGN_VAR_32(pixel, prediction[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
-    ALIGN_VAR_32(pixel, fencIntra[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
+    pixel fencIntra[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE];
     pixel neighbours[2][X265_LOWRES_CU_SIZE * 4 + 1];
 
     const int lookAheadLambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
@@ -224,6 +224,9 @@ void LookaheadTLD::lowresIntraEstimate(L
     pixel *planar = (cuSize >= 8) ? neighbours[1] : neighbours[0];
     pixelcmp_t satd = primitives.pu[sizeIdx].satd;
 
+    fenc.costEst[0][0] = 0;
+    fenc.costEstAq[0][0] = 0;
+
     for (int cuY = 0; cuY < heightInCU; cuY++)
     {
         fenc.rowSatds[0][0][cuY] = 0;
@@ -239,7 +242,7 @@ void LookaheadTLD::lowresIntraEstimate(L
 
             memcpy(neighbours[0], pixCur - 1 - fenc.lumaStride, (cuSize + 1) * sizeof(pixel));
             for (int i = 1; i < cuSize + 1; i++)
-                neighbours[0][i + cuSize2] = pixCur[-1 - fenc.lumaStride + i * fenc.lumaStride]; /* todo: fixme */
+                neighbours[0][i + cuSize2] = pixCur[-1 - fenc.lumaStride + i * fenc.lumaStride]; /* TODO: gcc warning */
 
             for (int i = 0; i < cuSize; i++)
             {
@@ -264,7 +267,7 @@ void LookaheadTLD::lowresIntraEstimate(L
             uint32_t ilowmode = 0;
 
             /* DC and planar */
-            primitives.cu[sizeIdx].intra_pred[DC_IDX](prediction, cuSize, neighbours[0], 0, (cuSize <= 16));
+            primitives.cu[sizeIdx].intra_pred[DC_IDX](prediction, cuSize, neighbours[0], 0, cuSize <= 16);
             cost = satd(fencIntra, cuSize, prediction, cuSize);
             COPY2_IF_LT(icost, cost, ilowmode, DC_IDX);
 
@@ -306,8 +309,20 @@ void LookaheadTLD::lowresIntraEstimate(L
             fenc.lowresCosts[0][0][cuXY] = (uint16_t)(X265_MIN(icost, LOWRES_COST_MASK) | (0 << LOWRES_COST_SHIFT));
             fenc.intraCost[cuXY] = icost;
             fenc.intraMode[cuXY] = (uint8_t)ilowmode;
-            fenc.rowSatds[0][0][cuY] += icost;
-            fenc.costEst[0][0] += icost;
+
+            /* do not include edge blocks in the frame cost estimates, they are not very accurate */
+            const bool bFrameScoreCU = (cuX > 0 && cuX < widthInCU - 1 &&
+                                        cuY > 0 && cuY < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2;
+
+            int icostAq = (bFrameScoreCU && fenc.invQscaleFactor) ? ((icost * fenc.invQscaleFactor[cuXY] + 128) >> 8) : icost;
+
+            if (bFrameScoreCU)
+            {
+                fenc.costEst[0][0] += icost;
+                fenc.costEstAq[0][0] += icostAq;
+            }
+
+            fenc.rowSatds[0][0][cuY] += icostAq;
         }
     }
 }
@@ -1888,6 +1903,7 @@ int64_t CostEstimateGroup::estimateFrame
              * going to need motion searches or bidir measurements */
 
             memset(&m_slice, 0, sizeof(Slice) * m_lookahead.m_numCoopSlices);
+            memset(fenc->rowSatds, 0, sizeof(fenc->rowSatds[0]) * m_lookahead.m_heightInCU);
 
             m_lock.acquire();
             X265_CHECK(!m_batchMode, "single CostEstimateGroup instance cannot mix batch modes\n");