[x265-commits] [x265] Merge default into stable, prep for 1.6

Thu Mar 26 21:10:34 CET 2015

details:   http://hg.videolan.org/x265/rev/3d0f23cb0e58
branches:  stable
changeset: 9923:3d0f23cb0e58
user:      Steve Borho <steve at borho.org>
date:      Thu Mar 26 15:09:51 2015 -0500
description:
Merge default into stable, prep for 1.6

diffstat:

 doc/reST/api.rst                         |     10 +-
 doc/reST/cli.rst                         |    326 +-
 doc/reST/presets.rst                     |     17 +-
 doc/reST/threading.rst                   |    101 +-
 readme.rst                               |     14 +
 source/CMakeLists.txt                    |     73 +-
 source/cmake/FindNuma.cmake              |     43 +
 source/cmake/version.cmake               |     18 +-
 source/common/CMakeLists.txt             |      2 +-
 source/common/bitstream.cpp              |      8 +-
 source/common/common.cpp                 |      4 +
 source/common/common.h                   |      7 +-
 source/common/constants.cpp              |      3 +-
 source/common/constants.h                |      4 +-
 source/common/cudata.cpp                 |    623 +-
 source/common/cudata.h                   |     54 +-
 source/common/dct.cpp                    |     48 +-
 source/common/deblock.cpp                |      4 +-
 source/common/framedata.h                |      2 +
 source/common/intrapred.cpp              |     28 +
 source/common/ipfilter.cpp               |     45 +-
 source/common/lowres.cpp                 |     23 +-
 source/common/lowres.h                   |      3 +-
 source/common/mv.h                       |      9 +-
 source/common/param.cpp                  |    211 +-
 source/common/picyuv.cpp                 |      8 +-
 source/common/predict.cpp                |    343 +-
 source/common/predict.h                  |     60 +-
 source/common/primitives.cpp             |      3 +-
 source/common/primitives.h               |     23 +-
 source/common/quant.cpp                  |    104 +-
 source/common/quant.h                    |      4 +-
 source/common/scalinglist.cpp            |      2 +-
 source/common/shortyuv.cpp               |      6 +-
 source/common/slice.cpp                  |     12 +-
 source/common/slice.h                    |     19 +-
 source/common/threading.h                |     63 +-
 source/common/threadpool.cpp             |    708 +-
 source/common/threadpool.h               |    167 +-
 source/common/wavefront.cpp              |     13 +-
 source/common/wavefront.h                |     11 +-
 source/common/x86/asm-primitives.cpp     |    685 +-
 source/common/x86/blockcopy8.asm         |   1548 +-
 source/common/x86/blockcopy8.h           |     42 +
 source/common/x86/const-a.asm            |     21 +-
 source/common/x86/dct8.asm               |    362 +
 source/common/x86/dct8.h                 |      1 +
 source/common/x86/intrapred.h            |     59 +-
 source/common/x86/intrapred16.asm        |    645 +-
 source/common/x86/intrapred8.asm         |  27455 ++++------------------------
 source/common/x86/intrapred8_allangs.asm |  23008 ++++++++++++++++++++++++
 source/common/x86/ipfilter16.asm         |   2697 ++-
 source/common/x86/ipfilter8.asm          |  11394 +++++++++++-
 source/common/x86/ipfilter8.h            |     31 +-
 source/common/x86/mc-a.asm               |    635 +-
 source/common/x86/pixel-a.asm            |   1440 +-
 source/common/x86/pixel-util.h           |     14 +-
 source/common/x86/pixel-util8.asm        |    688 +-
 source/common/x86/pixel.h                |     46 +-
 source/common/x86/pixeladd8.asm          |    161 +
 source/common/x86/sad-a.asm              |    746 +
 source/common/x86/ssd-a.asm              |     39 +-
 source/encoder/analysis.cpp              |    959 +-
 source/encoder/analysis.h                |     48 +-
 source/encoder/api.cpp                   |      5 +-
 source/encoder/dpb.cpp                   |     17 +-
 source/encoder/dpb.h                     |      4 +-
 source/encoder/encoder.cpp               |    491 +-
 source/encoder/encoder.h                 |     10 +-
 source/encoder/entropy.cpp               |    378 +-
 source/encoder/entropy.h                 |      6 +-
 source/encoder/frameencoder.cpp          |    449 +-
 source/encoder/frameencoder.h            |     30 +-
 source/encoder/framefilter.cpp           |      7 +-
 source/encoder/level.cpp                 |     25 +-
 source/encoder/motion.cpp                |     69 +-
 source/encoder/motion.h                  |      1 +
 source/encoder/nal.cpp                   |      2 +-
 source/encoder/ratecontrol.cpp           |    304 +-
 source/encoder/ratecontrol.h             |    216 +-
 source/encoder/sao.cpp                   |      2 +
 source/encoder/search.cpp                |    942 +-
 source/encoder/search.h                  |    197 +-
 source/encoder/slicetype.cpp             |   1715 +-
 source/encoder/slicetype.h               |    250 +-
 source/encoder/weightPrediction.cpp      |     20 +-
 source/input/y4m.cpp                     |     58 +-
 source/output/y4m.cpp                    |      8 -
 source/output/yuv.cpp                    |      4 -
 source/profile/cpuEvents.h               |      3 +-
 source/test/CMakeLists.txt               |      3 +
 source/test/ipfilterharness.cpp          |     73 +-
 source/test/ipfilterharness.h            |      4 +-
 source/test/mbdstharness.cpp             |     64 +-
 source/test/pixelharness.cpp             |    104 +-
 source/test/pixelharness.h               |      1 +
 source/test/rate-control-tests.txt       |     16 +-
 source/test/regression-tests.txt         |     50 +-
 source/test/smoke-tests.txt              |     34 +-
 source/test/testbench.cpp                |      6 +
 source/test/testharness.h                |      2 +-
 source/x265.cpp                          |      6 +-
 source/x265.h                            |    612 +-
 source/x265cli.h                         |     31 +-
 104 files changed, 54213 insertions(+), 27956 deletions(-)

diffs (truncated from 91221 to 300 lines):

diff -r f6a15e605973 -r 3d0f23cb0e58 doc/reST/api.rst

--- a/doc/reST/api.rst	Mon Mar 23 19:16:40 2015 -0500
+++ b/doc/reST/api.rst	Thu Mar 26 15:09:51 2015 -0500
@@ -72,6 +72,8 @@ blocking and thus this would be less eff
 	process. All of the encoders must use the same maximum CTU size
 	because many global variables are configured based on this size.
 	Encoder allocation will fail if a mis-matched CTU size is attempted.
+	If no encoders are open, **x265_cleanup()** can be called to reset
+	the configured CTU size so a new size can be used.
 
 An encoder is allocated by calling **x265_encoder_open()**::
 
@@ -337,10 +339,12 @@ handle must be discarded::
 	void x265_encoder_close(x265_encoder *);
 
 When the application has completed all encodes, it should call
-**x265_cleanup()** to free process global resources like the thread pool;
-particularly if a memory-leak detection tool is being used::
+**x265_cleanup()** to free process global, particularly if a memory-leak
+detection tool is being used. **x265_cleanup()** also resets the saved
+CTU size so it will be possible to create a new encoder with a different
+CTU size::
 
 	/***
-	 * Release library static allocations
+	 * Release library static allocations, reset configured CTU size
 	 */
 	void x265_cleanup(void);
diff -r f6a15e605973 -r 3d0f23cb0e58 doc/reST/cli.rst
--- a/doc/reST/cli.rst	Mon Mar 23 19:16:40 2015 -0500
+++ b/doc/reST/cli.rst	Thu Mar 26 15:09:51 2015 -0500
@@ -171,19 +171,54 @@ Performance Options
 	Over-allocation of frame threads will not improve performance, it
 	will generally just increase memory use.
 
-.. option:: --threads <integer>
+	**Values:** any value between 8 and 16. Default is 0, auto-detect
 
-	Number of threads to allocate for the worker thread pool  This pool
-	is used for WPP and for distributed analysis and motion search:
-	:option:`--wpp` :option:`--pmode` and :option:`--pme` respectively.
+.. option:: --pools <string>, --numa-pools <string>
 
-	If :option:`--threads` 1 is specified, then no thread pool is
-	created. When no thread pool is created, all the thread pool
-	features are implicitly disabled. If all the pool features are
-	disabled by the user, then the pool is implicitly disabled.
+	Comma seperated list of threads per NUMA node. If "none", then no worker
+	pools are created and only frame parallelism is possible. If NULL or ""
+	(default) x265 will use all available threads on each NUMA node::
 
-	Default 0, one thread is allocated per detected hardware thread
-	(logical CPU cores)
+	'+'  is a special value indicating all cores detected on the node
+	'*'  is a special value indicating all cores detected on the node and all remaining nodes
+	'-'  is a special value indicating no cores on the node, same as '0'
+
+	example strings for a 4-node system::
+
+	""        - default, unspecified, all numa nodes are used for thread pools
+	"*"       - same as default
+	"none"    - no thread pools are created, only frame parallelism possible
+	"-"       - same as "none"
+	"10"      - allocate one pool, using up to 10 cores on node 0
+	"-,+"     - allocate one pool, using all cores on node 1
+	"+,-,+"   - allocate two pools, using all cores on nodes 0 and 2
+	"+,-,+,-" - allocate two pools, using all cores on nodes 0 and 2
+	"-,*"     - allocate three pools, using all cores on nodes 1, 2 and 3
+	"8,8,8,8" - allocate four pools with up to 8 threads in each pool
+
+	The total number of threads will be determined by the number of threads
+	assigned to all nodes. The worker threads will each be given affinity for
+	their node, they will not be allowed to migrate between nodes, but they
+	will be allowed to move between CPU cores within their node.
+
+	If the three pool features: :option:`--wpp` :option:`--pmode` and
+	:option:`--pme` are all disabled, then :option:`--pools` is ignored
+	and no thread pools are created.
+
+	If "none" is specified, then all three of the thread pool features are
+	implicitly disabled.
+
+	Multiple thread pools will be allocated for any NUMA node with more than
+	64 logical CPU cores. But any given thread pool will always use at most
+	one NUMA node.
+
+	Frame encoders are distributed between the available thread pools,
+	and the encoder will never generate more thread pools than
+	:option:`--frame-threads`.  The pools are used for WPP and for
+	distributed analysis and motion search.
+
+	Default "", one thread is allocated per detected hardware thread
+	(logical CPU cores) and one thread pool per NUMA node.
 
 .. option:: --wpp, --no-wpp
 
@@ -409,7 +444,17 @@ Profile, Level, Tier
 	If :option:`--level-idc` has been specified, the option adds the
 	intention to support the High tier of that level. If your specified
 	level does not support a High tier, a warning is issued and this
-	modifier flag is ignored.
+	modifier flag is ignored. If :option:`--level-idc` has been specified,
+	but not --high-tier, then the encoder will attempt to encode at the 
+	specified level, main tier first, turning on high tier only if 
+	necessary and available at that level.
+
+.. option:: --ref <1..16>
+
+	Max number of L0 references to be allowed. This number has a linear
+	multiplier effect on the amount of work performed in motion search,
+	but will generally have a beneficial affect on compression and
+	distortion. Default 3
 
 .. note::
 	:option:`--profile`, :option:`--level-idc`, and
@@ -444,7 +489,7 @@ Mode decision / Analysis
 	+-------+---------------------------------------------------------------+
 	| 3     | RDO mode and split decisions, chroma residual used for sa8d   |
 	+-------+---------------------------------------------------------------+
-	| 4     | Adds RDO Quant                                                |
+	| 4     | Currently same as 3                                           |
 	+-------+---------------------------------------------------------------+
 	| 5     | Adds RDO prediction decisions                                 |
 	+-------+---------------------------------------------------------------+
@@ -465,6 +510,23 @@ the prediction quad-tree.
 	and less frame parallelism as well. Because of this the faster
 	presets use a CU size of 32. Default: 64
 
+.. option:: --min-cu-size <64|32|16|8>
+
+	Minimum CU size (width and height). By using 16 or 32 the encoder
+	will not analyze the cost of CUs below that minimum threshold,
+	saving considerable amounts of compute with a predictable increase
+	in bitrate. This setting has a large effect on performance on the
+	faster presets.
+
+	Default: 8 (minimum 8x8 CU for HEVC, best compression efficiency)
+
+.. note::
+
+	All encoders within a single process must use the same settings for
+	the CU size range. :option:`--ctu` and :option:`--min-cu-size` must
+	be consistent for all of them since the encoder configures several
+	key global data structures based on this range.
+
 .. option:: --rect, --no-rect
 
 	Enable analysis of rectangular motion partitions Nx2N and 2NxN
@@ -494,14 +556,6 @@ the prediction quad-tree.
 	Measure full CU size (2Nx2N) merge candidates first; if no residual
 	is found the analysis is short circuited. Default disabled
 
-.. option:: --fast-cbf, --no-fast-cbf
-
-	Short circuit analysis if a prediction is found that does not set
-	the coded block flag (aka: no residual was encoded).  It prevents
-	the encoder from perhaps finding other predictions that also have no
-	residual but require less signaling bits or have less distortion.
-	Only applicable for RD levels 5 and 6. Default disabled
-
 .. option:: --fast-intra, --no-fast-intra
 
 	Perform an initial scan of every fifth intra angular mode, then
@@ -526,14 +580,6 @@ the prediction quad-tree.
 	Only effective at RD levels 3 and above, which perform RDO mode
 	decisions.
 
-.. option:: --tskip, --no-tskip
-
-	Enable evaluation of transform skip (bypass DCT but still use
-	quantization) coding for 4x4 TU coded blocks.
-
-	Only effective at RD levels 3 and above, which perform RDO mode
-	decisions. Default disabled
-
 .. option:: --tskip-fast, --no-tskip-fast
 
 	Only evaluate transform skip for NxN intra predictions (4x4 blocks).
@@ -567,6 +613,30 @@ not match.
 Options which affect the transform unit quad-tree, sometimes referred to
 as the residual quad-tree (RQT).
 
+.. option:: --rdoq-level <0|1|2>, --no-rdoq-level
+
+	Specify the amount of rate-distortion analysis to use within
+	quantization::
+
+	At level 0 rate-distortion cost is not considered in quant
+	
+	At level 1 rate-distortion cost is used to find optimal rounding
+	values for each level (and allows psy-rdoq to be effective). It
+	trades-off the signaling cost of the coefficient vs its post-inverse
+	quant distortion from the pre-quant coefficient. When
+	:option:`--psy-rdoq` is enabled, this formula is biased in favor of
+	more energy in the residual (larger coefficient absolute levels)
+	
+	At level 2 rate-distortion cost is used to make decimate decisions
+	on each 4x4 coding group, including the cost of signaling the group
+	within the group bitmap. If the total distortion of not signaling
+	the entire coding group is less than the rate cost, the block is
+	decimated. Next, it applies rate-distortion cost analysis to the
+	last non-zero coefficient, which can result in many (or all) of the
+	coding groups being decimated. Psy-rdoq is less effective at
+	preserving energy when RDOQ is at level 2, since it only has
+	influence over the level distortion costs.
+
 .. option:: --tu-intra-depth <1..4>
 
 	The transform unit (residual) quad-tree begins with the same depth
@@ -593,9 +663,76 @@ as the residual quad-tree (RQT).
 	partitions, in which case a TU split is implied and thus the
 	residual quad-tree begins one layer below the CU quad-tree.
 
+.. option:: --nr-intra <integer>, --nr-inter <integer>
+
+	Noise reduction - an adaptive deadzone applied after DCT
+	(subtracting from DCT coefficients), before quantization.  It does
+	no pixel-level filtering, doesn't cross DCT block boundaries, has no
+	overlap, The higher the strength value parameter, the more
+	aggressively it will reduce noise.
+
+	Enabling noise reduction will make outputs diverge between different
+	numbers of frame threads. Outputs will be deterministic but the
+	outputs of -F2 will no longer match the outputs of -F3, etc.
+
+	**Values:** any value in range of 0 to 2000. Default 0 (disabled).
+
+.. option:: --tskip, --no-tskip
+
+	Enable evaluation of transform skip (bypass DCT but still use
+	quantization) coding for 4x4 TU coded blocks.
+
+	Only effective at RD levels 3 and above, which perform RDO mode
+	decisions. Default disabled
+
+.. option:: --rdpenalty <0..2>
+
+	When set to 1, transform units of size 32x32 are given a 4x bit cost
+	penalty compared to smaller transform units, in intra coded CUs in P
+	or B slices.
+
+	When set to 2, transform units of size 32x32 are not even attempted,
+	unless otherwise required by the maximum recursion depth.  For this
+	option to be effective with 32x32 intra CUs,
+	:option:`--tu-intra-depth` must be at least 2.  For it to be
+	effective with 64x64 intra CUs, :option:`--tu-intra-depth` must be
+	at least 3.
+
+	Note that in HEVC an intra transform unit (a block of the residual
+	quad-tree) is also a prediction unit, meaning that the intra
+	prediction signal is generated for each TU block, the residual
+	subtracted and then coded. The coding unit simply provides the
+	prediction modes that will be used when predicting all of the
+	transform units within the CU. This means that when you prevent
+	32x32 intra transform units, you are preventing 32x32 intra
+	predictions.
+
+	Default 0, disabled.
+
+	**Values:** 0:disabled 1:4x cost penalty 2:force splits
+
+.. option:: --max-tu-size <32|16|8|4>
+
+	Maximum TU size (width and height). The residual can be more
+	efficiently compressed by the DCT transform when the max TU size
+	is larger, but at the expense of more computation. Transform unit
+	quad-tree begins at the same depth of the coded tree unit, but if the
+	maximum TU size is smaller than the CU size then transform QT begins 
+	at the depth of the max-tu-size. Default: 32.
+
 Temporal / motion search options
 ================================
 
+.. option:: --max-merge <1..5>
+
+	Maximum number of neighbor (spatial and temporal) candidate blocks
+	that the encoder may consider for merging motion predictions. If a
+	merge candidate results in no residual, it is immediately selected
+	as a "skip".  Otherwise the merge candidates are tested as part of
+	motion estimation when searching for the least cost inter option.
+	The max candidate number is encoded in the SPS and determines the
+	bit cost of signaling merge CUs. Default 2
+
 .. option:: --me <integer|string>
 
 	Motion search method. Generally, the higher the number the harder
@@ -658,16 +795,6 @@ Temporal / motion search options
 
 	**Range of values:** an integer from 0 to 32768
 
-.. option:: --max-merge <1..5>
-
-	Maximum number of neighbor (spatial and temporal) candidate blocks
-	that the encoder may consider for merging motion predictions. If a
-	merge candidate results in no residual, it is immediately selected
-	as a "skip".  Otherwise the merge candidates are tested as part of
-	motion estimation when searching for the least cost inter option.
-	The max candidate number is encoded in the SPS and determines the
-	bit cost of signaling merge CUs. Default 2
-