[x265-commits] [x265] analysis: avoid motion references not used by split block...

Wed Oct 7 18:47:38 CEST 2015

details:   http://hg.videolan.org/x265/rev/e1269ce2d71a
branches:  stable
changeset: 11032:e1269ce2d71a
user:      Ashok Kumar Mishra<ashok at multicorewareinc.com>
date:      Tue Sep 29 16:40:30 2015 +0530
description:
analysis: avoid motion references not used by split blocks in pme mode
Subject: [x265] Fix: Provide width and height of input file to dither, param may have padded that.

details:   http://hg.videolan.org/x265/rev/98ac099a766f
branches:  stable
changeset: 11033:98ac099a766f
user:      Sagar Kotecha <sagar at multicorewareinc.com>
date:      Tue Sep 29 14:14:24 2015 +0530
description:
Fix: Provide width and height of input file to dither, param may have padded that.

Fixes Issue #195
Subject: [x265] Merge with stable

details:   http://hg.videolan.org/x265/rev/350c21c514ff
branches:  
changeset: 11034:350c21c514ff
user:      Deepthi Nandakumar <deepthi at multicorewareinc.com>
date:      Wed Sep 30 13:44:31 2015 +0530
description:
Merge with stable
Subject: [x265] remove unnecessary function argument

details:   http://hg.videolan.org/x265/rev/6e7761bdfe23
branches:  
changeset: 11035:6e7761bdfe23
user:      Sagar Kotecha <sagar at multicorewareinc.com>
date:      Wed Sep 30 14:57:15 2015 +0530
description:
remove unnecessary function argument
Subject: [x265] cleanup: align NR buffer for asm, rearrange member variables to avoid padding

details:   http://hg.videolan.org/x265/rev/f8b8ebdc5457
branches:  
changeset: 11036:f8b8ebdc5457
user:      Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date:      Mon Sep 28 14:34:41 2015 +0530
description:
cleanup: align NR buffer for asm, rearrange member variables to avoid padding

diffstat:

 doc/reST/cli.rst                         |     72 +-
 source/CMakeLists.txt                    |     30 +-
 source/common/bitstream.h                |      1 +
 source/common/common.h                   |      4 +
 source/common/constants.cpp              |      1 +
 source/common/constants.h                |      1 +
 source/common/contexts.h                 |      1 +
 source/common/cudata.cpp                 |     11 +-
 source/common/cudata.h                   |      1 -
 source/common/dct.cpp                    |      5 +-
 source/common/deblock.cpp                |      9 +-
 source/common/deblock.h                  |      1 +
 source/common/frame.cpp                  |     29 +-
 source/common/frame.h                    |      4 +-
 source/common/framedata.cpp              |      8 +-
 source/common/framedata.h                |     11 +-
 source/common/ipfilter.cpp               |      1 +
 source/common/loopfilter.cpp             |      1 +
 source/common/param.cpp                  |      8 +-
 source/common/picyuv.cpp                 |     71 +-
 source/common/picyuv.h                   |      5 +-
 source/common/pixel.cpp                  |     26 +
 source/common/predict.cpp                |     43 +-
 source/common/predict.h                  |      1 +
 source/common/primitives.h               |      2 +
 source/common/quant.cpp                  |    182 +-
 source/common/quant.h                    |     22 +-
 source/common/slice.cpp                  |     12 +-
 source/common/slice.h                    |    106 +-
 source/common/threading.h                |      1 +
 source/common/threadpool.cpp             |    128 +-
 source/common/threadpool.h               |      9 +-
 source/common/version.cpp                |      1 +
 source/common/wavefront.cpp              |      1 +
 source/common/wavefront.h                |      1 +
 source/common/x86/asm-primitives.cpp     |     93 +-
 source/common/x86/blockcopy8.asm         |      1 +
 source/common/x86/blockcopy8.h           |      1 +
 source/common/x86/const-a.asm            |      6 +
 source/common/x86/cpu-a.asm              |      1 +
 source/common/x86/dct8.h                 |      1 +
 source/common/x86/intrapred8.asm         |  11358 ++++++++++++++++++++--------
 source/common/x86/intrapred8_allangs.asm |    333 +-
 source/common/x86/ipfilter8.asm          |    559 +-
 source/common/x86/loopfilter.h           |      1 +
 source/common/x86/mc-a.asm               |     32 +
 source/common/x86/mc-a2.asm              |      2 +
 source/common/x86/pixel-a.asm            |     79 +
 source/common/x86/pixel-util.h           |      1 +
 source/common/x86/pixel-util8.asm        |     79 +-
 source/common/x86/pixel.h                |      3 +
 source/common/x86/pixeladd8.asm          |      1 +
 source/common/x86/sad-a.asm              |   1568 ++++
 source/common/x86/ssd-a.asm              |     16 +-
 source/common/x86/x86util.asm            |      1 +
 source/common/yuv.cpp                    |      1 +
 source/encoder/analysis.cpp              |    224 +-
 source/encoder/analysis.h                |      5 +-
 source/encoder/api.cpp                   |      5 +-
 source/encoder/bitcost.cpp               |      1 +
 source/encoder/dpb.cpp                   |     20 +-
 source/encoder/dpb.h                     |      4 +-
 source/encoder/encoder.cpp               |    180 +-
 source/encoder/encoder.h                 |     64 +-
 source/encoder/entropy.cpp               |      1 +
 source/encoder/entropy.h                 |      1 +
 source/encoder/frameencoder.cpp          |    140 +-
 source/encoder/framefilter.cpp           |      6 +-
 source/encoder/level.cpp                 |      1 +
 source/encoder/motion.cpp                |      1 +
 source/encoder/motion.h                  |      1 +
 source/encoder/nal.cpp                   |      1 +
 source/encoder/ratecontrol.cpp           |     20 +-
 source/encoder/rdcost.h                  |      1 +
 source/encoder/sao.cpp                   |     30 +-
 source/encoder/search.cpp                |    117 +-
 source/encoder/search.h                  |     13 +
 source/encoder/sei.h                     |      6 -
 source/encoder/slicetype.cpp             |     28 +-
 source/encoder/slicetype.h               |     22 +-
 source/encoder/weightPrediction.cpp      |      3 +-
 source/input/input.h                     |      4 +
 source/input/y4m.h                       |      4 +
 source/input/yuv.h                       |      4 +
 source/test/checkasm-a.asm               |      2 +
 source/test/intrapredharness.cpp         |      2 +
 source/test/ipfilterharness.h            |      1 +
 source/test/pixelharness.cpp             |    136 +-
 source/test/pixelharness.h               |      3 +
 source/test/testbench.cpp                |      1 +
 source/test/testharness.h                |      1 +
 source/x265-extras.cpp                   |    134 +-
 source/x265.cpp                          |      2 +-
 source/x265.h                            |     41 +-
 source/x265cli.h                         |      4 +
 95 files changed, 11696 insertions(+), 4485 deletions(-)

diffs (truncated from 19490 to 300 lines):

diff -r 6eb0b9cf8885 -r f8b8ebdc5457 doc/reST/cli.rst

--- a/doc/reST/cli.rst	Mon Sep 14 09:28:07 2015 +0530
+++ b/doc/reST/cli.rst	Mon Sep 28 14:34:41 2015 +0530
@@ -84,8 +84,8 @@ Logging/Statistic Options
 	it adds one line per run. If :option:`--csv-log-level` is greater than
 	0, it writes one line per frame. Default none
 
-	When frame level logging is enabled, several frame performance
-	statistics are listed:
+	Several frame performance statistics are available when 
+	:option:`--csv-log-level` is greater than or equal to 2:
 
 	**DecideWait ms** number of milliseconds the frame encoder had to
 	wait, since the previous frame was retrieved by the API thread,
@@ -202,15 +202,29 @@ Performance Options
 	"-"       - same as "none"
 	"10"      - allocate one pool, using up to 10 cores on node 0
 	"-,+"     - allocate one pool, using all cores on node 1
-	"+,-,+"   - allocate two pools, using all cores on nodes 0 and 2
-	"+,-,+,-" - allocate two pools, using all cores on nodes 0 and 2
-	"-,*"     - allocate three pools, using all cores on nodes 1, 2 and 3
+	"+,-,+"   - allocate one pool, using only cores on nodes 0 and 2
+	"+,-,+,-" - allocate one pool, using only cores on nodes 0 and 2
+	"-,*"     - allocate one pool, using all cores on nodes 1, 2 and 3
 	"8,8,8,8" - allocate four pools with up to 8 threads in each pool
+	"8,+,+,+" - allocate two pools, the first with 8 threads on node 0, and the second with all cores on node 1,2,3
 
-	The total number of threads will be determined by the number of threads
-	assigned to all nodes. The worker threads will each be given affinity for
-	their node, they will not be allowed to migrate between nodes, but they
-	will be allowed to move between CPU cores within their node.
+	A thread pool dedicated to a given NUMA node is enabled only when the
+	number of threads to be created on that NUMA node is explicitly mentioned
+	in that corresponding position with the --pools option. Else, all threads
+	are spawned from a single pool. The total number of threads will be
+	determined by the number of threads assigned to the enabled NUMA nodes for
+	that pool. The worker threads are be given affinity to all the enabled
+	NUMA nodes for that pool and may migrate between them, unless explicitly
+	specified as described above.
+
+	In the case that any threadpool has more than 64 threads, the threadpool
+	may be broken down into multiple pools of 64 threads each; on 32-bit
+	machines, this number is 32. All pools are given affinity to the NUMA
+	nodes on which the original pool had affinity. For performance reasons,
+	the last thread pool is spawned only if it has more than 32 threads for
+	64-bit machines, or 16 for 32-bit machines. If the total number of threads
+	in the system doesn't obey this constraint, we may spawn fewer threads
+	than cores which has been emperically shown to be better for performance. 
 
 	If the four pool features: :option:`--wpp`, :option:`--pmode`,
 	:option:`--pme` and :option:`--lookahead-slices` are all disabled,
@@ -219,10 +233,6 @@ Performance Options
 	If "none" is specified, then all four of the thread pool features are
 	implicitly disabled.
 
-	Multiple thread pools will be allocated for any NUMA node with more than
-	64 logical CPU cores. But any given thread pool will always use at most
-	one NUMA node.
-
 	Frame encoders are distributed between the available thread pools,
 	and the encoder will never generate more thread pools than
 	:option:`--frame-threads`.  The pools are used for WPP and for
@@ -238,8 +248,12 @@ Performance Options
 	system, a POSIX build of libx265 without libnuma will be less work
 	efficient. See :ref:`thread pools <pools>` for more detail.
 
-	Default "", one thread is allocated per detected hardware thread
-	(logical CPU cores) and one thread pool per NUMA node.
+	Default "", one pool is created across all available NUMA nodes, with
+	one thread allocated per detected hardware thread
+	(logical CPU cores). In the case that the total number of threads is more
+	than the maximum size that ATOMIC operations can handle (32 for 32-bit
+	compiles, and 64 for 64-bit compiles), multiple thread pools may be
+	spawned subject to the performance constraint described above.
 
 	Note that the string value will need to be escaped or quoted to
 	protect against shell expansion on many platforms
@@ -436,8 +450,8 @@ frame counts) are only applicable to the
 	depth of the encoder. If the requested bit depth is not the bit
 	depth of the linked libx265, it will attempt to bind libx265_main
 	for an 8bit encoder, libx265_main10 for a 10bit encoder, or
-	libx265_main12 for a 12bit encoder (EXPERIMENTAL), with the
-	same API version as the linked libx265.
+	libx265_main12 for a 12bit encoder, with the same API version as the
+	linked libx265.
 
 	If the output depth is not specified but :option:`--profile` is
 	specified, the output depth will be derived from the profile name.
@@ -490,13 +504,6 @@ Profile, Level, Tier
 	The CLI application will derive the output bit depth from the
 	profile name if :option:`--output-depth` is not specified.
 
-.. note::
-
-	All 12bit presets are extremely unstable, do not use them yet.
-	16bit is not supported at all, but those profiles are included
-	because it is possible for libx265 to make bitstreams compatible
-	with them.
-
 .. option:: --level-idc <integer|float>
 
 	Minimum decoder requirement level. Defaults to 0, which implies
@@ -1202,6 +1209,13 @@ Quality, rate control and rate distortio
 	is also non-zero. Both vbv-bufsize and vbv-maxrate are required to
 	enable VBV in CRF mode. Default 0 (disabled)
 
+	Note that when VBV is enabled (with a valid :option:`--vbv-bufsize`),
+	VBV emergency denoising is turned on. This will turn on aggressive 
+	denoising at the frame level when frame QP > QP_MAX_SPEC (51), drastically
+	reducing bitrate and allowing ratecontrol to assign lower QPs for
+	the following frames. The visual effect is blurring, but removes 
+	significant blocking/displacement artifacts.
+
 .. option:: --vbv-init <float>
 
 	Initial buffer occupancy. The portion of the decode buffer which
@@ -1644,6 +1658,16 @@ VUI fields must be manually specified.
 	Note that this string value will need to be escaped or quoted to
 	protect against shell expansion on many platforms. No default.
 
+.. option:: --min-luma <integer>
+
+	Minimum luma value allowed for input pictures. Any values below min-luma
+	are clipped. Experimental. No default.
+
+.. option:: --max-luma <integer>
+
+	Maximum luma value allowed for input pictures. Any values above max-luma
+	are clipped. Experimental. No default.
+
 Bitstream options
 =================
 
diff -r 6eb0b9cf8885 -r f8b8ebdc5457 source/CMakeLists.txt
--- a/source/CMakeLists.txt	Mon Sep 14 09:28:07 2015 +0530
+++ b/source/CMakeLists.txt	Mon Sep 28 14:34:41 2015 +0530
@@ -30,7 +30,7 @@ option(STATIC_LINK_CRT "Statically link 
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 68)
+set(X265_BUILD 75)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
@@ -71,23 +71,27 @@ if(UNIX)
     if(LIBRT)
         list(APPEND PLATFORM_LIBS rt)
     endif()
+    mark_as_advanced(LIBRT)
     find_library(LIBDL dl)
     if(LIBDL)
         list(APPEND PLATFORM_LIBS dl)
     endif()
-    find_package(Numa)
-    if(NUMA_FOUND)
-        link_directories(${NUMA_LIBRARY_DIR})
-        list(APPEND CMAKE_REQUIRED_LIBRARIES numa)
-        check_symbol_exists(numa_node_of_cpu numa.h NUMA_V2)
-        if(NUMA_V2)
-            add_definitions(-DHAVE_LIBNUMA)
-            message(STATUS "libnuma found, building with support for NUMA nodes")
-            list(APPEND PLATFORM_LIBS numa)
-            include_directories(${NUMA_INCLUDE_DIR})
+    option(ENABLE_LIBNUMA "Enable libnuma usage (Linux only)" ON)
+    if(ENABLE_LIBNUMA)
+        find_package(Numa)
+        if(NUMA_FOUND)
+            link_directories(${NUMA_LIBRARY_DIR})
+            list(APPEND CMAKE_REQUIRED_LIBRARIES numa)
+            check_symbol_exists(numa_node_of_cpu numa.h NUMA_V2)
+            if(NUMA_V2)
+                add_definitions(-DHAVE_LIBNUMA)
+                message(STATUS "libnuma found, building with support for NUMA nodes")
+                list(APPEND PLATFORM_LIBS numa)
+                include_directories(${NUMA_INCLUDE_DIR})
+            endif()
         endif()
-    endif()
-    mark_as_advanced(LIBRT NUMA_FOUND)
+        mark_as_advanced(NUMA_FOUND)
+    endif(ENABLE_LIBNUMA)
     option(NO_ATOMICS "Use a slow mutex to replace atomics" OFF)
     if(NO_ATOMICS)
         add_definitions(-DNO_ATOMICS=1)
diff -r 6eb0b9cf8885 -r f8b8ebdc5457 source/common/bitstream.h
--- a/source/common/bitstream.h	Mon Sep 14 09:28:07 2015 +0530
+++ b/source/common/bitstream.h	Mon Sep 28 14:34:41 2015 +0530
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Author: Steve Borho <steve at borho.org>
+ *         Min Chen <chenm003 at 163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff -r 6eb0b9cf8885 -r f8b8ebdc5457 source/common/common.h
--- a/source/common/common.h	Mon Sep 14 09:28:07 2015 +0530
+++ b/source/common/common.h	Mon Sep 28 14:34:41 2015 +0530
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Deepthi Nandakumar <deepthi at multicorewareinc.com>
+ *          Min Chen <chenm003 at 163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -317,6 +318,9 @@ typedef int16_t  coeff_t;      // transf
 #define CHROMA_V_SHIFT(x) (x == X265_CSP_I420)
 #define X265_MAX_PRED_MODE_PER_CTU 85 * 2 * 8
 
+#define MAX_NUM_TR_COEFFS           MAX_TR_SIZE * MAX_TR_SIZE // Maximum number of transform coefficients, for a 32x32 transform
+#define MAX_NUM_TR_CATEGORIES       16                        // 32, 16, 8, 4 transform categories each for luma and chroma
+
 namespace X265_NS {
 
 enum { SAO_NUM_OFFSET = 4 };
diff -r 6eb0b9cf8885 -r f8b8ebdc5457 source/common/constants.cpp
--- a/source/common/constants.cpp	Mon Sep 14 09:28:07 2015 +0530
+++ b/source/common/constants.cpp	Mon Sep 28 14:34:41 2015 +0530
@@ -2,6 +2,7 @@
 * Copyright (C) 2015 x265 project
 *
 * Authors: Steve Borho <steve at borho.org>
+*          Min Chen <chenm003 at 163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
diff -r 6eb0b9cf8885 -r f8b8ebdc5457 source/common/constants.h
--- a/source/common/constants.h	Mon Sep 14 09:28:07 2015 +0530
+++ b/source/common/constants.h	Mon Sep 28 14:34:41 2015 +0530
@@ -2,6 +2,7 @@
  * Copyright (C) 2015 x265 project
  *
  * Authors: Steve Borho <steve at borho.org>
+ *          Min Chen <chenm003 at 163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff -r 6eb0b9cf8885 -r f8b8ebdc5457 source/common/contexts.h
--- a/source/common/contexts.h	Mon Sep 14 09:28:07 2015 +0530
+++ b/source/common/contexts.h	Mon Sep 28 14:34:41 2015 +0530
@@ -2,6 +2,7 @@
 * Copyright (C) 2015 x265 project
 *
 * Authors: Steve Borho <steve at borho.org>
+*          Min Chen <chenm003 at 163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
diff -r 6eb0b9cf8885 -r f8b8ebdc5457 source/common/cudata.cpp
--- a/source/common/cudata.cpp	Mon Sep 14 09:28:07 2015 +0530
+++ b/source/common/cudata.cpp	Mon Sep 28 14:34:41 2015 +0530
@@ -2,6 +2,7 @@
  * Copyright (C) 2015 x265 project
  *
  * Authors: Steve Borho <steve at borho.org>
+ *          Min Chen <chenm003 at 163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -1676,7 +1677,7 @@ int CUData::getPMV(InterNeighbourMV *nei
         if (tempRefIdx != -1)
         {
             uint32_t cuAddr = neighbours[MD_COLLOCATED].cuAddr[picList];
-            const Frame* colPic = m_slice->m_refPicList[m_slice->isInterB() && !m_slice->m_colFromL0Flag][m_slice->m_colRefIdx];
+            const Frame* colPic = m_slice->m_refFrameList[m_slice->isInterB() && !m_slice->m_colFromL0Flag][m_slice->m_colRefIdx];
             const CUData* colCU = colPic->m_encData->getPicCTU(cuAddr);
 
             // Scale the vector
@@ -1857,7 +1858,7 @@ bool CUData::getIndirectPMV(MV& outMV, I
 
 bool CUData::getColMVP(MV& outMV, int& outRefIdx, int picList, int cuAddr, int partUnitIdx) const
 {
-    const Frame* colPic = m_slice->m_refPicList[m_slice->isInterB() && !m_slice->m_colFromL0Flag][m_slice->m_colRefIdx];
+    const Frame* colPic = m_slice->m_refFrameList[m_slice->isInterB() && !m_slice->m_colFromL0Flag][m_slice->m_colRefIdx];
     const CUData* colCU = colPic->m_encData->getPicCTU(cuAddr);
 
     uint32_t absPartAddr = partUnitIdx & TMVP_UNIT_MASK;
@@ -1892,7 +1893,7 @@ bool CUData::getColMVP(MV& outMV, int& o
 // Cache the collocated MV.
 bool CUData::getCollocatedMV(int cuAddr, int partUnitIdx, InterNeighbourMV *neighbour) const
 {
-    const Frame* colPic = m_slice->m_refPicList[m_slice->isInterB() && !m_slice->m_colFromL0Flag][m_slice->m_colRefIdx];
+    const Frame* colPic = m_slice->m_refFrameList[m_slice->isInterB() && !m_slice->m_colFromL0Flag][m_slice->m_colRefIdx];
     const CUData* colCU = colPic->m_encData->getPicCTU(cuAddr);
 
     uint32_t absPartAddr = partUnitIdx & TMVP_UNIT_MASK;
@@ -1951,7 +1952,7 @@ void CUData::getTUEntropyCodingParameter
     bool bIsIntra = isIntra(absPartIdx);
 
     // set the group layout
-    result.log2TrSizeCG = log2TrSize - 2;
+    const uint32_t log2TrSizeCG = log2TrSize - 2;
 
     // set the scan orders
     if (bIsIntra)
@@ -1979,7 +1980,7 @@ void CUData::getTUEntropyCodingParameter
         result.scanType = SCAN_DIAG;