[x265-commits] [x265] search: use chroma reconQt buffer instead of recon pictur...
Steve Borho
steve at borho.org
Tue Feb 10 22:50:30 CET 2015
details: http://hg.videolan.org/x265/rev/9f0324125f53
branches: stable
changeset: 9336:9f0324125f53
user: Steve Borho <steve at borho.org>
date: Tue Feb 10 15:15:13 2015 -0600
description:
search: use chroma reconQt buffer instead of recon picture (not yet updated)
This fixes non-determinism when psy-rd is enabled
Subject: [x265] Merge with stable
details: http://hg.videolan.org/x265/rev/1ed7dd760d0f
branches:
changeset: 9337:1ed7dd760d0f
user: Steve Borho <steve at borho.org>
date: Tue Feb 10 15:15:25 2015 -0600
description:
Merge with stable
diffstat:
doc/reST/cli.rst | 19 +-
doc/reST/threading.rst | 11 +-
source/CMakeLists.txt | 11 +-
source/common/bitstream.cpp | 2 +-
source/common/ipfilter.cpp | 45 +-
source/common/param.cpp | 7 +-
source/common/picyuv.cpp | 4 -
source/common/pixel.cpp | 2 +-
source/common/primitives.cpp | 1 +
source/common/primitives.h | 10 +-
source/common/quant.cpp | 78 ++++-
source/common/scalinglist.cpp | 2 +-
source/common/shortyuv.cpp | 6 +-
source/common/slice.cpp | 4 +-
source/common/slice.h | 11 +-
source/common/threading.h | 19 +-
source/common/x86/blockcopy8.asm | 677 +++++++++++++++++++++++---------------
source/encoder/analysis.cpp | 45 ++-
source/encoder/dpb.cpp | 44 +-
source/encoder/dpb.h | 4 +-
source/encoder/encoder.cpp | 173 ++++++++-
source/encoder/encoder.h | 2 +-
source/encoder/entropy.cpp | 81 ++--
source/encoder/entropy.h | 6 +-
source/encoder/frameencoder.cpp | 17 +-
source/encoder/frameencoder.h | 11 +-
source/encoder/framefilter.cpp | 5 +
source/encoder/level.cpp | 25 +-
source/encoder/nal.cpp | 2 +-
source/encoder/search.cpp | 289 ++++++++-------
source/encoder/search.h | 89 +++++-
source/encoder/slicetype.cpp | 33 +-
source/encoder/slicetype.h | 17 +-
source/output/y4m.cpp | 8 -
source/output/yuv.cpp | 4 -
source/test/ipfilterharness.cpp | 73 ++++-
source/test/ipfilterharness.h | 4 +-
source/x265.h | 29 +-
source/x265cli.h | 9 +-
39 files changed, 1287 insertions(+), 592 deletions(-)
diffs (truncated from 3199 to 300 lines):
diff -r 0d30d2641875 -r 1ed7dd760d0f doc/reST/cli.rst
--- a/doc/reST/cli.rst Mon Feb 09 16:45:18 2015 -0600
+++ b/doc/reST/cli.rst Tue Feb 10 15:15:25 2015 -0600
@@ -171,6 +171,8 @@ Performance Options
Over-allocation of frame threads will not improve performance, it
will generally just increase memory use.
+ **Values:** any value between 8 and 16. Default is 0, auto-detect
+
.. option:: --threads <integer>
Number of threads to allocate for the worker thread pool This pool
@@ -409,7 +411,10 @@ Profile, Level, Tier
If :option:`--level-idc` has been specified, the option adds the
intention to support the High tier of that level. If your specified
level does not support a High tier, a warning is issued and this
- modifier flag is ignored.
+ modifier flag is ignored. If :option:`--level-idc` has been specified,
+ but not --high-tier, then the encoder will attempt to encode at the
+ specified level, main tier first, turning on high tier only if
+ necessary and available at that level.
.. note::
:option:`--profile`, :option:`--level-idc`, and
@@ -1357,6 +1362,18 @@ Bitstream options
Picture Timing SEI messages providing timing information to the
decoder. Default disabled
+.. option:: --temporal-layers,--no-temporal-layers
+
+ Enable a temporal sub layer. All referenced I/P/B frames are in the
+ base layer and all unreferenced B frames are placed in a temporal
+ sublayer. A decoder may chose to drop the sublayer and only decode
+ and display the base layer slices.
+
+ If used with a fixed GOP (:option:`b-adapt` 0) and :option:`bframes`
+ 3 then the two layers evenly split the frame rate, with a cadence of
+ PbBbP. You probably also want :option:`--no-scenecut` and a keyframe
+ interval that is a multiple of 4.
+
.. option:: --aud, --no-aud
Emit an access unit delimiter NAL at the start of each slice access
diff -r 0d30d2641875 -r 1ed7dd760d0f doc/reST/threading.rst
--- a/doc/reST/threading.rst Mon Feb 09 16:45:18 2015 -0600
+++ b/doc/reST/threading.rst Tue Feb 10 15:15:25 2015 -0600
@@ -125,9 +125,14 @@ The second extenuating circumstance is t
for motion reference must be processed by the loop filters and the loop
filters cannot run until a full row has been encoded, and it must run a
full row behind the encode process so that the pixels below the row
-being filtered are available. When you add up all the row lags each
-frame ends up being 3 CTU rows behind its reference frames (the
-equivalent of 12 macroblock rows for x264)
+being filtered are available. On top of this, HEVC has two loop filters:
+deblocking and SAO, which must be run in series with a row lag between
+them. When you add up all the row lags each frame ends up being 3 CTU
+rows behind its reference frames (the equivalent of 12 macroblock rows
+for x264). And keep in mind the wave-front progression pattern; by the
+time the reference frame finishes the third row of CTUs, nearly half of
+the CTUs in the frame may be compressed (depending on the display aspect
+ratio).
The third extenuating circumstance is that when a frame being encoded
becomes blocked by a reference frame row being available, that frame's
diff -r 0d30d2641875 -r 1ed7dd760d0f source/CMakeLists.txt
--- a/source/CMakeLists.txt Mon Feb 09 16:45:18 2015 -0600
+++ b/source/CMakeLists.txt Tue Feb 10 15:15:25 2015 -0600
@@ -21,7 +21,7 @@ include(CheckSymbolExists)
include(CheckCXXCompilerFlag)
# X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 43)
+set(X265_BUILD 44)
configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
"${PROJECT_BINARY_DIR}/x265.def")
configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
@@ -240,6 +240,11 @@ if(ENABLE_VTUNE)
add_subdirectory(profile/vtune)
endif(ENABLE_VTUNE)
+option(DETAILED_CU_STATS "Enable internal profiling of encoder work" OFF)
+if(DETAILED_CU_STATS)
+ add_definitions(-DDETAILED_CU_STATS)
+endif(DETAILED_CU_STATS)
+
add_subdirectory(encoder)
add_subdirectory(common)
@@ -375,10 +380,10 @@ if(ENABLE_CLI)
if(XCODE)
# Xcode seems unable to link the CLI with libs, so link as one targget
- add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${FilterFiles} ${GETOPT} x265.cpp x265.h
+ add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${FilterFiles} ${GETOPT} x265.cpp x265.h x265cli.h
$<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> ${YASM_OBJS} ${YASM_SRCS})
else()
- add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${FilterFiles} ${GETOPT} ${X265_RC_FILE} x265.cpp x265.h)
+ add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${FilterFiles} ${GETOPT} ${X265_RC_FILE} x265.cpp x265.h x265cli.h)
if(WIN32 OR NOT ENABLE_SHARED OR INTEL_CXX)
# The CLI cannot link to the shared library on Windows, it
# requires internal APIs not exported from the DLL
diff -r 0d30d2641875 -r 1ed7dd760d0f source/common/bitstream.cpp
--- a/source/common/bitstream.cpp Mon Feb 09 16:45:18 2015 -0600
+++ b/source/common/bitstream.cpp Tue Feb 10 15:15:25 2015 -0600
@@ -27,7 +27,7 @@ void Bitstream::push_back(uint8_t val)
uint8_t *temp = X265_MALLOC(uint8_t, m_byteAlloc * 2);
if (temp)
{
- ::memcpy(temp, m_fifo, m_byteOccupancy);
+ memcpy(temp, m_fifo, m_byteOccupancy);
X265_FREE(m_fifo);
m_fifo = temp;
m_byteAlloc *= 2;
diff -r 0d30d2641875 -r 1ed7dd760d0f source/common/ipfilter.cpp
--- a/source/common/ipfilter.cpp Mon Feb 09 16:45:18 2015 -0600
+++ b/source/common/ipfilter.cpp Tue Feb 10 15:15:25 2015 -0600
@@ -34,8 +34,27 @@ using namespace x265;
#endif
namespace {
+template<int dstStride, int width, int height>
+void pixelToShort_c(const pixel* src, intptr_t srcStride, int16_t* dst)
+{
+ int shift = IF_INTERNAL_PREC - X265_DEPTH;
+ int row, col;
+
+ for (row = 0; row < height; row++)
+ {
+ for (col = 0; col < width; col++)
+ {
+ int16_t val = src[col] << shift;
+ dst[col] = val - (int16_t)IF_INTERNAL_OFFS;
+ }
+
+ src += srcStride;
+ dst += dstStride;
+ }
+}
+
template<int dstStride>
-void filterConvertPelToShort_c(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height)
+void filterPixelToShort_c(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height)
{
int shift = IF_INTERNAL_PREC - X265_DEPTH;
int row, col;
@@ -65,8 +84,8 @@ void extendCURowColBorder(pixel* txt, in
}
#else
- ::memset(txt - marginX, txt[0], marginX);
- ::memset(txt + width, txt[width - 1], marginX);
+ memset(txt - marginX, txt[0], marginX);
+ memset(txt + width, txt[width - 1], marginX);
#endif
txt += stride;
@@ -378,7 +397,8 @@ namespace x265 {
p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>; \
p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \
p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].chroma_p2s = pixelToShort_c<MAX_CU_SIZE / 2, W, H>;
#define CHROMA_422(W, H) \
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
@@ -386,7 +406,8 @@ namespace x265 {
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>; \
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].chroma_p2s = pixelToShort_c<MAX_CU_SIZE / 2, W, H>;
#define CHROMA_444(W, H) \
p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
@@ -394,7 +415,8 @@ namespace x265 {
p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>; \
p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \
p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \
- p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>;
+ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
+ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].chroma_p2s = pixelToShort_c<MAX_CU_SIZE, W, H>;
#define LUMA(W, H) \
p.pu[LUMA_ ## W ## x ## H].luma_hpp = interp_horiz_pp_c<8, W, H>; \
@@ -403,7 +425,8 @@ namespace x265 {
p.pu[LUMA_ ## W ## x ## H].luma_vps = interp_vert_ps_c<8, W, H>; \
p.pu[LUMA_ ## W ## x ## H].luma_vsp = interp_vert_sp_c<8, W, H>; \
p.pu[LUMA_ ## W ## x ## H].luma_vss = interp_vert_ss_c<8, W, H>; \
- p.pu[LUMA_ ## W ## x ## H].luma_hvpp = interp_hv_pp_c<8, W, H>;
+ p.pu[LUMA_ ## W ## x ## H].luma_hvpp = interp_hv_pp_c<8, W, H>; \
+ p.pu[LUMA_ ## W ## x ## H].filter_p2s = pixelToShort_c<MAX_CU_SIZE, W, H>
void setupFilterPrimitives_c(EncoderPrimitives& p)
{
@@ -507,11 +530,11 @@ void setupFilterPrimitives_c(EncoderPrim
CHROMA_444(48, 64);
CHROMA_444(64, 16);
CHROMA_444(16, 64);
- p.luma_p2s = filterConvertPelToShort_c<MAX_CU_SIZE>;
+ p.luma_p2s = filterPixelToShort_c<MAX_CU_SIZE>;
- p.chroma[X265_CSP_I444].p2s = filterConvertPelToShort_c<MAX_CU_SIZE>;
- p.chroma[X265_CSP_I420].p2s = filterConvertPelToShort_c<MAX_CU_SIZE / 2>;
- p.chroma[X265_CSP_I422].p2s = filterConvertPelToShort_c<MAX_CU_SIZE / 2>;
+ p.chroma[X265_CSP_I444].p2s = filterPixelToShort_c<MAX_CU_SIZE>;
+ p.chroma[X265_CSP_I420].p2s = filterPixelToShort_c<MAX_CU_SIZE / 2>;
+ p.chroma[X265_CSP_I422].p2s = filterPixelToShort_c<MAX_CU_SIZE / 2>;
p.extendRowBorder = extendCURowColBorder;
}
diff -r 0d30d2641875 -r 1ed7dd760d0f source/common/param.cpp
--- a/source/common/param.cpp Mon Feb 09 16:45:18 2015 -0600
+++ b/source/common/param.cpp Tue Feb 10 15:15:25 2015 -0600
@@ -181,6 +181,7 @@ void x265_param_default(x265_param *para
param->bIntraInBFrames = 0;
param->bLossless = 0;
param->bCULossless = 0;
+ param->bEnableTemporalSubLayers = 0;
/* Rate control options */
param->rc.vbvMaxBitrate = 0;
@@ -598,6 +599,7 @@ int x265_param_parse(x265_param *p, cons
p->scenecutThreshold = atoi(value);
}
}
+ OPT("temporal-layers") p->bEnableTemporalSubLayers = atobool(value);
OPT("keyint") p->keyframeMax = atoi(value);
OPT("min-keyint") p->keyframeMin = atoi(value);
OPT("rc-lookahead") p->lookaheadDepth = atoi(value);
@@ -992,8 +994,8 @@ int x265_check_params(x265_param *param)
"subme must be less than or equal to X265_MAX_SUBPEL_LEVEL (7)");
CHECK(param->subpelRefine < 0,
"subme must be greater than or equal to 0");
- CHECK(param->frameNumThreads < 0,
- "frameNumThreads (--frame-threads) must be 0 or higher");
+ CHECK(param->frameNumThreads < 0 || param->frameNumThreads > X265_MAX_FRAME_THREADS,
+ "frameNumThreads (--frame-threads) must be [0 .. X265_MAX_FRAME_THREADS)");
CHECK(param->cbQpOffset < -12, "Min. Chroma Cb QP Offset is -12");
CHECK(param->cbQpOffset > 12, "Max. Chroma Cb QP Offset is 12");
CHECK(param->crQpOffset < -12, "Min. Chroma Cr QP Offset is -12");
@@ -1309,6 +1311,7 @@ char *x265_param2string(x265_param *p)
BOOL(p->bEnableConstrainedIntra, "constrained-intra");
BOOL(p->bEnableFastIntra, "fast-intra");
BOOL(p->bOpenGOP, "open-gop");
+ BOOL(p->bEnableTemporalSubLayers, "temporal-layers");
s += sprintf(s, " interlace=%d", p->interlaceMode);
s += sprintf(s, " keyint=%d", p->keyframeMax);
s += sprintf(s, " min-keyint=%d", p->keyframeMin);
diff -r 0d30d2641875 -r 1ed7dd760d0f source/common/picyuv.cpp
--- a/source/common/picyuv.cpp Mon Feb 09 16:45:18 2015 -0600
+++ b/source/common/picyuv.cpp Tue Feb 10 15:15:25 2015 -0600
@@ -229,9 +229,7 @@ void PicYuv::copyFromPicture(const x265_
for (int r = 0; r < height; r++)
{
for (int x = 0; x < padx; x++)
- {
Y[width + x] = Y[width - 1];
- }
Y += m_stride;
}
@@ -257,9 +255,7 @@ void PicYuv::copyFromPicture(const x265_
pixel *V = m_picOrg[2] + ((height >> m_vChromaShift) - 1) * m_strideC;
for (int i = 1; i <= pady; i++)
- {
memcpy(Y + i * m_stride, Y, (width + padx) * sizeof(pixel));
- }
for (int j = 1; j <= pady >> m_vChromaShift; j++)
{
diff -r 0d30d2641875 -r 1ed7dd760d0f source/common/pixel.cpp
--- a/source/common/pixel.cpp Mon Feb 09 16:45:18 2015 -0600
+++ b/source/common/pixel.cpp Tue Feb 10 15:15:25 2015 -0600
@@ -527,7 +527,7 @@ void weight_sp_c(const int16_t* src, pix
X265_CHECK(!((w0 << 6) > 32767), "w0 using more than 16 bits, asm output will mismatch\n");
X265_CHECK(!(round > 32767), "round using more than 16 bits, asm output will mismatch\n");
X265_CHECK((shift >= correction), "shift must be include factor correction, please update ASM ABI\n");
- X265_CHECK(!(round & ((1 << correction) - 1)), "round must be include factor correction, please update ASM ABI\n");
+ X265_CHECK(!(round & ((1 << (correction - 1)) - 1)), "round must be include factor correction, please update ASM ABI\n");
for (y = 0; y <= height - 1; y++)
{
diff -r 0d30d2641875 -r 1ed7dd760d0f source/common/primitives.cpp
--- a/source/common/primitives.cpp Mon Feb 09 16:45:18 2015 -0600
+++ b/source/common/primitives.cpp Tue Feb 10 15:15:25 2015 -0600
@@ -98,6 +98,7 @@ void setupAliasPrimitives(EncoderPrimiti
p.chroma[X265_CSP_I444].pu[i].copy_pp = p.pu[i].copy_pp;
p.chroma[X265_CSP_I444].pu[i].addAvg = p.pu[i].addAvg;
p.chroma[X265_CSP_I444].pu[i].satd = p.pu[i].satd;
+ p.chroma[X265_CSP_I444].pu[i].chroma_p2s = p.pu[i].filter_p2s;
}
for (int i = 0; i < NUM_CU_SIZES; i++)
diff -r 0d30d2641875 -r 1ed7dd760d0f source/common/primitives.h
--- a/source/common/primitives.h Mon Feb 09 16:45:18 2015 -0600
+++ b/source/common/primitives.h Tue Feb 10 15:15:25 2015 -0600
@@ -155,7 +155,8 @@ typedef void (*filter_ps_t) (const pixel
More information about the x265-commits
mailing list