[x265-commits] [x265] search: fix decoder intra crash with --cu-lossless
Min Chen
chenm003 at 163.com
Tue Aug 26 22:24:04 CEST 2014
details: http://hg.videolan.org/x265/rev/a0028e5b6177
branches:
changeset: 7884:a0028e5b6177
user: Min Chen <chenm003 at 163.com>
date: Mon Aug 25 16:41:36 2014 -0500
description:
search: fix decoder intra crash with --cu-lossless
Subject: [x265] fix m_initSliceContext (uninitialised m_sliceQp)
details: http://hg.videolan.org/x265/rev/863faab1a004
branches:
changeset: 7885:863faab1a004
user: Satoshi Nakagawa <nakagawa424 at oki.com>
date: Tue Aug 26 17:22:37 2014 +0900
description:
fix m_initSliceContext (uninitialised m_sliceQp)
Subject: [x265] asm: Minor pixel_ssim_end4 improvements
details: http://hg.videolan.org/x265/rev/391e1fbb92cf
branches:
changeset: 7886:391e1fbb92cf
user: Min Chen <chenm003 at 163.com>
date: Tue Aug 26 12:11:56 2014 -0700
description:
asm: Minor pixel_ssim_end4 improvements
Reduce the number of vector registers used from 7 to 5.
Eliminate some moves in the AVX implementation.
Avoid bypass delays for transitioning between int and
float domains.
Ported from Henrik Gramner's recent commit to x264
Subject: [x265] x86inc: Make INIT_CPUFLAGS support an arbitrary number of cpuflags
details: http://hg.videolan.org/x265/rev/090480360cb9
branches:
changeset: 7887:090480360cb9
user: Min Chen <chenm003 at 163.com>
date: Tue Aug 26 12:12:19 2014 -0700
description:
x86inc: Make INIT_CPUFLAGS support an arbitrary number of cpuflags
Ported from Henrik Gramner's recent commit to x264
Subject: [x265] framefilter: move SAO init logic into the frame filter
details: http://hg.videolan.org/x265/rev/78804e5e360c
branches:
changeset: 7888:78804e5e360c
user: Steve Borho <steve at borho.org>
date: Tue Aug 26 13:07:50 2014 -0500
description:
framefilter: move SAO init logic into the frame filter
Subject: [x265] sao: don't pass member variables to functions
details: http://hg.videolan.org/x265/rev/2d386372d543
branches:
changeset: 7889:2d386372d543
user: Steve Borho <steve at borho.org>
date: Mon Aug 25 18:15:00 2014 -0500
description:
sao: don't pass member variables to functions
Subject: [x265] pattern: use isLuma instead of restricted TextType range
details: http://hg.videolan.org/x265/rev/45359413afe6
branches:
changeset: 7890:45359413afe6
user: Steve Borho <steve at borho.org>
date: Tue Aug 26 14:53:53 2014 -0500
description:
pattern: use isLuma instead of restricted TextType range
Subject: [x265] types: remove generic TEXT_CHROMA enum, no longer used
details: http://hg.videolan.org/x265/rev/14fae9208078
branches:
changeset: 7891:14fae9208078
user: Steve Borho <steve at borho.org>
date: Tue Aug 26 14:54:21 2014 -0500
description:
types: remove generic TEXT_CHROMA enum, no longer used
Subject: [x265] common: rename QP range macros to be consistent with x264
details: http://hg.videolan.org/x265/rev/32891b95f669
branches:
changeset: 7892:32891b95f669
user: Steve Borho <steve at borho.org>
date: Tue Aug 26 15:03:38 2014 -0500
description:
common: rename QP range macros to be consistent with x264
I find QP_MAX_SPEC to be a lot more self-explanatory than MAX_QP
diffstat:
source/Lib/TLibCommon/CommonDef.h | 7 ---
source/Lib/TLibCommon/TComPattern.cpp | 8 ++--
source/Lib/TLibCommon/TComPattern.h | 2 +-
source/Lib/TLibCommon/TComRom.cpp | 4 +-
source/Lib/TLibCommon/TComRom.h | 4 +-
source/Lib/TLibCommon/TypeDef.h | 1 -
source/Lib/TLibEncoder/TEncSearch.cpp | 4 ++
source/common/common.h | 7 +++
source/common/deblock.cpp | 6 +-
source/common/param.cpp | 4 +-
source/common/x86/pixel-util8.asm | 62 ++++++++++++++++++++-------------
source/common/x86/x86inc.asm | 41 ++++++++++++----------
source/encoder/frameencoder.cpp | 38 ++++++---------------
source/encoder/framefilter.cpp | 26 ++++++++++++--
source/encoder/framefilter.h | 3 +-
source/encoder/ratecontrol.cpp | 22 ++++++------
source/encoder/sao.cpp | 63 +++++++++++++++++-----------------
source/encoder/sao.h | 9 ++--
18 files changed, 166 insertions(+), 145 deletions(-)
diffs (truncated from 856 to 300 lines):
diff -r 5acfb12ec5d1 -r 32891b95f669 source/Lib/TLibCommon/CommonDef.h
--- a/source/Lib/TLibCommon/CommonDef.h Mon Aug 25 17:53:12 2014 +0900
+++ b/source/Lib/TLibCommon/CommonDef.h Tue Aug 26 15:03:38 2014 -0500
@@ -94,13 +94,6 @@
#define REF_PIC_LIST_X 100
#define NOT_VALID -1
-#define MIN_QP 0
-#define MAX_QP 51
-#define MAX_MAX_QP 69
-
-#define MIN_QPSCALE 0.21249999999999999
-#define MAX_MAX_QPSCALE 615.46574234477100
-
#define AMVP_NUM_CANDS 2 // number of AMVP candidates
#define MRG_MAX_NUM_CANDS 5 // max number of final merge candidates
diff -r 5acfb12ec5d1 -r 32891b95f669 source/Lib/TLibCommon/TComPattern.cpp
--- a/source/Lib/TLibCommon/TComPattern.cpp Mon Aug 25 17:53:12 2014 +0900
+++ b/source/Lib/TLibCommon/TComPattern.cpp Tue Aug 26 15:03:38 2014 -0500
@@ -59,7 +59,7 @@ void TComPattern::initAdiPattern(TComDat
IntraNeighbors intraNeighbors;
- initIntraNeighbors(cu, zOrderIdxInPart, partDepth, TEXT_LUMA, &intraNeighbors);
+ initIntraNeighbors(cu, zOrderIdxInPart, partDepth, true, &intraNeighbors);
uint32_t tuSize = intraNeighbors.tuSize;
uint32_t tuSize2 = tuSize << 1;
@@ -172,7 +172,7 @@ void TComPattern::initAdiPatternChroma(T
IntraNeighbors intraNeighbors;
- initIntraNeighbors(cu, zOrderIdxInPart, partDepth, TEXT_CHROMA, &intraNeighbors);
+ initIntraNeighbors(cu, zOrderIdxInPart, partDepth, false, &intraNeighbors);
uint32_t tuSize = intraNeighbors.tuSize;
roiOrigin = cu->m_pic->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), cu->getZorderIdxInCU() + zOrderIdxInPart);
@@ -181,13 +181,13 @@ void TComPattern::initAdiPatternChroma(T
fillReferenceSamples(roiOrigin, picStride, adiTemp, intraNeighbors);
}
-void TComPattern::initIntraNeighbors(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth, TextType cType, IntraNeighbors *intraNeighbors)
+void TComPattern::initIntraNeighbors(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth, bool isLuma, IntraNeighbors *intraNeighbors)
{
uint32_t log2TrSize = cu->getLog2CUSize(0) - partDepth;
int log2UnitWidth = LOG2_UNIT_SIZE;
int log2UnitHeight = LOG2_UNIT_SIZE;
- if (cType != TEXT_LUMA)
+ if (!isLuma)
{
log2TrSize -= cu->getHorzChromaShift();
log2UnitWidth -= cu->getHorzChromaShift();
diff -r 5acfb12ec5d1 -r 32891b95f669 source/Lib/TLibCommon/TComPattern.h
--- a/source/Lib/TLibCommon/TComPattern.h Mon Aug 25 17:53:12 2014 +0900
+++ b/source/Lib/TLibCommon/TComPattern.h Tue Aug 26 15:03:38 2014 -0500
@@ -90,7 +90,7 @@ public:
static void initAdiPatternChroma(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth,
pixel* adiBuf, uint32_t chromaId);
- static void initIntraNeighbors(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth, TextType cType, IntraNeighbors *IntraNeighbors);
+ static void initIntraNeighbors(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth, bool isLuma, IntraNeighbors *IntraNeighbors);
private:
diff -r 5acfb12ec5d1 -r 32891b95f669 source/Lib/TLibCommon/TComRom.cpp
--- a/source/Lib/TLibCommon/TComRom.cpp Mon Aug 25 17:53:12 2014 +0900
+++ b/source/Lib/TLibCommon/TComRom.cpp Tue Aug 26 15:03:38 2014 -0500
@@ -44,7 +44,7 @@ namespace x265 {
//! \{
// lambda = pow(2, (double)q / 6 - 2);
-double x265_lambda_tab[MAX_MAX_QP + 1] =
+double x265_lambda_tab[QP_MAX_MAX + 1] =
{
0.2500, 0.2806, 0.3150, 0.3536, 0.3969,
0.4454, 0.5000, 0.5612, 0.6300, 0.7071,
@@ -63,7 +63,7 @@ double x265_lambda_tab[MAX_MAX_QP + 1] =
};
// lambda2 = pow(lambda, 2) * scale (0.85);
-double x265_lambda2_tab[MAX_MAX_QP + 1] =
+double x265_lambda2_tab[QP_MAX_MAX + 1] =
{
0.0531, 0.0669, 0.0843, 0.1063, 0.1339,
0.1687, 0.2125, 0.2677, 0.3373, 0.4250,
diff -r 5acfb12ec5d1 -r 32891b95f669 source/Lib/TLibCommon/TComRom.h
--- a/source/Lib/TLibCommon/TComRom.h Mon Aug 25 17:53:12 2014 +0900
+++ b/source/Lib/TLibCommon/TComRom.h Tue Aug 26 15:03:38 2014 -0500
@@ -144,8 +144,8 @@ extern const uint8_t g_log2Size[MAX_CU_S
extern const int g_winUnitX[MAX_CHROMA_FORMAT_IDC + 1];
extern const int g_winUnitY[MAX_CHROMA_FORMAT_IDC + 1];
-extern double x265_lambda_tab[MAX_MAX_QP + 1];
-extern double x265_lambda2_tab[MAX_MAX_QP + 1];
+extern double x265_lambda_tab[QP_MAX_MAX + 1];
+extern double x265_lambda2_tab[QP_MAX_MAX + 1];
extern const uint16_t x265_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1];
// CABAC tables
diff -r 5acfb12ec5d1 -r 32891b95f669 source/Lib/TLibCommon/TypeDef.h
--- a/source/Lib/TLibCommon/TypeDef.h Mon Aug 25 17:53:12 2014 +0900
+++ b/source/Lib/TLibCommon/TypeDef.h Tue Aug 26 15:03:38 2014 -0500
@@ -73,7 +73,6 @@ enum PredMode
enum TextType
{
TEXT_LUMA = 0, // luma
- TEXT_CHROMA = 1, // chroma (U+V)
TEXT_CHROMA_U = 1, // chroma U
TEXT_CHROMA_V = 2, // chroma V
MAX_NUM_COMPONENT = 3
diff -r 5acfb12ec5d1 -r 32891b95f669 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Mon Aug 25 17:53:12 2014 +0900
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Tue Aug 26 15:03:38 2014 -0500
@@ -572,6 +572,10 @@ void TEncSearch::xRecurIntraCodingQT(TCo
bool checkTQbypass = cu->m_slice->m_pps->bTransquantBypassEnabled && !m_param->bLossless;
+ // NOTE: transform_quant_bypass just at cu level
+ if ((cu->m_slice->m_pps->bTransquantBypassEnabled) && cu->getCUTransquantBypass(0) != checkTQbypass)
+ checkTQbypass = cu->getCUTransquantBypass(0) && !m_param->bLossless;
+
uint32_t stride = fencYuv->getStride();
pixel* pred = predYuv->getLumaAddr(absPartIdx);
diff -r 5acfb12ec5d1 -r 32891b95f669 source/common/common.h
--- a/source/common/common.h Mon Aug 25 17:53:12 2014 +0900
+++ b/source/common/common.h Tue Aug 26 15:03:38 2014 -0500
@@ -103,6 +103,13 @@ typedef uint32_t pixel4;
#define X265_DEPTH 8 // compile time configurable bit depth
#endif // if HIGH_BIT_DEPTH
+#define QP_MIN 0
+#define QP_MAX_SPEC 51 /* max allowed signaled QP in HEVC */
+#define QP_MAX_MAX 69 /* max allowed QP to be output by rate control */
+
+#define MIN_QPSCALE 0.21249999999999999
+#define MAX_MAX_QPSCALE 615.46574234477100
+
#define BITS_FOR_POC 8
template<typename T>
diff -r 5acfb12ec5d1 -r 32891b95f669 source/common/deblock.cpp
--- a/source/common/deblock.cpp Mon Aug 25 17:53:12 2014 +0900
+++ b/source/common/deblock.cpp Tue Aug 26 15:03:38 2014 -0500
@@ -501,8 +501,8 @@ void Deblock::edgeFilterLuma(TComDataCU*
int32_t qp = (qpP + qpQ + 1) >> 1;
int32_t bitdepthScale = 1 << (X265_DEPTH - 8);
- int32_t indexTC = Clip3(0, MAX_QP + DEFAULT_INTRA_TC_OFFSET, int32_t(qp + DEFAULT_INTRA_TC_OFFSET * (bs - 1) + tcOffset));
- int32_t indexB = Clip3(0, MAX_QP, qp + betaOffset);
+ int32_t indexTC = Clip3(0, QP_MAX_SPEC + DEFAULT_INTRA_TC_OFFSET, int32_t(qp + DEFAULT_INTRA_TC_OFFSET * (bs - 1) + tcOffset));
+ int32_t indexB = Clip3(0, QP_MAX_SPEC, qp + betaOffset);
int32_t tc = s_tcTable[indexTC] * bitdepthScale;
int32_t beta = s_betaTable[indexB] * bitdepthScale;
@@ -641,7 +641,7 @@ void Deblock::edgeFilterChroma(TComDataC
}
int32_t bitdepthScale = 1 << (X265_DEPTH - 8);
- int32_t indexTC = Clip3(0, MAX_QP + DEFAULT_INTRA_TC_OFFSET, qp + DEFAULT_INTRA_TC_OFFSET * (bs - 1) + tcOffset);
+ int32_t indexTC = Clip3(0, QP_MAX_SPEC + DEFAULT_INTRA_TC_OFFSET, qp + DEFAULT_INTRA_TC_OFFSET * (bs - 1) + tcOffset);
int32_t tc = s_tcTable[indexTC] * bitdepthScale;
for (uint32_t step = 0; step < loopLength; step++)
diff -r 5acfb12ec5d1 -r 32891b95f669 source/common/param.cpp
--- a/source/common/param.cpp Mon Aug 25 17:53:12 2014 +0900
+++ b/source/common/param.cpp Tue Aug 26 15:03:38 2014 -0500
@@ -1245,7 +1245,7 @@ char *x265_param2string(x265_param *p)
s += sprintf(s, " bitrate=%d ratetol=%.1f",
p->rc.bitrate, p->rc.rateTolerance);
s += sprintf(s, " qcomp=%.2f qpmin=%d qpmax=%d qpstep=%d",
- p->rc.qCompress, MIN_QP, MAX_QP, p->rc.qpStep);
+ p->rc.qCompress, QP_MIN, QP_MAX_SPEC, p->rc.qpStep);
if (p->rc.bStatRead)
s += sprintf( s, " cplxblur=%.1f qblur=%.1f",
p->rc.complexityBlur, p->rc.qblur);
@@ -1288,7 +1288,7 @@ bool parseLambdaFile(x265_param *param)
{
double *table = t ? x265_lambda2_tab : x265_lambda_tab;
- for (int i = 0; i < MAX_MAX_QP + 1; i++)
+ for (int i = 0; i < QP_MAX_MAX + 1; i++)
{
double value;
diff -r 5acfb12ec5d1 -r 32891b95f669 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Mon Aug 25 17:53:12 2014 +0900
+++ b/source/common/x86/pixel-util8.asm Tue Aug 26 15:03:38 2014 -0500
@@ -2007,13 +2007,13 @@ cglobal pixel_ssim_4x4x2_core, 4,4,8
;-----------------------------------------------------------------------------
; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
;-----------------------------------------------------------------------------
-cglobal pixel_ssim_end4, 2,3,7
+cglobal pixel_ssim_end4, 2,3
mov r2d, r2m
- movdqa m0, [r0+ 0]
- movdqa m1, [r0+16]
- movdqa m2, [r0+32]
- movdqa m3, [r0+48]
- movdqa m4, [r0+64]
+ mova m0, [r0+ 0]
+ mova m1, [r0+16]
+ mova m2, [r0+32]
+ mova m3, [r0+48]
+ mova m4, [r0+64]
paddd m0, [r1+ 0]
paddd m1, [r1+16]
paddd m2, [r1+32]
@@ -2023,8 +2023,6 @@ cglobal pixel_ssim_end4, 2,3,7
paddd m1, m2
paddd m2, m3
paddd m3, m4
- movdqa m5, [ssim_c1]
- movdqa m6, [ssim_c2]
TRANSPOSE4x4D 0, 1, 2, 3, 4
; s1=m0, s2=m1, ss=m2, s12=m3
@@ -2033,20 +2031,21 @@ cglobal pixel_ssim_end4, 2,3,7
cvtdq2ps m1, m1
cvtdq2ps m2, m2
cvtdq2ps m3, m3
+ mulps m4, m0, m1 ; s1*s2
+ mulps m0, m0 ; s1*s1
+ mulps m1, m1 ; s2*s2
mulps m2, [pf_64] ; ss*64
mulps m3, [pf_128] ; s12*128
- movdqa m4, m1
- mulps m4, m0 ; s1*s2
- mulps m1, m1 ; s2*s2
- mulps m0, m0 ; s1*s1
addps m4, m4 ; s1*s2*2
addps m0, m1 ; s1*s1 + s2*s2
subps m2, m0 ; vars
subps m3, m4 ; covar*2
- addps m4, m5 ; s1*s2*2 + ssim_c1
- addps m0, m5 ; s1*s1 + s2*s2 + ssim_c1
- addps m2, m6 ; vars + ssim_c2
- addps m3, m6 ; covar*2 + ssim_c2
+ movaps m1, [ssim_c1]
+ addps m4, m1 ; s1*s2*2 + ssim_c1
+ addps m0, m1 ; s1*s1 + s2*s2 + ssim_c1
+ movaps m1, [ssim_c2]
+ addps m2, m1 ; vars + ssim_c2
+ addps m3, m1 ; covar*2 + ssim_c2
%else
pmaddwd m4, m1, m0 ; s1*s2
pslld m1, 16
@@ -2057,10 +2056,12 @@ cglobal pixel_ssim_end4, 2,3,7
pslld m2, 6
psubd m3, m4 ; covar*2
psubd m2, m0 ; vars
- paddd m0, m5
- paddd m4, m5
- paddd m3, m6
- paddd m2, m6
+ mova m1, [ssim_c1]
+ paddd m0, m1
+ paddd m4, m1
+ mova m1, [ssim_c2]
+ paddd m3, m1
+ paddd m2, m1
cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1)
cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2)
@@ -2073,20 +2074,31 @@ cglobal pixel_ssim_end4, 2,3,7
cmp r2d, 4
je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
neg r2
+
%ifdef PIC
lea r3, [mask_ff + 16]
- movdqu m1, [r3 + r2*4]
+ %xdefine %%mask r3
%else
- movdqu m1, [mask_ff + r2*4 + 16]
+ %xdefine %%mask mask_ff + 16
%endif
- pand m4, m1
+%if cpuflag(avx)
+ andps m4, [%%mask + r2*4]
+%else
+ movups m0, [%%mask + r2*4]
+ andps m4, m0
+%endif
+
.skip:
movhlps m0, m4
addps m0, m4
- pshuflw m4, m0, q0032
+%if cpuflag(ssse3)
+ movshdup m4, m0
+%else
+ pshuflw m4, m0, q0032
+%endif
addss m0, m4
%if ARCH_X86_64 == 0
- movd r0m, m0
+ movss r0m, m0
More information about the x265-commits
mailing list