[x265-commits] [x265] search: fix decoder intra crash with --cu-lossless

Tue Aug 26 22:24:04 CEST 2014

details:   http://hg.videolan.org/x265/rev/a0028e5b6177
branches:  
changeset: 7884:a0028e5b6177
user:      Min Chen <chenm003 at 163.com>
date:      Mon Aug 25 16:41:36 2014 -0500
description:
search: fix decoder intra crash with --cu-lossless
Subject: [x265] fix m_initSliceContext (uninitialised m_sliceQp)

details:   http://hg.videolan.org/x265/rev/863faab1a004
branches:  
changeset: 7885:863faab1a004
user:      Satoshi Nakagawa <nakagawa424 at oki.com>
date:      Tue Aug 26 17:22:37 2014 +0900
description:
fix m_initSliceContext (uninitialised m_sliceQp)
Subject: [x265] asm: Minor pixel_ssim_end4 improvements

details:   http://hg.videolan.org/x265/rev/391e1fbb92cf
branches:  
changeset: 7886:391e1fbb92cf
user:      Min Chen <chenm003 at 163.com>
date:      Tue Aug 26 12:11:56 2014 -0700
description:
asm: Minor pixel_ssim_end4 improvements

Reduce the number of vector registers used from 7 to 5.
Eliminate some moves in the AVX implementation.
Avoid bypass delays for transitioning between int and
float domains.

Ported from Henrik Gramner's recent commit to x264
Subject: [x265] x86inc: Make INIT_CPUFLAGS support an arbitrary number of cpuflags

details:   http://hg.videolan.org/x265/rev/090480360cb9
branches:  
changeset: 7887:090480360cb9
user:      Min Chen <chenm003 at 163.com>
date:      Tue Aug 26 12:12:19 2014 -0700
description:
x86inc: Make INIT_CPUFLAGS support an arbitrary number of cpuflags

Ported from Henrik Gramner's recent commit to x264
Subject: [x265] framefilter: move SAO init logic into the frame filter

details:   http://hg.videolan.org/x265/rev/78804e5e360c
branches:  
changeset: 7888:78804e5e360c
user:      Steve Borho <steve at borho.org>
date:      Tue Aug 26 13:07:50 2014 -0500
description:
framefilter: move SAO init logic into the frame filter
Subject: [x265] sao: don't pass member variables to functions

details:   http://hg.videolan.org/x265/rev/2d386372d543
branches:  
changeset: 7889:2d386372d543
user:      Steve Borho <steve at borho.org>
date:      Mon Aug 25 18:15:00 2014 -0500
description:
sao: don't pass member variables to functions
Subject: [x265] pattern: use isLuma instead of restricted TextType range

details:   http://hg.videolan.org/x265/rev/45359413afe6
branches:  
changeset: 7890:45359413afe6
user:      Steve Borho <steve at borho.org>
date:      Tue Aug 26 14:53:53 2014 -0500
description:
pattern: use isLuma instead of restricted TextType range
Subject: [x265] types: remove generic TEXT_CHROMA enum, no longer used

details:   http://hg.videolan.org/x265/rev/14fae9208078
branches:  
changeset: 7891:14fae9208078
user:      Steve Borho <steve at borho.org>
date:      Tue Aug 26 14:54:21 2014 -0500
description:
types: remove generic TEXT_CHROMA enum, no longer used
Subject: [x265] common: rename QP range macros to be consistent with x264

details:   http://hg.videolan.org/x265/rev/32891b95f669
branches:  
changeset: 7892:32891b95f669
user:      Steve Borho <steve at borho.org>
date:      Tue Aug 26 15:03:38 2014 -0500
description:
common: rename QP range macros to be consistent with x264

I find QP_MAX_SPEC to be a lot more self-explanatory than MAX_QP

diffstat:

 source/Lib/TLibCommon/CommonDef.h     |   7 ---
 source/Lib/TLibCommon/TComPattern.cpp |   8 ++--
 source/Lib/TLibCommon/TComPattern.h   |   2 +-
 source/Lib/TLibCommon/TComRom.cpp     |   4 +-
 source/Lib/TLibCommon/TComRom.h       |   4 +-
 source/Lib/TLibCommon/TypeDef.h       |   1 -
 source/Lib/TLibEncoder/TEncSearch.cpp |   4 ++
 source/common/common.h                |   7 +++
 source/common/deblock.cpp             |   6 +-
 source/common/param.cpp               |   4 +-
 source/common/x86/pixel-util8.asm     |  62 ++++++++++++++++++++-------------
 source/common/x86/x86inc.asm          |  41 ++++++++++++----------
 source/encoder/frameencoder.cpp       |  38 ++++++---------------
 source/encoder/framefilter.cpp        |  26 ++++++++++++--
 source/encoder/framefilter.h          |   3 +-
 source/encoder/ratecontrol.cpp        |  22 ++++++------
 source/encoder/sao.cpp                |  63 +++++++++++++++++-----------------
 source/encoder/sao.h                  |   9 ++--
 18 files changed, 166 insertions(+), 145 deletions(-)

diffs (truncated from 856 to 300 lines):

diff -r 5acfb12ec5d1 -r 32891b95f669 source/Lib/TLibCommon/CommonDef.h

--- a/source/Lib/TLibCommon/CommonDef.h	Mon Aug 25 17:53:12 2014 +0900
+++ b/source/Lib/TLibCommon/CommonDef.h	Tue Aug 26 15:03:38 2014 -0500
@@ -94,13 +94,6 @@
 #define REF_PIC_LIST_X              100
 #define NOT_VALID                   -1
 
-#define MIN_QP                      0
-#define MAX_QP                      51
-#define MAX_MAX_QP                  69
-
-#define MIN_QPSCALE                 0.21249999999999999
-#define MAX_MAX_QPSCALE             615.46574234477100
-
 #define AMVP_NUM_CANDS              2 // number of AMVP candidates
 #define MRG_MAX_NUM_CANDS           5 // max number of final merge candidates
 
diff -r 5acfb12ec5d1 -r 32891b95f669 source/Lib/TLibCommon/TComPattern.cpp
--- a/source/Lib/TLibCommon/TComPattern.cpp	Mon Aug 25 17:53:12 2014 +0900
+++ b/source/Lib/TLibCommon/TComPattern.cpp	Tue Aug 26 15:03:38 2014 -0500
@@ -59,7 +59,7 @@ void TComPattern::initAdiPattern(TComDat
 
     IntraNeighbors intraNeighbors;
 
-    initIntraNeighbors(cu, zOrderIdxInPart, partDepth, TEXT_LUMA, &intraNeighbors);
+    initIntraNeighbors(cu, zOrderIdxInPart, partDepth, true, &intraNeighbors);
     uint32_t tuSize = intraNeighbors.tuSize;
     uint32_t tuSize2 = tuSize << 1;
 
@@ -172,7 +172,7 @@ void TComPattern::initAdiPatternChroma(T
 
     IntraNeighbors intraNeighbors;
 
-    initIntraNeighbors(cu, zOrderIdxInPart, partDepth, TEXT_CHROMA, &intraNeighbors);
+    initIntraNeighbors(cu, zOrderIdxInPart, partDepth, false, &intraNeighbors);
     uint32_t tuSize = intraNeighbors.tuSize;
 
     roiOrigin = cu->m_pic->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), cu->getZorderIdxInCU() + zOrderIdxInPart);
@@ -181,13 +181,13 @@ void TComPattern::initAdiPatternChroma(T
     fillReferenceSamples(roiOrigin, picStride, adiTemp, intraNeighbors);
 }
 
-void TComPattern::initIntraNeighbors(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth, TextType cType, IntraNeighbors *intraNeighbors)
+void TComPattern::initIntraNeighbors(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth, bool isLuma, IntraNeighbors *intraNeighbors)
 {
     uint32_t log2TrSize = cu->getLog2CUSize(0) - partDepth;
     int log2UnitWidth  = LOG2_UNIT_SIZE;
     int log2UnitHeight = LOG2_UNIT_SIZE;
 
-    if (cType != TEXT_LUMA)
+    if (!isLuma)
     {
         log2TrSize     -= cu->getHorzChromaShift();
         log2UnitWidth  -= cu->getHorzChromaShift();
diff -r 5acfb12ec5d1 -r 32891b95f669 source/Lib/TLibCommon/TComPattern.h
--- a/source/Lib/TLibCommon/TComPattern.h	Mon Aug 25 17:53:12 2014 +0900
+++ b/source/Lib/TLibCommon/TComPattern.h	Tue Aug 26 15:03:38 2014 -0500
@@ -90,7 +90,7 @@ public:
     static void initAdiPatternChroma(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth,
                                      pixel* adiBuf, uint32_t chromaId);
 
-    static void initIntraNeighbors(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth, TextType cType, IntraNeighbors *IntraNeighbors);
+    static void initIntraNeighbors(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth, bool isLuma, IntraNeighbors *IntraNeighbors);
 
 private:
 
diff -r 5acfb12ec5d1 -r 32891b95f669 source/Lib/TLibCommon/TComRom.cpp
--- a/source/Lib/TLibCommon/TComRom.cpp	Mon Aug 25 17:53:12 2014 +0900
+++ b/source/Lib/TLibCommon/TComRom.cpp	Tue Aug 26 15:03:38 2014 -0500
@@ -44,7 +44,7 @@ namespace x265 {
 //! \{
 
 // lambda = pow(2, (double)q / 6 - 2);
-double x265_lambda_tab[MAX_MAX_QP + 1] =
+double x265_lambda_tab[QP_MAX_MAX + 1] =
 {
     0.2500, 0.2806, 0.3150, 0.3536, 0.3969,
     0.4454, 0.5000, 0.5612, 0.6300, 0.7071,
@@ -63,7 +63,7 @@ double x265_lambda_tab[MAX_MAX_QP + 1] =
 };
 
 // lambda2 = pow(lambda, 2) * scale (0.85);
-double x265_lambda2_tab[MAX_MAX_QP + 1] =
+double x265_lambda2_tab[QP_MAX_MAX + 1] =
 {
     0.0531, 0.0669, 0.0843, 0.1063, 0.1339,
     0.1687, 0.2125, 0.2677, 0.3373, 0.4250,
diff -r 5acfb12ec5d1 -r 32891b95f669 source/Lib/TLibCommon/TComRom.h
--- a/source/Lib/TLibCommon/TComRom.h	Mon Aug 25 17:53:12 2014 +0900
+++ b/source/Lib/TLibCommon/TComRom.h	Tue Aug 26 15:03:38 2014 -0500
@@ -144,8 +144,8 @@ extern const uint8_t g_log2Size[MAX_CU_S
 extern const int g_winUnitX[MAX_CHROMA_FORMAT_IDC + 1];
 extern const int g_winUnitY[MAX_CHROMA_FORMAT_IDC + 1];
 
-extern double x265_lambda_tab[MAX_MAX_QP + 1];
-extern double x265_lambda2_tab[MAX_MAX_QP + 1];
+extern double x265_lambda_tab[QP_MAX_MAX + 1];
+extern double x265_lambda2_tab[QP_MAX_MAX + 1];
 extern const uint16_t x265_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1];
 
 // CABAC tables
diff -r 5acfb12ec5d1 -r 32891b95f669 source/Lib/TLibCommon/TypeDef.h
--- a/source/Lib/TLibCommon/TypeDef.h	Mon Aug 25 17:53:12 2014 +0900
+++ b/source/Lib/TLibCommon/TypeDef.h	Tue Aug 26 15:03:38 2014 -0500
@@ -73,7 +73,6 @@ enum PredMode
 enum TextType
 {
     TEXT_LUMA     = 0,  // luma
-    TEXT_CHROMA   = 1,  // chroma (U+V)
     TEXT_CHROMA_U = 1,  // chroma U
     TEXT_CHROMA_V = 2,  // chroma V
     MAX_NUM_COMPONENT = 3
diff -r 5acfb12ec5d1 -r 32891b95f669 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Mon Aug 25 17:53:12 2014 +0900
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Tue Aug 26 15:03:38 2014 -0500
@@ -572,6 +572,10 @@ void TEncSearch::xRecurIntraCodingQT(TCo
 
         bool checkTQbypass = cu->m_slice->m_pps->bTransquantBypassEnabled && !m_param->bLossless;
 
+        // NOTE: transform_quant_bypass just at cu level
+        if ((cu->m_slice->m_pps->bTransquantBypassEnabled) && cu->getCUTransquantBypass(0) != checkTQbypass)
+            checkTQbypass = cu->getCUTransquantBypass(0) && !m_param->bLossless;
+
         uint32_t stride = fencYuv->getStride();
         pixel*   pred   = predYuv->getLumaAddr(absPartIdx);
 
diff -r 5acfb12ec5d1 -r 32891b95f669 source/common/common.h
--- a/source/common/common.h	Mon Aug 25 17:53:12 2014 +0900
+++ b/source/common/common.h	Tue Aug 26 15:03:38 2014 -0500
@@ -103,6 +103,13 @@ typedef uint32_t pixel4;
 #define X265_DEPTH 8           // compile time configurable bit depth
 #endif // if HIGH_BIT_DEPTH
 
+#define QP_MIN      0
+#define QP_MAX_SPEC 51 /* max allowed signaled QP in HEVC */
+#define QP_MAX_MAX  69 /* max allowed QP to be output by rate control */
+
+#define MIN_QPSCALE     0.21249999999999999
+#define MAX_MAX_QPSCALE 615.46574234477100
+
 #define BITS_FOR_POC 8
 
 template<typename T>
diff -r 5acfb12ec5d1 -r 32891b95f669 source/common/deblock.cpp
--- a/source/common/deblock.cpp	Mon Aug 25 17:53:12 2014 +0900
+++ b/source/common/deblock.cpp	Tue Aug 26 15:03:38 2014 -0500
@@ -501,8 +501,8 @@ void Deblock::edgeFilterLuma(TComDataCU*
             int32_t qp = (qpP + qpQ + 1) >> 1;
             int32_t bitdepthScale = 1 << (X265_DEPTH - 8);
 
-            int32_t indexTC = Clip3(0, MAX_QP + DEFAULT_INTRA_TC_OFFSET, int32_t(qp + DEFAULT_INTRA_TC_OFFSET * (bs - 1) + tcOffset));
-            int32_t indexB = Clip3(0, MAX_QP, qp + betaOffset);
+            int32_t indexTC = Clip3(0, QP_MAX_SPEC + DEFAULT_INTRA_TC_OFFSET, int32_t(qp + DEFAULT_INTRA_TC_OFFSET * (bs - 1) + tcOffset));
+            int32_t indexB = Clip3(0, QP_MAX_SPEC, qp + betaOffset);
 
             int32_t tc = s_tcTable[indexTC] * bitdepthScale;
             int32_t beta = s_betaTable[indexB] * bitdepthScale;
@@ -641,7 +641,7 @@ void Deblock::edgeFilterChroma(TComDataC
                 }
 
                 int32_t bitdepthScale = 1 << (X265_DEPTH - 8);
-                int32_t indexTC = Clip3(0, MAX_QP + DEFAULT_INTRA_TC_OFFSET, qp + DEFAULT_INTRA_TC_OFFSET * (bs - 1) + tcOffset);
+                int32_t indexTC = Clip3(0, QP_MAX_SPEC + DEFAULT_INTRA_TC_OFFSET, qp + DEFAULT_INTRA_TC_OFFSET * (bs - 1) + tcOffset);
                 int32_t tc = s_tcTable[indexTC] * bitdepthScale;
 
                 for (uint32_t step = 0; step < loopLength; step++)
diff -r 5acfb12ec5d1 -r 32891b95f669 source/common/param.cpp
--- a/source/common/param.cpp	Mon Aug 25 17:53:12 2014 +0900
+++ b/source/common/param.cpp	Tue Aug 26 15:03:38 2014 -0500
@@ -1245,7 +1245,7 @@ char *x265_param2string(x265_param *p)
             s += sprintf(s, " bitrate=%d ratetol=%.1f",
                          p->rc.bitrate, p->rc.rateTolerance);
         s += sprintf(s, " qcomp=%.2f qpmin=%d qpmax=%d qpstep=%d",
-                     p->rc.qCompress, MIN_QP, MAX_QP, p->rc.qpStep);
+                     p->rc.qCompress, QP_MIN, QP_MAX_SPEC, p->rc.qpStep);
         if (p->rc.bStatRead)
             s += sprintf( s, " cplxblur=%.1f qblur=%.1f",
                           p->rc.complexityBlur, p->rc.qblur);
@@ -1288,7 +1288,7 @@ bool parseLambdaFile(x265_param *param)
     {
         double *table = t ? x265_lambda2_tab : x265_lambda_tab;
 
-        for (int i = 0; i < MAX_MAX_QP + 1; i++)
+        for (int i = 0; i < QP_MAX_MAX + 1; i++)
         {
             double value;
 
diff -r 5acfb12ec5d1 -r 32891b95f669 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Mon Aug 25 17:53:12 2014 +0900
+++ b/source/common/x86/pixel-util8.asm	Tue Aug 26 15:03:38 2014 -0500
@@ -2007,13 +2007,13 @@ cglobal pixel_ssim_4x4x2_core, 4,4,8
 ;-----------------------------------------------------------------------------
 ; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
 ;-----------------------------------------------------------------------------
-cglobal pixel_ssim_end4, 2,3,7
+cglobal pixel_ssim_end4, 2,3
     mov       r2d, r2m
-    movdqa    m0, [r0+ 0]
-    movdqa    m1, [r0+16]
-    movdqa    m2, [r0+32]
-    movdqa    m3, [r0+48]
-    movdqa    m4, [r0+64]
+    mova      m0, [r0+ 0]
+    mova      m1, [r0+16]
+    mova      m2, [r0+32]
+    mova      m3, [r0+48]
+    mova      m4, [r0+64]
     paddd     m0, [r1+ 0]
     paddd     m1, [r1+16]
     paddd     m2, [r1+32]
@@ -2023,8 +2023,6 @@ cglobal pixel_ssim_end4, 2,3,7
     paddd     m1, m2
     paddd     m2, m3
     paddd     m3, m4
-    movdqa    m5, [ssim_c1]
-    movdqa    m6, [ssim_c2]
     TRANSPOSE4x4D  0, 1, 2, 3, 4
 
 ;   s1=m0, s2=m1, ss=m2, s12=m3
@@ -2033,20 +2031,21 @@ cglobal pixel_ssim_end4, 2,3,7
     cvtdq2ps  m1, m1
     cvtdq2ps  m2, m2
     cvtdq2ps  m3, m3
+    mulps     m4, m0, m1  ; s1*s2
+    mulps     m0, m0      ; s1*s1
+    mulps     m1, m1      ; s2*s2
     mulps     m2, [pf_64] ; ss*64
     mulps     m3, [pf_128] ; s12*128
-    movdqa    m4, m1
-    mulps     m4, m0      ; s1*s2
-    mulps     m1, m1      ; s2*s2
-    mulps     m0, m0      ; s1*s1
     addps     m4, m4      ; s1*s2*2
     addps     m0, m1      ; s1*s1 + s2*s2
     subps     m2, m0      ; vars
     subps     m3, m4      ; covar*2
-    addps     m4, m5      ; s1*s2*2 + ssim_c1
-    addps     m0, m5      ; s1*s1 + s2*s2 + ssim_c1
-    addps     m2, m6      ; vars + ssim_c2
-    addps     m3, m6      ; covar*2 + ssim_c2
+    movaps    m1, [ssim_c1]
+    addps     m4, m1      ; s1*s2*2 + ssim_c1
+    addps     m0, m1      ; s1*s1 + s2*s2 + ssim_c1
+    movaps    m1, [ssim_c2]
+    addps     m2, m1      ; vars + ssim_c2
+    addps     m3, m1      ; covar*2 + ssim_c2
 %else
     pmaddwd   m4, m1, m0  ; s1*s2
     pslld     m1, 16
@@ -2057,10 +2056,12 @@ cglobal pixel_ssim_end4, 2,3,7
     pslld     m2, 6
     psubd     m3, m4  ; covar*2
     psubd     m2, m0  ; vars
-    paddd     m0, m5
-    paddd     m4, m5
-    paddd     m3, m6
-    paddd     m2, m6
+    mova      m1, [ssim_c1]
+    paddd     m0, m1
+    paddd     m4, m1
+    mova      m1, [ssim_c2]
+    paddd     m3, m1
+    paddd     m2, m1
     cvtdq2ps  m0, m0  ; (float)(s1*s1 + s2*s2 + ssim_c1)
     cvtdq2ps  m4, m4  ; (float)(s1*s2*2 + ssim_c1)
     cvtdq2ps  m3, m3  ; (float)(covar*2 + ssim_c2)
@@ -2073,20 +2074,31 @@ cglobal pixel_ssim_end4, 2,3,7
     cmp       r2d, 4
     je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
     neg       r2
+
 %ifdef PIC
     lea       r3, [mask_ff + 16]
-    movdqu    m1, [r3 + r2*4]
+    %xdefine %%mask r3
 %else
-    movdqu    m1, [mask_ff + r2*4 + 16]
+    %xdefine %%mask mask_ff + 16
 %endif
-    pand      m4, m1
+%if cpuflag(avx)
+    andps     m4, [%%mask + r2*4]
+%else
+    movups    m0, [%%mask + r2*4]
+    andps     m4, m0
+%endif
+
 .skip:
     movhlps   m0, m4
     addps     m0, m4
-    pshuflw   m4, m0, q0032
+%if cpuflag(ssse3)
+    movshdup  m4, m0
+%else
+     pshuflw   m4, m0, q0032
+%endif
     addss     m0, m4
 %if ARCH_X86_64 == 0
-    movd     r0m, m0
+    movss    r0m, m0