[x265-commits] [x265] tune: ensure lookahead is disabled for zero-latency

Sat Mar 1 06:35:33 CET 2014

details:   http://hg.videolan.org/x265/rev/61c752e11424
branches:  
changeset: 6329:61c752e11424
user:      Steve Borho <steve at borho.org>
date:      Thu Feb 27 20:32:18 2014 -0600
description:
tune: ensure lookahead is disabled for zero-latency
Subject: [x265] asm: enable count_nonzero for HIGH_BIT_DEPTH

details:   http://hg.videolan.org/x265/rev/df831b319c08
branches:  
changeset: 6330:df831b319c08
user:      Satoshi Nakagawa <nakagawa424 at oki.com>
date:      Fri Feb 28 10:40:17 2014 +0900
description:
asm: enable count_nonzero for HIGH_BIT_DEPTH
Subject: [x265] square transform only

details:   http://hg.videolan.org/x265/rev/9b43c262124e
branches:  
changeset: 6331:9b43c262124e
user:      Satoshi Nakagawa <nakagawa424 at oki.com>
date:      Fri Feb 28 11:12:57 2014 +0900
description:
square transform only
Subject: [x265] api: make log-level 4 semi-official and expose in public API

details:   http://hg.videolan.org/x265/rev/994f046a8111
branches:  stable
changeset: 6332:994f046a8111
user:      Steve Borho <steve at borho.org>
date:      Thu Feb 27 21:40:52 2014 -0600
description:
api: make log-level 4 semi-official and expose in public API
Subject: [x265] tcompicyuv: initialize NULL to cu and bu offset buffers.

details:   http://hg.videolan.org/x265/rev/f6d079ad85bc
branches:  stable
changeset: 6333:f6d079ad85bc
user:      Gopu Govindaswamy
date:      Fri Feb 28 13:00:01 2014 -0800
description:
tcompicyuv: initialize NULL to cu and bu offset buffers.
Subject: [x265] Merge with stable

details:   http://hg.videolan.org/x265/rev/5e9559d366b3
branches:  
changeset: 6334:5e9559d366b3
user:      Steve Borho <steve at borho.org>
date:      Fri Feb 28 01:41:43 2014 -0600
description:
Merge with stable
Subject: [x265] weightp: use struct to cache data for reuse, refactor MC of reference planes

details:   http://hg.videolan.org/x265/rev/518313140b03
branches:  
changeset: 6335:518313140b03
user:      Kavitha Sampath <kavitha at multicorewareinc.com>
date:      Fri Feb 28 12:28:22 2014 +0530
description:
weightp: use struct to cache data for reuse, refactor MC of reference planes

* do not consider intra/mv cost during MC phase
* unconditionally motion-compensate luma and chroma blocks
* include slice header cost estimate in weight analysis
* weightCost() needed different paths for luma, chroma, and chroma444
* pass a single stride to weightCost()
Subject: [x265] asm: split SAO_EO_0 into separate primitive func

details:   http://hg.videolan.org/x265/rev/000f86d72337
branches:  
changeset: 6336:000f86d72337
user:      Praveen Tiwari
date:      Fri Feb 28 12:17:17 2014 +0530
description:
asm: split SAO_EO_0 into separate primitive func

added assembly code and testbench support
added loopfilter.cpp, loopfilter.h, loopfilter.asm files

diffstat:

 source/Lib/TLibCommon/TComPicYuv.cpp               |    5 +
 source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp |   66 +-
 source/Lib/TLibCommon/TComSampleAdaptiveOffset.h   |    4 +-
 source/Lib/TLibCommon/TComTrQuant.cpp              |   99 +-
 source/Lib/TLibCommon/TComTrQuant.h                |   12 +-
 source/Lib/TLibEncoder/TEncSearch.cpp              |   52 +-
 source/common/CMakeLists.txt                       |   11 +-
 source/common/bitstream.h                          |   70 ++
 source/common/common.cpp                           |    3 +
 source/common/loopfilter.cpp                       |   52 +
 source/common/param.cpp                            |    6 +-
 source/common/primitives.cpp                       |    4 +-
 source/common/primitives.h                         |    5 +-
 source/common/x86/asm-primitives.cpp               |    4 +
 source/common/x86/loopfilter.asm                   |   85 +++
 source/common/x86/loopfilter.h                     |   29 +
 source/encoder/encoder.cpp                         |    2 +-
 source/encoder/weightPrediction.cpp                |  578 +++++++++++---------
 source/test/pixelharness.cpp                       |   52 +-
 source/test/pixelharness.h                         |    3 +-
 source/x265.h                                      |    3 +-
 21 files changed, 765 insertions(+), 380 deletions(-)

diffs (truncated from 1998 to 300 lines):

diff -r 8189f9e9a39f -r 000f86d72337 source/Lib/TLibCommon/TComPicYuv.cpp

--- a/source/Lib/TLibCommon/TComPicYuv.cpp	Thu Feb 27 19:05:54 2014 -0600
+++ b/source/Lib/TLibCommon/TComPicYuv.cpp	Fri Feb 28 12:17:17 2014 +0530
@@ -57,6 +57,11 @@ TComPicYuv::TComPicYuv()
     m_picOrgY = NULL;  // m_apiPicBufY + m_iMarginLuma*getStride() + m_iMarginLuma
     m_picOrgU = NULL;
     m_picOrgV = NULL;
+
+    m_cuOffsetY = NULL;
+    m_cuOffsetC = NULL;
+    m_buOffsetY = NULL;
+    m_buOffsetC = NULL;
 }
 
 TComPicYuv::~TComPicYuv()
diff -r 8189f9e9a39f -r 000f86d72337 source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp
--- a/source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp	Thu Feb 27 19:05:54 2014 -0600
+++ b/source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp	Fri Feb 28 12:17:17 2014 +0530
@@ -44,7 +44,6 @@
 namespace x265 {
 //! \ingroup TLibCommon
 //! \{
-
 SAOParam::~SAOParam()
 {
     for (int i = 0; i < 3; i++)
@@ -535,12 +534,10 @@ void TComSampleAdaptiveOffset::processSa
     uint32_t tpely     = tmpCu->getCUPelY();
     uint32_t rpelx;
     uint32_t bpely;
-    int  signLeft;
-    int  signRight;
+    int  edgeType;
     int  signDown;
     int  signDown1;
     int  signDown2;
-    uint32_t edgeType;
     int picWidthTmp;
     int picHeightTmp;
     int startX;
@@ -614,23 +611,56 @@ void TComSampleAdaptiveOffset::processSa
     {
     case SAO_EO_0: // dir: -
     {
-        startX = (lpelx == 0) ? 1 : 0;
-        endX   = (rpelx == picWidthTmp) ? lcuWidth - 1 : lcuWidth;
-        for (y = 0; y < lcuHeight; y++)
-        {
-            signLeft = xSign(rec[startX] - tmpL[y]);
-            for (x = startX; x < endX; x++)
-            {
-                signRight =  xSign(rec[x] - rec[x + 1]);
-                edgeType =  signRight + signLeft + 2;
-                signLeft  = -signRight;
+      pixel firstPxl = 0, lastPxl = 0;
+      startX = (lpelx == 0) ? 1 : 0;
+      endX   = (rpelx == picWidthTmp) ? lcuWidth-1 : lcuWidth;
+      if (lcuWidth % 16)
+      {
+          int8_t signRight;
+          for (y = 0; y < lcuHeight; y++)
+          {
+              int8_t signLeft = xSign(rec[startX] - tmpL[y]);
+              for (x = startX; x < endX; x++)
+              {
+                  signRight = xSign(rec[x] - rec[x+1]);
+                  edgeType = signRight + signLeft + 2;
+                  signLeft  = -signRight;
 
-                rec[x] = clipTbl[rec[x] + m_offsetEo[edgeType]];
-            }
+                  rec[x] =  Clip3(0, (1 << X265_DEPTH) - 1, rec[x] + m_offsetEo[edgeType]);
+              }
+              rec += stride;
+          }
+      }
+      else
+      {
+          for (y = 0; y < lcuHeight; y++)
+          {
+              int8_t signLeft = xSign(rec[startX] - tmpL[y]);
 
-            rec += stride;
-        }
+              if (lpelx == 0)
+              {
+                  firstPxl = rec[0];
+              }
 
+              if (rpelx == picWidthTmp)
+              {
+                  lastPxl = rec[lcuWidth - 1];
+              }
+
+              primitives.saoCuOrgE0(rec, m_offsetEo, lcuWidth, signLeft);
+
+              if (lpelx == 0)
+              {
+                  rec[0] = firstPxl;
+              }
+
+              if (rpelx == picWidthTmp)
+              {
+                  rec[lcuWidth - 1] = lastPxl;
+              }
+              rec += stride;
+          }
+      }
         break;
     }
     case SAO_EO_1: // dir: |
diff -r 8189f9e9a39f -r 000f86d72337 source/Lib/TLibCommon/TComSampleAdaptiveOffset.h
--- a/source/Lib/TLibCommon/TComSampleAdaptiveOffset.h	Thu Feb 27 19:05:54 2014 -0600
+++ b/source/Lib/TLibCommon/TComSampleAdaptiveOffset.h	Fri Feb 28 12:17:17 2014 +0530
@@ -143,11 +143,9 @@ protected:
     static const int m_numCulPartsLevel[5];
     static const uint32_t m_eoTable[9];
     static const int m_numClass[MAX_NUM_SAO_TYPE];
-
     int32_t *m_offsetBo;
     int32_t *m_chromaOffsetBo;
-    int m_offsetEo[LUMA_GROUP_NUM];
-
+    int8_t m_offsetEo[LUMA_GROUP_NUM];
     int  m_picWidth;
     int  m_picHeight;
     uint32_t m_maxSplitLevel;
diff -r 8189f9e9a39f -r 000f86d72337 source/Lib/TLibCommon/TComTrQuant.cpp
--- a/source/Lib/TLibCommon/TComTrQuant.cpp	Thu Feb 27 19:05:54 2014 -0600
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp	Fri Feb 28 12:17:17 2014 +0530
@@ -131,14 +131,13 @@ void TComTrQuant::setQPforQuant(int qpy,
 // To minimize the distortion only. No rate is considered.
 void TComTrQuant::signBitHidingHDQ(TCoeff* qCoef, TCoeff* coef, int32_t* deltaU, const TUEntropyCodingParameters &codingParameters)
 {
-    const uint32_t width     = codingParameters.widthInGroups << MLS_CG_LOG2_WIDTH;
-    const uint32_t height    = codingParameters.heightInGroups << MLS_CG_LOG2_HEIGHT;
+    const uint32_t trSize = codingParameters.widthInGroups << MLS_CG_LOG2_WIDTH;
 
     int lastCG = -1;
     int absSum = 0;
     int n;
 
-    for (int subSet = (width * height - 1) >> LOG2_SCAN_SET_SIZE; subSet >= 0; subSet--)
+    for (int subSet = (trSize * trSize - 1) >> LOG2_SCAN_SET_SIZE; subSet >= 0; subSet--)
     {
         int  subPos = subSet << LOG2_SCAN_SET_SIZE;
         int  firstNZPosInCG = SCAN_SET_SIZE, lastNZPosInCG = -1;
@@ -255,29 +254,27 @@ void TComTrQuant::signBitHidingHDQ(TCoef
     } // TU loop
 }
 
-uint32_t TComTrQuant::xQuant(TComDataCU* cu, int32_t* coef, TCoeff* qCoef, int width, int height,
+uint32_t TComTrQuant::xQuant(TComDataCU* cu, int32_t* coef, TCoeff* qCoef, int trSize,
                              TextType ttype, uint32_t absPartIdx, int32_t *lastPos, bool curUseRDOQ)
 {
     uint32_t acSum = 0;
     int add = 0;
     bool useRDOQ = (cu->getTransformSkip(absPartIdx, ttype) ? m_useRDOQTS : m_useRDOQ) && curUseRDOQ;
 
-    assert(width == height);
-
 #if _MSC_VER
 #pragma warning(disable: 4127) // conditional expression is constant
 #endif
     if (useRDOQ && (ttype == TEXT_LUMA || RDOQ_CHROMA))
     {
-        acSum = xRateDistOptQuant(cu, coef, qCoef, width, height, ttype, absPartIdx, lastPos);
+        acSum = xRateDistOptQuant(cu, coef, qCoef, trSize, ttype, absPartIdx, lastPos);
     }
     else
     {
         TUEntropyCodingParameters codingParameters;
-        getTUEntropyCodingParameters(cu, codingParameters, absPartIdx,  width, height, ttype);
+        getTUEntropyCodingParameters(cu, codingParameters, absPartIdx,  trSize, trSize, ttype);
         int deltaU[32 * 32];
 
-        uint32_t log2TrSize = g_convertToBit[width] + 2;
+        const uint32_t log2TrSize = g_convertToBit[trSize] + 2;
         int scalingListType = (cu->isIntra(absPartIdx) ? 0 : 3) + ttype;
         assert(scalingListType < 6);
         int32_t *quantCoeff = 0;
@@ -288,7 +285,7 @@ uint32_t TComTrQuant::xQuant(TComDataCU*
         int qbits = QUANT_SHIFT + m_qpParam.m_per + transformShift;
         add = (cu->getSlice()->getSliceType() == I_SLICE ? 171 : 85) << (qbits - 9);
 
-        int numCoeff = width * height;
+        int numCoeff = trSize * trSize;
         acSum += primitives.quant(coef, quantCoeff, deltaU, qCoef, qbits, add, numCoeff, lastPos);
 
         if (cu->getSlice()->getPPS()->getSignHideFlag() && acSum >= 2)
@@ -311,8 +308,7 @@ uint32_t TComTrQuant::transformNxN(TComD
                                    int16_t*    residual,
                                    uint32_t    stride,
                                    TCoeff*     coeff,
-                                   uint32_t    width,
-                                   uint32_t    height,
+                                   uint32_t    trSize,
                                    TextType    ttype,
                                    uint32_t    absPartIdx,
                                    int32_t*    lastPos,
@@ -322,11 +318,11 @@ uint32_t TComTrQuant::transformNxN(TComD
     if (cu->getCUTransquantBypass(absPartIdx))
     {
         uint32_t absSum = 0;
-        for (uint32_t k = 0; k < height; k++)
+        for (uint32_t k = 0; k < trSize; k++)
         {
-            for (uint32_t j = 0; j < width; j++)
+            for (uint32_t j = 0; j < trSize; j++)
             {
-                coeff[k * width + j] = ((int16_t)residual[k * stride + j]);
+                coeff[k * trSize + j] = ((int16_t)residual[k * stride + j]);
                 absSum += abs(residual[k * stride + j]);
             }
         }
@@ -344,29 +340,29 @@ uint32_t TComTrQuant::transformNxN(TComD
         mode = REG_DCT;
     }
 
-    assert((cu->getSlice()->getSPS()->getMaxTrSize() >= width));
+    assert((cu->getSlice()->getSPS()->getMaxTrSize() >= trSize));
     if (useTransformSkip)
     {
-        xTransformSkip(residual, stride, m_tmpCoeff, width, height);
+        xTransformSkip(residual, stride, m_tmpCoeff, trSize);
     }
     else
     {
         // TODO: this may need larger data types for X265_DEPTH > 8
-        const uint32_t log2BlockSize = g_convertToBit[width];
-        primitives.dct[DCT_4x4 + log2BlockSize - ((width == 4) && (mode != REG_DCT))](residual, m_tmpCoeff, stride);
+        const uint32_t log2BlockSize = g_convertToBit[trSize];
+        primitives.dct[DCT_4x4 + log2BlockSize - ((trSize == 4) && (mode != REG_DCT))](residual, m_tmpCoeff, stride);
     }
-    return xQuant(cu, m_tmpCoeff, coeff, width, height, ttype, absPartIdx, lastPos, curUseRDOQ);
+    return xQuant(cu, m_tmpCoeff, coeff, trSize, ttype, absPartIdx, lastPos, curUseRDOQ);
 }
 
-void TComTrQuant::invtransformNxN(bool transQuantBypass, uint32_t mode, int16_t* residual, uint32_t stride, TCoeff* coeff, uint32_t width, uint32_t height, int scalingListType, bool useTransformSkip, int lastPos)
+void TComTrQuant::invtransformNxN(bool transQuantBypass, uint32_t mode, int16_t* residual, uint32_t stride, TCoeff* coeff, uint32_t trSize, int scalingListType, bool useTransformSkip, int lastPos)
 {
     if (transQuantBypass)
     {
-        for (uint32_t k = 0; k < height; k++)
+        for (uint32_t k = 0; k < trSize; k++)
         {
-            for (uint32_t j = 0; j < width; j++)
+            for (uint32_t j = 0; j < trSize; j++)
             {
-                residual[k * stride + j] = (int16_t)(coeff[k * width + j]);
+                residual[k * stride + j] = (int16_t)(coeff[k * trSize + j]);
             }
         }
 
@@ -377,7 +373,7 @@ void TComTrQuant::invtransformNxN(bool t
     int per = m_qpParam.m_per;
     int rem = m_qpParam.m_rem;
     bool useScalingList = getUseScalingList();
-    uint32_t log2TrSize = g_convertToBit[width] + 2;
+    const uint32_t log2TrSize = g_convertToBit[trSize] + 2;
     int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
     int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift;
     int32_t *dequantCoef = getDequantCoeff(scalingListType, m_qpParam.m_rem, log2TrSize - 2);
@@ -386,30 +382,30 @@ void TComTrQuant::invtransformNxN(bool t
     {
         static const int invQuantScales[6] = { 40, 45, 51, 57, 64, 72 };
         int scale = invQuantScales[rem] << per;
-        primitives.dequant_normal(coeff, m_tmpCoeff, width * height, scale, shift);
+        primitives.dequant_normal(coeff, m_tmpCoeff, trSize * trSize, scale, shift);
     }
     else
     {
         // CHECK_ME: the code is not verify since this is DEAD path
-        primitives.dequant_scaling(coeff, dequantCoef, m_tmpCoeff, width * height, per, shift);
+        primitives.dequant_scaling(coeff, dequantCoef, m_tmpCoeff, trSize * trSize, per, shift);
     }
 
     if (useTransformSkip == true)
     {
-        xITransformSkip(m_tmpCoeff, residual, stride, width, height);
+        xITransformSkip(m_tmpCoeff, residual, stride, trSize);
     }
     else
     {
         // CHECK_ME: we can't here when no any coeff
         assert(lastPos >= 0);
 
-        const uint32_t log2BlockSize = g_convertToBit[width];
+        const uint32_t log2BlockSize = log2TrSize - 2;
 
 #if HIGH_BIT_DEPTH
         lastPos = !lastPos; // prevent warning
 #else
         // DC only
-        if (lastPos == 0 && !((width == 4) && (mode != REG_DCT)))
+        if (lastPos == 0 && !((trSize == 4) && (mode != REG_DCT)))
         {
             int dc_val = (((m_tmpCoeff[0] * 64 + 64) >> 7) * 64 + 2048) >> 12;
             primitives.blockfill_s[log2BlockSize](residual, stride, dc_val);
@@ -419,7 +415,7 @@ void TComTrQuant::invtransformNxN(bool t