[x265-commits] [x265] lookahead primitives: fix bug that caused wrong cost esti...

Fri Nov 22 19:18:50 CET 2013

details:   http://hg.videolan.org/x265/rev/28f42f1be951
branches:  
changeset: 5274:28f42f1be951
user:      Deepthi Nandakumar <deepthi at multicorewareinc.com>
date:      Fri Nov 22 15:30:13 2013 +0530
description:
lookahead primitives: fix bug that caused wrong cost estimates in ducks_take_off.
Subject: [x265] no-rdo: implemented topskip algorithm

details:   http://hg.videolan.org/x265/rev/b6323f2be057
branches:  
changeset: 5275:b6323f2be057
user:      Sumalatha Polureddy
date:      Wed Nov 20 10:51:16 2013 +0530
description:
no-rdo: implemented topskip algorithm

It is basically setting the starting depth from which the partitioning should
happen for particular CU. The starting depth for particular CU in present frame
is selected form the previous frame's colocated CU minimum depth.

the performance, bitrate increase and psnr comparison are given below
CLI: x265.exe input.y4m -o abc.hevc -r recon.y4m --rd 1 --ref 1
a. Early exit OFF and Top Skip OFF
b. Early exit OFF and Top Skip ON

BasketballDrive_1920x1080_50
Timetaken to encode: 681/639s
bitrate: 3650/3657 kb/s
PSNR: 36.7/36.703
perf improvement: 6.16% (compared to TopSkip OFF and ON)
Bitrate increase: 0.19% (compared to TopSkip OFF and ON)

Cactus_1920x1080_50
Timetaken to encode: 530/492s
bitrate: 2787/2795 kb/s
PSNR: 35.527/35.529
perf improvement: 7.16% (compared to TopSkip OFF and ON)
Bitrate increase: 0.28% (compared to TopSkip OFF and ON)

Kimono1_1920x1080_24
Timetaken to encode: 278/264s
bitrate: 1243/1246 kb/s
PSNR: 38.16/38.16
perf improvement: 5.03% (compared to TopSkip OFF and ON)
Bitrate increase: 0.24% (compared to TopSkip OFF and ON)

FourPeople_1280x720_60
Timetaken to encode: 173/163s
bitrate: 486/492 kb/s
PSNR: 39.097/39.094
perf improvement: 5.78% (compared to TopSkip OFF and ON)
Bitrate increase: 1.2% (compared to TopSkip OFF and ON)

PartyScene_832x480_50
Timetaken to encode: 134/119s
bitrate: 1652/1661 kb/s
PSNR: 31.374/31.377
perf improvement: 11.16% (compared to TopSkip OFF and ON)
Bitrate increase: 0.544% (compared to TopSkip OFF and ON)

big_buck_bunny_360p24
Timetaken to encode: 1772/1477s
bitrate: 174/175 kb/s
PSNR: 37.798/37.797
perf improvement: 16.6% (compared to TopSkip OFF and ON)
Bitrate increase: 0.5% (compared to TopSkip OFF and ON)
Subject: [x265] Merge

details:   http://hg.videolan.org/x265/rev/cc075ae1098f
branches:  
changeset: 5276:cc075ae1098f
user:      Deepthi Nandakumar <deepthi at multicorewareinc.com>
date:      Fri Nov 22 23:25:19 2013 +0530
description:
Merge
Subject: [x265] crf: bug fix. regulate qp of first frame based on ABR_INIT_QP.

details:   http://hg.videolan.org/x265/rev/e6ec06cf5d3d
branches:  
changeset: 5277:e6ec06cf5d3d
user:      Aarthi Thirumalai
date:      Fri Nov 22 19:29:39 2013 +0530
description:
crf: bug fix. regulate qp of first frame based on ABR_INIT_QP.
Subject: [x265] asm: remove assignements to square block sa8d[] methods

details:   http://hg.videolan.org/x265/rev/e28d9b6b5d65
branches:  
changeset: 5278:e28d9b6b5d65
user:      Steve Borho <steve at borho.org>
date:      Fri Nov 22 12:02:55 2013 -0600
description:
asm: remove assignements to square block sa8d[] methods

These are handled specially in x265_setup_primitives()

diffstat:

 source/Lib/TLibCommon/TComTrQuant.cpp |   15 +-
 source/Lib/TLibCommon/TComYuv.cpp     |   43 +--
 source/Lib/TLibCommon/TComYuv.h       |    4 +-
 source/Lib/TLibEncoder/TEncCu.cpp     |    2 +-
 source/Lib/TLibEncoder/TEncCu.h       |    2 +-
 source/common/dct.cpp                 |   70 ++--
 source/common/primitives.h            |    7 +-
 source/common/vec/dct-sse41.cpp       |  170 ++++++------
 source/common/vec/pixel-sse41.cpp     |  220 ----------------
 source/common/x86/asm-primitives.cpp  |   36 +-
 source/common/x86/pixel-a.asm         |  453 +++++++++++++++++++++++++++++++++-
 source/common/x86/pixel.h             |    7 +
 source/common/x86/pixeladd8.asm       |  249 ++++++++++++++++++
 source/encoder/compress.cpp           |  314 +++++++++++++---------
 source/encoder/ratecontrol.cpp        |    6 +-
 source/encoder/slicetype.cpp          |    4 +-
 source/test/mbdstharness.cpp          |   82 ++++-
 source/test/mbdstharness.h            |    3 +-
 source/test/pixelharness.cpp          |    4 +-
 19 files changed, 1128 insertions(+), 563 deletions(-)

diffs (truncated from 2172 to 300 lines):

diff -r 5009254d3d3a -r e28d9b6b5d65 source/Lib/TLibCommon/TComTrQuant.cpp

--- a/source/Lib/TLibCommon/TComTrQuant.cpp	Fri Nov 22 00:17:46 2013 -0600
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp	Fri Nov 22 12:02:55 2013 -0600
@@ -409,8 +409,21 @@ void TComTrQuant::invtransformNxN(bool t
     int rem = m_qpParam.m_rem;
     bool useScalingList = getUseScalingList();
     uint32_t log2TrSize = g_convertToBit[width] + 2;
+    int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
+    int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift;
     int32_t *dequantCoef = getDequantCoeff(scalingListType, m_qpParam.m_rem, log2TrSize - 2);
-    primitives.dequant(coeff, m_tmpCoeff, width, height, per, rem, useScalingList, log2TrSize, dequantCoef);
+
+    if (!useScalingList)
+    {
+        static const int invQuantScales[6] = { 40, 45, 51, 57, 64, 72 };
+        int scale = invQuantScales[rem] << per;
+        primitives.dequant_normal(coeff, m_tmpCoeff, width * height, scale, shift);
+    }
+    else
+    {
+        // CHECK_ME: the code is not verify since this is DEAD path
+        primitives.dequant_scaling(coeff, dequantCoef, m_tmpCoeff, width * height, per, shift);
+    }
 
     if (useTransformSkip == true)
     {
diff -r 5009254d3d3a -r e28d9b6b5d65 source/Lib/TLibCommon/TComYuv.cpp
--- a/source/Lib/TLibCommon/TComYuv.cpp	Fri Nov 22 00:17:46 2013 -0600
+++ b/source/Lib/TLibCommon/TComYuv.cpp	Fri Nov 22 12:02:55 2013 -0600
@@ -395,14 +395,14 @@ void TComYuv::copyPartToPartChroma(TShor
 
 void TComYuv::addClip(TComYuv* srcYuv0, TShortYUV* srcYuv1, uint32_t trUnitIdx, uint32_t partSize)
 {
-    addClipLuma(srcYuv0, srcYuv1, trUnitIdx, partSize);
-    addClipChroma(srcYuv0, srcYuv1, trUnitIdx, partSize >> m_hChromaShift);
+    int part = partitionFromSizes(partSize, partSize);
+
+    addClipLuma(srcYuv0, srcYuv1, trUnitIdx, partSize, part);
+    addClipChroma(srcYuv0, srcYuv1, trUnitIdx, partSize >> m_hChromaShift, part);
 }
 
-void TComYuv::addClipLuma(TComYuv* srcYuv0, TShortYUV* srcYuv1, uint32_t trUnitIdx, uint32_t partSize)
+void TComYuv::addClipLuma(TComYuv* srcYuv0, TShortYUV* srcYuv1, uint32_t trUnitIdx, uint32_t partSize, uint32_t part)
 {
-    int x, y;
-
     Pel* src0 = srcYuv0->getLumaAddr(trUnitIdx, partSize);
     int16_t* src1 = srcYuv1->getLumaAddr(trUnitIdx, partSize);
     Pel* dst = getLumaAddr(trUnitIdx, partSize);
@@ -411,23 +411,11 @@ void TComYuv::addClipLuma(TComYuv* srcYu
     uint32_t src1Stride = srcYuv1->m_width;
     uint32_t dststride  = getStride();
 
-    for (y = partSize - 1; y >= 0; y--)
-    {
-        for (x = partSize - 1; x >= 0; x--)
-        {
-            dst[x] = ClipY(static_cast<int16_t>(src0[x]) + src1[x]);
-        }
-
-        src0 += src0Stride;
-        src1 += src1Stride;
-        dst  += dststride;
-    }
+    primitives.luma_add_ps[part](dst, dststride, src0, src1, src0Stride, src1Stride);
 }
 
-void TComYuv::addClipChroma(TComYuv* srcYuv0, TShortYUV* srcYuv1, uint32_t trUnitIdx, uint32_t partSize)
+void TComYuv::addClipChroma(TComYuv* srcYuv0, TShortYUV* srcYuv1, uint32_t trUnitIdx, uint32_t partSize, uint32_t part)
 {
-    int x, y;
-
     Pel* srcU0 = srcYuv0->getCbAddr(trUnitIdx, partSize);
     int16_t* srcU1 = srcYuv1->getCbAddr(trUnitIdx, partSize);
     Pel* srcV0 = srcYuv0->getCrAddr(trUnitIdx, partSize);
@@ -439,21 +427,8 @@ void TComYuv::addClipChroma(TComYuv* src
     uint32_t src1Stride = srcYuv1->m_cwidth;
     uint32_t dststride  = getCStride();
 
-    for (y = partSize - 1; y >= 0; y--)
-    {
-        for (x = partSize - 1; x >= 0; x--)
-        {
-            dstU[x] = ClipC(static_cast<int16_t>(srcU0[x]) + srcU1[x]);
-            dstV[x] = ClipC(static_cast<int16_t>(srcV0[x]) + srcV1[x]);
-        }
-
-        srcU0 += src0Stride;
-        srcU1 += src1Stride;
-        srcV0 += src0Stride;
-        srcV1 += src1Stride;
-        dstU  += dststride;
-        dstV  += dststride;
-    }
+   primitives.chroma[m_csp].add_ps[part](dstU, dststride, srcU0, srcU1, src0Stride, src1Stride);
+   primitives.chroma[m_csp].add_ps[part](dstV, dststride, srcV0, srcV1, src0Stride, src1Stride);
 }
 
 void TComYuv::subtract(TComYuv* srcYuv0, TComYuv* srcYuv1, uint32_t trUnitIdx, uint32_t partSize)
diff -r 5009254d3d3a -r e28d9b6b5d65 source/Lib/TLibCommon/TComYuv.h
--- a/source/Lib/TLibCommon/TComYuv.h	Fri Nov 22 00:17:46 2013 -0600
+++ b/source/Lib/TLibCommon/TComYuv.h	Fri Nov 22 12:02:55 2013 -0600
@@ -153,8 +153,8 @@ public:
 
     //  Clip(srcYuv0 + srcYuv1) -> m_apiBuf
     void    addClip(TComYuv* srcYuv0, TShortYUV* srcYuv1, uint32_t trUnitIdx, uint32_t partSize);
-    void    addClipLuma(TComYuv* srcYuv0, TShortYUV* srcYuv1, uint32_t trUnitIdx, uint32_t partSize);
-    void    addClipChroma(TComYuv* srcYuv0, TShortYUV* srcYuv1, uint32_t trUnitIdx, uint32_t partSize);
+    void    addClipLuma(TComYuv* srcYuv0, TShortYUV* srcYuv1, uint32_t trUnitIdx, uint32_t partSize, uint32_t part);
+    void    addClipChroma(TComYuv* srcYuv0, TShortYUV* srcYuv1, uint32_t trUnitIdx, uint32_t partSize, uint32_t part);
 
     //  srcYuv0 - srcYuv1 -> m_apiBuf
     void    subtract(TComYuv* srcYuv0, TComYuv* srcYuv1, uint32_t trUnitIdx, uint32_t partSize);
diff -r 5009254d3d3a -r e28d9b6b5d65 source/Lib/TLibEncoder/TEncCu.cpp
--- a/source/Lib/TLibEncoder/TEncCu.cpp	Fri Nov 22 00:17:46 2013 -0600
+++ b/source/Lib/TLibEncoder/TEncCu.cpp	Fri Nov 22 12:02:55 2013 -0600
@@ -367,7 +367,7 @@ void TEncCu::compressCU(TComDataCU* cu)
 
             /* At the start of analysis, the best CU is a null pointer
             On return, it points to the CU encode with best chosen mode*/
-            xCompressInterCU(outBestCU, m_tempCU[0], cu, 0, 0);
+            xCompressInterCU(outBestCU, m_tempCU[0], cu, 0, 0, 4);
         }
         else
             xCompressCU(m_bestCU[0], m_tempCU[0], 0);
diff -r 5009254d3d3a -r e28d9b6b5d65 source/Lib/TLibEncoder/TEncCu.h
--- a/source/Lib/TLibEncoder/TEncCu.h	Fri Nov 22 00:17:46 2013 -0600
+++ b/source/Lib/TLibEncoder/TEncCu.h	Fri Nov 22 12:02:55 2013 -0600
@@ -161,7 +161,7 @@ protected:
     void finishCU(TComDataCU* cu, uint32_t absPartIdx, uint32_t depth);
     void xCompressCU(TComDataCU*& outBestCU, TComDataCU*& outTempCU, uint32_t depth, PartSize parentSize = SIZE_NONE);
     void xCompressIntraCU(TComDataCU*& outBestCU, TComDataCU*& outTempCU, uint32_t depth);
-    void xCompressInterCU(TComDataCU*& outBestCU, TComDataCU*& outTempCU, TComDataCU*& cu, uint32_t depth, uint32_t partitionIndex);
+    void xCompressInterCU(TComDataCU*& outBestCU, TComDataCU*& outTempCU, TComDataCU*& cu, uint32_t depth, uint32_t partitionIndex, UChar minDepth);
     void xEncodeCU(TComDataCU* cu, uint32_t absPartIdx, uint32_t depth);
     int  xComputeQP(TComDataCU* cu);
     void xCheckBestMode(TComDataCU*& outBestCU, TComDataCU*& outTempCU, uint32_t depth);
diff -r 5009254d3d3a -r e28d9b6b5d65 source/common/dct.cpp
--- a/source/common/dct.cpp	Fri Nov 22 00:17:46 2013 -0600
+++ b/source/common/dct.cpp	Fri Nov 22 12:02:55 2013 -0600
@@ -718,57 +718,52 @@ void idct32_c(int32_t *src, int16_t *dst
     }
 }
 
-void dequant_c(const int32_t* quantCoef, int32_t* coef, int width, int height, int per, int rem, bool useScalingList, unsigned int log2TrSize, int32_t *dequantCoef)
+void dequant_normal_c(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift)
 {
-    int invQuantScales[6] = { 40, 45, 51, 57, 64, 72 };
-
-    if (width > 32)
-    {
-        width  = 32;
-        height = 32;
-    }
+    static const int invQuantScales[6] = { 40, 45, 51, 57, 64, 72 };
+    assert(num <= 32 * 32);
 
     int add, coeffQ;
-    int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
-    int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift;
 
     int clipQCoef;
 
-    if (useScalingList)
+    add = 1 << (shift - 1);
+
+    for (int n = 0; n < num; n++)
     {
-        shift += 4;
+        clipQCoef = Clip3(-32768, 32767, quantCoef[n]);
+        coeffQ = (clipQCoef * scale + add) >> shift;
+        coef[n] = Clip3(-32768, 32767, coeffQ);
+    }
+}
 
-        if (shift > per)
+void dequant_scaling_c(const int32_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift)
+{
+    assert(num <= 32 * 32);
+
+    int add, coeffQ;
+    int clipQCoef;
+
+    shift += 4;
+
+    if (shift > per)
+    {
+        add = 1 << (shift - per - 1);
+
+        for (int n = 0; n < num; n++)
         {
-            add = 1 << (shift - per - 1);
-
-            for (int n = 0; n < width * height; n++)
-            {
-                clipQCoef = Clip3(-32768, 32767, quantCoef[n]);
-                coeffQ = ((clipQCoef * dequantCoef[n]) + add) >> (shift - per);
-                coef[n] = Clip3(-32768, 32767, coeffQ);
-            }
-        }
-        else
-        {
-            for (int n = 0; n < width * height; n++)
-            {
-                clipQCoef = Clip3(-32768, 32767, quantCoef[n]);
-                coeffQ   = Clip3(-32768, 32767, clipQCoef * dequantCoef[n]);
-                coef[n] = Clip3(-32768, 32767, coeffQ << (per - shift));
-            }
+            clipQCoef = Clip3(-32768, 32767, quantCoef[n]);
+            coeffQ = ((clipQCoef * deQuantCoef[n]) + add) >> (shift - per);
+            coef[n] = Clip3(-32768, 32767, coeffQ);
         }
     }
     else
     {
-        add = 1 << (shift - 1);
-        int scale = invQuantScales[rem] << per;
-
-        for (int n = 0; n < width * height; n++)
+        for (int n = 0; n < num; n++)
         {
             clipQCoef = Clip3(-32768, 32767, quantCoef[n]);
-            coeffQ = (clipQCoef * scale + add) >> shift;
-            coef[n] = Clip3(-32768, 32767, coeffQ);
+            coeffQ   = Clip3(-32768, 32767, clipQCoef * deQuantCoef[n]);
+            coef[n] = Clip3(-32768, 32767, coeffQ << (per - shift));
         }
     }
 }
@@ -804,7 +799,8 @@ namespace x265 {
 
 void Setup_C_DCTPrimitives(EncoderPrimitives& p)
 {
-    p.dequant = dequant_c;
+    p.dequant_scaling = dequant_scaling_c;
+    p.dequant_normal = dequant_normal_c;
     p.quant = quant_c;
     p.dct[DST_4x4] = dst4_c;
     p.dct[DCT_4x4] = dct4_c;
diff -r 5009254d3d3a -r e28d9b6b5d65 source/common/primitives.h
--- a/source/common/primitives.h	Fri Nov 22 00:17:46 2013 -0600
+++ b/source/common/primitives.h	Fri Nov 22 12:02:55 2013 -0600
@@ -178,8 +178,8 @@ typedef void (*calcresidual_t)(pixel *fe
 typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
 typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
 typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
-typedef void (*dequant_t)(const int32_t* src, int32_t* dst, int width, int height, int mcqp_miper, int mcqp_mirem, bool useScalingList,
-                          unsigned int trSizeLog2, int32_t *dequantCoef);
+typedef void (*dequant_scaling_t)(const int32_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift);
+typedef void (*dequant_normal_t)(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift);
 
 typedef void (*weightp_pp_t)(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
 typedef void (*weightp_sp_t)(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
@@ -261,7 +261,8 @@ struct EncoderPrimitives
     dct_t           dct[NUM_DCTS];
     idct_t          idct[NUM_IDCTS];
     quant_t         quant;
-    dequant_t       dequant;
+    dequant_scaling_t dequant_scaling;
+    dequant_normal_t dequant_normal;
 
     calcresidual_t  calcresidual[NUM_SQUARE_BLOCKS];
     calcrecon_t     calcrecon[NUM_SQUARE_BLOCKS];
diff -r 5009254d3d3a -r e28d9b6b5d65 source/common/vec/dct-sse41.cpp
--- a/source/common/vec/dct-sse41.cpp	Fri Nov 22 00:17:46 2013 -0600
+++ b/source/common/vec/dct-sse41.cpp	Fri Nov 22 12:02:55 2013 -0600
@@ -40,114 +40,103 @@
 using namespace x265;
 
 namespace {
-void dequant(const int32_t* quantCoef, int32_t* coef, int width, int height, int per, int rem, bool useScalingList, unsigned int log2TrSize, int32_t *deQuantCoef)
+// TODO: normal and 8bpp dequant have only 16-bits dynamic rang, we can reduce 32-bits multiplication later
+void dequant_normal(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift)
 {
-    int invQuantScales[6] = { 40, 45, 51, 57, 64, 72 };
+    int valueToAdd = 1 << (shift - 1);
+    __m128i vScale = _mm_set1_epi32(scale);
+    __m128i vAdd = _mm_set1_epi32(valueToAdd);
 
-    if (width > 32)
+    for (int n = 0; n < num; n = n + 8)
     {
-        width  = 32;
-        height = 32;
+        __m128i quantCoef1, quantCoef2, quantCoef12, sign;
+
+        quantCoef1 = _mm_loadu_si128((__m128i*)(quantCoef + n));
+        quantCoef2 = _mm_loadu_si128((__m128i*)(quantCoef + n + 4));
+
+        quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
+        sign = _mm_srai_epi16(quantCoef12, 15);
+        quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
+        quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
+
+        quantCoef1 = _mm_sra_epi32(_mm_add_epi32(_mm_mullo_epi32(quantCoef1, vScale), vAdd), _mm_cvtsi32_si128(shift));
+        quantCoef2 = _mm_sra_epi32(_mm_add_epi32(_mm_mullo_epi32(quantCoef2, vScale), vAdd), _mm_cvtsi32_si128(shift));
+
+        quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
+        sign = _mm_srai_epi16(quantCoef12, 15);
+        quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);