[x265-commits] [x265] primitives: fix build error in refactor of chroma p2s pri...

Fri Dec 5 01:46:42 CET 2014

details:   http://hg.videolan.org/x265/rev/511dde5ac1de
branches:  
changeset: 8942:511dde5ac1de
user:      Aarthi Thirumalai
date:      Thu Dec 04 12:43:06 2014 +0530
description:
primitives: fix build error in refactor of chroma p2s primitive.
Subject: [x265] noiseReduction: allow separate strengths to be specified for intra and inter CUs

details:   http://hg.videolan.org/x265/rev/ec06f5878e8b
branches:  
changeset: 8943:ec06f5878e8b
user:      Deepthi Nandakumar <deepthi at multicorewareinc.com>
date:      Thu Dec 04 10:19:38 2014 +0530
description:
noiseReduction: allow separate strengths to be specified for intra and inter CUs
Subject: [x265] analysis: add chroma distortion to rdLevels 3 and 4

details:   http://hg.videolan.org/x265/rev/1d2a11f6a33f
branches:  
changeset: 8944:1d2a11f6a33f
user:      Deepthi Nandakumar <deepthi at multicorewareinc.com>
date:      Thu Dec 04 16:00:12 2014 +0530
description:
analysis: add chroma distortion to rdLevels 3 and 4

At these rdLevels, inter/bidir and merge candidate decisions were being taken
based on luma sa8dCost only. This will increase bitrate and lower ssim slightly,
with better subjective quality.

Also fixed some naming nits.
Subject: [x265] analysis: cache m_bChromaSa8d and reduce redundant work

details:   http://hg.videolan.org/x265/rev/cc327e846dac
branches:  
changeset: 8945:cc327e846dac
user:      Steve Borho <steve at borho.org>
date:      Thu Dec 04 10:50:02 2014 -0600
description:
analysis: cache m_bChromaSa8d and reduce redundant work

Renamed some 'part' variables to 'puIdx' to avoid variable shadow warnings and
for consistency with search.cpp
Subject: [x265] asm: chroma_vpp[4x4] for colorspace i420 in avx2: improve 228c->184c

details:   http://hg.videolan.org/x265/rev/23e637065aec
branches:  
changeset: 8946:23e637065aec
user:      Divya Manivannan <divya at multicorewareinc.com>
date:      Thu Dec 04 10:57:35 2014 +0530
description:
asm: chroma_vpp[4x4] for colorspace i420 in avx2: improve 228c->184c

diffstat:

 doc/reST/cli.rst                     |   2 +-
 source/CMakeLists.txt                |   2 +-
 source/common/param.cpp              |  15 +++-
 source/common/x86/asm-primitives.cpp |   8 +-
 source/common/x86/ipfilter8.asm      |  77 +++++++++++++++++++++++++++++
 source/common/x86/ipfilter8.h        |   1 +
 source/encoder/analysis.cpp          |  95 ++++++++++++++++++++++++++---------
 source/encoder/analysis.h            |   1 +
 source/encoder/frameencoder.cpp      |   7 +-
 source/encoder/search.cpp            |   2 +-
 source/x265.cpp                      |   6 +-
 source/x265.h                        |  10 ++-
 12 files changed, 182 insertions(+), 44 deletions(-)

diffs (truncated from 505 to 300 lines):

diff -r b1b5f06fe9ce -r 23e637065aec doc/reST/cli.rst

--- a/doc/reST/cli.rst	Wed Dec 03 22:21:46 2014 -0600
+++ b/doc/reST/cli.rst	Thu Dec 04 10:57:35 2014 +0530
@@ -925,7 +925,7 @@ Quality, rate control and rate distortio
 	less bits. This tends to improve detail in the backgrounds of video
 	with less detail in areas of high motion. Default enabled
 
-.. option:: --nr <integer>
+.. option:: --nr-intra <integer>, --nr-inter <integer>
 
 	Noise reduction - an adaptive deadzone applied after DCT
 	(subtracting from DCT coefficients), before quantization.  It does
diff -r b1b5f06fe9ce -r 23e637065aec source/CMakeLists.txt
--- a/source/CMakeLists.txt	Wed Dec 03 22:21:46 2014 -0600
+++ b/source/CMakeLists.txt	Thu Dec 04 10:57:35 2014 +0530
@@ -21,7 +21,7 @@ include(CheckSymbolExists)
 include(CheckCXXCompilerFlag)
 
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 39)
+set(X265_BUILD 40)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
diff -r b1b5f06fe9ce -r 23e637065aec source/common/param.cpp
--- a/source/common/param.cpp	Wed Dec 03 22:21:46 2014 -0600
+++ b/source/common/param.cpp	Thu Dec 04 10:57:35 2014 +0530
@@ -766,7 +766,8 @@ int x265_param_parse(x265_param *p, cons
                          &p->vui.defDispWinRightOffset,
                          &p->vui.defDispWinBottomOffset) != 4;
     }
-    OPT("nr") p->noiseReduction = atoi(value);
+    OPT("nr-intra") p->noiseReductionIntra = atoi(value);
+    OPT("nr-inter") p->noiseReductionInter = atoi(value);
     OPT("pass")
     {
         int pass = Clip3(0, 3, atoi(value));
@@ -1078,8 +1079,10 @@ int x265_check_params(x265_param *param)
           "Target bitrate can not be less than zero");
     CHECK(param->rc.qCompress < 0.5 || param->rc.qCompress > 1.0,
           "qCompress must be between 0.5 and 1.0");
-    if (param->noiseReduction)
-        CHECK(0 > param->noiseReduction || param->noiseReduction > 2000, "Valid noise reduction range 0 - 2000");
+    if (param->noiseReductionIntra)
+        CHECK(0 > param->noiseReductionIntra || param->noiseReductionIntra > 2000, "Valid noise reduction range 0 - 2000");
+    if (param->noiseReductionInter)
+        CHECK(0 > param->noiseReductionInter || param->noiseReductionInter > 2000, "Valid noise reduction range 0 - 2000");
     CHECK(param->rc.rateControlMode == X265_RC_CRF && param->rc.bStatRead,
           "Constant rate-factor is incompatible with 2pass");
     CHECK(param->rc.rateControlMode == X265_RC_CQP && param->rc.bStatRead,
@@ -1201,8 +1204,10 @@ void x265_print_params(x265_param *param
         fprintf(stderr, "psy-rdoq=%.2lf ", param->psyRdoq);
     TOOLOPT(param->bEnableEarlySkip, "early-skip");
     TOOLOPT(param->bEnableCbfFastMode, "fast-cbf");
-    if (param->noiseReduction)
-        fprintf(stderr, "nr=%d ", param->noiseReduction);
+    if (param->noiseReductionIntra)
+        fprintf(stderr, "nr-intra=%d ", param->noiseReductionIntra);
+    if (param->noiseReductionInter)
+        fprintf(stderr, "nr-inter=%d ", param->noiseReductionInter);
     if (param->bEnableLoopFilter)
     {
         if (param->deblockingFilterBetaOffset || param->deblockingFilterTCOffset)
diff -r b1b5f06fe9ce -r 23e637065aec source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Dec 03 22:21:46 2014 -0600
+++ b/source/common/x86/asm-primitives.cpp	Thu Dec 04 10:57:35 2014 +0530
@@ -1365,9 +1365,9 @@ void Setup_Assembly_Primitives(EncoderPr
         CHROMA_VERT_FILTERS_422(_sse2);
         CHROMA_VERT_FILTERS_444(_sse2);
         p.luma_p2s = x265_luma_p2s_sse2;
-        p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_sse2;
-        p.chroma_p2s[X265_CSP_I422] = x265_chroma_p2s_sse2;
-        p.chroma_p2s[X265_CSP_I444] = x265_luma_p2s_sse2; // for i444 , chroma_p2s can be replaced by luma_p2s
+        p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_sse2;
+        p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_sse2;
+        p.chroma[X265_CSP_I444].p2s = x265_luma_p2s_sse2; // for i444 , chroma_p2s can be replaced by luma_p2s
 
         p.blockfill_s[BLOCK_4x4] = x265_blockfill_s_4x4_sse2;
         p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2;
@@ -1879,6 +1879,8 @@ void Setup_Assembly_Primitives(EncoderPr
         p.luma_vpp[LUMA_8x8] = x265_interp_8tap_vert_pp_8x8_avx2;
         p.luma_vpp[LUMA_8x16] = x265_interp_8tap_vert_pp_8x16_avx2;
         p.luma_vpp[LUMA_8x32] = x265_interp_8tap_vert_pp_8x32_avx2;
+
+        p.chroma[X265_CSP_I420].filter_vpp[CHROMA_4x4] = x265_interp_4tap_vert_pp_4x4_avx2;
     }
 #endif // if HIGH_BIT_DEPTH
 }
diff -r b1b5f06fe9ce -r 23e637065aec source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Wed Dec 03 22:21:46 2014 -0600
+++ b/source/common/x86/ipfilter8.asm	Thu Dec 04 10:57:35 2014 +0530
@@ -32,6 +32,13 @@ tab_Tm:    db 0, 1, 2, 3, 1, 2, 3, 4, 2,
            db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14
 
 ALIGN 32
+const interp4_vpp_shuf, times 2 db 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15
+
+ALIGN 32
+const interp4_vpp_shuf1, dd 0, 1, 1, 2, 2, 3, 3, 4
+                         dd 2, 3, 3, 4, 4, 5, 5, 6
+
+ALIGN 32
 tab_Lm:    db 0, 1, 2, 3, 4,  5,  6,  7,  1, 2, 3, 4,  5,  6,  7,  8
            db 2, 3, 4, 5, 6,  7,  8,  9,  3, 4, 5, 6,  7,  8,  9,  10
            db 4, 5, 6, 7, 8,  9,  10, 11, 5, 6, 7, 8,  9,  10, 11, 12
@@ -143,6 +150,31 @@ tab_LumaCoeffVer_32: times 16 db 0, 0
                      times 16 db 58, -10
                      times 16 db 4, -1
 
+ALIGN 32
+tab_ChromaCoeffVer_32: times 16 db 0, 64
+                       times 16 db 0, 0
+
+                       times 16 db -2, 58
+                       times 16 db 10, -2
+
+                       times 16 db -4, 54
+                       times 16 db 16, -2
+
+                       times 16 db -6, 46
+                       times 16 db 28, -4
+
+                       times 16 db -4, 36
+                       times 16 db 36, -4
+
+                       times 16 db -4, 28
+                       times 16 db 46, -6
+
+                       times 16 db -2, 16
+                       times 16 db 54, -4
+
+                       times 16 db -2, 10
+                       times 16 db 58, -2
+
 tab_c_64_n64:   times 8 db 64, -64
 
 const interp4_shuf, times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
@@ -1901,6 +1933,51 @@ pextrd      [r2 + r3], m2, 3
 
 RET
 
+INIT_YMM avx2
+cglobal interp_4tap_vert_pp_4x4, 4, 6, 3
+    mov             r4d, r4m
+    shl             r4d, 6
+    sub             r0, r1
+
+%ifdef PIC
+    lea             r5, [tab_ChromaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_ChromaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+
+    movd            xm1, [r0]
+    pinsrd          xm1, [r0 + r1], 1
+    pinsrd          xm1, [r0 + r1 * 2], 2
+    pinsrd          xm1, [r0 + r4], 3                       ; m1 = row[3 2 1 0]
+    lea             r0, [r0 + r1 * 4]
+    movd            xm2, [r0]
+    pinsrd          xm2, [r0 + r1], 1
+    pinsrd          xm2, [r0 + r1 * 2], 2                   ; m2 = row[x 6 5 4]
+    vinserti128     m1, m1, xm2, 1                          ; m1 = row[x 6 5 4 3 2 1 0]
+    mova            m2, [interp4_vpp_shuf1]
+    vpermd          m0, m2, m1                              ; m0 = row[4 3 3 2 2 1 1 0]
+    mova            m2, [interp4_vpp_shuf1 + mmsize]
+    vpermd          m1, m2, m1                              ; m1 = row[6 5 5 4 4 3 3 2]
+
+    mova            m2, [interp4_vpp_shuf]
+    pshufb          m0, m0, m2
+    pshufb          m1, m1, m2
+    pmaddubsw       m0, [r5]
+    pmaddubsw       m1, [r5 + mmsize]
+    paddw           m0, m1                                  ; m0 = WORD ROW[3 2 1 0]
+    pmulhrsw        m0, [pw_512]
+    vextracti128    xm1, m0, 1
+    packuswb        xm0, xm1
+    lea             r5, [r3 * 3]
+    movd            [r2], xm0
+    pextrd          [r2 + r3], xm0, 1
+    pextrd          [r2 + r3 * 2], xm0, 2
+    pextrd          [r2 + r5], xm0, 3
+    RET
+
 ;-----------------------------------------------------------------------------
 ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
diff -r b1b5f06fe9ce -r 23e637065aec source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h	Wed Dec 03 22:21:46 2014 -0600
+++ b/source/common/x86/ipfilter8.h	Thu Dec 04 10:57:35 2014 +0530
@@ -573,6 +573,7 @@ CHROMA_HORIZ_FILTERS_444(_sse4);
     SETUP_CHROMA_SS_FUNC_DEF(16, 64, cpu);
 
 CHROMA_FILTERS(_sse4);
+CHROMA_FILTERS(_avx2);
 CHROMA_SP_FILTERS(_sse2);
 CHROMA_SP_FILTERS_SSE4(_sse4);
 CHROMA_SS_FILTERS(_sse2);
diff -r b1b5f06fe9ce -r 23e637065aec source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp	Wed Dec 03 22:21:46 2014 -0600
+++ b/source/encoder/analysis.cpp	Thu Dec 04 10:57:35 2014 +0530
@@ -79,6 +79,7 @@ bool Analysis::create(ThreadLocalData *t
 {
     m_tld = tld;
     m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2;
+    m_bChromaSa8d = m_param->rdLevel >= 3;
 
     int csp = m_param->internalCsp;
     uint32_t cuSize = g_maxCUSize;
@@ -593,10 +594,13 @@ void Analysis::compressInterCU_dist(cons
             if (m_param->rdLevel > 2)
             {
                 /* RD selection between merge, inter, bidir and intra */
-                for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
+                if (!m_bChromaSa8d)
                 {
-                    prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
-                    motionCompensation(bestInter->predYuv, false, true);
+                    for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
+                    {
+                        prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
+                        motionCompensation(bestInter->predYuv, false, true);
+                    }
                 }
                 encodeResAndCalcRdInterCU(*bestInter, cuGeom);
                 checkBestMode(*bestInter, depth);
@@ -841,10 +845,13 @@ void Analysis::compressInterCU_rd0_4(con
             if (m_param->rdLevel >= 3)
             {
                 /* Calculate RD cost of best inter option */
-                for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
+                if (!m_bChromaSa8d)
                 {
-                    prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
-                    motionCompensation(bestInter->predYuv, false, true);
+                    for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
+                    {
+                        prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
+                        motionCompensation(bestInter->predYuv, false, true);
+                    }
                 }
                 encodeResAndCalcRdInterCU(*bestInter, cuGeom);
                 checkBestMode(*bestInter, depth);
@@ -1227,7 +1234,12 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mod
 
     bestPred->sa8dCost = MAX_INT64;
     int bestSadCand = -1;
-    int sizeIdx = cuGeom.log2CUSize - 2;
+    int cpart, sizeIdx = cuGeom.log2CUSize - 2;
+    if (m_bChromaSa8d)
+    {
+        int cuSize = 1 << cuGeom.log2CUSize;
+        cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
+    }
     for (uint32_t i = 0; i < maxNumMergeCand; ++i)
     {
         if (m_bFrameParallel &&
@@ -1242,12 +1254,16 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mod
         tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
         tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
 
-        // do MC only for Luma part
         prepMotionCompensation(tempPred->cu, cuGeom, 0);
-        motionCompensation(tempPred->predYuv, true, false);
+        motionCompensation(tempPred->predYuv, true, m_bChromaSa8d);
 
         tempPred->sa8dBits = getTUBits(i, maxNumMergeCand);
         tempPred->distortion = primitives.sa8d[sizeIdx](fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
+        if (m_bChromaSa8d)
+        {
+            tempPred->distortion += primitives.sa8d[cpart](fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize);
+            tempPred->distortion += primitives.sa8d[cpart](fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize);
+        }
         tempPred->sa8dCost = m_rdCost.calcRdSADCost(tempPred->distortion, tempPred->sa8dBits);
 
         if (tempPred->sa8dCost < bestPred->sa8dCost)
@@ -1262,8 +1278,11 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mod
         return;
 
     /* calculate the motion compensation for chroma for the best mode selected */
-    prepMotionCompensation(bestPred->cu, cuGeom, 0);
-    motionCompensation(bestPred->predYuv, false, true);
+    if (!m_bChromaSa8d)
+    {
+        prepMotionCompensation(bestPred->cu, cuGeom, 0);
+        motionCompensation(bestPred->predYuv, false, true);
+    }
 
     if (m_param->rdLevel)
     {
@@ -1428,19 +1447,27 @@ void Analysis::checkInter_rd0_4(Mode& in
             }
         }
     }
-    if (predInterSearch(interMode, cuGeom, false, false))
+    if (predInterSearch(interMode, cuGeom, false, m_bChromaSa8d))
     {
         /* predInterSearch sets interMode.sa8dBits */