[x265] [PATCH] MV refinement for multipass encoding

Mon Jun 5 12:15:33 CEST 2017

# HG changeset patch
# User Ashok Kumar Mishra <ashok at multicorewareinc.com>
# Date 1496656244 -19800
#      Mon Jun 05 15:20:44 2017 +0530
# Node ID c04d02d71f206431b6b6e60460b81dcc85fc5db5
# Parent  de49a722b256d94c9ba30b5d88459026bea528b8
MV refinement for multipass encoding

diff -r de49a722b256 -r c04d02d71f20 doc/reST/cli.rst

--- a/doc/reST/cli.rst	Wed May 24 20:01:59 2017 +0530
+++ b/doc/reST/cli.rst	Mon Jun 05 15:20:44 2017 +0530
@@ -911,6 +911,12 @@
 	inter modes for blocks of size one smaller than the min-cu-size of the 
 	incoming analysis data from the previous encode. Default disabled.
 
+.. option:: --refine-mv
+	
+	Enables refinement of motion vector for scaled video. Evaluates the best 
+	motion vector by searching the surrounding eight integer and subpel pixel
+    positions.
+
 Options which affect the transform unit quad-tree, sometimes referred to
 as the residual quad-tree (RQT).
 
diff -r de49a722b256 -r c04d02d71f20 source/CMakeLists.txt
--- a/source/CMakeLists.txt	Wed May 24 20:01:59 2017 +0530
+++ b/source/CMakeLists.txt	Mon Jun 05 15:20:44 2017 +0530
@@ -29,7 +29,7 @@
 option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 120)
+set(X265_BUILD 121)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
diff -r de49a722b256 -r c04d02d71f20 source/common/param.cpp
--- a/source/common/param.cpp	Wed May 24 20:01:59 2017 +0530
+++ b/source/common/param.cpp	Mon Jun 05 15:20:44 2017 +0530
@@ -280,6 +280,7 @@
     param->scaleFactor = 0;
     param->intraRefine = 0;
     param->interRefine = 0;
+    param->mvRefine = 0;
 }
 
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
@@ -963,6 +964,7 @@
         OPT("scale-factor") p->scaleFactor = atoi(value);
         OPT("refine-intra")p->intraRefine = atobool(value);
         OPT("refine-inter")p->interRefine = atobool(value);
+        OPT("refine-mv")p->mvRefine = atobool(value);
         else
             return X265_PARAM_BAD_NAME;
     }
@@ -1685,6 +1687,7 @@
     s += sprintf(s, " scale-factor=%d", p->scaleFactor);
     s += sprintf(s, " refine-intra=%d", p->intraRefine);
     s += sprintf(s, " refine-inter=%d", p->interRefine);
+    s += sprintf(s, " refine-mv=%d", p->mvRefine);
     BOOL(p->bLimitSAO, "limit-sao");
     s += sprintf(s, " ctu-info=%d", p->bCTUInfo);
 #undef BOOL
diff -r de49a722b256 -r c04d02d71f20 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp	Wed May 24 20:01:59 2017 +0530
+++ b/source/encoder/analysis.cpp	Mon Jun 05 15:20:44 2017 +0530
@@ -2267,14 +2267,16 @@
                     int cuIdx = (mode.cu.m_cuAddr * parentCTU.m_numPartitions) + cuGeom.absPartIdx;
                     mode.cu.m_mergeFlag[pu.puAbsPartIdx] = interDataCTU->mergeFlag[cuIdx + part];
                     mode.cu.setPUInterDir(interDataCTU->interDir[cuIdx + part], pu.puAbsPartIdx, part);
-                    for (int dir = 0; dir < m_slice->isInterB() + 1; dir++)
+                    for (int list = 0; list < m_slice->isInterB() + 1; list++)
                     {
-                        mode.cu.setPUMv(dir, interDataCTU->mv[dir][cuIdx + part], pu.puAbsPartIdx, part);
-                        mode.cu.setPURefIdx(dir, interDataCTU->refIdx[dir][cuIdx + part], pu.puAbsPartIdx, part);
-                        mode.cu.m_mvpIdx[dir][pu.puAbsPartIdx] = interDataCTU->mvpIdx[dir][cuIdx + part];
+                        mode.cu.setPUMv(list, interDataCTU->mv[list][cuIdx + part], pu.puAbsPartIdx, part);
+                        mode.cu.setPURefIdx(list, interDataCTU->refIdx[list][cuIdx + part], pu.puAbsPartIdx, part);
+                        mode.cu.m_mvpIdx[list][pu.puAbsPartIdx] = interDataCTU->mvpIdx[list][cuIdx + part];
                     }
                     if (!mode.cu.m_mergeFlag[pu.puAbsPartIdx])
                     {
+                        if (m_param->mvRefine)
+                            m_me.setSourcePU(*mode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, false);
                         //AMVP
                         MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
                         mode.cu.getNeighbourMV(part, pu.puAbsPartIdx, mode.interNeighbours);
@@ -2285,6 +2287,12 @@
                                 continue;
                             mode.cu.getPMV(mode.interNeighbours, list, ref, mode.amvpCand[list][ref], mvc);
                             MV mvp = mode.amvpCand[list][ref][mode.cu.m_mvpIdx[list][pu.puAbsPartIdx]];
+                            if (m_param->mvRefine)
+                            {
+                                MV outmv;
+                                searchMV(mode, pu, list, ref, outmv);
+                                mode.cu.setPUMv(list, outmv, pu.puAbsPartIdx, part);
+                            }
                             mode.cu.m_mvd[list][pu.puAbsPartIdx] = mode.cu.m_mv[list][pu.puAbsPartIdx] - mvp;
                         }
                     }
@@ -2293,7 +2301,6 @@
                         MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
                         uint8_t candDir[MRG_MAX_NUM_CANDS];
                         mode.cu.getInterMergeCandidates(pu.puAbsPartIdx, part, candMvField, candDir);
-                        mode.cu.m_mvpIdx[0][pu.puAbsPartIdx] = interDataCTU->mvpIdx[0][cuIdx + part];
                         uint8_t mvpIdx = mode.cu.m_mvpIdx[0][pu.puAbsPartIdx];
                         mode.cu.setPUInterDir(candDir[mvpIdx], pu.puAbsPartIdx, part);
                         mode.cu.setPUMv(0, candMvField[mvpIdx][0].mv, pu.puAbsPartIdx, part);
diff -r de49a722b256 -r c04d02d71f20 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp	Wed May 24 20:01:59 2017 +0530
+++ b/source/encoder/encoder.cpp	Mon Jun 05 15:20:44 2017 +0530
@@ -2310,6 +2310,15 @@
         x265_log(p, X265_LOG_WARNING, "Inter refinement does not support limitTU. Disabling limitTU.\n");
         p->limitTU = 0;
     }
+	
+	if (p->mvRefine)
+    {
+        if (p->analysisMode != X265_ANALYSIS_LOAD || p->analysisRefineLevel < 10 || !p->scaleFactor)
+        {
+            x265_log(p, X265_LOG_WARNING, "MV refinement requires analysis load, refine-level 10, scale factor. Disabling inter refine.\n");
+            p->mvRefine = 0;
+        }
+    }
 
     if ((p->analysisMultiPassRefine || p->analysisMultiPassDistortion) && (p->bDistributeModeAnalysis || p->bDistributeMotionEstimation))
     {
diff -r de49a722b256 -r c04d02d71f20 source/encoder/motion.cpp
--- a/source/encoder/motion.cpp	Wed May 24 20:01:59 2017 +0530
+++ b/source/encoder/motion.cpp	Mon Jun 05 15:20:44 2017 +0530
@@ -598,6 +598,139 @@
     }
 }
 
+void MotionEstimate::refineMV(ReferencePlanes* ref,
+                              const MV&        mvmin,
+                              const MV&        mvmax,
+                              const MV&        qmvp,
+                              MV&              outQMv)
+{
+    ALIGN_VAR_16(int, costs[16]);
+    if (ctuAddr >= 0)
+        blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0);
+    intptr_t stride = ref->lumaStride;
+    pixel* fenc = fencPUYuv.m_buf[0];
+    pixel* fref = ref->fpelPlane[0] + blockOffset;
+    
+    setMVP(qmvp);
+    
+    MV qmvmin = mvmin.toQPel();
+    MV qmvmax = mvmax.toQPel();
+   
+    /* The term cost used here means satd/sad values for that particular search.
+     * The costs used in ME integer search only includes the SAD cost of motion
+     * residual and sqrtLambda times MVD bits.  The subpel refine steps use SATD
+     * cost of residual and sqrtLambda * MVD bits.
+    */
+             
+    // measure SATD cost at clipped QPEL MVP
+    MV pmv = qmvp.clipped(qmvmin, qmvmax);
+    MV bestpre = pmv;
+    int bprecost;
+
+    bprecost = subpelCompare(ref, pmv, sad);
+
+    /* re-measure full pel rounded MVP with SAD as search start point */
+    MV bmv = pmv.roundToFPel();
+    int bcost = bprecost;
+    if (pmv.isSubpel())
+        bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2);
+
+    /* square refine */
+    int dir = 0;
+    COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);
+    if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
+        COPY2_IF_LT(bcost, costs[0], dir, 1);
+    if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
+        COPY2_IF_LT(bcost, costs[1], dir, 2);
+    COPY2_IF_LT(bcost, costs[2], dir, 3);
+    COPY2_IF_LT(bcost, costs[3], dir, 4);
+    COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs);
+    if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
+        COPY2_IF_LT(bcost, costs[0], dir, 5);
+    if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
+        COPY2_IF_LT(bcost, costs[1], dir, 6);
+    if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
+        COPY2_IF_LT(bcost, costs[2], dir, 7);
+    if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
+        COPY2_IF_LT(bcost, costs[3], dir, 8);
+    bmv += square1[dir];
+
+    if (bprecost < bcost)
+    {
+        bmv = bestpre;
+        bcost = bprecost;
+    }
+    else
+        bmv = bmv.toQPel(); // promote search bmv to qpel
+
+    // TO DO: Change SubpelWorkload to fine tune MV
+    // Now it is set to 5 for experiment.
+    // const SubpelWorkload& wl = workload[this->subpelRefine];
+    const SubpelWorkload& wl = workload[5];
+
+    pixelcmp_t hpelcomp;
+
+    if (wl.hpel_satd)
+    {
+        bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
+        hpelcomp = satd;
+    }
+    else
+        hpelcomp = sad;
+
+    for (int iter = 0; iter < wl.hpel_iters; iter++)
+    {
+        int bdir = 0;
+        for (int i = 1; i <= wl.hpel_dirs; i++)
+        {
+            MV qmv = bmv + square1[i] * 2;            
+
+            // check mv range for slice bound
+            if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
+                continue;
+
+            int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
+            COPY2_IF_LT(bcost, cost, bdir, i);
+        }
+
+        if (bdir)
+            bmv += square1[bdir] * 2;            
+        else
+            break;
+    }
+
+    /* if HPEL search used SAD, remeasure with SATD before QPEL */
+    if (!wl.hpel_satd)
+        bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
+
+    for (int iter = 0; iter < wl.qpel_iters; iter++)
+    {
+        int bdir = 0;
+        for (int i = 1; i <= wl.qpel_dirs; i++)
+        {
+            MV qmv = bmv + square1[i];
+            
+            // check mv range for slice bound
+            if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
+                continue;
+
+            int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
+            COPY2_IF_LT(bcost, cost, bdir, i);
+        }
+
+        if (bdir)
+            bmv += square1[bdir];
+        else
+            break;
+    }
+
+    // check mv range for slice bound
+    X265_CHECK(((pmv.y >= qmvmin.y) & (pmv.y <= qmvmax.y)), "mv beyond range!");
+    
+    x265_emms();
+    outQMv = bmv;
+}
+
 int MotionEstimate::motionEstimate(ReferencePlanes *ref,
                                    const MV &       mvmin,
                                    const MV &       mvmax,
diff -r de49a722b256 -r c04d02d71f20 source/encoder/motion.h
--- a/source/encoder/motion.h	Wed May 24 20:01:59 2017 +0530
+++ b/source/encoder/motion.h	Mon Jun 05 15:20:44 2017 +0530
@@ -92,6 +92,7 @@
                chromaSatd(refYuv.getCrAddr(puPartIdx), refYuv.m_csize, fencPUYuv.m_buf[2], fencPUYuv.m_csize);
     }
 
+    void refineMV(ReferencePlanes* ref, const MV& mvmin, const MV& mvmax, const MV& qmvp, MV& outQMv);
     int motionEstimate(ReferencePlanes* ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv, pixel *srcReferencePlane = 0);
 
     int subpelCompare(ReferencePlanes* ref, const MV &qmv, pixelcmp_t);
diff -r de49a722b256 -r c04d02d71f20 source/encoder/search.cpp
--- a/source/encoder/search.cpp	Wed May 24 20:01:59 2017 +0530
+++ b/source/encoder/search.cpp	Mon Jun 05 15:20:44 2017 +0530
@@ -2108,6 +2108,17 @@
     }
 }
 
+void Search::searchMV(Mode& interMode, const PredictionUnit& pu, int list, int ref, MV& outmv)
+{
+    CUData& cu = interMode.cu;
+    const Slice *slice = m_slice;
+    MV mv = cu.m_mv[list][pu.puAbsPartIdx];
+    cu.clipMv(mv);
+    MV mvmin, mvmax;
+    setSearchRange(cu, mv, m_param->searchRange, mvmin, mvmax);
+    m_me.refineMV(&slice->m_mref[list][ref], mvmin, mvmax, mv, outmv);
+}
+
 /* find the best inter prediction for each PU of specified mode */
 void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2])
 {
diff -r de49a722b256 -r c04d02d71f20 source/encoder/search.h
--- a/source/encoder/search.h	Wed May 24 20:01:59 2017 +0530
+++ b/source/encoder/search.h	Mon Jun 05 15:20:44 2017 +0530
@@ -311,6 +311,7 @@
     // estimation inter prediction (non-skip)
     void     predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t masks[2]);
 
+    void     searchMV(Mode& interMode, const PredictionUnit& pu, int list, int ref, MV& outmv);
     // encode residual and compute rd-cost for inter mode
     void     encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom);
     void     encodeResAndCalcRdSkipCU(Mode& interMode);
diff -r de49a722b256 -r c04d02d71f20 source/x265.h
--- a/source/x265.h	Wed May 24 20:01:59 2017 +0530
+++ b/source/x265.h	Mon Jun 05 15:20:44 2017 +0530
@@ -1449,6 +1449,9 @@
     /* Enable inter refinement in load mode*/
     int       interRefine;
 
+    /* Enable motion vector refinement in load mode*/
+    int       mvRefine;
+
 } x265_param;
 
 /* x265_param_alloc:
diff -r de49a722b256 -r c04d02d71f20 source/x265cli.h
--- a/source/x265cli.h	Wed May 24 20:01:59 2017 +0530
+++ b/source/x265cli.h	Mon Jun 05 15:20:44 2017 +0530
@@ -277,6 +277,8 @@
     { "dhdr10-info",    required_argument, NULL, 0 },
     { "dhdr10-opt",           no_argument, NULL, 0},
     { "no-dhdr10-opt",        no_argument, NULL, 0},
+    { "refine-mv",            no_argument, NULL, 0 },
+    { "no-refine-mv",         no_argument, NULL, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
@@ -448,6 +450,7 @@
     H0("   --scale-factor <int>          Specify factor by which input video is scaled down for analysis save mode. Default %d\n", param->scaleFactor);
     H0("   --[no-]refine-intra           Enable intra refinement for load mode. Default %s\n", OPT(param->intraRefine));
     H0("   --[no-]refine-inter           Enable inter refinement for load mode. Default %s\n", OPT(param->interRefine));
+    H0("   --[no-]refine-mv              Enable mv refinement for load mode. Default %s\n", OPT(param->mvRefine));
     H0("   --aq-mode <integer>           Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance 3:auto variance with bias to dark scenes. Default %d\n", param->rc.aqMode);
     H0("   --aq-strength <float>         Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
     H0("   --[no-]aq-motion              Adaptive Quantization based on the relative motion of each CU w.r.t., frame. Default %s\n", OPT(param->bOptCUDeltaQP));