[x265] [PATCH 2 of 2] motion: Perform ME on each HME level

Wed Jul 10 06:17:03 CEST 2019

# HG changeset patch
# User Pooja Venkatesan <pooja at multicorewareinc.com>
# Date 1562562567 -19800
#      Mon Jul 08 10:39:27 2019 +0530
# Node ID 2dcff9aea06f0f1c396fd2a62104e4fd5029bf40
# Parent  14a235657a2011aa28d45544f33b7186c33b9218
motion: Perform ME on each HME level

This patch does the following:
1) Perform level-0 ME
2) Use the MVs as predictor for next level ME
3) Restrict full-search within a range when HME is enabled

diff -r 14a235657a20 -r 2dcff9aea06f source/common/lowres.cpp

--- a/source/common/lowres.cpp	Fri Jul 05 11:17:26 2019 +0530
+++ b/source/common/lowres.cpp	Mon Jul 08 10:39:27 2019 +0530
@@ -65,6 +65,7 @@
     maxBlocksInColFullRes = maxBlocksInCol * 2;
     int cuCount = maxBlocksInRow * maxBlocksInCol;
     int cuCountFullRes = (qgSize > 8) ? cuCount : cuCount << 2;
+    isHMELowres = param->bEnableHME ? 1 : 0;
 
     /* rounding the width to multiple of lowres CU size */
     width = maxBlocksInRow * X265_LOWRES_CU_SIZE;
@@ -176,6 +177,16 @@
         CHECKED_MALLOC(lowresMvs[1][i], MV, cuCount);
         CHECKED_MALLOC(lowresMvCosts[0][i], int32_t, cuCount);
         CHECKED_MALLOC(lowresMvCosts[1][i], int32_t, cuCount);
+        if (bEnableHME)
+        {
+            int maxBlocksInRowLowerRes = ((width/2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+            int maxBlocksInColLowerRes = ((lines/2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+            int cuCountLowerRes = maxBlocksInRowLowerRes * maxBlocksInColLowerRes;
+            CHECKED_MALLOC(lowerResMvs[0][i], MV, cuCountLowerRes);
+            CHECKED_MALLOC(lowerResMvs[1][i], MV, cuCountLowerRes);
+            CHECKED_MALLOC(lowerResMvCosts[0][i], int32_t, cuCountLowerRes);
+            CHECKED_MALLOC(lowerResMvCosts[1][i], int32_t, cuCountLowerRes);
+        }
     }
 
     return true;
@@ -207,6 +218,13 @@
         X265_FREE(lowresMvs[1][i]);
         X265_FREE(lowresMvCosts[0][i]);
         X265_FREE(lowresMvCosts[1][i]);
+        if (bEnableHME)
+        {
+            X265_FREE(lowerResMvs[0][i]);
+            X265_FREE(lowerResMvs[1][i]);
+            X265_FREE(lowerResMvCosts[0][i]);
+            X265_FREE(lowerResMvCosts[1][i]);
+        }
     }
     X265_FREE(qpAqOffset);
     X265_FREE(invQscaleFactor);
diff -r 14a235657a20 -r 2dcff9aea06f source/common/lowres.h
--- a/source/common/lowres.h	Fri Jul 05 11:17:26 2019 +0530
+++ b/source/common/lowres.h	Mon Jul 08 10:39:27 2019 +0530
@@ -46,6 +46,7 @@
 
     bool     isWeighted;
     bool     isLowres;
+    bool     isHMELowres;
 
     intptr_t lumaStride;
     intptr_t chromaStride;
@@ -63,46 +64,58 @@
 
     /* lowres motion compensation, you must provide a buffer and stride for QPEL averaged pixels
      * in case QPEL is required.  Else it returns a pointer to the HPEL pixels */
-    inline pixel *lowresMC(intptr_t blockOffset, const MV& qmv, pixel *buf, intptr_t& outstride)
+    inline pixel *lowresMC(intptr_t blockOffset, const MV& qmv, pixel *buf, intptr_t& outstride, bool hme)
     {
+        intptr_t YStride = hme ? lumaStride / 2 : lumaStride;
+        pixel *plane[4];
+        for (int i = 0; i < 4; i++)
+        {
+            plane[i] = hme ? lowerResPlane[i] : lowresPlane[i];
+        }
         if ((qmv.x | qmv.y) & 1)
         {
             int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1);
-            pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride;
+            pixel *frefA = plane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * YStride;
             int qmvx = qmv.x + (qmv.x & 1);
             int qmvy = qmv.y + (qmv.y & 1);
             int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
-            pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
-            primitives.pu[LUMA_8x8].pixelavg_pp[(outstride % 64 == 0) && (lumaStride % 64 == 0)](buf, outstride, frefA, lumaStride, frefB, lumaStride, 32);
+            pixel *frefB = plane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * YStride;
+            primitives.pu[LUMA_8x8].pixelavg_pp[(outstride % 64 == 0) && (YStride % 64 == 0)](buf, outstride, frefA, YStride, frefB, YStride, 32);
             return buf;
         }
         else
         {
-            outstride = lumaStride;
+            outstride = YStride;
             int hpel = (qmv.y & 2) | ((qmv.x & 2) >> 1);
-            return lowresPlane[hpel] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride;
+            return plane[hpel] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * YStride;
         }
     }
 
-    inline int lowresQPelCost(pixel *fenc, intptr_t blockOffset, const MV& qmv, pixelcmp_t comp)
+    inline int lowresQPelCost(pixel *fenc, intptr_t blockOffset, const MV& qmv, pixelcmp_t comp, bool hme)
     {
+        intptr_t YStride = hme ? lumaStride / 2 : lumaStride;
+        pixel *plane[4];
+        for (int i = 0; i < 4; i++)
+        {
+            plane[i] = hme ? lowerResPlane[i] : lowresPlane[i];
+        }
         if ((qmv.x | qmv.y) & 1)
         {
             ALIGN_VAR_16(pixel, subpelbuf[8 * 8]);
             int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1);
-            pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride;
+            pixel *frefA = plane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * YStride;
             int qmvx = qmv.x + (qmv.x & 1);
             int qmvy = qmv.y + (qmv.y & 1);
             int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
-            pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
-            primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](subpelbuf, 8, frefA, lumaStride, frefB, lumaStride, 32);
+            pixel *frefB = plane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * YStride;
+            primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](subpelbuf, 8, frefA, YStride, frefB, YStride, 32);
             return comp(fenc, FENC_STRIDE, subpelbuf, 8);
         }
         else
         {
             int hpel = (qmv.y & 2) | ((qmv.x & 2) >> 1);
-            pixel *fref = lowresPlane[hpel] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride;
-            return comp(fenc, FENC_STRIDE, fref, lumaStride);
+            pixel *fref = plane[hpel] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * YStride;
+            return comp(fenc, FENC_STRIDE, fref, YStride);
         }
     }
 };
@@ -188,6 +201,8 @@
 
     /* Hierarchical Motion Estimation */
     bool      bEnableHME;
+    int32_t*  lowerResMvCosts[2][X265_BFRAME_MAX + 2];
+    MV*       lowerResMvs[2][X265_BFRAME_MAX + 2];
 
     /* used for vbvLookahead */
     int       plannedType[X265_LOOKAHEAD_MAX + 1];
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp	Fri Jul 05 11:17:26 2019 +0530
+++ b/source/encoder/encoder.cpp	Mon Jul 08 10:39:27 2019 +0530
@@ -3387,6 +3387,10 @@
             x265_log(p, X265_LOG_WARNING, "Source height < 540p is too low for HME. Disabling HME.\n");
             p->bEnableHME = 0;
         }
+        if (m_param->bEnableHME && m_param->searchMethod != m_param->hmeSearchMethod[2])
+        {
+            m_param->searchMethod = m_param->hmeSearchMethod[2];
+        }
     }
 }
 
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/motion.cpp
--- a/source/encoder/motion.cpp	Fri Jul 05 11:17:26 2019 +0530
+++ b/source/encoder/motion.cpp	Mon Jul 08 10:39:27 2019 +0530
@@ -104,6 +104,8 @@
     ctuAddr = -1;
     absPartIdx = -1;
     searchMethod = X265_HEX_SEARCH;
+    searchMethodL0 = X265_HEX_SEARCH;
+    searchMethodL1 = X265_HEX_SEARCH;
     subpelRefine = 2;
     blockwidth = blockheight = 0;
     blockOffset = 0;
@@ -162,7 +164,7 @@
 }
 
 /* Called by lookahead, luma only, no use of PicYuv */
-void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int refine)
+void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int searchL0, const int searchL1, const int refine)
 {
     partEnum = partitionFromSizes(pwidth, pheight);
     X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
@@ -179,6 +181,8 @@
 
     /* Search params */
     searchMethod = method;
+    searchMethodL0 = searchL0;
+    searchMethodL1 = searchL1;
     subpelRefine = refine;
 
     /* copy PU block into cache */
@@ -743,9 +747,10 @@
                                    pixel *          srcReferencePlane)
 {
     ALIGN_VAR_16(int, costs[16]);
+    bool hme = srcReferencePlane && srcReferencePlane == ref->fpelLowerResPlane[0];
     if (ctuAddr >= 0)
         blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0);
-    intptr_t stride = ref->lumaStride;
+    intptr_t stride = hme ? ref->lumaStride / 2 : ref->lumaStride;
     pixel* fenc = fencPUYuv.m_buf[0];
     pixel* fref = srcReferencePlane == 0 ? ref->fpelPlane[0] + blockOffset : srcReferencePlane + blockOffset;
 
@@ -767,7 +772,7 @@
     int bprecost;
 
     if (ref->isLowres)
-        bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad);
+        bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad, hme);
     else
         bprecost = subpelCompare(ref, pmv, sad);
 
@@ -808,7 +813,8 @@
     pmv = pmv.roundToFPel();
     MV omv = bmv;  // current search origin or starting point
 
-    switch (searchMethod)
+    int search = ref->isHMELowres ? (hme ? searchMethodL0 : searchMethodL1) : searchMethod;
+    switch (search)
     {
     case X265_DIA_SEARCH:
     {
@@ -1391,11 +1397,20 @@
     {
         // dead slow exhaustive search, but at least it uses sad_x4()
         MV tmv;
-        for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y++)
+        int32_t mvmin_y = mvmin.y, mvmin_x = mvmin.x, mvmax_y = mvmax.y, mvmax_x = mvmax.x;
+        if (ref->isHMELowres)
         {
-            for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x++)
+            merange = (merange < 0 ? -merange : merange);
+            mvmin_y = X265_MAX(mvmin.y, -merange);
+            mvmin_x = X265_MAX(mvmin.x, -merange);
+            mvmax_y = X265_MIN(mvmax.y, merange);
+            mvmax_x = X265_MIN(mvmax.x, merange);
+        }
+        for (tmv.y = mvmin_y; tmv.y <= mvmax_y; tmv.y++)
+        {
+            for (tmv.x = mvmin_x; tmv.x <= mvmax_x; tmv.x++)
             {
-                if (tmv.x + 3 <= mvmax.x)
+                if (tmv.x + 3 <= mvmax_x)
                 {
                     pixel *pix_base = fref + tmv.y * stride + tmv.x;
                     sad_x4(fenc,
@@ -1463,12 +1478,12 @@
             if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
                 continue;
 
-            int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv);
+            int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad, hme) + mvcost(qmv);
             COPY2_IF_LT(bcost, cost, bdir, i);
         }
 
         bmv += square1[bdir] * 2;
-        bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd) + mvcost(bmv);
+        bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd, hme) + mvcost(bmv);
 
         bdir = 0;
         for (int i = 1; i <= wl.qpel_dirs; i++)
@@ -1479,7 +1494,7 @@
             if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
                 continue;
 
-            int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv);
+            int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd, hme) + mvcost(qmv);
             COPY2_IF_LT(bcost, cost, bdir, i);
         }
 
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/motion.h
--- a/source/encoder/motion.h	Fri Jul 05 11:17:26 2019 +0530
+++ b/source/encoder/motion.h	Mon Jul 08 10:39:27 2019 +0530
@@ -44,6 +44,8 @@
     int absPartIdx;  // part index of PU, including CU offset within CTU
 
     int searchMethod;
+    int searchMethodL0;
+    int searchMethodL1;
     int subpelRefine;
 
     int blockwidth;
@@ -76,7 +78,7 @@
 
     /* Methods called at slice setup */
 
-    void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int searchMethod, const int subpelRefine);
+    void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int searchMethod, const int searchL0, const int searchL1, const int subpelRefine);
     void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int searchMethod, const int subpelRefine, bool bChroma);
 
     /* buf*() and motionEstimate() methods all use cached fenc pixels and thus
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/search.cpp
--- a/source/encoder/search.cpp	Fri Jul 05 11:17:26 2019 +0530
+++ b/source/encoder/search.cpp	Mon Jul 08 10:39:27 2019 +0530
@@ -2096,13 +2096,16 @@
 
     const MV* amvp = interMode.amvpCand[list][ref];
     int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref);
-    MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
+    bool bLowresMVP = false;
+    MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres;
 
     if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents load/save outputs from diverging if lowresMV is not available */
     {
         MV lmv = getLowresMV(interMode.cu, pu, list, ref);
         if (lmv.notZero())
             mvc[numMvc++] = lmv;
+        if (m_param->bEnableHME)
+            mvp_lowres = lmv;
     }
 
     setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);
@@ -2110,11 +2113,28 @@
     int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, 
       m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
 
+    if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
+    {
+        MV outmv_lowres;
+        setSearchRange(interMode.cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
+        int lowresMvCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices,
+            m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
+        if (lowresMvCost < satdCost)
+        {
+            outmv = outmv_lowres;
+            satdCost = lowresMvCost;
+            bLowresMVP = true;
+        }
+    }
     /* Get total cost of partition, but only include MV bit cost once */
     bits += m_me.bitcost(outmv);
     uint32_t mvCost = m_me.mvcost(outmv);
     uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
 
+    /* Update LowresMVP to best AMVP cand*/
+    if (bLowresMVP)
+        updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);
+
     /* Refine MVP selection, updates: mvpIdx, bits, cost */
     mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
 
@@ -2346,13 +2366,16 @@
 
                     const MV* amvp = interMode.amvpCand[list][ref];
                     int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
-                    MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
+                    MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres;
+                    bool bLowresMVP = false;
 
                     if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents load/save outputs from diverging when lowresMV is not available */
                     {
                         MV lmv = getLowresMV(cu, pu, list, ref);
                         if (lmv.notZero())
                             mvc[numMvc++] = lmv;
+                        if (m_param->bEnableHME)
+                            mvp_lowres = lmv;
                     }
                     if (m_param->searchMethod == X265_SEA)
                     {
@@ -2365,10 +2388,27 @@
                     int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, 
                       m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
 
+                    if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
+                    {
+                        MV outmv_lowres;
+                        setSearchRange(cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
+                        int lowresMvCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices,
+                            m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
+                        if (lowresMvCost < satdCost)
+                        {
+                            outmv = outmv_lowres;
+                            satdCost = lowresMvCost;
+                            bLowresMVP = true;
+                        }
+                    }
+
                     /* Get total cost of partition, but only include MV bit cost once */
                     bits += m_me.bitcost(outmv);
                     uint32_t mvCost = m_me.mvcost(outmv);
                     uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
+                    /* Update LowresMVP to best AMVP cand*/
+                    if (bLowresMVP)
+                        updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);
 
                     /* Refine MVP selection, updates: mvpIdx, bits, cost */
                     mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
@@ -2631,6 +2671,15 @@
     return amvpCand[mvpIdx];
 }
 
+/* Update to default MVP when using an alternative mvp */
+void Search::updateMVP(const MV amvp, const MV& mv, uint32_t& outBits, uint32_t& outCost, const MV& alterMVP)
+{
+    int diffBits = m_me.bitcost(mv, amvp) - m_me.bitcost(mv, alterMVP);
+    uint32_t origOutBits = outBits;
+    outBits = origOutBits + diffBits;
+    outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits);
+}
+
 void Search::setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const
 {
     MV dist((int32_t)merange << 2, (int32_t)merange << 2);
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/search.h
--- a/source/encoder/search.h	Fri Jul 05 11:17:26 2019 +0530
+++ b/source/encoder/search.h	Mon Jul 08 10:39:27 2019 +0530
@@ -425,6 +425,7 @@
     void     setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const;
     uint32_t mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m);
     static void getBlkBits(PartSize cuMode, bool bPSlice, int puIdx, uint32_t lastMode, uint32_t blockBit[3]);
+    void      updateMVP(const MV amvp, const MV& mv, uint32_t& outBits, uint32_t& outCost, const MV& alterMVP);
 
     /* intra helper functions */
     enum { MAX_RD_INTRA_MODES = 16 };
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Fri Jul 05 11:17:26 2019 +0530
+++ b/source/encoder/slicetype.cpp	Mon Jul 08 10:39:27 2019 +0530
@@ -664,6 +664,7 @@
     weightedRef.lumaStride = fenc.lumaStride;
     weightedRef.isLowres = true;
     weightedRef.isWeighted = false;
+    weightedRef.isHMELowres = ref.bEnableHME;
 
     /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
     float guessScale, fencMean, refMean;
@@ -759,6 +760,8 @@
     m_extendGopBoundary = false;
     m_8x8Height = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
     m_8x8Width = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+    m_4x4Height = ((m_param->sourceHeight / 4) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+    m_4x4Width = ((m_param->sourceWidth / 4) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
     m_cuCount = m_8x8Width * m_8x8Height;
     m_8x8Blocks = m_8x8Width > 2 && m_8x8Height > 2 ? (m_cuCount + 4 - 2 * (m_8x8Width + m_8x8Height)) : m_cuCount;
     m_isFadeIn = false;
@@ -2782,16 +2785,32 @@
 
             X265_CHECK(i < MAX_COOP_SLICES, "impossible number of coop slices\n");
 
-            int firstY = m_lookahead.m_numRowsPerSlice * i;
-            int lastY = (i == m_jobTotal - 1) ? m_lookahead.m_8x8Height - 1 : m_lookahead.m_numRowsPerSlice * (i + 1) - 1;
-
-            bool lastRow = true;
+            int firstY, lastY;
+            bool lastRow;
+            if (m_lookahead.m_param->bEnableHME)
+            {
+                int numRowsPerSlice = m_lookahead.m_4x4Height / m_lookahead.m_param->lookaheadSlices;
+                numRowsPerSlice = X265_MIN(X265_MAX(numRowsPerSlice, 5), m_lookahead.m_4x4Height);
+                firstY = numRowsPerSlice * i;
+                lastY = (i == m_jobTotal - 1) ? m_lookahead.m_4x4Height - 1 : numRowsPerSlice * (i + 1) - 1;
+                lastRow = true;
+                for (int cuY = lastY; cuY >= firstY; cuY--)
+                {
+                    for (int cuX = m_lookahead.m_4x4Width - 1; cuX >= 0; cuX--)
+                        estimateCUCost(tld, cuX, cuY, m_coop.p0, m_coop.p1, m_coop.b, m_coop.bDoSearch, lastRow, i, 1);
+                    lastRow = false;
+                }
+            }
+
+            firstY = m_lookahead.m_numRowsPerSlice * i;
+            lastY = (i == m_jobTotal - 1) ? m_lookahead.m_8x8Height - 1 : m_lookahead.m_numRowsPerSlice * (i + 1) - 1;
+            lastRow = true;
             for (int cuY = lastY; cuY >= firstY; cuY--)
             {
                 m_frames[m_coop.b]->rowSatds[m_coop.b - m_coop.p0][m_coop.p1 - m_coop.b][cuY] = 0;
 
                 for (int cuX = m_lookahead.m_8x8Width - 1; cuX >= 0; cuX--)
-                    estimateCUCost(tld, cuX, cuY, m_coop.p0, m_coop.p1, m_coop.b, m_coop.bDoSearch, lastRow, i);
+                    estimateCUCost(tld, cuX, cuY, m_coop.p0, m_coop.p1, m_coop.b, m_coop.bDoSearch, lastRow, i, 0);
 
                 lastRow = false;
             }
@@ -2864,13 +2883,25 @@
         }
         else
         {
-            bool lastRow = true;
+            /* Calculate MVs for 1/16th resolution*/
+            bool lastRow;
+            if (param->bEnableHME)
+            {
+                lastRow = true;
+                for (int cuY = m_lookahead.m_4x4Height - 1; cuY >= 0; cuY--)
+                {
+                    for (int cuX = m_lookahead.m_4x4Width - 1; cuX >= 0; cuX--)
+                        estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, lastRow, -1, 1);
+                    lastRow = false;
+                }
+            }
+            lastRow = true;
             for (int cuY = m_lookahead.m_8x8Height - 1; cuY >= 0; cuY--)
             {
                 fenc->rowSatds[b - p0][p1 - b][cuY] = 0;
 
                 for (int cuX = m_lookahead.m_8x8Width - 1; cuX >= 0; cuX--)
-                    estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, lastRow, -1);
+                    estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, lastRow, -1, 0);
 
                 lastRow = false;
             }
@@ -2891,23 +2922,27 @@
     return score;
 }
 
-void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int cuY, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice)
+void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int cuY, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice, bool hme)
 {
     Lowres *fref0 = m_frames[p0];
     Lowres *fref1 = m_frames[p1];
     Lowres *fenc  = m_frames[b];
 
-    ReferencePlanes *wfref0 = fenc->weightedRef[b - p0].isWeighted ? &fenc->weightedRef[b - p0] : fref0;
-
-    const int widthInCU = m_lookahead.m_8x8Width;
-    const int heightInCU = m_lookahead.m_8x8Height;
+    ReferencePlanes *wfref0 = fenc->weightedRef[b - p0].isWeighted && !hme ? &fenc->weightedRef[b - p0] : fref0;
+
+    const int widthInCU = hme ? m_lookahead.m_4x4Width : m_lookahead.m_8x8Width;
+    const int heightInCU = hme ? m_lookahead.m_4x4Height : m_lookahead.m_8x8Height;
     const int bBidir = (b < p1);
     const int cuXY = cuX + cuY * widthInCU;
+    const int cuXY_4x4 = (cuX / 2) + (cuY / 2) * widthInCU / 2;
     const int cuSize = X265_LOWRES_CU_SIZE;
-    const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * fenc->lumaStride;
-
-    if (bBidir || bDoSearch[0] || bDoSearch[1])
-        tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, 1);
+    const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * (hme ? fenc->lumaStride/2 : fenc->lumaStride);
+
+    if ((bBidir || bDoSearch[0] || bDoSearch[1]) && hme)
+        tld.me.setSourcePU(fenc->lowerResPlane[0], fenc->lumaStride / 2, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, m_lookahead.m_param->hmeSearchMethod[0], m_lookahead.m_param->hmeSearchMethod[1], 1);
+    else if((bBidir || bDoSearch[0] || bDoSearch[1]) && !hme)
+        tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, m_lookahead.m_param->hmeSearchMethod[0], m_lookahead.m_param->hmeSearchMethod[1], 1);
+
 
     /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
     int lowresPenalty = 4;
@@ -2926,7 +2961,7 @@
 
     for (int i = 0; i < 1 + bBidir; i++)
     {
-        int& fencCost = fenc->lowresMvCosts[i][listDist[i]][cuXY];
+        int& fencCost = hme ? fenc->lowerResMvCosts[i][listDist[i]][cuXY] : fenc->lowresMvCosts[i][listDist[i]][cuXY];
         int skipCost = INT_MAX;
 
         if (!bDoSearch[i])
@@ -2936,8 +2971,8 @@
         }
 
         int numc = 0;
-        MV mvc[4], mvp;
-        MV* fencMV = &fenc->lowresMvs[i][listDist[i]][cuXY];
+        MV mvc[5], mvp;
+        MV* fencMV = hme ? &fenc->lowerResMvs[i][listDist[i]][cuXY] : &fenc->lowresMvs[i][listDist[i]][cuXY];
         ReferencePlanes* fref = i ? fref1 : wfref0;
 
         /* Reverse-order MV prediction */
@@ -2952,6 +2987,10 @@
             if (cuX < widthInCU - 1)
                 MVC(fencMV[widthInCU + 1]);
         }
+        if (fenc->lowerResMvs[0][0] && !hme && fenc->lowerResMvCosts[i][listDist[i]][cuXY_4x4] > 0)
+        {
+            MVC((fenc->lowerResMvs[i][listDist[i]][cuXY_4x4]) * 2);
+        }
 #undef MVC
 
         if (!numc)
@@ -2967,7 +3006,7 @@
             for (int idx = 0; idx < numc; idx++)
             {
                 intptr_t stride = X265_LOWRES_CU_SIZE;
-                pixel *src = fref->lowresMC(pelOffset, mvc[idx], subpelbuf, stride);
+                pixel *src = fref->lowresMC(pelOffset, mvc[idx], subpelbuf, stride, hme);
                 int cost = tld.me.bufSATD(src, stride);
                 COPY2_IF_LT(mvpcost, cost, mvp, mvc[idx]);
                 /* Except for mv0 case, everyting else is likely to have enough residual to not trigger the skip. */
@@ -2978,7 +3017,10 @@
 
         /* ME will never return a cost larger than the cost @MVP, so we do not
          * have to check that ME cost is more than the estimated merge cost */
-        fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices);
+        if(!hme)
+            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices);
+        else
+            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices, fref->lowerResPlane[0]);
         if (skipCost < 64 && skipCost < fencCost && bBidir)
         {
             fencCost = skipCost;
@@ -2986,6 +3028,8 @@
         }
         COPY2_IF_LT(bcost, fencCost, listused, i + 1);
     }
+    if (hme)
+        return;
 
     if (bBidir) /* B, also consider bidir */
     {
@@ -2995,8 +3039,8 @@
         ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
         ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
         intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
-        pixel *src0 = fref0->lowresMC(pelOffset, fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0);
-        pixel *src1 = fref1->lowresMC(pelOffset, fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1);
+        pixel *src0 = fref0->lowresMC(pelOffset, fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0, 0);
+        pixel *src1 = fref1->lowresMC(pelOffset, fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1, 0);
         ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
         primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
         int bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/slicetype.h
--- a/source/encoder/slicetype.h	Fri Jul 05 11:17:26 2019 +0530
+++ b/source/encoder/slicetype.h	Mon Jul 08 10:39:27 2019 +0530
@@ -124,6 +124,10 @@
     int           m_inputCount;
     double        m_cuTreeStrength;
 
+    /* HME */
+    int           m_4x4Width;
+    int           m_4x4Height;
+
     bool          m_isActive;
     bool          m_sliceTypeBusy;
     bool          m_bAdaptiveQuant;
@@ -246,7 +250,7 @@
     void    processTasks(int workerThreadID);
 
     int64_t estimateFrameCost(LookaheadTLD& tld, int p0, int p1, int b, bool intraPenalty);
-    void    estimateCUCost(LookaheadTLD& tld, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice);
+    void    estimateCUCost(LookaheadTLD& tld, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice, bool hme);
 
     CostEstimateGroup& operator=(const CostEstimateGroup&);
 };
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/weightPrediction.cpp
--- a/source/encoder/weightPrediction.cpp	Fri Jul 05 11:17:26 2019 +0530
+++ b/source/encoder/weightPrediction.cpp	Mon Jul 08 10:39:27 2019 +0530
@@ -82,7 +82,7 @@
             /* clip MV to available pixels */
             MV mv = mvs[cu];
             mv = mv.clipped(mvmin, mvmax);
-            pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride);
+            pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride, 0);
             primitives.cu[BLOCK_8x8].copy_pp(mcout + pixoff, stride, tmp, bstride);
         }
     }
-------------- next part --------------
A non-text attachment was scrubbed...
Name: x265-2.patch
Type: text/x-patch
Size: 30425 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20190710/c03eedfe/attachment-0001.bin>