[x265] [PATCH 2 of 2] motion: Perform ME on each HME level
Aruna Matheswaran
aruna at multicorewareinc.com
Thu Jul 11 15:08:28 CEST 2019
Pushed to default.
On Wed, Jul 10, 2019 at 9:47 AM <pooja at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Pooja Venkatesan <pooja at multicorewareinc.com>
> # Date 1562562567 -19800
> # Mon Jul 08 10:39:27 2019 +0530
> # Node ID 2dcff9aea06f0f1c396fd2a62104e4fd5029bf40
> # Parent 14a235657a2011aa28d45544f33b7186c33b9218
> motion: Perform ME on each HME level
>
> This patch does the following:
> 1) Perform level-0 ME
> 2) Use the MVs as predictor for next level ME
> 3) Restrict full-search within a range when HME is enabled
>
> diff -r 14a235657a20 -r 2dcff9aea06f source/common/lowres.cpp
> --- a/source/common/lowres.cpp Fri Jul 05 11:17:26 2019 +0530
> +++ b/source/common/lowres.cpp Mon Jul 08 10:39:27 2019 +0530
> @@ -65,6 +65,7 @@
> maxBlocksInColFullRes = maxBlocksInCol * 2;
> int cuCount = maxBlocksInRow * maxBlocksInCol;
> int cuCountFullRes = (qgSize > 8) ? cuCount : cuCount << 2;
> + isHMELowres = param->bEnableHME ? 1 : 0;
>
> /* rounding the width to multiple of lowres CU size */
> width = maxBlocksInRow * X265_LOWRES_CU_SIZE;
> @@ -176,6 +177,16 @@
> CHECKED_MALLOC(lowresMvs[1][i], MV, cuCount);
> CHECKED_MALLOC(lowresMvCosts[0][i], int32_t, cuCount);
> CHECKED_MALLOC(lowresMvCosts[1][i], int32_t, cuCount);
> + if (bEnableHME)
> + {
> + int maxBlocksInRowLowerRes = ((width/2) + X265_LOWRES_CU_SIZE
> - 1) >> X265_LOWRES_CU_BITS;
> + int maxBlocksInColLowerRes = ((lines/2) + X265_LOWRES_CU_SIZE
> - 1) >> X265_LOWRES_CU_BITS;
> + int cuCountLowerRes = maxBlocksInRowLowerRes *
> maxBlocksInColLowerRes;
> + CHECKED_MALLOC(lowerResMvs[0][i], MV, cuCountLowerRes);
> + CHECKED_MALLOC(lowerResMvs[1][i], MV, cuCountLowerRes);
> + CHECKED_MALLOC(lowerResMvCosts[0][i], int32_t,
> cuCountLowerRes);
> + CHECKED_MALLOC(lowerResMvCosts[1][i], int32_t,
> cuCountLowerRes);
> + }
> }
>
> return true;
> @@ -207,6 +218,13 @@
> X265_FREE(lowresMvs[1][i]);
> X265_FREE(lowresMvCosts[0][i]);
> X265_FREE(lowresMvCosts[1][i]);
> + if (bEnableHME)
> + {
> + X265_FREE(lowerResMvs[0][i]);
> + X265_FREE(lowerResMvs[1][i]);
> + X265_FREE(lowerResMvCosts[0][i]);
> + X265_FREE(lowerResMvCosts[1][i]);
> + }
> }
> X265_FREE(qpAqOffset);
> X265_FREE(invQscaleFactor);
> diff -r 14a235657a20 -r 2dcff9aea06f source/common/lowres.h
> --- a/source/common/lowres.h Fri Jul 05 11:17:26 2019 +0530
> +++ b/source/common/lowres.h Mon Jul 08 10:39:27 2019 +0530
> @@ -46,6 +46,7 @@
>
> bool isWeighted;
> bool isLowres;
> + bool isHMELowres;
>
> intptr_t lumaStride;
> intptr_t chromaStride;
> @@ -63,46 +64,58 @@
>
> /* lowres motion compensation, you must provide a buffer and stride
> for QPEL averaged pixels
> * in case QPEL is required. Else it returns a pointer to the HPEL
> pixels */
> - inline pixel *lowresMC(intptr_t blockOffset, const MV& qmv, pixel
> *buf, intptr_t& outstride)
> + inline pixel *lowresMC(intptr_t blockOffset, const MV& qmv, pixel
> *buf, intptr_t& outstride, bool hme)
> {
> + intptr_t YStride = hme ? lumaStride / 2 : lumaStride;
> + pixel *plane[4];
> + for (int i = 0; i < 4; i++)
> + {
> + plane[i] = hme ? lowerResPlane[i] : lowresPlane[i];
> + }
> if ((qmv.x | qmv.y) & 1)
> {
> int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1);
> - pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >>
> 2) + (qmv.y >> 2) * lumaStride;
> + pixel *frefA = plane[hpelA] + blockOffset + (qmv.x >> 2) +
> (qmv.y >> 2) * YStride;
> int qmvx = qmv.x + (qmv.x & 1);
> int qmvy = qmv.y + (qmv.y & 1);
> int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
> - pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2)
> + (qmvy >> 2) * lumaStride;
> - primitives.pu[LUMA_8x8].pixelavg_pp[(outstride % 64 == 0) &&
> (lumaStride % 64 == 0)](buf, outstride, frefA, lumaStride, frefB,
> lumaStride, 32);
> + pixel *frefB = plane[hpelB] + blockOffset + (qmvx >> 2) +
> (qmvy >> 2) * YStride;
> + primitives.pu[LUMA_8x8].pixelavg_pp[(outstride % 64 == 0) &&
> (YStride % 64 == 0)](buf, outstride, frefA, YStride, frefB, YStride, 32);
> return buf;
> }
> else
> {
> - outstride = lumaStride;
> + outstride = YStride;
> int hpel = (qmv.y & 2) | ((qmv.x & 2) >> 1);
> - return lowresPlane[hpel] + blockOffset + (qmv.x >> 2) +
> (qmv.y >> 2) * lumaStride;
> + return plane[hpel] + blockOffset + (qmv.x >> 2) + (qmv.y >>
> 2) * YStride;
> }
> }
>
> - inline int lowresQPelCost(pixel *fenc, intptr_t blockOffset, const
> MV& qmv, pixelcmp_t comp)
> + inline int lowresQPelCost(pixel *fenc, intptr_t blockOffset, const
> MV& qmv, pixelcmp_t comp, bool hme)
> {
> + intptr_t YStride = hme ? lumaStride / 2 : lumaStride;
> + pixel *plane[4];
> + for (int i = 0; i < 4; i++)
> + {
> + plane[i] = hme ? lowerResPlane[i] : lowresPlane[i];
> + }
> if ((qmv.x | qmv.y) & 1)
> {
> ALIGN_VAR_16(pixel, subpelbuf[8 * 8]);
> int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1);
> - pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >>
> 2) + (qmv.y >> 2) * lumaStride;
> + pixel *frefA = plane[hpelA] + blockOffset + (qmv.x >> 2) +
> (qmv.y >> 2) * YStride;
> int qmvx = qmv.x + (qmv.x & 1);
> int qmvy = qmv.y + (qmv.y & 1);
> int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
> - pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2)
> + (qmvy >> 2) * lumaStride;
> - primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](subpelbuf, 8,
> frefA, lumaStride, frefB, lumaStride, 32);
> + pixel *frefB = plane[hpelB] + blockOffset + (qmvx >> 2) +
> (qmvy >> 2) * YStride;
> + primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](subpelbuf, 8,
> frefA, YStride, frefB, YStride, 32);
> return comp(fenc, FENC_STRIDE, subpelbuf, 8);
> }
> else
> {
> int hpel = (qmv.y & 2) | ((qmv.x & 2) >> 1);
> - pixel *fref = lowresPlane[hpel] + blockOffset + (qmv.x >> 2)
> + (qmv.y >> 2) * lumaStride;
> - return comp(fenc, FENC_STRIDE, fref, lumaStride);
> + pixel *fref = plane[hpel] + blockOffset + (qmv.x >> 2) +
> (qmv.y >> 2) * YStride;
> + return comp(fenc, FENC_STRIDE, fref, YStride);
> }
> }
> };
> @@ -188,6 +201,8 @@
>
> /* Hierarchical Motion Estimation */
> bool bEnableHME;
> + int32_t* lowerResMvCosts[2][X265_BFRAME_MAX + 2];
> + MV* lowerResMvs[2][X265_BFRAME_MAX + 2];
>
> /* used for vbvLookahead */
> int plannedType[X265_LOOKAHEAD_MAX + 1];
> diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/encoder.cpp
> --- a/source/encoder/encoder.cpp Fri Jul 05 11:17:26 2019 +0530
> +++ b/source/encoder/encoder.cpp Mon Jul 08 10:39:27 2019 +0530
> @@ -3387,6 +3387,10 @@
> x265_log(p, X265_LOG_WARNING, "Source height < 540p is too
> low for HME. Disabling HME.\n");
> p->bEnableHME = 0;
> }
> + if (m_param->bEnableHME && m_param->searchMethod !=
> m_param->hmeSearchMethod[2])
> + {
> + m_param->searchMethod = m_param->hmeSearchMethod[2];
> + }
> }
> }
>
> diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/motion.cpp
> --- a/source/encoder/motion.cpp Fri Jul 05 11:17:26 2019 +0530
> +++ b/source/encoder/motion.cpp Mon Jul 08 10:39:27 2019 +0530
> @@ -104,6 +104,8 @@
> ctuAddr = -1;
> absPartIdx = -1;
> searchMethod = X265_HEX_SEARCH;
> + searchMethodL0 = X265_HEX_SEARCH;
> + searchMethodL1 = X265_HEX_SEARCH;
> subpelRefine = 2;
> blockwidth = blockheight = 0;
> blockOffset = 0;
> @@ -162,7 +164,7 @@
> }
>
> /* Called by lookahead, luma only, no use of PicYuv */
> -void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t
> offset, int pwidth, int pheight, const int method, const int refine)
> +void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t
> offset, int pwidth, int pheight, const int method, const int searchL0,
> const int searchL1, const int refine)
> {
> partEnum = partitionFromSizes(pwidth, pheight);
> X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
> @@ -179,6 +181,8 @@
>
> /* Search params */
> searchMethod = method;
> + searchMethodL0 = searchL0;
> + searchMethodL1 = searchL1;
> subpelRefine = refine;
>
> /* copy PU block into cache */
> @@ -743,9 +747,10 @@
> pixel * srcReferencePlane)
> {
> ALIGN_VAR_16(int, costs[16]);
> + bool hme = srcReferencePlane && srcReferencePlane ==
> ref->fpelLowerResPlane[0];
> if (ctuAddr >= 0)
> blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) -
> ref->reconPic->getLumaAddr(0);
> - intptr_t stride = ref->lumaStride;
> + intptr_t stride = hme ? ref->lumaStride / 2 : ref->lumaStride;
> pixel* fenc = fencPUYuv.m_buf[0];
> pixel* fref = srcReferencePlane == 0 ? ref->fpelPlane[0] +
> blockOffset : srcReferencePlane + blockOffset;
>
> @@ -767,7 +772,7 @@
> int bprecost;
>
> if (ref->isLowres)
> - bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad);
> + bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad, hme);
> else
> bprecost = subpelCompare(ref, pmv, sad);
>
> @@ -808,7 +813,8 @@
> pmv = pmv.roundToFPel();
> MV omv = bmv; // current search origin or starting point
>
> - switch (searchMethod)
> + int search = ref->isHMELowres ? (hme ? searchMethodL0 :
> searchMethodL1) : searchMethod;
> + switch (search)
> {
> case X265_DIA_SEARCH:
> {
> @@ -1391,11 +1397,20 @@
> {
> // dead slow exhaustive search, but at least it uses sad_x4()
> MV tmv;
> - for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y++)
> + int32_t mvmin_y = mvmin.y, mvmin_x = mvmin.x, mvmax_y = mvmax.y,
> mvmax_x = mvmax.x;
> + if (ref->isHMELowres)
> {
> - for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x++)
> + merange = (merange < 0 ? -merange : merange);
> + mvmin_y = X265_MAX(mvmin.y, -merange);
> + mvmin_x = X265_MAX(mvmin.x, -merange);
> + mvmax_y = X265_MIN(mvmax.y, merange);
> + mvmax_x = X265_MIN(mvmax.x, merange);
> + }
> + for (tmv.y = mvmin_y; tmv.y <= mvmax_y; tmv.y++)
> + {
> + for (tmv.x = mvmin_x; tmv.x <= mvmax_x; tmv.x++)
> {
> - if (tmv.x + 3 <= mvmax.x)
> + if (tmv.x + 3 <= mvmax_x)
> {
> pixel *pix_base = fref + tmv.y * stride + tmv.x;
> sad_x4(fenc,
> @@ -1463,12 +1478,12 @@
> if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
> continue;
>
> - int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) +
> mvcost(qmv);
> + int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad,
> hme) + mvcost(qmv);
> COPY2_IF_LT(bcost, cost, bdir, i);
> }
>
> bmv += square1[bdir] * 2;
> - bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd) +
> mvcost(bmv);
> + bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd, hme) +
> mvcost(bmv);
>
> bdir = 0;
> for (int i = 1; i <= wl.qpel_dirs; i++)
> @@ -1479,7 +1494,7 @@
> if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
> continue;
>
> - int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd)
> + mvcost(qmv);
> + int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd,
> hme) + mvcost(qmv);
> COPY2_IF_LT(bcost, cost, bdir, i);
> }
>
> diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/motion.h
> --- a/source/encoder/motion.h Fri Jul 05 11:17:26 2019 +0530
> +++ b/source/encoder/motion.h Mon Jul 08 10:39:27 2019 +0530
> @@ -44,6 +44,8 @@
> int absPartIdx; // part index of PU, including CU offset within CTU
>
> int searchMethod;
> + int searchMethodL0;
> + int searchMethodL1;
> int subpelRefine;
>
> int blockwidth;
> @@ -76,7 +78,7 @@
>
> /* Methods called at slice setup */
>
> - void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int
> pwidth, int pheight, const int searchMethod, const int subpelRefine);
> + void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int
> pwidth, int pheight, const int searchMethod, const int searchL0, const int
> searchL1, const int subpelRefine);
> void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx,
> int puPartIdx, int pwidth, int pheight, const int searchMethod, const int
> subpelRefine, bool bChroma);
>
> /* buf*() and motionEstimate() methods all use cached fenc pixels and
> thus
> diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/search.cpp
> --- a/source/encoder/search.cpp Fri Jul 05 11:17:26 2019 +0530
> +++ b/source/encoder/search.cpp Mon Jul 08 10:39:27 2019 +0530
> @@ -2096,13 +2096,16 @@
>
> const MV* amvp = interMode.amvpCand[list][ref];
> int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref);
> - MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
> + bool bLowresMVP = false;
> + MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres;
>
> if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents
> load/save outputs from diverging if lowresMV is not available */
> {
> MV lmv = getLowresMV(interMode.cu, pu, list, ref);
> if (lmv.notZero())
> mvc[numMvc++] = lmv;
> + if (m_param->bEnableHME)
> + mvp_lowres = lmv;
> }
>
> setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);
> @@ -2110,11 +2113,28 @@
> int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref],
> mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv,
> m_param->maxSlices,
> m_param->bSourceReferenceEstimation ?
> m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
>
> + if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
> + {
> + MV outmv_lowres;
> + setSearchRange(interMode.cu, mvp_lowres, m_param->searchRange,
> mvmin, mvmax);
> + int lowresMvCost =
> m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres,
> numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices,
> + m_param->bSourceReferenceEstimation ?
> m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
> + if (lowresMvCost < satdCost)
> + {
> + outmv = outmv_lowres;
> + satdCost = lowresMvCost;
> + bLowresMVP = true;
> + }
> + }
> /* Get total cost of partition, but only include MV bit cost once */
> bits += m_me.bitcost(outmv);
> uint32_t mvCost = m_me.mvcost(outmv);
> uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
>
> + /* Update LowresMVP to best AMVP cand*/
> + if (bLowresMVP)
> + updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);
> +
> /* Refine MVP selection, updates: mvpIdx, bits, cost */
> mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
>
> @@ -2346,13 +2366,16 @@
>
> const MV* amvp = interMode.amvpCand[list][ref];
> int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
> - MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
> + MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx],
> mvp_lowres;
> + bool bLowresMVP = false;
>
> if (!m_param->analysisSave && !m_param->analysisLoad)
> /* Prevents load/save outputs from diverging when lowresMV is not available
> */
> {
> MV lmv = getLowresMV(cu, pu, list, ref);
> if (lmv.notZero())
> mvc[numMvc++] = lmv;
> + if (m_param->bEnableHME)
> + mvp_lowres = lmv;
> }
> if (m_param->searchMethod == X265_SEA)
> {
> @@ -2365,10 +2388,27 @@
> int satdCost =
> m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc,
> mvc, m_param->searchRange, outmv, m_param->maxSlices,
> m_param->bSourceReferenceEstimation ?
> m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
>
> + if (m_param->bEnableHME && mvp_lowres.notZero() &&
> mvp_lowres != mvp)
> + {
> + MV outmv_lowres;
> + setSearchRange(cu, mvp_lowres,
> m_param->searchRange, mvmin, mvmax);
> + int lowresMvCost =
> m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres,
> numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices,
> + m_param->bSourceReferenceEstimation ?
> m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
> + if (lowresMvCost < satdCost)
> + {
> + outmv = outmv_lowres;
> + satdCost = lowresMvCost;
> + bLowresMVP = true;
> + }
> + }
> +
> /* Get total cost of partition, but only include MV
> bit cost once */
> bits += m_me.bitcost(outmv);
> uint32_t mvCost = m_me.mvcost(outmv);
> uint32_t cost = (satdCost - mvCost) +
> m_rdCost.getCost(bits);
> + /* Update LowresMVP to best AMVP cand*/
> + if (bLowresMVP)
> + updateMVP(amvp[mvpIdx], outmv, bits, cost,
> mvp_lowres);
>
> /* Refine MVP selection, updates: mvpIdx, bits, cost
> */
> mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
> @@ -2631,6 +2671,15 @@
> return amvpCand[mvpIdx];
> }
>
> +/* Update to default MVP when using an alternative mvp */
> +void Search::updateMVP(const MV amvp, const MV& mv, uint32_t& outBits,
> uint32_t& outCost, const MV& alterMVP)
> +{
> + int diffBits = m_me.bitcost(mv, amvp) - m_me.bitcost(mv, alterMVP);
> + uint32_t origOutBits = outBits;
> + outBits = origOutBits + diffBits;
> + outCost = (outCost - m_rdCost.getCost(origOutBits)) +
> m_rdCost.getCost(outBits);
> +}
> +
> void Search::setSearchRange(const CUData& cu, const MV& mvp, int merange,
> MV& mvmin, MV& mvmax) const
> {
> MV dist((int32_t)merange << 2, (int32_t)merange << 2);
> diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/search.h
> --- a/source/encoder/search.h Fri Jul 05 11:17:26 2019 +0530
> +++ b/source/encoder/search.h Mon Jul 08 10:39:27 2019 +0530
> @@ -425,6 +425,7 @@
> void setSearchRange(const CUData& cu, const MV& mvp, int merange,
> MV& mvmin, MV& mvmax) const;
> uint32_t mergeEstimation(CUData& cu, const CUGeom& cuGeom, const
> PredictionUnit& pu, int puIdx, MergeData& m);
> static void getBlkBits(PartSize cuMode, bool bPSlice, int puIdx,
> uint32_t lastMode, uint32_t blockBit[3]);
> + void updateMVP(const MV amvp, const MV& mv, uint32_t& outBits,
> uint32_t& outCost, const MV& alterMVP);
>
> /* intra helper functions */
> enum { MAX_RD_INTRA_MODES = 16 };
> diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/slicetype.cpp
> --- a/source/encoder/slicetype.cpp Fri Jul 05 11:17:26 2019 +0530
> +++ b/source/encoder/slicetype.cpp Mon Jul 08 10:39:27 2019 +0530
> @@ -664,6 +664,7 @@
> weightedRef.lumaStride = fenc.lumaStride;
> weightedRef.isLowres = true;
> weightedRef.isWeighted = false;
> + weightedRef.isHMELowres = ref.bEnableHME;
>
> /* epsilon is chosen to require at least a numerator of 127 (with
> denominator = 128) */
> float guessScale, fencMean, refMean;
> @@ -759,6 +760,8 @@
> m_extendGopBoundary = false;
> m_8x8Height = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1)
> >> X265_LOWRES_CU_BITS;
> m_8x8Width = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1)
> >> X265_LOWRES_CU_BITS;
> + m_4x4Height = ((m_param->sourceHeight / 4) + X265_LOWRES_CU_SIZE - 1)
> >> X265_LOWRES_CU_BITS;
> + m_4x4Width = ((m_param->sourceWidth / 4) + X265_LOWRES_CU_SIZE - 1)
> >> X265_LOWRES_CU_BITS;
> m_cuCount = m_8x8Width * m_8x8Height;
> m_8x8Blocks = m_8x8Width > 2 && m_8x8Height > 2 ? (m_cuCount + 4 - 2
> * (m_8x8Width + m_8x8Height)) : m_cuCount;
> m_isFadeIn = false;
> @@ -2782,16 +2785,32 @@
>
> X265_CHECK(i < MAX_COOP_SLICES, "impossible number of coop
> slices\n");
>
> - int firstY = m_lookahead.m_numRowsPerSlice * i;
> - int lastY = (i == m_jobTotal - 1) ? m_lookahead.m_8x8Height -
> 1 : m_lookahead.m_numRowsPerSlice * (i + 1) - 1;
> -
> - bool lastRow = true;
> + int firstY, lastY;
> + bool lastRow;
> + if (m_lookahead.m_param->bEnableHME)
> + {
> + int numRowsPerSlice = m_lookahead.m_4x4Height /
> m_lookahead.m_param->lookaheadSlices;
> + numRowsPerSlice = X265_MIN(X265_MAX(numRowsPerSlice, 5),
> m_lookahead.m_4x4Height);
> + firstY = numRowsPerSlice * i;
> + lastY = (i == m_jobTotal - 1) ? m_lookahead.m_4x4Height -
> 1 : numRowsPerSlice * (i + 1) - 1;
> + lastRow = true;
> + for (int cuY = lastY; cuY >= firstY; cuY--)
> + {
> + for (int cuX = m_lookahead.m_4x4Width - 1; cuX >= 0;
> cuX--)
> + estimateCUCost(tld, cuX, cuY, m_coop.p0,
> m_coop.p1, m_coop.b, m_coop.bDoSearch, lastRow, i, 1);
> + lastRow = false;
> + }
> + }
> +
> + firstY = m_lookahead.m_numRowsPerSlice * i;
> + lastY = (i == m_jobTotal - 1) ? m_lookahead.m_8x8Height - 1 :
> m_lookahead.m_numRowsPerSlice * (i + 1) - 1;
> + lastRow = true;
> for (int cuY = lastY; cuY >= firstY; cuY--)
> {
> m_frames[m_coop.b]->rowSatds[m_coop.b -
> m_coop.p0][m_coop.p1 - m_coop.b][cuY] = 0;
>
> for (int cuX = m_lookahead.m_8x8Width - 1; cuX >= 0;
> cuX--)
> - estimateCUCost(tld, cuX, cuY, m_coop.p0, m_coop.p1,
> m_coop.b, m_coop.bDoSearch, lastRow, i);
> + estimateCUCost(tld, cuX, cuY, m_coop.p0, m_coop.p1,
> m_coop.b, m_coop.bDoSearch, lastRow, i, 0);
>
> lastRow = false;
> }
> @@ -2864,13 +2883,25 @@
> }
> else
> {
> - bool lastRow = true;
> + /* Calculate MVs for 1/16th resolution*/
> + bool lastRow;
> + if (param->bEnableHME)
> + {
> + lastRow = true;
> + for (int cuY = m_lookahead.m_4x4Height - 1; cuY >= 0;
> cuY--)
> + {
> + for (int cuX = m_lookahead.m_4x4Width - 1; cuX >= 0;
> cuX--)
> + estimateCUCost(tld, cuX, cuY, p0, p1, b,
> bDoSearch, lastRow, -1, 1);
> + lastRow = false;
> + }
> + }
> + lastRow = true;
> for (int cuY = m_lookahead.m_8x8Height - 1; cuY >= 0; cuY--)
> {
> fenc->rowSatds[b - p0][p1 - b][cuY] = 0;
>
> for (int cuX = m_lookahead.m_8x8Width - 1; cuX >= 0;
> cuX--)
> - estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch,
> lastRow, -1);
> + estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch,
> lastRow, -1, 0);
>
> lastRow = false;
> }
> @@ -2891,23 +2922,27 @@
> return score;
> }
>
> -void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int
> cuY, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice)
> +void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int
> cuY, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice,
> bool hme)
> {
> Lowres *fref0 = m_frames[p0];
> Lowres *fref1 = m_frames[p1];
> Lowres *fenc = m_frames[b];
>
> - ReferencePlanes *wfref0 = fenc->weightedRef[b - p0].isWeighted ?
> &fenc->weightedRef[b - p0] : fref0;
> -
> - const int widthInCU = m_lookahead.m_8x8Width;
> - const int heightInCU = m_lookahead.m_8x8Height;
> + ReferencePlanes *wfref0 = fenc->weightedRef[b - p0].isWeighted &&
> !hme ? &fenc->weightedRef[b - p0] : fref0;
> +
> + const int widthInCU = hme ? m_lookahead.m_4x4Width :
> m_lookahead.m_8x8Width;
> + const int heightInCU = hme ? m_lookahead.m_4x4Height :
> m_lookahead.m_8x8Height;
> const int bBidir = (b < p1);
> const int cuXY = cuX + cuY * widthInCU;
> + const int cuXY_4x4 = (cuX / 2) + (cuY / 2) * widthInCU / 2;
> const int cuSize = X265_LOWRES_CU_SIZE;
> - const intptr_t pelOffset = cuSize * cuX + cuSize * cuY *
> fenc->lumaStride;
> -
> - if (bBidir || bDoSearch[0] || bDoSearch[1])
> - tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride,
> pelOffset, cuSize, cuSize, X265_HEX_SEARCH, 1);
> + const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * (hme ?
> fenc->lumaStride/2 : fenc->lumaStride);
> +
> + if ((bBidir || bDoSearch[0] || bDoSearch[1]) && hme)
> + tld.me.setSourcePU(fenc->lowerResPlane[0], fenc->lumaStride / 2,
> pelOffset, cuSize, cuSize, X265_HEX_SEARCH,
> m_lookahead.m_param->hmeSearchMethod[0],
> m_lookahead.m_param->hmeSearchMethod[1], 1);
> + else if((bBidir || bDoSearch[0] || bDoSearch[1]) && !hme)
> + tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride,
> pelOffset, cuSize, cuSize, X265_HEX_SEARCH,
> m_lookahead.m_param->hmeSearchMethod[0],
> m_lookahead.m_param->hmeSearchMethod[1], 1);
> +
>
> /* A small, arbitrary bias to avoid VBV problems caused by
> zero-residual lookahead blocks. */
> int lowresPenalty = 4;
> @@ -2926,7 +2961,7 @@
>
> for (int i = 0; i < 1 + bBidir; i++)
> {
> - int& fencCost = fenc->lowresMvCosts[i][listDist[i]][cuXY];
> + int& fencCost = hme ? fenc->lowerResMvCosts[i][listDist[i]][cuXY]
> : fenc->lowresMvCosts[i][listDist[i]][cuXY];
> int skipCost = INT_MAX;
>
> if (!bDoSearch[i])
> @@ -2936,8 +2971,8 @@
> }
>
> int numc = 0;
> - MV mvc[4], mvp;
> - MV* fencMV = &fenc->lowresMvs[i][listDist[i]][cuXY];
> + MV mvc[5], mvp;
> + MV* fencMV = hme ? &fenc->lowerResMvs[i][listDist[i]][cuXY] :
> &fenc->lowresMvs[i][listDist[i]][cuXY];
> ReferencePlanes* fref = i ? fref1 : wfref0;
>
> /* Reverse-order MV prediction */
> @@ -2952,6 +2987,10 @@
> if (cuX < widthInCU - 1)
> MVC(fencMV[widthInCU + 1]);
> }
> + if (fenc->lowerResMvs[0][0] && !hme &&
> fenc->lowerResMvCosts[i][listDist[i]][cuXY_4x4] > 0)
> + {
> + MVC((fenc->lowerResMvs[i][listDist[i]][cuXY_4x4]) * 2);
> + }
> #undef MVC
>
> if (!numc)
> @@ -2967,7 +3006,7 @@
> for (int idx = 0; idx < numc; idx++)
> {
> intptr_t stride = X265_LOWRES_CU_SIZE;
> - pixel *src = fref->lowresMC(pelOffset, mvc[idx],
> subpelbuf, stride);
> + pixel *src = fref->lowresMC(pelOffset, mvc[idx],
> subpelbuf, stride, hme);
> int cost = tld.me.bufSATD(src, stride);
> COPY2_IF_LT(mvpcost, cost, mvp, mvc[idx]);
> /* Except for mv0 case, everyting else is likely to have
> enough residual to not trigger the skip. */
> @@ -2978,7 +3017,10 @@
>
> /* ME will never return a cost larger than the cost @MVP, so we
> do not
> * have to check that ME cost is more than the estimated merge
> cost */
> - fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0,
> NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices);
> + if(!hme)
> + fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0,
> NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices);
> + else
> + fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0,
> NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices,
> fref->lowerResPlane[0]);
> if (skipCost < 64 && skipCost < fencCost && bBidir)
> {
> fencCost = skipCost;
> @@ -2986,6 +3028,8 @@
> }
> COPY2_IF_LT(bcost, fencCost, listused, i + 1);
> }
> + if (hme)
> + return;
>
> if (bBidir) /* B, also consider bidir */
> {
> @@ -2995,8 +3039,8 @@
> ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE *
> X265_LOWRES_CU_SIZE]);
> ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE *
> X265_LOWRES_CU_SIZE]);
> intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 =
> X265_LOWRES_CU_SIZE;
> - pixel *src0 = fref0->lowresMC(pelOffset,
> fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0);
> - pixel *src1 = fref1->lowresMC(pelOffset,
> fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1);
> + pixel *src0 = fref0->lowresMC(pelOffset,
> fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0, 0);
> + pixel *src1 = fref1->lowresMC(pelOffset,
> fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1, 0);
> ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE *
> X265_LOWRES_CU_SIZE]);
> primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](ref,
> X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
> int bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
> diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/slicetype.h
> --- a/source/encoder/slicetype.h Fri Jul 05 11:17:26 2019 +0530
> +++ b/source/encoder/slicetype.h Mon Jul 08 10:39:27 2019 +0530
> @@ -124,6 +124,10 @@
> int m_inputCount;
> double m_cuTreeStrength;
>
> + /* HME */
> + int m_4x4Width;
> + int m_4x4Height;
> +
> bool m_isActive;
> bool m_sliceTypeBusy;
> bool m_bAdaptiveQuant;
> @@ -246,7 +250,7 @@
> void processTasks(int workerThreadID);
>
> int64_t estimateFrameCost(LookaheadTLD& tld, int p0, int p1, int b,
> bool intraPenalty);
> - void estimateCUCost(LookaheadTLD& tld, int cux, int cuy, int p0,
> int p1, int b, bool bDoSearch[2], bool lastRow, int slice);
> + void estimateCUCost(LookaheadTLD& tld, int cux, int cuy, int p0,
> int p1, int b, bool bDoSearch[2], bool lastRow, int slice, bool hme);
>
> CostEstimateGroup& operator=(const CostEstimateGroup&);
> };
> diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/weightPrediction.cpp
> --- a/source/encoder/weightPrediction.cpp Fri Jul 05 11:17:26 2019
> +0530
> +++ b/source/encoder/weightPrediction.cpp Mon Jul 08 10:39:27 2019
> +0530
> @@ -82,7 +82,7 @@
> /* clip MV to available pixels */
> MV mv = mvs[cu];
> mv = mv.clipped(mvmin, mvmax);
> - pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride);
> + pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride, 0);
> primitives.cu[BLOCK_8x8].copy_pp(mcout + pixoff, stride,
> tmp, bstride);
> }
> }
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
--
Regards,
Aruna
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20190711/1c1f4ebc/attachment-0001.html>
More information about the x265-devel
mailing list