[x265] [PATCH 2 of 2] motion: Perform ME on each HME level
pooja at multicorewareinc.com
pooja at multicorewareinc.com
Wed Jul 10 06:17:03 CEST 2019
# HG changeset patch
# User Pooja Venkatesan <pooja at multicorewareinc.com>
# Date 1562562567 -19800
# Mon Jul 08 10:39:27 2019 +0530
# Node ID 2dcff9aea06f0f1c396fd2a62104e4fd5029bf40
# Parent 14a235657a2011aa28d45544f33b7186c33b9218
motion: Perform ME on each HME level
This patch does the following:
1) Perform level-0 ME
2) Use the MVs as predictor for next level ME
3) Restrict full-search within a range when HME is enabled
diff -r 14a235657a20 -r 2dcff9aea06f source/common/lowres.cpp
--- a/source/common/lowres.cpp Fri Jul 05 11:17:26 2019 +0530
+++ b/source/common/lowres.cpp Mon Jul 08 10:39:27 2019 +0530
@@ -65,6 +65,7 @@
maxBlocksInColFullRes = maxBlocksInCol * 2;
int cuCount = maxBlocksInRow * maxBlocksInCol;
int cuCountFullRes = (qgSize > 8) ? cuCount : cuCount << 2;
+ isHMELowres = param->bEnableHME ? 1 : 0;
/* rounding the width to multiple of lowres CU size */
width = maxBlocksInRow * X265_LOWRES_CU_SIZE;
@@ -176,6 +177,16 @@
CHECKED_MALLOC(lowresMvs[1][i], MV, cuCount);
CHECKED_MALLOC(lowresMvCosts[0][i], int32_t, cuCount);
CHECKED_MALLOC(lowresMvCosts[1][i], int32_t, cuCount);
+ if (bEnableHME)
+ {
+ int maxBlocksInRowLowerRes = ((width/2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+ int maxBlocksInColLowerRes = ((lines/2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+ int cuCountLowerRes = maxBlocksInRowLowerRes * maxBlocksInColLowerRes;
+ CHECKED_MALLOC(lowerResMvs[0][i], MV, cuCountLowerRes);
+ CHECKED_MALLOC(lowerResMvs[1][i], MV, cuCountLowerRes);
+ CHECKED_MALLOC(lowerResMvCosts[0][i], int32_t, cuCountLowerRes);
+ CHECKED_MALLOC(lowerResMvCosts[1][i], int32_t, cuCountLowerRes);
+ }
}
return true;
@@ -207,6 +218,13 @@
X265_FREE(lowresMvs[1][i]);
X265_FREE(lowresMvCosts[0][i]);
X265_FREE(lowresMvCosts[1][i]);
+ if (bEnableHME)
+ {
+ X265_FREE(lowerResMvs[0][i]);
+ X265_FREE(lowerResMvs[1][i]);
+ X265_FREE(lowerResMvCosts[0][i]);
+ X265_FREE(lowerResMvCosts[1][i]);
+ }
}
X265_FREE(qpAqOffset);
X265_FREE(invQscaleFactor);
diff -r 14a235657a20 -r 2dcff9aea06f source/common/lowres.h
--- a/source/common/lowres.h Fri Jul 05 11:17:26 2019 +0530
+++ b/source/common/lowres.h Mon Jul 08 10:39:27 2019 +0530
@@ -46,6 +46,7 @@
bool isWeighted;
bool isLowres;
+ bool isHMELowres;
intptr_t lumaStride;
intptr_t chromaStride;
@@ -63,46 +64,58 @@
/* lowres motion compensation, you must provide a buffer and stride for QPEL averaged pixels
* in case QPEL is required. Else it returns a pointer to the HPEL pixels */
- inline pixel *lowresMC(intptr_t blockOffset, const MV& qmv, pixel *buf, intptr_t& outstride)
+ inline pixel *lowresMC(intptr_t blockOffset, const MV& qmv, pixel *buf, intptr_t& outstride, bool hme)
{
+ intptr_t YStride = hme ? lumaStride / 2 : lumaStride;
+ pixel *plane[4];
+ for (int i = 0; i < 4; i++)
+ {
+ plane[i] = hme ? lowerResPlane[i] : lowresPlane[i];
+ }
if ((qmv.x | qmv.y) & 1)
{
int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1);
- pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride;
+ pixel *frefA = plane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * YStride;
int qmvx = qmv.x + (qmv.x & 1);
int qmvy = qmv.y + (qmv.y & 1);
int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
- pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
- primitives.pu[LUMA_8x8].pixelavg_pp[(outstride % 64 == 0) && (lumaStride % 64 == 0)](buf, outstride, frefA, lumaStride, frefB, lumaStride, 32);
+ pixel *frefB = plane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * YStride;
+ primitives.pu[LUMA_8x8].pixelavg_pp[(outstride % 64 == 0) && (YStride % 64 == 0)](buf, outstride, frefA, YStride, frefB, YStride, 32);
return buf;
}
else
{
- outstride = lumaStride;
+ outstride = YStride;
int hpel = (qmv.y & 2) | ((qmv.x & 2) >> 1);
- return lowresPlane[hpel] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride;
+ return plane[hpel] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * YStride;
}
}
- inline int lowresQPelCost(pixel *fenc, intptr_t blockOffset, const MV& qmv, pixelcmp_t comp)
+ inline int lowresQPelCost(pixel *fenc, intptr_t blockOffset, const MV& qmv, pixelcmp_t comp, bool hme)
{
+ intptr_t YStride = hme ? lumaStride / 2 : lumaStride;
+ pixel *plane[4];
+ for (int i = 0; i < 4; i++)
+ {
+ plane[i] = hme ? lowerResPlane[i] : lowresPlane[i];
+ }
if ((qmv.x | qmv.y) & 1)
{
ALIGN_VAR_16(pixel, subpelbuf[8 * 8]);
int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1);
- pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride;
+ pixel *frefA = plane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * YStride;
int qmvx = qmv.x + (qmv.x & 1);
int qmvy = qmv.y + (qmv.y & 1);
int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
- pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
- primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](subpelbuf, 8, frefA, lumaStride, frefB, lumaStride, 32);
+ pixel *frefB = plane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * YStride;
+ primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](subpelbuf, 8, frefA, YStride, frefB, YStride, 32);
return comp(fenc, FENC_STRIDE, subpelbuf, 8);
}
else
{
int hpel = (qmv.y & 2) | ((qmv.x & 2) >> 1);
- pixel *fref = lowresPlane[hpel] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride;
- return comp(fenc, FENC_STRIDE, fref, lumaStride);
+ pixel *fref = plane[hpel] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * YStride;
+ return comp(fenc, FENC_STRIDE, fref, YStride);
}
}
};
@@ -188,6 +201,8 @@
/* Hierarchical Motion Estimation */
bool bEnableHME;
+ int32_t* lowerResMvCosts[2][X265_BFRAME_MAX + 2];
+ MV* lowerResMvs[2][X265_BFRAME_MAX + 2];
/* used for vbvLookahead */
int plannedType[X265_LOOKAHEAD_MAX + 1];
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp Fri Jul 05 11:17:26 2019 +0530
+++ b/source/encoder/encoder.cpp Mon Jul 08 10:39:27 2019 +0530
@@ -3387,6 +3387,10 @@
x265_log(p, X265_LOG_WARNING, "Source height < 540p is too low for HME. Disabling HME.\n");
p->bEnableHME = 0;
}
+ if (m_param->bEnableHME && m_param->searchMethod != m_param->hmeSearchMethod[2])
+ {
+ m_param->searchMethod = m_param->hmeSearchMethod[2];
+ }
}
}
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/motion.cpp
--- a/source/encoder/motion.cpp Fri Jul 05 11:17:26 2019 +0530
+++ b/source/encoder/motion.cpp Mon Jul 08 10:39:27 2019 +0530
@@ -104,6 +104,8 @@
ctuAddr = -1;
absPartIdx = -1;
searchMethod = X265_HEX_SEARCH;
+ searchMethodL0 = X265_HEX_SEARCH;
+ searchMethodL1 = X265_HEX_SEARCH;
subpelRefine = 2;
blockwidth = blockheight = 0;
blockOffset = 0;
@@ -162,7 +164,7 @@
}
/* Called by lookahead, luma only, no use of PicYuv */
-void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int refine)
+void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int searchL0, const int searchL1, const int refine)
{
partEnum = partitionFromSizes(pwidth, pheight);
X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
@@ -179,6 +181,8 @@
/* Search params */
searchMethod = method;
+ searchMethodL0 = searchL0;
+ searchMethodL1 = searchL1;
subpelRefine = refine;
/* copy PU block into cache */
@@ -743,9 +747,10 @@
pixel * srcReferencePlane)
{
ALIGN_VAR_16(int, costs[16]);
+ bool hme = srcReferencePlane && srcReferencePlane == ref->fpelLowerResPlane[0];
if (ctuAddr >= 0)
blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0);
- intptr_t stride = ref->lumaStride;
+ intptr_t stride = hme ? ref->lumaStride / 2 : ref->lumaStride;
pixel* fenc = fencPUYuv.m_buf[0];
pixel* fref = srcReferencePlane == 0 ? ref->fpelPlane[0] + blockOffset : srcReferencePlane + blockOffset;
@@ -767,7 +772,7 @@
int bprecost;
if (ref->isLowres)
- bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad);
+ bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad, hme);
else
bprecost = subpelCompare(ref, pmv, sad);
@@ -808,7 +813,8 @@
pmv = pmv.roundToFPel();
MV omv = bmv; // current search origin or starting point
- switch (searchMethod)
+ int search = ref->isHMELowres ? (hme ? searchMethodL0 : searchMethodL1) : searchMethod;
+ switch (search)
{
case X265_DIA_SEARCH:
{
@@ -1391,11 +1397,20 @@
{
// dead slow exhaustive search, but at least it uses sad_x4()
MV tmv;
- for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y++)
+ int32_t mvmin_y = mvmin.y, mvmin_x = mvmin.x, mvmax_y = mvmax.y, mvmax_x = mvmax.x;
+ if (ref->isHMELowres)
{
- for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x++)
+ merange = (merange < 0 ? -merange : merange);
+ mvmin_y = X265_MAX(mvmin.y, -merange);
+ mvmin_x = X265_MAX(mvmin.x, -merange);
+ mvmax_y = X265_MIN(mvmax.y, merange);
+ mvmax_x = X265_MIN(mvmax.x, merange);
+ }
+ for (tmv.y = mvmin_y; tmv.y <= mvmax_y; tmv.y++)
+ {
+ for (tmv.x = mvmin_x; tmv.x <= mvmax_x; tmv.x++)
{
- if (tmv.x + 3 <= mvmax.x)
+ if (tmv.x + 3 <= mvmax_x)
{
pixel *pix_base = fref + tmv.y * stride + tmv.x;
sad_x4(fenc,
@@ -1463,12 +1478,12 @@
if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
continue;
- int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv);
+ int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad, hme) + mvcost(qmv);
COPY2_IF_LT(bcost, cost, bdir, i);
}
bmv += square1[bdir] * 2;
- bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd) + mvcost(bmv);
+ bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd, hme) + mvcost(bmv);
bdir = 0;
for (int i = 1; i <= wl.qpel_dirs; i++)
@@ -1479,7 +1494,7 @@
if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
continue;
- int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv);
+ int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd, hme) + mvcost(qmv);
COPY2_IF_LT(bcost, cost, bdir, i);
}
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/motion.h
--- a/source/encoder/motion.h Fri Jul 05 11:17:26 2019 +0530
+++ b/source/encoder/motion.h Mon Jul 08 10:39:27 2019 +0530
@@ -44,6 +44,8 @@
int absPartIdx; // part index of PU, including CU offset within CTU
int searchMethod;
+ int searchMethodL0;
+ int searchMethodL1;
int subpelRefine;
int blockwidth;
@@ -76,7 +78,7 @@
/* Methods called at slice setup */
- void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int searchMethod, const int subpelRefine);
+ void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int searchMethod, const int searchL0, const int searchL1, const int subpelRefine);
void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int searchMethod, const int subpelRefine, bool bChroma);
/* buf*() and motionEstimate() methods all use cached fenc pixels and thus
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/search.cpp
--- a/source/encoder/search.cpp Fri Jul 05 11:17:26 2019 +0530
+++ b/source/encoder/search.cpp Mon Jul 08 10:39:27 2019 +0530
@@ -2096,13 +2096,16 @@
const MV* amvp = interMode.amvpCand[list][ref];
int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref);
- MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
+ bool bLowresMVP = false;
+ MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres;
if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents load/save outputs from diverging if lowresMV is not available */
{
MV lmv = getLowresMV(interMode.cu, pu, list, ref);
if (lmv.notZero())
mvc[numMvc++] = lmv;
+ if (m_param->bEnableHME)
+ mvp_lowres = lmv;
}
setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);
@@ -2110,11 +2113,28 @@
int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices,
m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
+ if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
+ {
+ MV outmv_lowres;
+ setSearchRange(interMode.cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
+ int lowresMvCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices,
+ m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
+ if (lowresMvCost < satdCost)
+ {
+ outmv = outmv_lowres;
+ satdCost = lowresMvCost;
+ bLowresMVP = true;
+ }
+ }
/* Get total cost of partition, but only include MV bit cost once */
bits += m_me.bitcost(outmv);
uint32_t mvCost = m_me.mvcost(outmv);
uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
+ /* Update LowresMVP to best AMVP cand*/
+ if (bLowresMVP)
+ updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);
+
/* Refine MVP selection, updates: mvpIdx, bits, cost */
mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
@@ -2346,13 +2366,16 @@
const MV* amvp = interMode.amvpCand[list][ref];
int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
- MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
+ MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres;
+ bool bLowresMVP = false;
if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents load/save outputs from diverging when lowresMV is not available */
{
MV lmv = getLowresMV(cu, pu, list, ref);
if (lmv.notZero())
mvc[numMvc++] = lmv;
+ if (m_param->bEnableHME)
+ mvp_lowres = lmv;
}
if (m_param->searchMethod == X265_SEA)
{
@@ -2365,10 +2388,27 @@
int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices,
m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
+ if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
+ {
+ MV outmv_lowres;
+ setSearchRange(cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
+ int lowresMvCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices,
+ m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
+ if (lowresMvCost < satdCost)
+ {
+ outmv = outmv_lowres;
+ satdCost = lowresMvCost;
+ bLowresMVP = true;
+ }
+ }
+
/* Get total cost of partition, but only include MV bit cost once */
bits += m_me.bitcost(outmv);
uint32_t mvCost = m_me.mvcost(outmv);
uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
+ /* Update LowresMVP to best AMVP cand*/
+ if (bLowresMVP)
+ updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);
/* Refine MVP selection, updates: mvpIdx, bits, cost */
mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
@@ -2631,6 +2671,15 @@
return amvpCand[mvpIdx];
}
+/* Update to default MVP when using an alternative mvp */
+void Search::updateMVP(const MV amvp, const MV& mv, uint32_t& outBits, uint32_t& outCost, const MV& alterMVP)
+{
+ int diffBits = m_me.bitcost(mv, amvp) - m_me.bitcost(mv, alterMVP);
+ uint32_t origOutBits = outBits;
+ outBits = origOutBits + diffBits;
+ outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits);
+}
+
void Search::setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const
{
MV dist((int32_t)merange << 2, (int32_t)merange << 2);
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/search.h
--- a/source/encoder/search.h Fri Jul 05 11:17:26 2019 +0530
+++ b/source/encoder/search.h Mon Jul 08 10:39:27 2019 +0530
@@ -425,6 +425,7 @@
void setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const;
uint32_t mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m);
static void getBlkBits(PartSize cuMode, bool bPSlice, int puIdx, uint32_t lastMode, uint32_t blockBit[3]);
+ void updateMVP(const MV amvp, const MV& mv, uint32_t& outBits, uint32_t& outCost, const MV& alterMVP);
/* intra helper functions */
enum { MAX_RD_INTRA_MODES = 16 };
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp Fri Jul 05 11:17:26 2019 +0530
+++ b/source/encoder/slicetype.cpp Mon Jul 08 10:39:27 2019 +0530
@@ -664,6 +664,7 @@
weightedRef.lumaStride = fenc.lumaStride;
weightedRef.isLowres = true;
weightedRef.isWeighted = false;
+ weightedRef.isHMELowres = ref.bEnableHME;
/* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
float guessScale, fencMean, refMean;
@@ -759,6 +760,8 @@
m_extendGopBoundary = false;
m_8x8Height = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
m_8x8Width = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+ m_4x4Height = ((m_param->sourceHeight / 4) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+ m_4x4Width = ((m_param->sourceWidth / 4) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
m_cuCount = m_8x8Width * m_8x8Height;
m_8x8Blocks = m_8x8Width > 2 && m_8x8Height > 2 ? (m_cuCount + 4 - 2 * (m_8x8Width + m_8x8Height)) : m_cuCount;
m_isFadeIn = false;
@@ -2782,16 +2785,32 @@
X265_CHECK(i < MAX_COOP_SLICES, "impossible number of coop slices\n");
- int firstY = m_lookahead.m_numRowsPerSlice * i;
- int lastY = (i == m_jobTotal - 1) ? m_lookahead.m_8x8Height - 1 : m_lookahead.m_numRowsPerSlice * (i + 1) - 1;
-
- bool lastRow = true;
+ int firstY, lastY;
+ bool lastRow;
+ if (m_lookahead.m_param->bEnableHME)
+ {
+ int numRowsPerSlice = m_lookahead.m_4x4Height / m_lookahead.m_param->lookaheadSlices;
+ numRowsPerSlice = X265_MIN(X265_MAX(numRowsPerSlice, 5), m_lookahead.m_4x4Height);
+ firstY = numRowsPerSlice * i;
+ lastY = (i == m_jobTotal - 1) ? m_lookahead.m_4x4Height - 1 : numRowsPerSlice * (i + 1) - 1;
+ lastRow = true;
+ for (int cuY = lastY; cuY >= firstY; cuY--)
+ {
+ for (int cuX = m_lookahead.m_4x4Width - 1; cuX >= 0; cuX--)
+ estimateCUCost(tld, cuX, cuY, m_coop.p0, m_coop.p1, m_coop.b, m_coop.bDoSearch, lastRow, i, 1);
+ lastRow = false;
+ }
+ }
+
+ firstY = m_lookahead.m_numRowsPerSlice * i;
+ lastY = (i == m_jobTotal - 1) ? m_lookahead.m_8x8Height - 1 : m_lookahead.m_numRowsPerSlice * (i + 1) - 1;
+ lastRow = true;
for (int cuY = lastY; cuY >= firstY; cuY--)
{
m_frames[m_coop.b]->rowSatds[m_coop.b - m_coop.p0][m_coop.p1 - m_coop.b][cuY] = 0;
for (int cuX = m_lookahead.m_8x8Width - 1; cuX >= 0; cuX--)
- estimateCUCost(tld, cuX, cuY, m_coop.p0, m_coop.p1, m_coop.b, m_coop.bDoSearch, lastRow, i);
+ estimateCUCost(tld, cuX, cuY, m_coop.p0, m_coop.p1, m_coop.b, m_coop.bDoSearch, lastRow, i, 0);
lastRow = false;
}
@@ -2864,13 +2883,25 @@
}
else
{
- bool lastRow = true;
+ /* Calculate MVs for 1/16th resolution*/
+ bool lastRow;
+ if (param->bEnableHME)
+ {
+ lastRow = true;
+ for (int cuY = m_lookahead.m_4x4Height - 1; cuY >= 0; cuY--)
+ {
+ for (int cuX = m_lookahead.m_4x4Width - 1; cuX >= 0; cuX--)
+ estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, lastRow, -1, 1);
+ lastRow = false;
+ }
+ }
+ lastRow = true;
for (int cuY = m_lookahead.m_8x8Height - 1; cuY >= 0; cuY--)
{
fenc->rowSatds[b - p0][p1 - b][cuY] = 0;
for (int cuX = m_lookahead.m_8x8Width - 1; cuX >= 0; cuX--)
- estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, lastRow, -1);
+ estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, lastRow, -1, 0);
lastRow = false;
}
@@ -2891,23 +2922,27 @@
return score;
}
-void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int cuY, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice)
+void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int cuY, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice, bool hme)
{
Lowres *fref0 = m_frames[p0];
Lowres *fref1 = m_frames[p1];
Lowres *fenc = m_frames[b];
- ReferencePlanes *wfref0 = fenc->weightedRef[b - p0].isWeighted ? &fenc->weightedRef[b - p0] : fref0;
-
- const int widthInCU = m_lookahead.m_8x8Width;
- const int heightInCU = m_lookahead.m_8x8Height;
+ ReferencePlanes *wfref0 = fenc->weightedRef[b - p0].isWeighted && !hme ? &fenc->weightedRef[b - p0] : fref0;
+
+ const int widthInCU = hme ? m_lookahead.m_4x4Width : m_lookahead.m_8x8Width;
+ const int heightInCU = hme ? m_lookahead.m_4x4Height : m_lookahead.m_8x8Height;
const int bBidir = (b < p1);
const int cuXY = cuX + cuY * widthInCU;
+ const int cuXY_4x4 = (cuX / 2) + (cuY / 2) * widthInCU / 2;
const int cuSize = X265_LOWRES_CU_SIZE;
- const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * fenc->lumaStride;
-
- if (bBidir || bDoSearch[0] || bDoSearch[1])
- tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, 1);
+ const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * (hme ? fenc->lumaStride/2 : fenc->lumaStride);
+
+ if ((bBidir || bDoSearch[0] || bDoSearch[1]) && hme)
+ tld.me.setSourcePU(fenc->lowerResPlane[0], fenc->lumaStride / 2, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, m_lookahead.m_param->hmeSearchMethod[0], m_lookahead.m_param->hmeSearchMethod[1], 1);
+ else if((bBidir || bDoSearch[0] || bDoSearch[1]) && !hme)
+ tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, m_lookahead.m_param->hmeSearchMethod[0], m_lookahead.m_param->hmeSearchMethod[1], 1);
+
/* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
int lowresPenalty = 4;
@@ -2926,7 +2961,7 @@
for (int i = 0; i < 1 + bBidir; i++)
{
- int& fencCost = fenc->lowresMvCosts[i][listDist[i]][cuXY];
+ int& fencCost = hme ? fenc->lowerResMvCosts[i][listDist[i]][cuXY] : fenc->lowresMvCosts[i][listDist[i]][cuXY];
int skipCost = INT_MAX;
if (!bDoSearch[i])
@@ -2936,8 +2971,8 @@
}
int numc = 0;
- MV mvc[4], mvp;
- MV* fencMV = &fenc->lowresMvs[i][listDist[i]][cuXY];
+ MV mvc[5], mvp;
+ MV* fencMV = hme ? &fenc->lowerResMvs[i][listDist[i]][cuXY] : &fenc->lowresMvs[i][listDist[i]][cuXY];
ReferencePlanes* fref = i ? fref1 : wfref0;
/* Reverse-order MV prediction */
@@ -2952,6 +2987,10 @@
if (cuX < widthInCU - 1)
MVC(fencMV[widthInCU + 1]);
}
+ if (fenc->lowerResMvs[0][0] && !hme && fenc->lowerResMvCosts[i][listDist[i]][cuXY_4x4] > 0)
+ {
+ MVC((fenc->lowerResMvs[i][listDist[i]][cuXY_4x4]) * 2);
+ }
#undef MVC
if (!numc)
@@ -2967,7 +3006,7 @@
for (int idx = 0; idx < numc; idx++)
{
intptr_t stride = X265_LOWRES_CU_SIZE;
- pixel *src = fref->lowresMC(pelOffset, mvc[idx], subpelbuf, stride);
+ pixel *src = fref->lowresMC(pelOffset, mvc[idx], subpelbuf, stride, hme);
int cost = tld.me.bufSATD(src, stride);
COPY2_IF_LT(mvpcost, cost, mvp, mvc[idx]);
/* Except for mv0 case, everyting else is likely to have enough residual to not trigger the skip. */
@@ -2978,7 +3017,10 @@
/* ME will never return a cost larger than the cost @MVP, so we do not
* have to check that ME cost is more than the estimated merge cost */
- fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices);
+ if(!hme)
+ fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices);
+ else
+ fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices, fref->lowerResPlane[0]);
if (skipCost < 64 && skipCost < fencCost && bBidir)
{
fencCost = skipCost;
@@ -2986,6 +3028,8 @@
}
COPY2_IF_LT(bcost, fencCost, listused, i + 1);
}
+ if (hme)
+ return;
if (bBidir) /* B, also consider bidir */
{
@@ -2995,8 +3039,8 @@
ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
- pixel *src0 = fref0->lowresMC(pelOffset, fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0);
- pixel *src1 = fref1->lowresMC(pelOffset, fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1);
+ pixel *src0 = fref0->lowresMC(pelOffset, fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0, 0);
+ pixel *src1 = fref1->lowresMC(pelOffset, fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1, 0);
ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
int bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/slicetype.h
--- a/source/encoder/slicetype.h Fri Jul 05 11:17:26 2019 +0530
+++ b/source/encoder/slicetype.h Mon Jul 08 10:39:27 2019 +0530
@@ -124,6 +124,10 @@
int m_inputCount;
double m_cuTreeStrength;
+ /* HME */
+ int m_4x4Width;
+ int m_4x4Height;
+
bool m_isActive;
bool m_sliceTypeBusy;
bool m_bAdaptiveQuant;
@@ -246,7 +250,7 @@
void processTasks(int workerThreadID);
int64_t estimateFrameCost(LookaheadTLD& tld, int p0, int p1, int b, bool intraPenalty);
- void estimateCUCost(LookaheadTLD& tld, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice);
+ void estimateCUCost(LookaheadTLD& tld, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice, bool hme);
CostEstimateGroup& operator=(const CostEstimateGroup&);
};
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/weightPrediction.cpp
--- a/source/encoder/weightPrediction.cpp Fri Jul 05 11:17:26 2019 +0530
+++ b/source/encoder/weightPrediction.cpp Mon Jul 08 10:39:27 2019 +0530
@@ -82,7 +82,7 @@
/* clip MV to available pixels */
MV mv = mvs[cu];
mv = mv.clipped(mvmin, mvmax);
- pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride);
+ pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride, 0);
primitives.cu[BLOCK_8x8].copy_pp(mcout + pixoff, stride, tmp, bstride);
}
}
-------------- next part --------------
A non-text attachment was scrubbed...
Name: x265-2.patch
Type: text/x-patch
Size: 30425 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20190710/c03eedfe/attachment-0001.bin>
More information about the x265-devel
mailing list