<div dir="ltr">Pushed to default. <br></div><br><div class="gmail_quote"><div dir="ltr" class="gmail_attr">On Wed, Jul 10, 2019 at 9:47 AM <<a href="mailto:pooja@multicorewareinc.com">pooja@multicorewareinc.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"># HG changeset patch<br>
# User Pooja Venkatesan <<a href="mailto:pooja@multicorewareinc.com" target="_blank">pooja@multicorewareinc.com</a>><br>
# Date 1562562567 -19800<br>
# Mon Jul 08 10:39:27 2019 +0530<br>
# Node ID 2dcff9aea06f0f1c396fd2a62104e4fd5029bf40<br>
# Parent 14a235657a2011aa28d45544f33b7186c33b9218<br>
motion: Perform ME on each HME level<br>
<br>
This patch does the following:<br>
1) Perform level-0 ME<br>
2) Use the MVs as predictor for next level ME<br>
3) Restrict full-search within a range when HME is enabled<br>
<br>
diff -r 14a235657a20 -r 2dcff9aea06f source/common/lowres.cpp<br>
--- a/source/common/lowres.cpp Fri Jul 05 11:17:26 2019 +0530<br>
+++ b/source/common/lowres.cpp Mon Jul 08 10:39:27 2019 +0530<br>
@@ -65,6 +65,7 @@<br>
maxBlocksInColFullRes = maxBlocksInCol * 2;<br>
int cuCount = maxBlocksInRow * maxBlocksInCol;<br>
int cuCountFullRes = (qgSize > 8) ? cuCount : cuCount << 2;<br>
+ isHMELowres = param->bEnableHME ? 1 : 0;<br>
<br>
/* rounding the width to multiple of lowres CU size */<br>
width = maxBlocksInRow * X265_LOWRES_CU_SIZE;<br>
@@ -176,6 +177,16 @@<br>
CHECKED_MALLOC(lowresMvs[1][i], MV, cuCount);<br>
CHECKED_MALLOC(lowresMvCosts[0][i], int32_t, cuCount);<br>
CHECKED_MALLOC(lowresMvCosts[1][i], int32_t, cuCount);<br>
+ if (bEnableHME)<br>
+ {<br>
+ int maxBlocksInRowLowerRes = ((width/2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;<br>
+ int maxBlocksInColLowerRes = ((lines/2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;<br>
+ int cuCountLowerRes = maxBlocksInRowLowerRes * maxBlocksInColLowerRes;<br>
+ CHECKED_MALLOC(lowerResMvs[0][i], MV, cuCountLowerRes);<br>
+ CHECKED_MALLOC(lowerResMvs[1][i], MV, cuCountLowerRes);<br>
+ CHECKED_MALLOC(lowerResMvCosts[0][i], int32_t, cuCountLowerRes);<br>
+ CHECKED_MALLOC(lowerResMvCosts[1][i], int32_t, cuCountLowerRes);<br>
+ }<br>
}<br>
<br>
return true;<br>
@@ -207,6 +218,13 @@<br>
X265_FREE(lowresMvs[1][i]);<br>
X265_FREE(lowresMvCosts[0][i]);<br>
X265_FREE(lowresMvCosts[1][i]);<br>
+ if (bEnableHME)<br>
+ {<br>
+ X265_FREE(lowerResMvs[0][i]);<br>
+ X265_FREE(lowerResMvs[1][i]);<br>
+ X265_FREE(lowerResMvCosts[0][i]);<br>
+ X265_FREE(lowerResMvCosts[1][i]);<br>
+ }<br>
}<br>
X265_FREE(qpAqOffset);<br>
X265_FREE(invQscaleFactor);<br>
diff -r 14a235657a20 -r 2dcff9aea06f source/common/lowres.h<br>
--- a/source/common/lowres.h Fri Jul 05 11:17:26 2019 +0530<br>
+++ b/source/common/lowres.h Mon Jul 08 10:39:27 2019 +0530<br>
@@ -46,6 +46,7 @@<br>
<br>
bool isWeighted;<br>
bool isLowres;<br>
+ bool isHMELowres;<br>
<br>
intptr_t lumaStride;<br>
intptr_t chromaStride;<br>
@@ -63,46 +64,58 @@<br>
<br>
/* lowres motion compensation, you must provide a buffer and stride for QPEL averaged pixels<br>
* in case QPEL is required. Else it returns a pointer to the HPEL pixels */<br>
- inline pixel *lowresMC(intptr_t blockOffset, const MV& qmv, pixel *buf, intptr_t& outstride)<br>
+ inline pixel *lowresMC(intptr_t blockOffset, const MV& qmv, pixel *buf, intptr_t& outstride, bool hme)<br>
{<br>
+ intptr_t YStride = hme ? lumaStride / 2 : lumaStride;<br>
+ pixel *plane[4];<br>
+ for (int i = 0; i < 4; i++)<br>
+ {<br>
+ plane[i] = hme ? lowerResPlane[i] : lowresPlane[i];<br>
+ }<br>
if ((qmv.x | qmv.y) & 1)<br>
{<br>
int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1);<br>
- pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride;<br>
+ pixel *frefA = plane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * YStride;<br>
int qmvx = qmv.x + (qmv.x & 1);<br>
int qmvy = qmv.y + (qmv.y & 1);<br>
int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);<br>
- pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;<br>
- primitives.pu[LUMA_8x8].pixelavg_pp[(outstride % 64 == 0) && (lumaStride % 64 == 0)](buf, outstride, frefA, lumaStride, frefB, lumaStride, 32);<br>
+ pixel *frefB = plane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * YStride;<br>
+ primitives.pu[LUMA_8x8].pixelavg_pp[(outstride % 64 == 0) && (YStride % 64 == 0)](buf, outstride, frefA, YStride, frefB, YStride, 32);<br>
return buf;<br>
}<br>
else<br>
{<br>
- outstride = lumaStride;<br>
+ outstride = YStride;<br>
int hpel = (qmv.y & 2) | ((qmv.x & 2) >> 1);<br>
- return lowresPlane[hpel] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride;<br>
+ return plane[hpel] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * YStride;<br>
}<br>
}<br>
<br>
- inline int lowresQPelCost(pixel *fenc, intptr_t blockOffset, const MV& qmv, pixelcmp_t comp)<br>
+ inline int lowresQPelCost(pixel *fenc, intptr_t blockOffset, const MV& qmv, pixelcmp_t comp, bool hme)<br>
{<br>
+ intptr_t YStride = hme ? lumaStride / 2 : lumaStride;<br>
+ pixel *plane[4];<br>
+ for (int i = 0; i < 4; i++)<br>
+ {<br>
+ plane[i] = hme ? lowerResPlane[i] : lowresPlane[i];<br>
+ }<br>
if ((qmv.x | qmv.y) & 1)<br>
{<br>
ALIGN_VAR_16(pixel, subpelbuf[8 * 8]);<br>
int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1);<br>
- pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride;<br>
+ pixel *frefA = plane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * YStride;<br>
int qmvx = qmv.x + (qmv.x & 1);<br>
int qmvy = qmv.y + (qmv.y & 1);<br>
int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);<br>
- pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;<br>
- primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](subpelbuf, 8, frefA, lumaStride, frefB, lumaStride, 32);<br>
+ pixel *frefB = plane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * YStride;<br>
+ primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](subpelbuf, 8, frefA, YStride, frefB, YStride, 32);<br>
return comp(fenc, FENC_STRIDE, subpelbuf, 8);<br>
}<br>
else<br>
{<br>
int hpel = (qmv.y & 2) | ((qmv.x & 2) >> 1);<br>
- pixel *fref = lowresPlane[hpel] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride;<br>
- return comp(fenc, FENC_STRIDE, fref, lumaStride);<br>
+ pixel *fref = plane[hpel] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * YStride;<br>
+ return comp(fenc, FENC_STRIDE, fref, YStride);<br>
}<br>
}<br>
};<br>
@@ -188,6 +201,8 @@<br>
<br>
/* Hierarchical Motion Estimation */<br>
bool bEnableHME;<br>
+ int32_t* lowerResMvCosts[2][X265_BFRAME_MAX + 2];<br>
+ MV* lowerResMvs[2][X265_BFRAME_MAX + 2];<br>
<br>
/* used for vbvLookahead */<br>
int plannedType[X265_LOOKAHEAD_MAX + 1];<br>
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/encoder.cpp<br>
--- a/source/encoder/encoder.cpp Fri Jul 05 11:17:26 2019 +0530<br>
+++ b/source/encoder/encoder.cpp Mon Jul 08 10:39:27 2019 +0530<br>
@@ -3387,6 +3387,10 @@<br>
x265_log(p, X265_LOG_WARNING, "Source height < 540p is too low for HME. Disabling HME.\n");<br>
p->bEnableHME = 0;<br>
}<br>
+ if (m_param->bEnableHME && m_param->searchMethod != m_param->hmeSearchMethod[2])<br>
+ {<br>
+ m_param->searchMethod = m_param->hmeSearchMethod[2];<br>
+ }<br>
}<br>
}<br>
<br>
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/motion.cpp<br>
--- a/source/encoder/motion.cpp Fri Jul 05 11:17:26 2019 +0530<br>
+++ b/source/encoder/motion.cpp Mon Jul 08 10:39:27 2019 +0530<br>
@@ -104,6 +104,8 @@<br>
ctuAddr = -1;<br>
absPartIdx = -1;<br>
searchMethod = X265_HEX_SEARCH;<br>
+ searchMethodL0 = X265_HEX_SEARCH;<br>
+ searchMethodL1 = X265_HEX_SEARCH;<br>
subpelRefine = 2;<br>
blockwidth = blockheight = 0;<br>
blockOffset = 0;<br>
@@ -162,7 +164,7 @@<br>
}<br>
<br>
/* Called by lookahead, luma only, no use of PicYuv */<br>
-void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int refine)<br>
+void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int searchL0, const int searchL1, const int refine)<br>
{<br>
partEnum = partitionFromSizes(pwidth, pheight);<br>
X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");<br>
@@ -179,6 +181,8 @@<br>
<br>
/* Search params */<br>
searchMethod = method;<br>
+ searchMethodL0 = searchL0;<br>
+ searchMethodL1 = searchL1;<br>
subpelRefine = refine;<br>
<br>
/* copy PU block into cache */<br>
@@ -743,9 +747,10 @@<br>
pixel * srcReferencePlane)<br>
{<br>
ALIGN_VAR_16(int, costs[16]);<br>
+ bool hme = srcReferencePlane && srcReferencePlane == ref->fpelLowerResPlane[0];<br>
if (ctuAddr >= 0)<br>
blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0);<br>
- intptr_t stride = ref->lumaStride;<br>
+ intptr_t stride = hme ? ref->lumaStride / 2 : ref->lumaStride;<br>
pixel* fenc = fencPUYuv.m_buf[0];<br>
pixel* fref = srcReferencePlane == 0 ? ref->fpelPlane[0] + blockOffset : srcReferencePlane + blockOffset;<br>
<br>
@@ -767,7 +772,7 @@<br>
int bprecost;<br>
<br>
if (ref->isLowres)<br>
- bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad);<br>
+ bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad, hme);<br>
else<br>
bprecost = subpelCompare(ref, pmv, sad);<br>
<br>
@@ -808,7 +813,8 @@<br>
pmv = pmv.roundToFPel();<br>
MV omv = bmv; // current search origin or starting point<br>
<br>
- switch (searchMethod)<br>
+ int search = ref->isHMELowres ? (hme ? searchMethodL0 : searchMethodL1) : searchMethod;<br>
+ switch (search)<br>
{<br>
case X265_DIA_SEARCH:<br>
{<br>
@@ -1391,11 +1397,20 @@<br>
{<br>
// dead slow exhaustive search, but at least it uses sad_x4()<br>
MV tmv;<br>
- for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y++)<br>
+ int32_t mvmin_y = mvmin.y, mvmin_x = mvmin.x, mvmax_y = mvmax.y, mvmax_x = mvmax.x;<br>
+ if (ref->isHMELowres)<br>
{<br>
- for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x++)<br>
+ merange = (merange < 0 ? -merange : merange);<br>
+ mvmin_y = X265_MAX(mvmin.y, -merange);<br>
+ mvmin_x = X265_MAX(mvmin.x, -merange);<br>
+ mvmax_y = X265_MIN(mvmax.y, merange);<br>
+ mvmax_x = X265_MIN(mvmax.x, merange);<br>
+ }<br>
+ for (tmv.y = mvmin_y; tmv.y <= mvmax_y; tmv.y++)<br>
+ {<br>
+ for (tmv.x = mvmin_x; tmv.x <= mvmax_x; tmv.x++)<br>
{<br>
- if (tmv.x + 3 <= mvmax.x)<br>
+ if (tmv.x + 3 <= mvmax_x)<br>
{<br>
pixel *pix_base = fref + tmv.y * stride + tmv.x;<br>
sad_x4(fenc,<br>
@@ -1463,12 +1478,12 @@<br>
if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))<br>
continue;<br>
<br>
- int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv);<br>
+ int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad, hme) + mvcost(qmv);<br>
COPY2_IF_LT(bcost, cost, bdir, i);<br>
}<br>
<br>
bmv += square1[bdir] * 2;<br>
- bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd) + mvcost(bmv);<br>
+ bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd, hme) + mvcost(bmv);<br>
<br>
bdir = 0;<br>
for (int i = 1; i <= wl.qpel_dirs; i++)<br>
@@ -1479,7 +1494,7 @@<br>
if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))<br>
continue;<br>
<br>
- int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv);<br>
+ int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd, hme) + mvcost(qmv);<br>
COPY2_IF_LT(bcost, cost, bdir, i);<br>
}<br>
<br>
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/motion.h<br>
--- a/source/encoder/motion.h Fri Jul 05 11:17:26 2019 +0530<br>
+++ b/source/encoder/motion.h Mon Jul 08 10:39:27 2019 +0530<br>
@@ -44,6 +44,8 @@<br>
int absPartIdx; // part index of PU, including CU offset within CTU<br>
<br>
int searchMethod;<br>
+ int searchMethodL0;<br>
+ int searchMethodL1;<br>
int subpelRefine;<br>
<br>
int blockwidth;<br>
@@ -76,7 +78,7 @@<br>
<br>
/* Methods called at slice setup */<br>
<br>
- void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int searchMethod, const int subpelRefine);<br>
+ void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int searchMethod, const int searchL0, const int searchL1, const int subpelRefine);<br>
void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int searchMethod, const int subpelRefine, bool bChroma);<br>
<br>
/* buf*() and motionEstimate() methods all use cached fenc pixels and thus<br>
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/search.cpp<br>
--- a/source/encoder/search.cpp Fri Jul 05 11:17:26 2019 +0530<br>
+++ b/source/encoder/search.cpp Mon Jul 08 10:39:27 2019 +0530<br>
@@ -2096,13 +2096,16 @@<br>
<br>
const MV* amvp = interMode.amvpCand[list][ref];<br>
int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref);<br>
- MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];<br>
+ bool bLowresMVP = false;<br>
+ MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres;<br>
<br>
if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents load/save outputs from diverging if lowresMV is not available */<br>
{<br>
MV lmv = getLowresMV(interMode.cu, pu, list, ref);<br>
if (lmv.notZero())<br>
mvc[numMvc++] = lmv;<br>
+ if (m_param->bEnableHME)<br>
+ mvp_lowres = lmv;<br>
}<br>
<br>
setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);<br>
@@ -2110,11 +2113,28 @@<br>
int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, <br>
m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);<br>
<br>
+ if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)<br>
+ {<br>
+ MV outmv_lowres;<br>
+ setSearchRange(interMode.cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);<br>
+ int lowresMvCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices,<br>
+ m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);<br>
+ if (lowresMvCost < satdCost)<br>
+ {<br>
+ outmv = outmv_lowres;<br>
+ satdCost = lowresMvCost;<br>
+ bLowresMVP = true;<br>
+ }<br>
+ }<br>
/* Get total cost of partition, but only include MV bit cost once */<br>
bits += m_me.bitcost(outmv);<br>
uint32_t mvCost = m_me.mvcost(outmv);<br>
uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);<br>
<br>
+ /* Update LowresMVP to best AMVP cand*/<br>
+ if (bLowresMVP)<br>
+ updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);<br>
+<br>
/* Refine MVP selection, updates: mvpIdx, bits, cost */<br>
mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);<br>
<br>
@@ -2346,13 +2366,16 @@<br>
<br>
const MV* amvp = interMode.amvpCand[list][ref];<br>
int mvpIdx = selectMVP(cu, pu, amvp, list, ref);<br>
- MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];<br>
+ MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres;<br>
+ bool bLowresMVP = false;<br>
<br>
if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents load/save outputs from diverging when lowresMV is not available */<br>
{<br>
MV lmv = getLowresMV(cu, pu, list, ref);<br>
if (lmv.notZero())<br>
mvc[numMvc++] = lmv;<br>
+ if (m_param->bEnableHME)<br>
+ mvp_lowres = lmv;<br>
}<br>
if (m_param->searchMethod == X265_SEA)<br>
{<br>
@@ -2365,10 +2388,27 @@<br>
int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, <br>
m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);<br>
<br>
+ if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)<br>
+ {<br>
+ MV outmv_lowres;<br>
+ setSearchRange(cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);<br>
+ int lowresMvCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices,<br>
+ m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);<br>
+ if (lowresMvCost < satdCost)<br>
+ {<br>
+ outmv = outmv_lowres;<br>
+ satdCost = lowresMvCost;<br>
+ bLowresMVP = true;<br>
+ }<br>
+ }<br>
+<br>
/* Get total cost of partition, but only include MV bit cost once */<br>
bits += m_me.bitcost(outmv);<br>
uint32_t mvCost = m_me.mvcost(outmv);<br>
uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);<br>
+ /* Update LowresMVP to best AMVP cand*/<br>
+ if (bLowresMVP)<br>
+ updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);<br>
<br>
/* Refine MVP selection, updates: mvpIdx, bits, cost */<br>
mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);<br>
@@ -2631,6 +2671,15 @@<br>
return amvpCand[mvpIdx];<br>
}<br>
<br>
+/* Update to default MVP when using an alternative mvp */<br>
+void Search::updateMVP(const MV amvp, const MV& mv, uint32_t& outBits, uint32_t& outCost, const MV& alterMVP)<br>
+{<br>
+ int diffBits = m_me.bitcost(mv, amvp) - m_me.bitcost(mv, alterMVP);<br>
+ uint32_t origOutBits = outBits;<br>
+ outBits = origOutBits + diffBits;<br>
+ outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits);<br>
+}<br>
+<br>
void Search::setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const<br>
{<br>
MV dist((int32_t)merange << 2, (int32_t)merange << 2);<br>
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/search.h<br>
--- a/source/encoder/search.h Fri Jul 05 11:17:26 2019 +0530<br>
+++ b/source/encoder/search.h Mon Jul 08 10:39:27 2019 +0530<br>
@@ -425,6 +425,7 @@<br>
void setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const;<br>
uint32_t mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m);<br>
static void getBlkBits(PartSize cuMode, bool bPSlice, int puIdx, uint32_t lastMode, uint32_t blockBit[3]);<br>
+ void updateMVP(const MV amvp, const MV& mv, uint32_t& outBits, uint32_t& outCost, const MV& alterMVP);<br>
<br>
/* intra helper functions */<br>
enum { MAX_RD_INTRA_MODES = 16 };<br>
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/slicetype.cpp<br>
--- a/source/encoder/slicetype.cpp Fri Jul 05 11:17:26 2019 +0530<br>
+++ b/source/encoder/slicetype.cpp Mon Jul 08 10:39:27 2019 +0530<br>
@@ -664,6 +664,7 @@<br>
weightedRef.lumaStride = fenc.lumaStride;<br>
weightedRef.isLowres = true;<br>
weightedRef.isWeighted = false;<br>
+ weightedRef.isHMELowres = ref.bEnableHME;<br>
<br>
/* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */<br>
float guessScale, fencMean, refMean;<br>
@@ -759,6 +760,8 @@<br>
m_extendGopBoundary = false;<br>
m_8x8Height = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;<br>
m_8x8Width = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;<br>
+ m_4x4Height = ((m_param->sourceHeight / 4) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;<br>
+ m_4x4Width = ((m_param->sourceWidth / 4) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;<br>
m_cuCount = m_8x8Width * m_8x8Height;<br>
m_8x8Blocks = m_8x8Width > 2 && m_8x8Height > 2 ? (m_cuCount + 4 - 2 * (m_8x8Width + m_8x8Height)) : m_cuCount;<br>
m_isFadeIn = false;<br>
@@ -2782,16 +2785,32 @@<br>
<br>
X265_CHECK(i < MAX_COOP_SLICES, "impossible number of coop slices\n");<br>
<br>
- int firstY = m_lookahead.m_numRowsPerSlice * i;<br>
- int lastY = (i == m_jobTotal - 1) ? m_lookahead.m_8x8Height - 1 : m_lookahead.m_numRowsPerSlice * (i + 1) - 1;<br>
-<br>
- bool lastRow = true;<br>
+ int firstY, lastY;<br>
+ bool lastRow;<br>
+ if (m_lookahead.m_param->bEnableHME)<br>
+ {<br>
+ int numRowsPerSlice = m_lookahead.m_4x4Height / m_lookahead.m_param->lookaheadSlices;<br>
+ numRowsPerSlice = X265_MIN(X265_MAX(numRowsPerSlice, 5), m_lookahead.m_4x4Height);<br>
+ firstY = numRowsPerSlice * i;<br>
+ lastY = (i == m_jobTotal - 1) ? m_lookahead.m_4x4Height - 1 : numRowsPerSlice * (i + 1) - 1;<br>
+ lastRow = true;<br>
+ for (int cuY = lastY; cuY >= firstY; cuY--)<br>
+ {<br>
+ for (int cuX = m_lookahead.m_4x4Width - 1; cuX >= 0; cuX--)<br>
+ estimateCUCost(tld, cuX, cuY, m_coop.p0, m_coop.p1, m_coop.b, m_coop.bDoSearch, lastRow, i, 1);<br>
+ lastRow = false;<br>
+ }<br>
+ }<br>
+<br>
+ firstY = m_lookahead.m_numRowsPerSlice * i;<br>
+ lastY = (i == m_jobTotal - 1) ? m_lookahead.m_8x8Height - 1 : m_lookahead.m_numRowsPerSlice * (i + 1) - 1;<br>
+ lastRow = true;<br>
for (int cuY = lastY; cuY >= firstY; cuY--)<br>
{<br>
m_frames[m_coop.b]->rowSatds[m_coop.b - m_coop.p0][m_coop.p1 - m_coop.b][cuY] = 0;<br>
<br>
for (int cuX = m_lookahead.m_8x8Width - 1; cuX >= 0; cuX--)<br>
- estimateCUCost(tld, cuX, cuY, m_coop.p0, m_coop.p1, m_coop.b, m_coop.bDoSearch, lastRow, i);<br>
+ estimateCUCost(tld, cuX, cuY, m_coop.p0, m_coop.p1, m_coop.b, m_coop.bDoSearch, lastRow, i, 0);<br>
<br>
lastRow = false;<br>
}<br>
@@ -2864,13 +2883,25 @@<br>
}<br>
else<br>
{<br>
- bool lastRow = true;<br>
+ /* Calculate MVs for 1/16th resolution*/<br>
+ bool lastRow;<br>
+ if (param->bEnableHME)<br>
+ {<br>
+ lastRow = true;<br>
+ for (int cuY = m_lookahead.m_4x4Height - 1; cuY >= 0; cuY--)<br>
+ {<br>
+ for (int cuX = m_lookahead.m_4x4Width - 1; cuX >= 0; cuX--)<br>
+ estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, lastRow, -1, 1);<br>
+ lastRow = false;<br>
+ }<br>
+ }<br>
+ lastRow = true;<br>
for (int cuY = m_lookahead.m_8x8Height - 1; cuY >= 0; cuY--)<br>
{<br>
fenc->rowSatds[b - p0][p1 - b][cuY] = 0;<br>
<br>
for (int cuX = m_lookahead.m_8x8Width - 1; cuX >= 0; cuX--)<br>
- estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, lastRow, -1);<br>
+ estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, lastRow, -1, 0);<br>
<br>
lastRow = false;<br>
}<br>
@@ -2891,23 +2922,27 @@<br>
return score;<br>
}<br>
<br>
-void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int cuY, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice)<br>
+void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int cuY, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice, bool hme)<br>
{<br>
Lowres *fref0 = m_frames[p0];<br>
Lowres *fref1 = m_frames[p1];<br>
Lowres *fenc = m_frames[b];<br>
<br>
- ReferencePlanes *wfref0 = fenc->weightedRef[b - p0].isWeighted ? &fenc->weightedRef[b - p0] : fref0;<br>
-<br>
- const int widthInCU = m_lookahead.m_8x8Width;<br>
- const int heightInCU = m_lookahead.m_8x8Height;<br>
+ ReferencePlanes *wfref0 = fenc->weightedRef[b - p0].isWeighted && !hme ? &fenc->weightedRef[b - p0] : fref0;<br>
+<br>
+ const int widthInCU = hme ? m_lookahead.m_4x4Width : m_lookahead.m_8x8Width;<br>
+ const int heightInCU = hme ? m_lookahead.m_4x4Height : m_lookahead.m_8x8Height;<br>
const int bBidir = (b < p1);<br>
const int cuXY = cuX + cuY * widthInCU;<br>
+ const int cuXY_4x4 = (cuX / 2) + (cuY / 2) * widthInCU / 2;<br>
const int cuSize = X265_LOWRES_CU_SIZE;<br>
- const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * fenc->lumaStride;<br>
-<br>
- if (bBidir || bDoSearch[0] || bDoSearch[1])<br>
- tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, 1);<br>
+ const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * (hme ? fenc->lumaStride/2 : fenc->lumaStride);<br>
+<br>
+ if ((bBidir || bDoSearch[0] || bDoSearch[1]) && hme)<br>
+ tld.me.setSourcePU(fenc->lowerResPlane[0], fenc->lumaStride / 2, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, m_lookahead.m_param->hmeSearchMethod[0], m_lookahead.m_param->hmeSearchMethod[1], 1);<br>
+ else if((bBidir || bDoSearch[0] || bDoSearch[1]) && !hme)<br>
+ tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, m_lookahead.m_param->hmeSearchMethod[0], m_lookahead.m_param->hmeSearchMethod[1], 1);<br>
+<br>
<br>
/* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */<br>
int lowresPenalty = 4;<br>
@@ -2926,7 +2961,7 @@<br>
<br>
for (int i = 0; i < 1 + bBidir; i++)<br>
{<br>
- int& fencCost = fenc->lowresMvCosts[i][listDist[i]][cuXY];<br>
+ int& fencCost = hme ? fenc->lowerResMvCosts[i][listDist[i]][cuXY] : fenc->lowresMvCosts[i][listDist[i]][cuXY];<br>
int skipCost = INT_MAX;<br>
<br>
if (!bDoSearch[i])<br>
@@ -2936,8 +2971,8 @@<br>
}<br>
<br>
int numc = 0;<br>
- MV mvc[4], mvp;<br>
- MV* fencMV = &fenc->lowresMvs[i][listDist[i]][cuXY];<br>
+ MV mvc[5], mvp;<br>
+ MV* fencMV = hme ? &fenc->lowerResMvs[i][listDist[i]][cuXY] : &fenc->lowresMvs[i][listDist[i]][cuXY];<br>
ReferencePlanes* fref = i ? fref1 : wfref0;<br>
<br>
/* Reverse-order MV prediction */<br>
@@ -2952,6 +2987,10 @@<br>
if (cuX < widthInCU - 1)<br>
MVC(fencMV[widthInCU + 1]);<br>
}<br>
+ if (fenc->lowerResMvs[0][0] && !hme && fenc->lowerResMvCosts[i][listDist[i]][cuXY_4x4] > 0)<br>
+ {<br>
+ MVC((fenc->lowerResMvs[i][listDist[i]][cuXY_4x4]) * 2);<br>
+ }<br>
#undef MVC<br>
<br>
if (!numc)<br>
@@ -2967,7 +3006,7 @@<br>
for (int idx = 0; idx < numc; idx++)<br>
{<br>
intptr_t stride = X265_LOWRES_CU_SIZE;<br>
- pixel *src = fref->lowresMC(pelOffset, mvc[idx], subpelbuf, stride);<br>
+ pixel *src = fref->lowresMC(pelOffset, mvc[idx], subpelbuf, stride, hme);<br>
int cost = tld.me.bufSATD(src, stride);<br>
COPY2_IF_LT(mvpcost, cost, mvp, mvc[idx]);<br>
/* Except for mv0 case, everyting else is likely to have enough residual to not trigger the skip. */<br>
@@ -2978,7 +3017,10 @@<br>
<br>
/* ME will never return a cost larger than the cost @MVP, so we do not<br>
* have to check that ME cost is more than the estimated merge cost */<br>
- fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices);<br>
+ if(!hme)<br>
+ fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices);<br>
+ else<br>
+ fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices, fref->lowerResPlane[0]);<br>
if (skipCost < 64 && skipCost < fencCost && bBidir)<br>
{<br>
fencCost = skipCost;<br>
@@ -2986,6 +3028,8 @@<br>
}<br>
COPY2_IF_LT(bcost, fencCost, listused, i + 1);<br>
}<br>
+ if (hme)<br>
+ return;<br>
<br>
if (bBidir) /* B, also consider bidir */<br>
{<br>
@@ -2995,8 +3039,8 @@<br>
ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);<br>
ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);<br>
intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;<br>
- pixel *src0 = fref0->lowresMC(pelOffset, fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0);<br>
- pixel *src1 = fref1->lowresMC(pelOffset, fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1);<br>
+ pixel *src0 = fref0->lowresMC(pelOffset, fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0, 0);<br>
+ pixel *src1 = fref1->lowresMC(pelOffset, fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1, 0);<br>
ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);<br>
primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);<br>
int bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);<br>
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/slicetype.h<br>
--- a/source/encoder/slicetype.h Fri Jul 05 11:17:26 2019 +0530<br>
+++ b/source/encoder/slicetype.h Mon Jul 08 10:39:27 2019 +0530<br>
@@ -124,6 +124,10 @@<br>
int m_inputCount;<br>
double m_cuTreeStrength;<br>
<br>
+ /* HME */<br>
+ int m_4x4Width;<br>
+ int m_4x4Height;<br>
+<br>
bool m_isActive;<br>
bool m_sliceTypeBusy;<br>
bool m_bAdaptiveQuant;<br>
@@ -246,7 +250,7 @@<br>
void processTasks(int workerThreadID);<br>
<br>
int64_t estimateFrameCost(LookaheadTLD& tld, int p0, int p1, int b, bool intraPenalty);<br>
- void estimateCUCost(LookaheadTLD& tld, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice);<br>
+ void estimateCUCost(LookaheadTLD& tld, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice, bool hme);<br>
<br>
CostEstimateGroup& operator=(const CostEstimateGroup&);<br>
};<br>
diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/weightPrediction.cpp<br>
--- a/source/encoder/weightPrediction.cpp Fri Jul 05 11:17:26 2019 +0530<br>
+++ b/source/encoder/weightPrediction.cpp Mon Jul 08 10:39:27 2019 +0530<br>
@@ -82,7 +82,7 @@<br>
/* clip MV to available pixels */<br>
MV mv = mvs[cu];<br>
mv = mv.clipped(mvmin, mvmax);<br>
- pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride);<br>
+ pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride, 0);<br>
<a href="http://primitives.cu" rel="noreferrer" target="_blank">primitives.cu</a>[BLOCK_8x8].copy_pp(mcout + pixoff, stride, tmp, bstride);<br>
}<br>
}<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org" target="_blank">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" rel="noreferrer" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br clear="all"><div><br></div>-- <br><div dir="ltr" class="gmail_signature"><div dir="ltr"><font face="georgia, serif">Regards,</font><div><font face="georgia, serif">Aruna</font></div></div></div>