[x265] [PATCH] analysis: avoid motion references not used by split blocks in pme mode
Steve Borho
steve at borho.org
Thu Aug 20 05:13:20 CEST 2015
On 08/19, ashok at multicorewareinc.com wrote:
> # HG changeset patch
> # User Ashok Kumar Mishra<ashok at multicorewareinc.com>
> # Date 1439808217 -19800
> # Mon Aug 17 16:13:37 2015 +0530
> # Node ID 868aa6ce2f9033812cdc1c9c4fae31a8a40a2921
> # Parent 2980141a744a569ad6f60dbebdece76a4eababfd
> analysis: avoid motion references not used by split blocks in pme mode
this looks correct, but I worry it adds some avoidable overhead to the
typical non-pme work-flow
> diff -r 2980141a744a -r 868aa6ce2f90 source/encoder/search.cpp
> --- a/source/encoder/search.cpp Tue Aug 18 12:45:52 2015 +0530
> +++ b/source/encoder/search.cpp Mon Aug 17 16:13:37 2015 +0530
> @@ -1962,10 +1962,16 @@
> /* Perform ME, repeat until no more work is available */
> do
> {
> - if (meId < m_slice->m_numRefIdx[0])
> - slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 0, meId);
> + if (meId < m_refIdxCnt[0])
> + {
> + int refIdx = m_bestRefIdx[0][meId]; //L0
> + slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 0, refIdx);
> + }
> else
> - slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 1, meId - m_slice->m_numRefIdx[0]);
> + {
> + int refIdx = m_bestRefIdx[1][meId - m_refIdxCnt[0]]; //L1
> + slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 1, refIdx);
> + }
>
> meId = -1;
> pme.m_lock.acquire();
> @@ -2037,8 +2043,6 @@
> const int* numRefIdx = slice->m_numRefIdx;
> uint32_t lastMode = 0;
> int totalmebits = 0;
> - int numME = numRefIdx[0] + numRefIdx[1];
> - bool bTryDistributed = m_param->bDistributeMotionEstimation && numME > 2;
> MV mvzero(0, 0);
> Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
>
> @@ -2047,6 +2051,32 @@
>
> for (int puIdx = 0; puIdx < numPart; puIdx++)
> {
> + uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
> +
> + m_refIdxCnt[0] = 0; m_refIdxCnt[1] = 0;
> + for (int list = 0; list < numPredDir; list++)
> + {
> + int idx = 0;
> + for (int ref = 0; ref < numRefIdx[list]; ref++)
> + {
> + ProfileCounter(interMode.cu, totalMotionReferences[cuGeom.depth]);
> +
> + if (!(refMask & (1 << ref)))
> + {
> + ProfileCounter(interMode.cu, skippedMotionReferences[cuGeom.depth]);
> + continue;
> + }
> + m_bestRefIdx[list][idx++] = ref;
> + }
> + m_refIdxCnt[list] = idx;
> +
> + /* the second list ref bits start at bit 16 */
> + refMask >>= 16;
> + }
> + int numME = m_refIdxCnt[0] + m_refIdxCnt[1];
> +
> + bool bTryDistributed = m_param->bDistributeMotionEstimation && numME > 2;
my thought on how to fix this was to add an explicit ref list in the PME
class.
struct {
int list, ref;
} m_jobs[2 * MAX_NUM_REF];
then within if (m_param->bDistributeMotionEstimation) it would walk the
references and the refmasks and add jobs to pme.m_jobs and incrementing
pme.m_jobTotal. At the end if m_jobTotal is not large enough, then skip
bonding any threads and let it fall through to the serial unidir path
(just as if bonding had failed).
Let me know if you need any further clarifications.
> MotionData* bestME = interMode.bestME[puIdx];
> PredictionUnit pu(cu, cuGeom, puIdx);
>
> @@ -2113,7 +2143,8 @@
> {
> processPME(pme, *this);
>
> - singleMotionEstimation(*this, interMode, pu, puIdx, 0, 0); /* L0-0 */
> + int ref = (m_refIdxCnt[0] == 0) ? m_bestRefIdx[1][0] : m_bestRefIdx[0][0];
> + singleMotionEstimation(*this, interMode, pu, puIdx, 0, ref); /* L0-0 or L1-0 */
>
> bDoUnidir = false;
>
> @@ -2126,19 +2157,11 @@
> }
> if (bDoUnidir)
> {
> - uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
> -
> for (int list = 0; list < numPredDir; list++)
> {
> - for (int ref = 0; ref < numRefIdx[list]; ref++)
> + for (int idx = 0; idx < m_refIdxCnt[list]; idx++)
> {
> - ProfileCounter(interMode.cu, totalMotionReferences[cuGeom.depth]);
> -
> - if (!(refMask & (1 << ref)))
> - {
> - ProfileCounter(interMode.cu, skippedMotionReferences[cuGeom.depth]);
> - continue;
> - }
> + int ref = m_bestRefIdx[list][idx];
>
> uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
> bits += getTUBits(ref, numRefIdx[list]);
> @@ -2173,8 +2196,6 @@
> bestME[list].bits = bits;
> }
> }
> - /* the second list ref bits start at bit 16 */
> - refMask >>= 16;
> }
> }
>
> diff -r 2980141a744a -r 868aa6ce2f90 source/encoder/search.h
> --- a/source/encoder/search.h Tue Aug 18 12:45:52 2015 +0530
> +++ b/source/encoder/search.h Mon Aug 17 16:13:37 2015 +0530
> @@ -316,6 +316,9 @@
> uint32_t m_numLayers;
> uint32_t m_refLagPixels;
>
> + int m_refIdxCnt[2];
> + int m_bestRefIdx[2][16];
> +
> #if DETAILED_CU_STATS
> /* Accumulate CU statistics separately for each frame encoder */
> CUStats m_stats[X265_MAX_FRAME_THREADS];
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list