[x265] [PATCH] threading 2Nx2N, Nx2N and 2NxN predInterSearch in xCompressInterCU
Steve Borho
steve at borho.org
Sat Mar 8 20:12:32 CET 2014
On Fri, Mar 7, 2014 at 12:26 AM, Wenju He <wenju at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Wenju He <wenju at multicorewareinc.com>
> # Date 1394173273 -28800
> # Fri Mar 07 14:21:13 2014 +0800
> # Node ID 7b757a1a99538a40ce2d74b6674ac974993bb2aa
> # Parent 33b67a53b6deb19bd5b5142398f7c8c47ba3d2fa
> threading 2Nx2N, Nx2N and 2NxN predInterSearch in xCompressInterCU
Thanks Wenju,
This patch will need to be shelved for a bit until we're ready for the
next round of performance improvements.
When this is finally merged, I believe the encode analysis structures
like TEncSearch, TComPrediction, MotionEstimation, etc will need to be
allocated per worker thread instead of per CU row. This will save a
good amount of memory and hopefully improve cache coherence a bit.
This will remove the need to allocate 8 ME objects per row, since each
worker thread can not do more than one search at a time.
> TEncSearch is inherited from JobProvider;
>
> two bitmaps are added: m_queuedBitmap and m_completeBitmap;
>
> 2Nx2N, Nx2N and 2NxN predInterSearch are enqueued. In each of the partSize,
> we set (numL0+numL1) bits in m_queuedBitmap to 1, i.e. motion search against
> a specific reference frame is a separate job. When a motion search job finishes,
> it will set a bit in m_completeBitmap to 1;
>
> In each partSize, the thread that last finishes motion search will do bidir
> and merge. At the end, if partIdx < numPart, it will enqueue the jobs of the
> next partIdx iteration, i.e. set (numL0+numL1) bits to 1 in m_queueBitmap and
> clear m_completeBitmap. When all partIdx iteration finishes, it will set another
> bit to 1 in m_completeBitmap, indicating this partSize is done;
>
> In TEncCu::xCompressInterCU, we enqueue m_search, run a while loop to call
> findJob() and finally call dequeue();
>
> Some variable instances are increased, e.g. the number of m_me instances is
> increased to 8.
>
> diff -r 33b67a53b6de -r 7b757a1a9953 source/Lib/TLibCommon/TComPrediction.cpp
> --- a/source/Lib/TLibCommon/TComPrediction.cpp Thu Mar 06 21:27:55 2014 -0600
> +++ b/source/Lib/TLibCommon/TComPrediction.cpp Fri Mar 07 14:21:13 2014 +0800
> @@ -64,7 +64,18 @@
> m_refAboveFlt = NULL;
> m_refLeft = NULL;
> m_refLeftFlt = NULL;
> - m_immedVals = NULL;
> +
> + for (int i = 0; i <= SIZE_nRx2N; i++)
> + {
> + for (int j = 0; j < 2; j++)
> + {
> + for (int k = 0; k < MAX_NUM_REF * 2; k++)
> + {
> + m_immedVals[i][j][k] = NULL;
> + }
> + }
> + }
> + m_threading = false;
> }
>
> TComPrediction::~TComPrediction()
> @@ -74,16 +85,26 @@
> X265_FREE(m_refAboveFlt);
> X265_FREE(m_refLeft);
> X265_FREE(m_refLeftFlt);
> - X265_FREE(m_immedVals);
>
> - m_predYuv[0].destroy();
> - m_predYuv[1].destroy();
> - m_predShortYuv[0].destroy();
> - m_predShortYuv[1].destroy();
> - m_predTempYuv.destroy();
> + for (int i = 0; i <= SIZE_nRx2N; i++)
> + {
> + m_predShortYuv[i][0].destroy();
> + m_predShortYuv[i][1].destroy();
> + for (int j = 0; j < 2; j++)
> + {
> + for (int k = 0; k < MAX_NUM_REF; k++)
> + {
> + m_predYuv[i][j][k].destroy();
> + }
> + for (int k = 0; k < MAX_NUM_REF * 2; k++)
> + {
> + X265_FREE(m_immedVals[i][j][k]);
> + }
> + }
> + }
> }
>
> -void TComPrediction::initTempBuff(int csp)
> +void TComPrediction::initTempBuff(int csp, int numPart, int maxNumRef)
> {
> m_hChromaShift = CHROMA_H_SHIFT(csp);
> m_vChromaShift = CHROMA_V_SHIFT(csp);
> @@ -99,13 +120,35 @@
> m_refLeft = X265_MALLOC(pixel, 3 * MAX_CU_SIZE);
> m_refLeftFlt = X265_MALLOC(pixel, 3 * MAX_CU_SIZE);
>
> - m_predYuv[0].create(MAX_CU_SIZE, MAX_CU_SIZE, csp);
> - m_predYuv[1].create(MAX_CU_SIZE, MAX_CU_SIZE, csp);
> - m_predShortYuv[0].create(MAX_CU_SIZE, MAX_CU_SIZE, csp);
> - m_predShortYuv[1].create(MAX_CU_SIZE, MAX_CU_SIZE, csp);
> - m_predTempYuv.create(MAX_CU_SIZE, MAX_CU_SIZE, csp);
> + for (int i = 0; i < numPart; i++)
> + {
> + m_predShortYuv[i][0].create(MAX_CU_SIZE, MAX_CU_SIZE, csp);
> + m_predShortYuv[i][1].create(MAX_CU_SIZE, MAX_CU_SIZE, csp);
> + }
>
> - m_immedVals = X265_MALLOC(int16_t, 64 * (64 + NTAPS_LUMA - 1));
> + if (m_threading)
> + {
> + for (int i = 0; i < numPart; i++)
> + {
> + for (int j = 0; j < maxNumRef; j++)
> + {
> + m_predYuv[i][0][j].create(MAX_CU_SIZE, MAX_CU_SIZE, csp);
> + m_immedVals[i][0][j] = X265_MALLOC(int16_t, 64 * (64 + NTAPS_LUMA - 1));
> + m_immedVals[i][0][j + MAX_NUM_REF] = X265_MALLOC(int16_t, 64 * (64 + NTAPS_LUMA - 1));
> + }
> + /* currently there is only one L1 reference */
> + m_predYuv[i][1][0].create(MAX_CU_SIZE, MAX_CU_SIZE, csp);
> + m_immedVals[i][1][0] = X265_MALLOC(int16_t, 64 * (64 + NTAPS_LUMA - 1));
> + m_immedVals[i][1][0 + MAX_NUM_REF] = X265_MALLOC(int16_t, 64 * (64 + NTAPS_LUMA - 1));
> + }
> + }
> + else
> + {
> + m_predYuv[0][0][0].create(MAX_CU_SIZE, MAX_CU_SIZE, csp);
> + m_predYuv[0][1][0].create(MAX_CU_SIZE, MAX_CU_SIZE, csp);
> + m_immedVals[0][0][0] = X265_MALLOC(int16_t, 64 * (64 + NTAPS_LUMA - 1));
> + }
> +
> }
> }
>
> @@ -279,7 +322,8 @@
> {
> if (cu->getSlice()->getPPS()->getUseWP())
> {
> - ShortYuv* shortYuv = &m_predShortYuv[0];
> + int partSize = m_threading ? cu->getPartitionSize(partAddr) : 0;
> + ShortYuv* shortYuv = &m_predShortYuv[partSize][0];
> int refId = cu->getCUMvField(list)->getRefIdx(partAddr);
> assert(refId >= 0);
>
> @@ -321,7 +365,10 @@
> cu->clipMv(mv);
>
> if (bLuma)
> - xPredInterLumaBlk(cu, cu->getSlice()->getRefPic(list, refIdx)->getPicYuvRec(), partAddr, &mv, width, height, outPredYuv);
> + {
> + int immedIdx = m_threading ? ((list << 5) + refIdx) : 0;
> + xPredInterLumaBlk(cu, cu->getSlice()->getRefPic(list, refIdx)->getPicYuvRec(), partAddr, &mv, width, height, outPredYuv, immedIdx);
> + }
>
> if (bChroma)
> xPredInterChromaBlk(cu, cu->getSlice()->getRefPic(list, refIdx)->getPicYuvRec(), partAddr, &mv, width, height, outPredYuv);
> @@ -345,6 +392,7 @@
> void TComPrediction::xPredInterBi(TComDataCU* cu, uint32_t partAddr, int width, int height, TComYuv* outPredYuv, bool bLuma, bool bChroma)
> {
> assert(cu->getSlice()->isInterB());
> + int partSize = m_threading ? cu->getPartitionSize(partAddr) : 0;
>
> int refIdx[2];
> refIdx[0] = cu->getCUMvField(REF_PIC_LIST_0)->getRefIdx(partAddr);
> @@ -356,16 +404,16 @@
> {
> assert(refIdx[list] < cu->getSlice()->getNumRefIdx(list));
>
> - xPredInterUni(cu, partAddr, width, height, list, &m_predShortYuv[list], bLuma, bChroma);
> + xPredInterUni(cu, partAddr, width, height, list, &m_predShortYuv[partSize][list], bLuma, bChroma);
> }
>
> if (cu->getSlice()->getPPS()->getWPBiPred())
> {
> - xWeightedPredictionBi(cu, &m_predShortYuv[0], &m_predShortYuv[1], refIdx[0], refIdx[1], partAddr, width, height, outPredYuv, bLuma, bChroma);
> + xWeightedPredictionBi(cu, &m_predShortYuv[partSize][0], &m_predShortYuv[partSize][1], refIdx[0], refIdx[1], partAddr, width, height, outPredYuv, bLuma, bChroma);
> }
> else
> {
> - outPredYuv->addAvg(&m_predShortYuv[0], &m_predShortYuv[1], partAddr, width, height, bLuma, bChroma);
> + outPredYuv->addAvg(&m_predShortYuv[partSize][0], &m_predShortYuv[partSize][1], partAddr, width, height, bLuma, bChroma);
> }
> }
> else if (cu->getSlice()->getPPS()->getWPBiPred())
> @@ -376,10 +424,10 @@
>
> assert(refIdx[list] < cu->getSlice()->getNumRefIdx(list));
>
> - xPredInterUni(cu, partAddr, width, height, list, &m_predShortYuv[list], bLuma, bChroma);
> + xPredInterUni(cu, partAddr, width, height, list, &m_predShortYuv[partSize][list], bLuma, bChroma);
> }
>
> - xWeightedPredictionBi(cu, &m_predShortYuv[0], &m_predShortYuv[1], refIdx[0], refIdx[1], partAddr, width, height, outPredYuv, bLuma, bChroma);
> + xWeightedPredictionBi(cu, &m_predShortYuv[partSize][0], &m_predShortYuv[partSize][1], refIdx[0], refIdx[1], partAddr, width, height, outPredYuv, bLuma, bChroma);
> }
> else if (refIdx[0] >= 0)
> {
> @@ -412,10 +460,10 @@
> * \param height Height of block
> * \param dstPic Pointer to destination picture
> */
> -void TComPrediction::xPredInterLumaBlk(TComDataCU *cu, TComPicYuv *refPic, uint32_t partAddr, MV *mv, int width, int height, TComYuv *dstPic)
> +void TComPrediction::xPredInterLumaBlk(TComDataCU *cu, TComPicYuv *refPic, uint32_t partAddr, MV *mv, int width, int height, TComYuv *dstPic, int immedIdx)
> {
> int dstStride = dstPic->getStride();
> - pixel *dst = dstPic->getLumaAddr(partAddr);
> + pixel *dst = dstPic->getLumaAddr(partAddr);
>
> int srcStride = refPic->getStride();
> int srcOffset = (mv->x >> 2) + (mv->y >> 2) * srcStride;
> @@ -442,8 +490,10 @@
> int tmpStride = width;
> int filterSize = NTAPS_LUMA;
> int halfFilterSize = (filterSize >> 1);
> - primitives.luma_hps[partEnum](src, srcStride, m_immedVals, tmpStride, xFrac, 1);
> - primitives.luma_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, yFrac);
> + int partSize = m_threading ? cu->getPartitionSize(partAddr) : 0;
> + int16_t* immedVals = m_immedVals[partSize][immedIdx >> 5][immedIdx & 31];
> + primitives.luma_hps[partEnum](src, srcStride, immedVals, tmpStride, xFrac, 1);
> + primitives.luma_vsp[partEnum](immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, yFrac);
> }
> }
>
> @@ -482,8 +532,10 @@
> int tmpStride = width;
> int filterSize = NTAPS_LUMA;
> int halfFilterSize = (filterSize >> 1);
> - primitives.luma_hps[partEnum](ref, refStride, m_immedVals, tmpStride, xFrac, 1);
> - primitives.luma_vss[partEnum](m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, yFrac);
> + int partSize = m_threading ? cu->getPartitionSize(partAddr) : 0;
> + int16_t* immedVals = m_immedVals[partSize][0][0];
> + primitives.luma_hps[partEnum](ref, refStride, immedVals, tmpStride, xFrac, 1);
> + primitives.luma_vss[partEnum](immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, yFrac);
> }
> }
>
> @@ -540,12 +592,14 @@
> int extStride = width >> m_hChromaShift;
> int filterSize = NTAPS_CHROMA;
> int halfFilterSize = (filterSize >> 1);
> + int partSize = m_threading ? cu->getPartitionSize(partAddr) : 0;
> + int16_t* immedVals = m_immedVals[partSize][0][0];
>
> - primitives.chroma[csp].filter_hps[partEnum](refCb, refStride, m_immedVals, extStride, xFrac << (1 - cu->getHorzChromaShift()), 1);
> - primitives.chroma[csp].filter_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - cu->getVertChromaShift()));
> + primitives.chroma[csp].filter_hps[partEnum](refCb, refStride, immedVals, extStride, xFrac << (1 - cu->getHorzChromaShift()), 1);
> + primitives.chroma[csp].filter_vsp[partEnum](immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - cu->getVertChromaShift()));
>
> - primitives.chroma[csp].filter_hps[partEnum](refCr, refStride, m_immedVals, extStride, xFrac << (1 - cu->getHorzChromaShift()), 1);
> - primitives.chroma[csp].filter_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - cu->getVertChromaShift()));
> + primitives.chroma[csp].filter_hps[partEnum](refCr, refStride, immedVals, extStride, xFrac << (1 - cu->getHorzChromaShift()), 1);
> + primitives.chroma[csp].filter_vsp[partEnum](immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - cu->getVertChromaShift()));
> }
> }
>
> @@ -597,10 +651,13 @@
> int extStride = cxWidth;
> int filterSize = NTAPS_CHROMA;
> int halfFilterSize = (filterSize >> 1);
> - primitives.chroma[csp].filter_hps[partEnum](refCb, refStride, m_immedVals, extStride, xFrac << (1 - cu->getHorzChromaShift()), 1);
> - primitives.chroma[csp].filter_vss[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - cu->getVertChromaShift()));
> - primitives.chroma[csp].filter_hps[partEnum](refCr, refStride, m_immedVals, extStride, xFrac << (1 - cu->getHorzChromaShift()), 1);
> - primitives.chroma[csp].filter_vss[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - cu->getVertChromaShift()));
> + int partSize = m_threading ? cu->getPartitionSize(partAddr) : 0;
> + int16_t* immedVals = m_immedVals[partSize][0][0];
> +
> + primitives.chroma[csp].filter_hps[partEnum](refCb, refStride, immedVals, extStride, xFrac << (1 - cu->getHorzChromaShift()), 1);
> + primitives.chroma[csp].filter_vss[partEnum](immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - cu->getVertChromaShift()));
> + primitives.chroma[csp].filter_hps[partEnum](refCr, refStride, immedVals, extStride, xFrac << (1 - cu->getHorzChromaShift()), 1);
> + primitives.chroma[csp].filter_vss[partEnum](immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - cu->getVertChromaShift()));
> }
> }
>
> diff -r 33b67a53b6de -r 7b757a1a9953 source/Lib/TLibCommon/TComPrediction.h
> --- a/source/Lib/TLibCommon/TComPrediction.h Thu Mar 06 21:27:55 2014 -0600
> +++ b/source/Lib/TLibCommon/TComPrediction.h Fri Mar 07 14:21:13 2014 +0800
> @@ -63,19 +63,20 @@
> {
> protected:
>
> - // references sample for IntraPrediction
> - TComYuv m_predYuv[2];
> - ShortYuv m_predShortYuv[2];
> - TComYuv m_predTempYuv;
> + // references sample for InterPrediction
> + TComYuv m_predYuv[SIZE_nRx2N + 1][2][MAX_NUM_REF];
> + ShortYuv m_predShortYuv[SIZE_nRx2N + 1][2];
>
> - int16_t* m_immedVals;
> + int16_t* m_immedVals[SIZE_nRx2N + 1][2][MAX_NUM_REF * 2];
> + bool m_threading;
> +
> int m_hChromaShift;
> int m_vChromaShift;
>
> // motion compensation functions
> void xPredInterUni(TComDataCU* cu, uint32_t partAddr, int width, int height, int picList, TComYuv* outPredYuv, bool bLuma, bool bChroma);
> void xPredInterUni(TComDataCU* cu, uint32_t partAddr, int width, int height, int picList, ShortYuv* outPredYuv, bool bLuma, bool bChroma);
> - void xPredInterLumaBlk(TComDataCU *cu, TComPicYuv *refPic, uint32_t partAddr, MV *mv, int width, int height, TComYuv *dstPic);
> + void xPredInterLumaBlk(TComDataCU *cu, TComPicYuv *refPic, uint32_t partAddr, MV *mv, int width, int height, TComYuv *dstPic, int immedIdx = 0);
> void xPredInterLumaBlk(TComDataCU *cu, TComPicYuv *refPic, uint32_t partAddr, MV *mv, int width, int height, ShortYuv *dstPic);
> void xPredInterChromaBlk(TComDataCU *cu, TComPicYuv *refPic, uint32_t partAddr, MV *mv, int width, int height, TComYuv *dstPic);
> void xPredInterChromaBlk(TComDataCU *cu, TComPicYuv *refPic, uint32_t partAddr, MV *mv, int width, int height, ShortYuv *dstPic);
> @@ -99,7 +100,7 @@
> TComPrediction();
> virtual ~TComPrediction();
>
> - void initTempBuff(int csp);
> + void initTempBuff(int csp, int numPart, int maxNumRef);
>
> // inter
> void motionCompensation(TComDataCU* cu, TComYuv* predYuv, int picList = REF_PIC_LIST_X, int partIdx = -1, bool bLuma = true, bool bChroma = true);
> diff -r 33b67a53b6de -r 7b757a1a9953 source/Lib/TLibCommon/TComSlice.cpp
> --- a/source/Lib/TLibCommon/TComSlice.cpp Thu Mar 06 21:27:55 2014 -0600
> +++ b/source/Lib/TLibCommon/TComSlice.cpp Fri Mar 07 14:21:13 2014 +0800
> @@ -374,9 +374,9 @@
> * \param *&wpScalingParam
> * \returns void
> */
> -void TComSlice::getWpScaling(int l, int refIdx, wpScalingParam *&wp)
> +void TComSlice::getWpScaling(int l, int refIdx, wpScalingParam *&wp, int partSize)
> {
> - wp = m_weightPredTable[l][refIdx];
> + wp = m_weightPredTable[partSize][l][refIdx];
> }
>
> /** reset Default WP tables settings : no weight.
> @@ -391,11 +391,14 @@
> {
> for (int yuv = 0; yuv < 3; yuv++)
> {
> - wpScalingParam *pwp = &(m_weightPredTable[e][i][yuv]);
> - pwp->bPresentFlag = false;
> - pwp->log2WeightDenom = 0;
> - pwp->inputWeight = 1;
> - pwp->inputOffset = 0;
> + for (int partSize = 0; partSize <= SIZE_nRx2N; partSize++)
> + {
> + wpScalingParam *pwp = &(m_weightPredTable[partSize][e][i][yuv]);
> + pwp->bPresentFlag = false;
> + pwp->log2WeightDenom = 0;
> + pwp->inputWeight = 1;
> + pwp->inputOffset = 0;
> + }
> }
> }
> }
> @@ -412,18 +415,21 @@
> {
> for (int yuv = 0; yuv < 3; yuv++)
> {
> - wpScalingParam *pwp = &(m_weightPredTable[e][i][yuv]);
> - if (!pwp->bPresentFlag)
> + for (int partSize = 0; partSize <= SIZE_nRx2N; partSize++)
> {
> - // Inferring values not present :
> - pwp->inputWeight = (1 << pwp->log2WeightDenom);
> - pwp->inputOffset = 0;
> + wpScalingParam *pwp = &(m_weightPredTable[partSize][e][i][yuv]);
> + if (!pwp->bPresentFlag)
> + {
> + // Inferring values not present :
> + pwp->inputWeight = (1 << pwp->log2WeightDenom);
> + pwp->inputOffset = 0;
> + }
> +
> + pwp->w = pwp->inputWeight;
> + pwp->o = pwp->inputOffset << (X265_DEPTH - 8);
> + pwp->shift = pwp->log2WeightDenom;
> + pwp->round = (pwp->log2WeightDenom >= 1) ? (1 << (pwp->log2WeightDenom - 1)) : (0);
> }
> -
> - pwp->w = pwp->inputWeight;
> - pwp->o = pwp->inputOffset << (X265_DEPTH - 8);
> - pwp->shift = pwp->log2WeightDenom;
> - pwp->round = (pwp->log2WeightDenom >= 1) ? (1 << (pwp->log2WeightDenom - 1)) : (0);
> }
> }
> }
> diff -r 33b67a53b6de -r 7b757a1a9953 source/Lib/TLibCommon/TComSlice.h
> --- a/source/Lib/TLibCommon/TComSlice.h Thu Mar 06 21:27:55 2014 -0600
> +++ b/source/Lib/TLibCommon/TComSlice.h Fri Mar 07 14:21:13 2014 +0800
> @@ -1346,7 +1346,7 @@
>
> public:
>
> - wpScalingParam m_weightPredTable[2][MAX_NUM_REF][3]; // [REF_PIC_LIST_0 or REF_PIC_LIST_1][refIdx][0:Y, 1:U, 2:V]
> + wpScalingParam m_weightPredTable[SIZE_nRx2N + 1][2][MAX_NUM_REF][3]; // [partSize][REF_PIC_LIST_0 or REF_PIC_LIST_1][refIdx][0:Y, 1:U, 2:V]
> int m_numWPRefs; // number of references for which unidirectional weighted prediction is used
>
> TComSlice();
> @@ -1528,9 +1528,15 @@
>
> bool getFinalized() { return m_bFinalized; }
>
> - void setWpScaling(wpScalingParam wp[2][MAX_NUM_REF][3]) { memcpy(m_weightPredTable, wp, sizeof(wpScalingParam) * 2 * MAX_NUM_REF * 3); }
> + void setWpScaling(wpScalingParam wp[2][MAX_NUM_REF][3])
> + {
> + for (int partSize = 0; partSize <= SIZE_nRx2N; partSize++)
> + {
> + memcpy(m_weightPredTable[partSize], wp, sizeof(wpScalingParam) * 2 * MAX_NUM_REF * 3);
> + }
> + }
>
> - void getWpScaling(int e, int refIdx, wpScalingParam *&wp);
> + void getWpScaling(int l, int refIdx, wpScalingParam *&wp, int partSize = 0);
>
> void resetWpScaling();
> void initWpScaling();
> diff -r 33b67a53b6de -r 7b757a1a9953 source/Lib/TLibCommon/TComWeightPrediction.cpp
> --- a/source/Lib/TLibCommon/TComWeightPrediction.cpp Thu Mar 06 21:27:55 2014 -0600
> +++ b/source/Lib/TLibCommon/TComWeightPrediction.cpp Fri Mar 07 14:21:13 2014 +0800
> @@ -510,11 +510,11 @@
> { // explicit --------------------
> if (refIdx0 >= 0)
> {
> - slice->getWpScaling(REF_PIC_LIST_0, refIdx0, wp0);
> + slice->getWpScaling(REF_PIC_LIST_0, refIdx0, wp0, cu->getPartitionSize(0));
> }
> if (refIdx1 >= 0)
> {
> - slice->getWpScaling(REF_PIC_LIST_1, refIdx1, wp1);
> + slice->getWpScaling(REF_PIC_LIST_1, refIdx1, wp1, cu->getPartitionSize(0));
> }
> }
> else
> diff -r 33b67a53b6de -r 7b757a1a9953 source/Lib/TLibEncoder/TEncCu.h
> --- a/source/Lib/TLibEncoder/TEncCu.h Thu Mar 06 21:27:55 2014 -0600
> +++ b/source/Lib/TLibEncoder/TEncCu.h Fri Mar 07 14:21:13 2014 +0800
> @@ -169,6 +169,8 @@
> void xComputeCostIntraInInter(TComDataCU* cu, PartSize partSize);
> void xCheckRDCostInter(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, bool bUseMRG = false);
> void xComputeCostInter(TComDataCU* outTempCU, TComYuv* outPredYUV, PartSize partSize, bool bUseMRG = false);
> + void xComputeCostInterEnqueue(TComDataCU* outTempCU, TComYuv* outPredYUV, PartSize partSize, bool bUseMRG = false);
> + void xComputeDistortionCostInter(TComDataCU* outTempCU, TComYuv* outPredYUV);
> void xComputeCostMerge2Nx2N(TComDataCU*& outBestCU, TComDataCU*& outTempCU, TComYuv*& bestPredYuv, TComYuv*& tmpPredYuv);
> void xEncodeIntraInInter(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* outResiYuv, TComYuv* outReconYuv);
> void encodeResidue(TComDataCU* lcu, TComDataCU* cu, uint32_t absPartIdx, UChar depth);
> diff -r 33b67a53b6de -r 7b757a1a9953 source/Lib/TLibEncoder/TEncSearch.cpp
> --- a/source/Lib/TLibEncoder/TEncSearch.cpp Thu Mar 06 21:27:55 2014 -0600
> +++ b/source/Lib/TLibEncoder/TEncSearch.cpp Fri Mar 07 14:21:13 2014 +0800
> @@ -52,7 +52,7 @@
> //! \ingroup TLibEncoder
> //! \{
>
> -TEncSearch::TEncSearch()
> +TEncSearch::TEncSearch() : JobProvider(0), TComPrediction()
> {
> m_qtTempCoeffY = NULL;
> m_qtTempCoeffCb = NULL;
> @@ -75,6 +75,12 @@
> m_entropyCoder = NULL;
> m_rdSbacCoders = NULL;
> m_rdGoOnSbacCoder = NULL;
> +
> + m_queuedBitmap = NULL;
> + m_completeBitmap = NULL;
> + m_queuedCU = NULL;
> + m_queuedPredYuv = NULL;
> + m_queuedbUseMRG = NULL;
> }
>
> TEncSearch::~TEncSearch()
> @@ -106,18 +112,35 @@
> delete[] m_qtTempCoeffCr;
> delete[] m_qtTempShortYuv;
> m_qtTempTransformSkipYuv.destroy();
> - m_tmpYuvPred.destroy();
> + for (int i = 0; i <= m_numWords; i++)
> + {
> + m_tmpYuvPred[i].destroy();
> + }
> +
> + X265_FREE((void*)m_queuedBitmap);
> + X265_FREE((void*)m_completeBitmap);
> + X265_FREE(m_queuedCU);
> + X265_FREE(m_queuedPredYuv);
> + X265_FREE(m_queuedbUseMRG);
> }
>
> bool TEncSearch::init(Encoder* cfg, TComRdCost* rdCost, TComTrQuant* trQuant)
> {
> + bool ok = true;
> m_cfg = cfg;
> m_trQuant = trQuant;
> m_rdCost = rdCost;
>
> - initTempBuff(cfg->param->internalCsp);
> - m_me.setSearchMethod(cfg->param->searchMethod);
> - m_me.setSubpelRefine(cfg->param->subpelRefine);
> + m_threading = m_pool != NULL && cfg->param->rdLevel < 5 && ThreadPool::getThreadPool()->getThreadCount() >= 16;
> + m_numWords = (m_threading && cfg->param->bEnableRectInter) ? 3 : 1;
> +
> + initTempBuff(cfg->param->internalCsp, m_numWords, cfg->param->maxNumReferences);
> +
> + for (int i = 0; i <= m_numWords; i++)
> + {
> + m_me[i].setSearchMethod(cfg->param->searchMethod);
> + m_me[i].setSubpelRefine(cfg->param->subpelRefine);
> + }
>
> /* When frame parallelism is active, only 'refLagPixels' of reference frames will be guaranteed
> * available for motion reference. See refLagRows in FrameEncoder::compressCTURows() */
> @@ -142,7 +165,7 @@
> m_qtTempCoeffY[i] = X265_MALLOC(TCoeff, g_maxCUSize * g_maxCUSize);
> m_qtTempCoeffCb[i] = X265_MALLOC(TCoeff, (g_maxCUSize >> m_hChromaShift) * (g_maxCUSize >> m_vChromaShift));
> m_qtTempCoeffCr[i] = X265_MALLOC(TCoeff, (g_maxCUSize >> m_hChromaShift) * (g_maxCUSize >> m_vChromaShift));
> - m_qtTempShortYuv[i].create(MAX_CU_SIZE, MAX_CU_SIZE, cfg->param->internalCsp);
> + ok = ok && m_qtTempShortYuv[i].create(MAX_CU_SIZE, MAX_CU_SIZE, cfg->param->internalCsp);
> }
>
> const uint32_t numPartitions = 1 << (g_maxCUDepth << 1);
> @@ -161,17 +184,39 @@
> CHECKED_MALLOC(m_qtTempTUCoeffCb, TCoeff, MAX_TS_WIDTH * MAX_TS_HEIGHT);
> CHECKED_MALLOC(m_qtTempTUCoeffCr, TCoeff, MAX_TS_WIDTH * MAX_TS_HEIGHT);
>
> - return m_qtTempTransformSkipYuv.create(g_maxCUSize, g_maxCUSize, cfg->param->internalCsp) &&
> - m_tmpYuvPred.create(MAX_CU_SIZE, MAX_CU_SIZE, cfg->param->internalCsp);
> + ok = ok && m_qtTempTransformSkipYuv.create(g_maxCUSize, g_maxCUSize, cfg->param->internalCsp);
> +
> + for (int i = 0; i < m_numWords; i++)
> + {
> + ok = ok && m_tmpYuvPred[i].create(MAX_CU_SIZE, MAX_CU_SIZE, cfg->param->internalCsp);
> + }
> +
> + if (m_threading)
> + {
> + CHECKED_MALLOC(m_queuedBitmap, uint64_t, m_numWords);
> + memset((void*)m_queuedBitmap, 0, sizeof(uint64_t) * m_numWords);
> +
> + CHECKED_MALLOC(m_completeBitmap, uint64_t, m_numWords);
> + memset((void*)m_completeBitmap, 0, sizeof(uint64_t) * m_numWords);
> +
> + CHECKED_MALLOC(m_queuedCU, TComDataCU*, m_numWords);
> + CHECKED_MALLOC(m_queuedPredYuv, TComYuv*, m_numWords);
> + CHECKED_MALLOC(m_queuedbUseMRG, bool, m_numWords);
> + }
> +
> + return ok;
>
> fail:
> - return false;
> + return false;
> }
>
> void TEncSearch::setQPLambda(int QP, double lambdaLuma, double lambdaChroma)
> {
> m_trQuant->setLambda(lambdaLuma, lambdaChroma);
> - m_me.setQP(QP);
> + for (int i = 0; i <= m_numWords; i++)
> + {
> + m_me[i].setQP(QP);
> + }
> }
>
> void TEncSearch::xEncSubdivCbfQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, bool bLuma, bool bChroma)
> @@ -2448,9 +2493,10 @@
> uint32_t absPartIdx;
> int width, height;
>
> - motionCompensation(cu, &m_tmpYuvPred, REF_PIC_LIST_X, partIdx, true, false);
> cu->getPartIndexAndSize(partIdx, absPartIdx, width, height);
> - uint32_t cost = m_me.bufSA8D(m_tmpYuvPred.getLumaAddr(absPartIdx), m_tmpYuvPred.getStride());
> + int partSizeId = m_threading ? (int)cu->getPartitionSize(absPartIdx) : 0;
> + motionCompensation(cu, &m_tmpYuvPred[partSizeId], REF_PIC_LIST_X, partIdx, true, false);
> + uint32_t cost = m_me[partSizeId].bufSA8D(m_tmpYuvPred[partSizeId].getLumaAddr(absPartIdx), m_tmpYuvPred[partSizeId].getStride());
> x265_emms();
> return cost;
> }
> @@ -2599,11 +2645,11 @@
> cu->getPartIndexAndSize(partIdx, partAddr, roiWidth, roiHeight);
>
> Pel* pu = fenc->getLumaAddr(cu->getAddr(), cu->getZorderIdxInCU() + partAddr);
> - m_me.setSourcePU(pu - fenc->getLumaAddr(), roiWidth, roiHeight);
> -
> - cu->getMvPredLeft(m_mvPredictors[0]);
> - cu->getMvPredAbove(m_mvPredictors[1]);
> - cu->getMvPredAboveRight(m_mvPredictors[2]);
> + m_me[0].setSourcePU(pu - fenc->getLumaAddr(), roiWidth, roiHeight);
> +
> + cu->getMvPredLeft(m_mvPredictors[0][0]);
> + cu->getMvPredAbove(m_mvPredictors[0][1]);
> + cu->getMvPredAboveRight(m_mvPredictors[0][2]);
>
> bool bTestNormalMC = true;
>
> @@ -2635,12 +2681,12 @@
>
> MV mvmin, mvmax;
> xSetSearchRange(cu, mvp, merange, mvmin, mvmax);
> - int satdCost = m_me.motionEstimate(m_mref[list][idx],
> - mvmin, mvmax, mvp, 3, m_mvPredictors, merange, outmv);
> + int satdCost = m_me[0].motionEstimate(m_mref[list][idx],
> + mvmin, mvmax, mvp, 3, m_mvPredictors[0], merange, outmv);
>
> /* Get total cost of partition, but only include MV bit cost once */
> - bitsTemp += m_me.bitcost(outmv);
> - costTemp = (satdCost - m_me.mvcost(outmv)) + m_rdCost->getCost(bitsTemp);
> + bitsTemp += m_me[0].bitcost(outmv);
> + costTemp = (satdCost - m_me[0].mvcost(outmv)) + m_rdCost->getCost(bitsTemp);
>
> xCheckBestMVP(&amvpInfo[list][idx], mvTemp[list][idx], mvPred[list][idx], mvpIdx[list][idx], bitsTemp, costTemp);
>
> @@ -2678,16 +2724,16 @@
> ::memcpy(mvpIdxBi, mvpIdx, sizeof(mvpIdx));
>
> // Generate reference subpels
> - xPredInterLumaBlk(cu, cu->getSlice()->getRefPic(REF_PIC_LIST_0, refIdx[0])->getPicYuvRec(), partAddr, &mv[0], roiWidth, roiHeight, &m_predYuv[0]);
> - xPredInterLumaBlk(cu, cu->getSlice()->getRefPic(REF_PIC_LIST_1, refIdx[1])->getPicYuvRec(), partAddr, &mv[1], roiWidth, roiHeight, &m_predYuv[1]);
> -
> - pixel *ref0 = m_predYuv[0].getLumaAddr(partAddr);
> - pixel *ref1 = m_predYuv[1].getLumaAddr(partAddr);
> + xPredInterLumaBlk(cu, cu->getSlice()->getRefPic(REF_PIC_LIST_0, refIdx[0])->getPicYuvRec(), partAddr, &mv[0], roiWidth, roiHeight, &m_predYuv[0][0][0]);
> + xPredInterLumaBlk(cu, cu->getSlice()->getRefPic(REF_PIC_LIST_1, refIdx[1])->getPicYuvRec(), partAddr, &mv[1], roiWidth, roiHeight, &m_predYuv[0][1][0]);
> +
> + pixel *ref0 = m_predYuv[0][0][0].getLumaAddr(partAddr);
> + pixel *ref1 = m_predYuv[0][1][0].getLumaAddr(partAddr);
>
> ALIGN_VAR_32(pixel, avg[MAX_CU_SIZE * MAX_CU_SIZE]);
>
> int partEnum = partitionFromSizes(roiWidth, roiHeight);
> - primitives.pixelavg_pp[partEnum](avg, roiWidth, ref0, m_predYuv[0].getStride(), ref1, m_predYuv[1].getStride(), 32);
> + primitives.pixelavg_pp[partEnum](avg, roiWidth, ref0, m_predYuv[0][0][0].getStride(), ref1, m_predYuv[0][1][0].getStride(), 32);
> int satdCost = primitives.satd[partEnum](pu, fenc->getStride(), avg, roiWidth);
> x265_emms();
> bits[2] = bits[0] + bits[1] - mbBits[0] - mbBits[1] + mbBits[2];
> @@ -2704,11 +2750,11 @@
> x265_emms();
>
> unsigned int bitsZero0, bitsZero1;
> - m_me.setMVP(mvPredBi[0][refIdxBidir[0]]);
> - bitsZero0 = bits[0] - m_me.bitcost(mv[0]) + m_me.bitcost(mvzero);
> -
> - m_me.setMVP(mvPredBi[1][refIdxBidir[1]]);
> - bitsZero1 = bits[1] - m_me.bitcost(mv[1]) + m_me.bitcost(mvzero);
> + m_me[0].setMVP(mvPredBi[0][refIdxBidir[0]]);
> + bitsZero0 = bits[0] - m_me[0].bitcost(mv[0]) + m_me[0].bitcost(mvzero);
> +
> + m_me[0].setMVP(mvPredBi[1][refIdxBidir[1]]);
> + bitsZero1 = bits[1] - m_me[0].bitcost(mv[1]) + m_me[0].bitcost(mvzero);
>
> uint32_t costZero = satdCost + m_rdCost->getCost(bitsZero0) + m_rdCost->getCost(bitsZero1);
>
> @@ -2868,6 +2914,338 @@
> cu->m_totalBits = totalmebits;
> }
>
> +/** search of the best candidate for inter prediction, multi-thread version
> + * \param cu
> + * \param predYuv
> + * \param bUseMRG
> + * \param bLuma
> + * \param bChroma
> + * \returns void
> + */
> +void TEncSearch::predInterSearch(TComDataCU* cu, TComYuv* predYuv, int id, bool bUseMRG, bool bLuma, bool bChroma)
> +{
> + MV mvzero(0, 0);
> + MV mv[2];
> + MV mvBidir[2];
> + MV mvPredBi[2][MAX_NUM_REF];
> + int mvpIdxBi[2][MAX_NUM_REF];
> +
> + uint32_t mbBits[3] = { 1, 1, 0 };
> + int refIdx[2] = { 0, 0 }; /* If un-initialized, may cause SEGV in bi-directional prediction iterative stage. */
> + int refIdxBidir[2] = { 0, 0 };
> +
> + PartSize partSize = cu->getPartitionSize(0);
> + int numPart = cu->getNumPartInter();
> + int numPredDir = cu->getSlice()->isInterP() ? 1 : 2;
> +
> + uint32_t listCost[2] = { MAX_UINT, MAX_UINT };
> + uint32_t bits[3];
> + uint32_t costbi = MAX_UINT;
> + MV mvValidList1(0, 0);
> + int refIdxValidList1 = 0;
> + uint32_t bitsValidList1 = MAX_UINT;
> + uint32_t costValidList1 = MAX_UINT;
> +
> + int& partIdx = m_partIdx[partSize];
> +
> + uint32_t partAddr;
> + int roiWidth, roiHeight;
> + xGetBlkBits(partSize, cu->getSlice()->isInterP(), partIdx, m_lastMode[partSize], mbBits);
> + cu->getPartIndexAndSize(partIdx, partAddr, roiWidth, roiHeight);
> + TComPicYuv *fenc = cu->getSlice()->getPic()->getPicYuvOrg();
> + Pel* pu = fenc->getLumaAddr(cu->getAddr(), cu->getZorderIdxInCU() + partAddr);
> +
> + int numRefL0 = cu->getSlice()->getNumRefIdx(REF_PIC_LIST_0);
> + int numRefL1 = cu->getSlice()->getNumRefIdx(REF_PIC_LIST_1);
> +
> + bool bTestNormalMC = true;
> +
> + if (bUseMRG && cu->getCUSize(0) > 8 && numPart == 2)
> + {
> + bTestNormalMC = false;
> + }
> +
> + if (id == (numRefL0 + numRefL1))
> + {
> + TComMvField mvFieldNeighbours[MRG_MAX_NUM_CANDS << 1]; // double length for mv of both lists
> + UChar interDirNeighbours[MRG_MAX_NUM_CANDS];
> + int numValidMergeCand = 0;
> +
> + m_mrgInterDir[partSize] = 0;
> + m_mrgMvField[partSize][0].setMvField(MV(0, 0), -1);
> + m_mrgMvField[partSize][1].setMvField(MV(0, 0), -1);
> + m_mrgIndex[partSize] = 0;
> + m_mrgBits[partSize] = 0;
> +
> + /* find Merge result */
> + xMergeEstimation(cu, partIdx, m_mrgInterDir[partSize], m_mrgMvField[partSize], m_mrgIndex[partSize],
> + m_mrgCost[partSize], m_mrgBits[partSize], mvFieldNeighbours, interDirNeighbours, numValidMergeCand);
> + }
> +
> + if (id < (numRefL0 + numRefL1))
> + {
> + /* Uni-directional prediction */
> + int list = id < numRefL0 ? 0 : 1;
> + int idx = id - list * numRefL0;
> +
> + m_bitsTemp[partSize][list][idx] = mbBits[list];
> + if (cu->getSlice()->getNumRefIdx(list) > 1)
> + {
> + m_bitsTemp[partSize][list][idx] += idx + 1;
> + if (idx == cu->getSlice()->getNumRefIdx(list) - 1) m_bitsTemp[partSize][list][idx]--;
> + }
> + uint32_t biPDistTemp = MAX_INT;
> + xEstimateMvPredAMVP(cu, partIdx, list, idx, m_mvPred[partSize][list][idx], &m_amvpInfo[partSize][list][idx], &biPDistTemp);
> +
> + m_bitsTemp[partSize][list][idx] += MVP_IDX_BITS;
> + int merange = m_adaptiveRange[list][idx];
> + MV& mvp = m_mvPred[partSize][list][idx];
> + MV& outmv = m_mvTemp[partSize][list][idx];
> +
> + MV mvmin, mvmax;
> + xSetSearchRange(cu, mvp, merange, mvmin, mvmax);
> + int satdCost = m_me[partSize].motionEstimate(m_mref[list][idx], mvmin, mvmax, mvp, 3, m_mvPredictors[partSize], merange, outmv);
> +
> + /* Get total cost of partition, but only include MV bit cost once */
> + m_bitsTemp[partSize][list][idx] += m_me[partSize].bitcost(mvp, outmv);
> + m_costTemp[partSize][list][idx] = (satdCost - m_me[partSize].mvcost(mvp, outmv)) + m_rdCost->getCost(m_bitsTemp[partSize][list][idx]);
> +
> + xCheckBestMVP(&m_amvpInfo[partSize][list][idx], m_mvTemp[partSize][list][idx], m_mvPred[partSize][list][idx],
> + m_mvpIdx[partSize][list][idx], m_bitsTemp[partSize][list][idx], m_costTemp[partSize][list][idx], partSize);
> + }
> +
> + if (bTestNormalMC)
> + {
> + uint64_t oldval = ATOMIC_OR(&m_completeBitmap[partSize], 1LL << id);
> + oldval |= 1LL << id;
> +
> + /* let the last finished thread do bidir */
> + uint64_t finish = 0;
> + int maxId = (cu->getPartitionSize(partAddr) == SIZE_2Nx2N) ? (numRefL0 + numRefL1 - 1) : numRefL0 + numRefL1;
> + for (int i = 0; i <= maxId; i++)
> + {
> + finish |= 1LL << i;
> + }
> + if ((oldval & finish) != finish)
> + {
> + return;
> + }
> + }
> +
> + if (bTestNormalMC)
> + {
> + for (int list = 0; list < numPredDir; list++)
> + {
> + for (int idx = 0; idx < cu->getSlice()->getNumRefIdx(list); idx++)
> + {
> + if (m_costTemp[partSize][list][idx] < listCost[list])
> + {
> + listCost[list] = m_costTemp[partSize][list][idx];
> + bits[list] = m_bitsTemp[partSize][list][idx]; /* storing for bi-prediction */
> +
> + /* set motion */
> + mv[list] = m_mvTemp[partSize][list][idx];
> + refIdx[list] = idx;
> + }
> +
> + if (list == 1 && m_costTemp[partSize][list][idx] < costValidList1)
> + {
> + costValidList1 = m_costTemp[partSize][list][idx];
> + bitsValidList1 = m_bitsTemp[partSize][list][idx];
> +
> + /* set motion */
> + mvValidList1 = m_mvTemp[partSize][list][idx];
> + refIdxValidList1 = idx;
> + }
> + }
> + }
> +
> + /* Bi-directional prediction */
> + if ((cu->getSlice()->isInterB()) && (cu->isBipredRestriction() == false))
> + {
> + mvBidir[0] = mv[0];
> + mvBidir[1] = mv[1];
> + refIdxBidir[0] = refIdx[0];
> + refIdxBidir[1] = refIdx[1];
> +
> + ::memcpy(mvPredBi, m_mvPred[partSize], sizeof(mvPredBi));
> + ::memcpy(mvpIdxBi, m_mvpIdx[partSize], sizeof(mvpIdxBi));
> +
> + /* Generate reference subpels */
> + xPredInterLumaBlk(cu, cu->getSlice()->getRefPic(REF_PIC_LIST_0, refIdx[0])->getPicYuvRec(), partAddr, &mv[0], roiWidth, roiHeight, &m_predYuv[partSize][0][0]);
> + xPredInterLumaBlk(cu, cu->getSlice()->getRefPic(REF_PIC_LIST_1, refIdx[1])->getPicYuvRec(), partAddr, &mv[1], roiWidth, roiHeight, &m_predYuv[partSize][1][0]);
> +
> + pixel *ref0 = m_predYuv[partSize][0][0].getLumaAddr(partAddr);
> + pixel *ref1 = m_predYuv[partSize][1][0].getLumaAddr(partAddr);
> +
> + ALIGN_VAR_32(pixel, avg[MAX_CU_SIZE * MAX_CU_SIZE]);
> +
> + int partEnum = partitionFromSizes(roiWidth, roiHeight);
> + primitives.pixelavg_pp[partEnum](avg, roiWidth, ref0, m_predYuv[partSize][0][0].getStride(), ref1, m_predYuv[partSize][1][0].getStride(), 32);
> + int satdCost = primitives.satd[partEnum](pu, fenc->getStride(), avg, roiWidth);
> + x265_emms();
> + bits[2] = bits[0] + bits[1] - mbBits[0] - mbBits[1] + mbBits[2];
> + costbi = satdCost + m_rdCost->getCost(bits[2]);
> +
> + if (mv[0].notZero() || mv[1].notZero())
> + {
> + ref0 = m_mref[0][refIdx[0]]->fpelPlane + (pu - fenc->getLumaAddr()); //MV(0,0) of ref0
> + ref1 = m_mref[1][refIdx[1]]->fpelPlane + (pu - fenc->getLumaAddr()); //MV(0,0) of ref1
> + intptr_t refStride = m_mref[0][refIdx[0]]->lumaStride;
> +
> + primitives.pixelavg_pp[partEnum](avg, roiWidth, ref0, refStride, ref1, refStride, 32);
> + satdCost = primitives.satd[partEnum](pu, fenc->getStride(), avg, roiWidth);
> + x265_emms();
> +
> + unsigned int bitsZero0, bitsZero1;
> + m_me[partSize].setMVP(mvPredBi[0][refIdxBidir[0]]);
> + bitsZero0 = bits[0] - m_me[partSize].bitcost(mv[0]) + m_me[partSize].bitcost(mvzero);
> +
> + m_me[partSize].setMVP(mvPredBi[1][refIdxBidir[1]]);
> + bitsZero1 = bits[1] - m_me[partSize].bitcost(mv[1]) + m_me[partSize].bitcost(mvzero);
> +
> + uint32_t costZero = satdCost + m_rdCost->getCost(bitsZero0) + m_rdCost->getCost(bitsZero1);
> +
> + MV mvpZero[2];
> + int mvpidxZero[2];
> + mvpZero[0] = mvPredBi[0][refIdxBidir[0]];
> + mvpidxZero[0] = mvpIdxBi[0][refIdxBidir[0]];
> + xCheckBestMVP(&m_amvpInfo[partSize][0][refIdxBidir[0]], mvzero, mvpZero[0], mvpidxZero[0], bitsZero0, costZero, partSize);
> + mvpZero[1] = mvPredBi[1][refIdxBidir[1]];
> + mvpidxZero[1] = mvpIdxBi[1][refIdxBidir[1]];
> + xCheckBestMVP(&m_amvpInfo[partSize][1][refIdxBidir[1]], mvzero, mvpZero[1], mvpidxZero[1], bitsZero1, costZero, partSize);
> +
> + if (costZero < costbi)
> + {
> + costbi = costZero;
> + mvBidir[0].x = mvBidir[0].y = 0;
> + mvBidir[1].x = mvBidir[1].y = 0;
> + mvPredBi[0][refIdxBidir[0]] = mvpZero[0];
> + mvPredBi[1][refIdxBidir[1]] = mvpZero[1];
> + mvpIdxBi[0][refIdxBidir[0]] = mvpidxZero[0];
> + mvpIdxBi[1][refIdxBidir[1]] = mvpidxZero[1];
> + bits[2] = bitsZero0 + bitsZero1 - mbBits[0] - mbBits[1] + mbBits[2];
> + }
> + }
> + } /* if (B_SLICE) */
> + } /* end if bTestNormalMC */
> +
> + /* Clear Motion Field */
> + cu->getCUMvField(REF_PIC_LIST_0)->setAllMvField(TComMvField(), partSize, partAddr, 0, partIdx);
> + cu->getCUMvField(REF_PIC_LIST_1)->setAllMvField(TComMvField(), partSize, partAddr, 0, partIdx);
> +
> + uint32_t mebits = 0;
> + /* Set Motion Field */
> + mv[1] = mvValidList1;
> + refIdx[1] = refIdxValidList1;
> + bits[1] = bitsValidList1;
> + listCost[1] = costValidList1;
> +
> + if (bTestNormalMC)
> + {
> + if (costbi <= listCost[0] && costbi <= listCost[1])
> + {
> + m_lastMode[partSize] = 2;
> + cu->getCUMvField(REF_PIC_LIST_0)->setAllMv(mvBidir[0], partSize, partAddr, 0, partIdx);
> + cu->getCUMvField(REF_PIC_LIST_0)->setAllRefIdx(refIdxBidir[0], partSize, partAddr, 0, partIdx);
> + cu->getCUMvField(REF_PIC_LIST_1)->setAllMv(mvBidir[1], partSize, partAddr, 0, partIdx);
> + cu->getCUMvField(REF_PIC_LIST_1)->setAllRefIdx(refIdxBidir[1], partSize, partAddr, 0, partIdx);
> +
> + MV mvtmp = mvBidir[0] - mvPredBi[0][refIdxBidir[0]];
> + cu->getCUMvField(REF_PIC_LIST_0)->setMvd(partAddr, mvtmp);
> + mvtmp = mvBidir[1] - mvPredBi[1][refIdxBidir[1]];
> + cu->getCUMvField(REF_PIC_LIST_1)->setMvd(partAddr, mvtmp);
> +
> + cu->setInterDirSubParts(3, partAddr, partIdx, cu->getDepth(0));
> +
> + cu->setMVPIdx(REF_PIC_LIST_0, partAddr, mvpIdxBi[0][refIdxBidir[0]]);
> + cu->setMVPIdx(REF_PIC_LIST_1, partAddr, mvpIdxBi[1][refIdxBidir[1]]);
> +
> + mebits = bits[2];
> + }
> + else if (listCost[0] <= listCost[1])
> + {
> + m_lastMode[partSize] = 0;
> + cu->getCUMvField(REF_PIC_LIST_0)->setAllMv(mv[0], partSize, partAddr, 0, partIdx);
> + cu->getCUMvField(REF_PIC_LIST_0)->setAllRefIdx(refIdx[0], partSize, partAddr, 0, partIdx);
> +
> + MV mvtmp = mv[0] - m_mvPred[partSize][0][refIdx[0]];
> + cu->getCUMvField(REF_PIC_LIST_0)->setMvd(partAddr, mvtmp);
> +
> + cu->setInterDirSubParts(1, partAddr, partIdx, cu->getDepth(0));
> +
> + cu->setMVPIdx(REF_PIC_LIST_0, partAddr, m_mvpIdx[partSize][0][refIdx[0]]);
> +
> + mebits = bits[0];
> + }
> + else
> + {
> + m_lastMode[partSize] = 1;
> + cu->getCUMvField(REF_PIC_LIST_1)->setAllMv(mv[1], partSize, partAddr, 0, partIdx);
> + cu->getCUMvField(REF_PIC_LIST_1)->setAllRefIdx(refIdx[1], partSize, partAddr, 0, partIdx);
> +
> + MV mvtmp = mv[1] - m_mvPred[partSize][1][refIdx[1]];
> + cu->getCUMvField(REF_PIC_LIST_1)->setMvd(partAddr, mvtmp);
> +
> + cu->setInterDirSubParts(2, partAddr, partIdx, cu->getDepth(0));
> +
> + cu->setMVPIdx(REF_PIC_LIST_1, partAddr, m_mvpIdx[partSize][1][refIdx[1]]);
> +
> + mebits = bits[1];
> + }
> + } /* end if bTestNormalMC */
> +
> + uint32_t totalbits = mebits;
> + cu->setMergeFlag(partAddr, false);
> +
> + if (cu->getPartitionSize(partAddr) != SIZE_2Nx2N)
> + {
> + /* calculate ME cost */
> + uint32_t meError = MAX_UINT;
> + uint32_t meCost = MAX_UINT;
> +
> + if (bTestNormalMC)
> + {
> + meError = xGetInterPredictionError(cu, partIdx);
> + meCost = meError + m_rdCost->getCost(mebits);
> + }
> +
> + /* compare with Merge result */
> + if (m_mrgCost[partSize] < meCost)
> + {
> + // set Merge result
> + cu->setMergeFlag(partAddr, true);
> + cu->setMergeIndex(partAddr, m_mrgIndex[partSize]);
> + cu->setInterDirSubParts(m_mrgInterDir[partSize], partAddr, partIdx, cu->getDepth(partAddr));
> + {
> + cu->getCUMvField(REF_PIC_LIST_0)->setAllMvField(m_mrgMvField[partSize][0], partSize, partAddr, 0, partIdx);
> + cu->getCUMvField(REF_PIC_LIST_1)->setAllMvField(m_mrgMvField[partSize][1], partSize, partAddr, 0, partIdx);
> + }
> + totalbits = m_mrgBits[partSize];
> + }
> + }
> +
> + if (partIdx == 0)
> + {
> + cu->m_totalBits = totalbits;
> + }
> + else
> + {
> + cu->m_totalBits += totalbits;
> + }
> +
> + motionCompensation(cu, predYuv, REF_PIC_LIST_X, partIdx, bLuma, bChroma);
> +
> + if (++partIdx < numPart)
> + {
> + enqueueInterSearch(cu, predYuv, partSize, bUseMRG, false);
> + }
> + else
> + {
> + m_completeBitmap[partSize] |= 1LL << (MAX_NUM_REF * 2 + 1);
> + }
> +}
> +
> // AMVP
> void TEncSearch::xEstimateMvPredAMVP(TComDataCU* cu, uint32_t partIdx, int list, int refIdx, MV& mvPred, AMVPInfo* amvpInfo, uint32_t* distBiP)
> {
> @@ -2885,10 +3263,13 @@
>
> bestMv = amvpInfo->m_mvCand[0];
>
> + PartSize partSize = cu->getPartitionSize(partAddr);
> + TComYuv* templateCand = m_threading ? &m_predYuv[partSize][list][refIdx] : &m_predYuv[0][0][0];
> +
> //-- Check Minimum Cost.
> for (i = 0; i < AMVP_MAX_NUM_CANDS; i++)
> {
> - uint32_t cost = xGetTemplateCost(cu, partAddr, &m_predTempYuv, amvpInfo->m_mvCand[i], list, refIdx, roiWidth, roiHeight);
> + uint32_t cost = xGetTemplateCost(cu, partAddr, templateCand, amvpInfo->m_mvCand[i], list, refIdx, roiWidth, roiHeight);
> if (bestCost > cost)
> {
> bestCost = cost;
> @@ -2901,6 +3282,7 @@
> // Setting Best MVP
> mvPred = bestMv;
> cu->setMVPIdx(list, partAddr, bestIdx);
> + m_mvpIdx[partSize][list][refIdx] = bestIdx;
> }
>
> void TEncSearch::xGetBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3])
> @@ -2953,13 +3335,12 @@
> }
>
> /* Check if using an alternative MVP would result in a smaller MVD + signal bits */
> -void TEncSearch::xCheckBestMVP(AMVPInfo* amvpInfo, MV mv, MV& mvPred, int& outMvpIdx, uint32_t& outBits, uint32_t& outCost)
> +void TEncSearch::xCheckBestMVP(AMVPInfo* amvpInfo, MV mv, MV& mvPred, int& outMvpIdx, uint32_t& outBits, uint32_t& outCost, int partSizeId)
> {
> assert(amvpInfo->m_mvCand[outMvpIdx] == mvPred);
>
> - m_me.setMVP(mvPred);
> int bestMvpIdx = outMvpIdx;
> - int mvBitsOrig = m_me.bitcost(mv) + MVP_IDX_BITS;
> + int mvBitsOrig = m_me[partSizeId].bitcost(mvPred, mv) + MVP_IDX_BITS;
> int bestMvBits = mvBitsOrig;
>
> for (int mvpIdx = 0; mvpIdx < AMVP_MAX_NUM_CANDS; mvpIdx++)
> @@ -2967,8 +3348,7 @@
> if (mvpIdx == outMvpIdx)
> continue;
>
> - m_me.setMVP(amvpInfo->m_mvCand[mvpIdx]);
> - int mvbits = m_me.bitcost(mv) + MVP_IDX_BITS;
> + int mvbits = m_me[partSizeId].bitcost(amvpInfo->m_mvCand[mvpIdx], mv) + MVP_IDX_BITS;
>
> if (mvbits < bestMvBits)
> {
> @@ -2995,10 +3375,12 @@
> cu->clipMv(mvCand);
>
> // prediction pattern
> - xPredInterLumaBlk(cu, cu->getSlice()->getRefPic(list, refIdx)->getPicYuvRec(), partAddr, &mvCand, sizex, sizey, templateCand);
> + int immedIdx = m_threading ? ((list << 5) + refIdx + MAX_NUM_REF) : 0;
> + xPredInterLumaBlk(cu, cu->getSlice()->getRefPic(list, refIdx)->getPicYuvRec(), partAddr, &mvCand, sizex, sizey, templateCand, immedIdx);
>
> // calc distortion
> - uint32_t cost = m_me.bufSAD(templateCand->getLumaAddr(partAddr), templateCand->getStride());
> + int partSizeId = m_threading ? (int)cu->getPartitionSize(0) : 0;
> + uint32_t cost = m_me[partSizeId].bufSAD(templateCand->getLumaAddr(partAddr), templateCand->getStride());
> x265_emms();
> return m_rdCost->calcRdSADCost(cost, MVP_IDX_BITS);
> }
> @@ -4300,4 +4682,102 @@
> }
> }
>
> +void TEncSearch::setThreadPool(ThreadPool *p)
> +{
> + m_pool = p;
> +}
> +
> +bool TEncSearch::threadingInterSearch()
> +{
> + return m_threading;
> +}
> +
> +bool TEncSearch::findJob()
> +{
> + unsigned long id;
> +
> + /* thread safe */
> + for (int w = 0; w < m_numWords; w++)
> + {
> + uint64_t oldval = m_queuedBitmap[w];
> + while (oldval)
> + {
> + CTZ64(id, oldval);
> +
> + uint64_t newval = oldval & ~(1LL << id);
> + if (ATOMIC_CAS(&m_queuedBitmap[w], oldval, newval) == oldval)
> + {
> + /* we cleared the bit, do predInterSearch */
> + predInterSearch(m_queuedCU[w], m_queuedPredYuv[w], (int)id, m_queuedbUseMRG[w], true, false);
> + return true;
> + }
> + /* some other thread cleared the bit, try another bit */
> + oldval = m_queuedBitmap[w];
> + }
> + }
> +
> + /* made it through the bitmap without finding any enqueued rows */
> + return false;
> +}
> +
> +void TEncSearch::enqueueInterSearch(TComDataCU* cu, TComYuv* predYuv, PartSize partSize, bool bUseMRG, bool firstPart)
> +{
> + if (firstPart)
> + {
> + m_partIdx[partSize] = 0;
> + m_lastMode[partSize] = 0;
> +
> + m_queuedCU[partSize] = cu;
> + m_queuedPredYuv[partSize] = predYuv;
> + m_queuedbUseMRG[partSize] = bUseMRG;
> + }
> +
> + uint32_t partAddr;
> + int roiWidth, roiHeight;
> + cu->getPartIndexAndSize(m_partIdx[partSize], partAddr, roiWidth, roiHeight);
> +
> + TComPicYuv *fenc = cu->getSlice()->getPic()->getPicYuvOrg();
> + Pel* pu = fenc->getLumaAddr(cu->getAddr(), cu->getZorderIdxInCU() + partAddr);
> + m_me[partSize].setSourcePU(pu - fenc->getLumaAddr(), roiWidth, roiHeight);
> +
> + cu->getMvPredLeft(m_mvPredictors[partSize][0]);
> + cu->getMvPredAbove(m_mvPredictors[partSize][1]);
> + cu->getMvPredAboveRight(m_mvPredictors[partSize][2]);
> +
> + /* reset complete bitmap */
> + m_completeBitmap[partSize] = 0;
> +
> + int numRef = cu->getSlice()->getNumRefIdx(REF_PIC_LIST_0) + cu->getSlice()->getNumRefIdx(REF_PIC_LIST_1);
> + bool bTestNormalMC = true;
> + if (bUseMRG && cu->getCUSize(0) > 8 && cu->getNumPartInter() == 2)
> + {
> + bTestNormalMC = false;
> + }
> + uint64_t val = 0;
> + for (int bit = 0; bTestNormalMC && bit < numRef; bit++)
> + {
> + /* enqueue motion estimate */
> + val |= 1LL << bit;
> + }
> + if (partSize != SIZE_2Nx2N)
> + {
> + /* enqueue merge estimation */
> + val |= 1LL << numRef;
> + }
> + ATOMIC_OR(&m_queuedBitmap[partSize], val);
> +
> + m_pool->pokeIdleThread();
> +}
> +
> +bool TEncSearch::jobCompleted()
> +{
> + uint64_t complete = 1LL << (MAX_NUM_REF * 2 + 1);
> + complete &= m_completeBitmap[SIZE_2Nx2N];
> + if (m_cfg->param->bEnableRectInter)
> + {
> + complete &= m_completeBitmap[SIZE_2NxN] & m_completeBitmap[SIZE_Nx2N];
> + }
> + return complete != (uint64_t)0;
> +}
> +
> //! \}
> diff -r 33b67a53b6de -r 7b757a1a9953 source/Lib/TLibEncoder/TEncSearch.h
> --- a/source/Lib/TLibEncoder/TEncSearch.h Thu Mar 06 21:27:55 2014 -0600
> +++ b/source/Lib/TLibEncoder/TEncSearch.h Fri Mar 07 14:21:13 2014 +0800
> @@ -50,6 +50,7 @@
> #include "primitives.h"
> #include "bitcost.h"
> #include "motion.h"
> +#include "threadpool.h"
>
> #define MVP_IDX_BITS 1
>
> @@ -73,16 +74,16 @@
> // ====================================================================================================================
>
> /// encoder search class
> -class TEncSearch : public TComPrediction
> +class TEncSearch : public TComPrediction, public JobProvider
> {
> public:
>
> - MotionEstimate m_me;
> + MotionEstimate m_me[SIZE_nRx2N + 1];
> MotionReference* m_mref[2][MAX_NUM_REF + 1];
>
> protected:
>
> - ShortYuv* m_qtTempShortYuv;
> + ShortYuv* m_qtTempShortYuv;
> pixel* m_sharedPredTransformSkip[3];
>
> TCoeff** m_qtTempCoeffY;
> @@ -108,9 +109,9 @@
> // ME parameters
> int m_refLagPixels;
> int m_adaptiveRange[2][MAX_NUM_REF];
> - MV m_mvPredictors[3];
> + MV m_mvPredictors[SIZE_nRx2N + 1][3];
>
> - TComYuv m_tmpYuvPred; // to avoid constant memory allocation/deallocation in xGetInterPredictionError()
> + TComYuv m_tmpYuvPred[SIZE_nRx2N + 1]; // to avoid constant memory allocation/deallocation in xGetInterPredictionError()
>
> // Color space parameters
> uint32_t m_section;
> @@ -119,6 +120,34 @@
> uint32_t m_absPartIdxStep;
> uint32_t m_partOffset;
>
> +private:
> +
> + // bitmap of motion search functions queued for processing, uses atomic intrinsics
> + uint64_t volatile *m_queuedBitmap;
> + uint64_t volatile *m_completeBitmap;
> +
> + // number of words in the bitmap
> + int m_numWords;
> +
> + TComDataCU** m_queuedCU;
> + TComYuv** m_queuedPredYuv;
> + bool* m_queuedbUseMRG;
> +
> + MV m_mvTemp[SIZE_nRx2N + 1][2][MAX_NUM_REF];
> + MV m_mvPred[SIZE_nRx2N + 1][2][MAX_NUM_REF];
> + int m_mvpIdx[SIZE_nRx2N + 1][2][MAX_NUM_REF];
> + AMVPInfo m_amvpInfo[SIZE_nRx2N + 1][2][MAX_NUM_REF];
> + uint32_t m_costTemp[SIZE_nRx2N + 1][2][MAX_NUM_REF];
> + uint32_t m_bitsTemp[SIZE_nRx2N + 1][2][MAX_NUM_REF];
> + uint32_t m_lastMode[SIZE_nRx2N + 1];
> + int m_partIdx[SIZE_nRx2N + 1];
> +
> + uint32_t m_mrgInterDir[SIZE_nRx2N + 1];
> + TComMvField m_mrgMvField[SIZE_nRx2N + 1][2];
> + uint32_t m_mrgIndex[SIZE_nRx2N + 1];
> + uint32_t m_mrgCost[SIZE_nRx2N + 1];
> + uint32_t m_mrgBits[SIZE_nRx2N + 1];
> +
> public:
>
> TEncSbac*** m_rdSbacCoders;
> @@ -137,6 +166,20 @@
>
> bool init(Encoder* cfg, TComRdCost* rdCost, TComTrQuant *trQuant);
>
> + void setThreadPool(ThreadPool *p);
> +
> + bool threadingInterSearch();
> +
> + int getnumWords() { return m_numWords; }
> + // TEncSearch's implementation of JobProvider::findJob.
> + bool findJob();
> +
> + bool jobCompleted();
> +
> + void enqueueInterSearch(TComDataCU* cu, TComYuv* predYuv, PartSize partSize, bool bUseMRG, bool firstPart = true);
> +
> + void predInterSearch(TComDataCU* cu, TComYuv* predYuv, int id, bool bUseMRG = false, bool bLuma = true, bool bChroma = true);
> +
> protected:
>
> uint32_t xGetInterPredictionError(TComDataCU* cu, int partIdx);
> @@ -231,7 +274,7 @@
> MV& mvPred, AMVPInfo* amvpInfo, uint32_t* distBiP = NULL);
>
> void xCheckBestMVP(AMVPInfo* amvpInfo, MV cMv, MV& mvPred, int& mvpIdx,
> - uint32_t& outBits, uint32_t& outCost);
> + uint32_t& outBits, uint32_t& outCost, int partSizeId = 0);
>
> uint32_t xGetTemplateCost(TComDataCU* cu, uint32_t partAddr, TComYuv* templateCand, MV mvCand,
> int picList, int refIdx, int sizex, int sizey);
> diff -r 33b67a53b6de -r 7b757a1a9953 source/encoder/bitcost.h
> --- a/source/encoder/bitcost.h Thu Mar 06 21:27:55 2014 -0600
> +++ b/source/encoder/bitcost.h Fri Mar 07 14:21:13 2014 +0800
> @@ -44,6 +44,9 @@
> // return bit cost of motion vector difference, multiplied by lambda
> inline uint16_t mvcost(const MV& mv) const { return m_cost_mvx[mv.x] + m_cost_mvy[mv.y]; }
>
> + // return bit cost of motion vector difference, multiplied by lambda
> + inline uint16_t mvcost(const MV& mvp, const MV& mv) const { return m_cost[mv.x - mvp.x] + m_cost[mv.y - mvp.y]; }
> +
> // return bit cost of motion vector difference, without lambda
> inline uint16_t bitcost(const MV& mv) const
> {
> @@ -51,6 +54,13 @@
> s_bitsizes[(abs(mv.y - m_mvp.y) << 1) + !!(mv.y < m_mvp.y)] + 0.5f);
> }
>
> + // return bit cost of motion vector difference, without lambda
> + inline uint16_t bitcost(const MV& mvp, const MV& mv) const
> + {
> + return (uint16_t)(s_bitsizes[(abs(mv.x - mvp.x) << 1) + !!(mv.x < mvp.x)] +
> + s_bitsizes[(abs(mv.y - mvp.y) << 1) + !!(mv.y < mvp.y)] + 0.5f);
> + }
> +
> static void destroy();
>
> protected:
> diff -r 33b67a53b6de -r 7b757a1a9953 source/encoder/compress.cpp
> --- a/source/encoder/compress.cpp Thu Mar 06 21:27:55 2014 -0600
> +++ b/source/encoder/compress.cpp Fri Mar 07 14:21:13 2014 +0800
> @@ -211,6 +211,32 @@
> outTempCU->m_totalCost = m_rdCost->calcRdSADCost(distortion, outTempCU->m_totalBits);
> }
>
> +/** check RD costs for a CU block encoded with merge */
> +void TEncCu::xComputeCostInterEnqueue(TComDataCU* outTempCU, TComYuv* outPredYuv, PartSize partSize, bool bUseMRG)
> +{
> + UChar depth = outTempCU->getDepth(0);
> +
> + outTempCU->setPartSizeSubParts(partSize, 0, depth);
> + outTempCU->setPredModeSubParts(MODE_INTER, 0, depth);
> + outTempCU->setCUTransquantBypassSubParts(m_cfg->m_CUTransquantBypassFlagValue, 0, depth);
> +
> + //do motion compensation only for Luma since luma cost alone is calculated
> + outTempCU->m_totalBits = 0;
> +
> + m_search->enqueueInterSearch(outTempCU, outPredYuv, partSize, bUseMRG);
> +}
> +
> +void TEncCu::xComputeDistortionCostInter(TComDataCU* outTempCU, TComYuv* outPredYuv)
> +{
> + UChar depth = outTempCU->getDepth(0);
> +
> + int part = g_convertToBit[outTempCU->getCUSize(0)];
> + uint32_t distortion = primitives.sa8d[part](m_origYuv[depth]->getLumaAddr(), m_origYuv[depth]->getStride(),
> + outPredYuv->getLumaAddr(), outPredYuv->getStride());
> + outTempCU->m_totalDistortion = distortion;
> + outTempCU->m_totalCost = m_rdCost->calcRdSADCost(distortion, outTempCU->m_totalBits);
> +}
> +
> void TEncCu::xComputeCostMerge2Nx2N(TComDataCU*& outBestCU, TComDataCU*& outTempCU, TComYuv*& bestPredYuv, TComYuv*& yuvReconBest)
> {
> assert(outTempCU->getSlice()->getSliceType() != I_SLICE);
> @@ -432,22 +458,53 @@
>
> if (!earlyskip)
> {
> - /*Compute 2Nx2N mode costs*/
> + if (m_search->threadingInterSearch())
> {
> + m_search->enqueue();
> +
> + /*Compute 2Nx2N mode costs*/
> + xComputeCostInterEnqueue(m_interCU_2Nx2N[depth], m_modePredYuv[0][depth], SIZE_2Nx2N);
> +
> + /*Compute Rect costs*/
> + if (m_cfg->param->bEnableRectInter)
> + {
> + xComputeCostInterEnqueue(m_interCU_Nx2N[depth], m_modePredYuv[1][depth], SIZE_Nx2N);
> + xComputeCostInterEnqueue(m_interCU_2NxN[depth], m_modePredYuv[2][depth], SIZE_2NxN);
> + }
> +
> + while (!m_search->jobCompleted())
> + {
> + m_search->findJob();
> + }
> +
> + m_search->dequeue();
> +
> + xComputeDistortionCostInter(m_interCU_2Nx2N[depth], m_modePredYuv[0][depth]);
> +
> + if (m_cfg->param->bEnableRectInter)
> + {
> + xComputeDistortionCostInter(m_interCU_Nx2N[depth], m_modePredYuv[1][depth]);
> + xComputeDistortionCostInter(m_interCU_2NxN[depth], m_modePredYuv[2][depth]);
> + }
> + }
> + else
> + {
> + /*Compute 2Nx2N mode costs*/
> xComputeCostInter(m_interCU_2Nx2N[depth], m_modePredYuv[0][depth], SIZE_2Nx2N);
> - /*Choose best mode; initialise outBestCU to 2Nx2N*/
> - outBestCU = m_interCU_2Nx2N[depth];
> - tempYuv = m_modePredYuv[0][depth];
> - m_modePredYuv[0][depth] = m_bestPredYuv[depth];
> - m_bestPredYuv[depth] = tempYuv;
> +
> + /*Compute Rect costs*/
> + if (m_cfg->param->bEnableRectInter)
> + {
> + xComputeCostInter(m_interCU_Nx2N[depth], m_modePredYuv[1][depth], SIZE_Nx2N);
> + xComputeCostInter(m_interCU_2NxN[depth], m_modePredYuv[2][depth], SIZE_2NxN);
> + }
> }
>
> - /*Compute Rect costs*/
> - if (m_cfg->param->bEnableRectInter)
> - {
> - xComputeCostInter(m_interCU_Nx2N[depth], m_modePredYuv[1][depth], SIZE_Nx2N);
> - xComputeCostInter(m_interCU_2NxN[depth], m_modePredYuv[2][depth], SIZE_2NxN);
> - }
> + /*Choose best mode; initialise outBestCU to 2Nx2N*/
> + outBestCU = m_interCU_2Nx2N[depth];
> + tempYuv = m_modePredYuv[0][depth];
> + m_modePredYuv[0][depth] = m_bestPredYuv[depth];
> + m_bestPredYuv[depth] = tempYuv;
>
> if (m_interCU_Nx2N[depth]->m_totalCost < outBestCU->m_totalCost)
> {
> diff -r 33b67a53b6de -r 7b757a1a9953 source/encoder/frameencoder.cpp
> --- a/source/encoder/frameencoder.cpp Thu Mar 06 21:27:55 2014 -0600
> +++ b/source/encoder/frameencoder.cpp Fri Mar 07 14:21:13 2014 +0800
> @@ -108,6 +108,8 @@
> m_rows = new CTURow[m_numRows];
> for (int i = 0; i < m_numRows; ++i)
> {
> + m_rows[i].m_search.setThreadPool(m_pool);
> +
> ok &= m_rows[i].create(top);
>
> for (int list = 0; list <= 1; list++)
> @@ -344,7 +346,10 @@
> double chromaLambda = lambda / crWeight;
>
> m_rows[row].m_search.setQPLambda(qp, lambda, chromaLambda);
> - m_rows[row].m_search.m_me.setSourcePlane(fenc->getLumaAddr(), fenc->getStride());
> + for (int partSize = 0; partSize <= m_rows[row].m_search.getnumWords(); partSize++)
> + {
> + m_rows[row].m_search.m_me[partSize].setSourcePlane(fenc->getLumaAddr(), fenc->getStride());
> + }
> m_rows[row].m_rdCost.setLambda(lambda);
> m_rows[row].m_rdCost.setCbDistortionWeight(cbWeight);
> m_rows[row].m_rdCost.setCrDistortionWeight(crWeight);
> @@ -391,7 +396,10 @@
> for (int i = 0; i < m_numRows; i++)
> {
> m_rows[i].m_search.setQPLambda(qp, lambda, chromaLambda);
> - m_rows[i].m_search.m_me.setSourcePlane(fenc->getLumaAddr(), fenc->getStride());
> + for (int partSize = 0; partSize <= m_rows[i].m_search.getnumWords(); partSize++)
> + {
> + m_rows[i].m_search.m_me[partSize].setSourcePlane(fenc->getLumaAddr(), fenc->getStride());
> + }
> m_rows[i].m_rdCost.setLambda(lambda);
> m_rows[i].m_rdCost.setCbDistortionWeight(cbWeight);
> m_rows[i].m_rdCost.setCrDistortionWeight(crWeight);
> @@ -466,9 +474,9 @@
> for (int ref = 0; ref < slice->getNumRefIdx(l); ref++)
> {
> wpScalingParam *w = NULL;
> - if ((slice->isInterP() && slice->getPPS()->getUseWP() && slice->m_weightPredTable[l][ref][0].bPresentFlag))
> + if ((slice->isInterP() && slice->getPPS()->getUseWP() && slice->m_weightPredTable[0][l][ref][0].bPresentFlag))
> {
> - w = slice->m_weightPredTable[l][ref];
> + w = slice->m_weightPredTable[0][l][ref];
> slice->m_numWPRefs++;
> }
> m_mref[l][ref].init(slice->getRefPic(l, ref)->getPicYuvRec(), w);
> diff -r 33b67a53b6de -r 7b757a1a9953 source/encoder/motion.cpp
> --- a/source/encoder/motion.cpp Thu Mar 06 21:27:55 2014 -0600
> +++ b/source/encoder/motion.cpp Fri Mar 07 14:21:13 2014 +0800
> @@ -168,7 +168,7 @@
> { \
> MV tmv(mx, my); \
> int cost = sad(fenc, FENC_STRIDE, fref + mx + my * stride, stride); \
> - cost += mvcost(tmv << 2); \
> + cost += mvcost(qmvp, tmv << 2); \
> if (cost < bcost) { \
> bcost = cost; \
> bmv = tmv; \
> @@ -181,7 +181,7 @@
> do \
> { \
> int cost = sad(fenc, FENC_STRIDE, fref + mx + my * stride, stride); \
> - cost += mvcost(MV(mx, my) << 2); \
> + cost += mvcost(qmvp, MV(mx, my) << 2); \
> COPY2_IF_LT(bcost, cost, bmv, MV(mx, my)); \
> } while (0)
>
> @@ -193,9 +193,9 @@
> pix_base + (m1x) + (m1y) * stride, \
> pix_base + (m2x) + (m2y) * stride, \
> stride, costs); \
> - (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
> - (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
> - (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
> + (costs)[0] += mvcost(qmvp, (bmv + MV(m0x, m0y)) << 2); \
> + (costs)[1] += mvcost(qmvp, (bmv + MV(m1x, m1y)) << 2); \
> + (costs)[2] += mvcost(qmvp, (bmv + MV(m2x, m2y)) << 2); \
> }
>
> #define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \
> @@ -206,10 +206,10 @@
> fref + (m2x) + (m2y) * stride, \
> fref + (m3x) + (m3y) * stride, \
> stride, costs); \
> - costs[0] += mvcost(MV(m0x, m0y) << 2); \
> - costs[1] += mvcost(MV(m1x, m1y) << 2); \
> - costs[2] += mvcost(MV(m2x, m2y) << 2); \
> - costs[3] += mvcost(MV(m3x, m3y) << 2); \
> + costs[0] += mvcost(qmvp, MV(m0x, m0y) << 2); \
> + costs[1] += mvcost(qmvp, MV(m1x, m1y) << 2); \
> + costs[2] += mvcost(qmvp, MV(m2x, m2y) << 2); \
> + costs[3] += mvcost(qmvp, MV(m3x, m3y) << 2); \
> COPY4_IF_LT(bcost, costs[0], bmv, MV(m0x, m0y), bPointNr, p0, bDistance, d0); \
> COPY4_IF_LT(bcost, costs[1], bmv, MV(m1x, m1y), bPointNr, p1, bDistance, d1); \
> COPY4_IF_LT(bcost, costs[2], bmv, MV(m2x, m2y), bPointNr, p2, bDistance, d2); \
> @@ -224,10 +224,10 @@
> pix_base + (m2x) + (m2y) * stride, \
> pix_base + (m3x) + (m3y) * stride, \
> stride, costs); \
> - costs[0] += mvcost((omv + MV(m0x, m0y)) << 2); \
> - costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \
> - costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \
> - costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \
> + costs[0] += mvcost(qmvp, (omv + MV(m0x, m0y)) << 2); \
> + costs[1] += mvcost(qmvp, (omv + MV(m1x, m1y)) << 2); \
> + costs[2] += mvcost(qmvp, (omv + MV(m2x, m2y)) << 2); \
> + costs[3] += mvcost(qmvp, (omv + MV(m3x, m3y)) << 2); \
> COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
> COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
> COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
> @@ -243,10 +243,10 @@
> pix_base + (m2x) + (m2y) * stride, \
> pix_base + (m3x) + (m3y) * stride, \
> stride, costs); \
> - (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
> - (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
> - (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
> - (costs)[3] += mvcost((bmv + MV(m3x, m3y)) << 2); \
> + (costs)[0] += mvcost(qmvp, (bmv + MV(m0x, m0y)) << 2); \
> + (costs)[1] += mvcost(qmvp, (bmv + MV(m1x, m1y)) << 2); \
> + (costs)[2] += mvcost(qmvp, (bmv + MV(m2x, m2y)) << 2); \
> + (costs)[3] += mvcost(qmvp, (bmv + MV(m3x, m3y)) << 2); \
> }
>
> #define DIA1_ITER(mx, my) \
> @@ -284,6 +284,7 @@
> void MotionEstimate::StarPatternSearch(ReferencePlanes *ref,
> const MV & mvmin,
> const MV & mvmax,
> + const MV & qmvp,
> MV & bmv,
> int & bcost,
> int & bPointNr,
> @@ -563,13 +564,13 @@
> int bcost = bprecost;
> if (pmv.isSubpel())
> {
> - bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2);
> + bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(qmvp, bmv << 2);
> }
>
> // measure SAD cost at MV(0) if MVP is not zero
> if (pmv.notZero())
> {
> - int cost = sad(fenc, FENC_STRIDE, fref, stride) + mvcost(MV(0, 0));
> + int cost = sad(fenc, FENC_STRIDE, fref, stride) + mvcost(qmvp, MV(0, 0));
> if (cost < bcost)
> {
> bcost = cost;
> @@ -585,9 +586,9 @@
> {
> int cost;
> if (ref->isLowres)
> - cost = ref->lowresQPelCost(fenc, blockOffset, m, sad) + mvcost(m);
> + cost = ref->lowresQPelCost(fenc, blockOffset, m, sad) + mvcost(qmvp, m);
> else
> - cost = subpelCompare(ref, m, sad) + mvcost(m);
> + cost = subpelCompare(ref, m, sad) + mvcost(qmvp, m);
>
> if (cost < bprecost)
> {
> @@ -891,7 +892,7 @@
> int bDistance = 0;
>
> const int EarlyExitIters = 3;
> - StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, EarlyExitIters, merange);
> + StarPatternSearch(ref, mvmin, mvmax, qmvp, bmv, bcost, bPointNr, bDistance, EarlyExitIters, merange);
> if (bDistance == 1)
> {
> // if best distance was only 1, check two missing points. If no new point is found, stop
> @@ -940,16 +941,16 @@
> pix_base + RasterDistance * 2,
> pix_base + RasterDistance * 3,
> stride, costs);
> - costs[0] += mvcost(tmv << 2);
> + costs[0] += mvcost(qmvp, tmv << 2);
> COPY2_IF_LT(bcost, costs[0], bmv, tmv);
> tmv.x += RasterDistance;
> - costs[1] += mvcost(tmv << 2);
> + costs[1] += mvcost(qmvp, tmv << 2);
> COPY2_IF_LT(bcost, costs[1], bmv, tmv);
> tmv.x += RasterDistance;
> - costs[2] += mvcost(tmv << 2);
> + costs[2] += mvcost(qmvp, tmv << 2);
> COPY2_IF_LT(bcost, costs[2], bmv, tmv);
> tmv.x += RasterDistance;
> - costs[3] += mvcost(tmv << 3);
> + costs[3] += mvcost(qmvp, tmv << 3);
> COPY2_IF_LT(bcost, costs[3], bmv, tmv);
> }
> else
> @@ -964,7 +965,7 @@
> bDistance = 0;
> bPointNr = 0;
> const int MaxIters = 32;
> - StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, MaxIters, merange);
> + StarPatternSearch(ref, mvmin, mvmax, qmvp, bmv, bcost, bPointNr, bDistance, MaxIters, merange);
>
> if (bDistance == 1)
> {
> @@ -1012,16 +1013,16 @@
> pix_base + 2,
> pix_base + 3,
> stride, costs);
> - costs[0] += mvcost(tmv << 2);
> + costs[0] += mvcost(qmvp, tmv << 2);
> COPY2_IF_LT(bcost, costs[0], bmv, tmv);
> tmv.x++;
> - costs[1] += mvcost(tmv << 2);
> + costs[1] += mvcost(qmvp, tmv << 2);
> COPY2_IF_LT(bcost, costs[1], bmv, tmv);
> tmv.x++;
> - costs[2] += mvcost(tmv << 2);
> + costs[2] += mvcost(qmvp, tmv << 2);
> COPY2_IF_LT(bcost, costs[2], bmv, tmv);
> tmv.x++;
> - costs[3] += mvcost(tmv << 2);
> + costs[3] += mvcost(qmvp, tmv << 2);
> COPY2_IF_LT(bcost, costs[3], bmv, tmv);
> }
> else
> @@ -1057,18 +1058,18 @@
> for (int i = 1; i <= wl.hpel_dirs; i++)
> {
> MV qmv = bmv + square1[i] * 2;
> - cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv);
> + cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmvp, qmv);
> COPY2_IF_LT(bcost, cost, bdir, i);
> }
>
> bmv += square1[bdir] * 2;
> - bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd) + mvcost(bmv);
> + bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd) + mvcost(qmvp, bmv);
>
> bdir = 0;
> for (int i = 1; i <= wl.qpel_dirs; i++)
> {
> MV qmv = bmv + square1[i];
> - cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv);
> + cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmvp, qmv);
> COPY2_IF_LT(bcost, cost, bdir, i);
> }
>
> @@ -1080,7 +1081,7 @@
>
> if (wl.hpel_satd)
> {
> - bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
> + bcost = subpelCompare(ref, bmv, satd) + mvcost(qmvp, bmv);
> hpelcomp = satd;
> }
> else
> @@ -1092,7 +1093,7 @@
> for (int i = 1; i <= wl.hpel_dirs; i++)
> {
> MV qmv = bmv + square1[i] * 2;
> - cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
> + cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmvp, qmv);
> COPY2_IF_LT(bcost, cost, bdir, i);
> }
>
> @@ -1101,7 +1102,7 @@
>
> /* if HPEL search used SAD, remeasure with SATD before QPEL */
> if (!wl.hpel_satd)
> - bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
> + bcost = subpelCompare(ref, bmv, satd) + mvcost(qmvp, bmv);
>
> for (int iter = 0; iter < wl.qpel_iters; iter++)
> {
> @@ -1109,7 +1110,7 @@
> for (int i = 1; i <= wl.qpel_dirs; i++)
> {
> MV qmv = bmv + square1[i];
> - cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
> + cost = subpelCompare(ref, qmv, satd) + mvcost(qmvp, qmv);
> COPY2_IF_LT(bcost, cost, bdir, i);
> }
>
> diff -r 33b67a53b6de -r 7b757a1a9953 source/encoder/motion.h
> --- a/source/encoder/motion.h Thu Mar 06 21:27:55 2014 -0600
> +++ b/source/encoder/motion.h Fri Mar 07 14:21:13 2014 +0800
> @@ -99,6 +99,7 @@
> inline void StarPatternSearch(ReferencePlanes *ref,
> const MV & mvmin,
> const MV & mvmax,
> + const MV & qmvp,
> MV & bmv,
> int & bcost,
> int & bPointNr,
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list