[x265] [PATCH] threading 2Nx2N, Nx2N and 2NxN predInterSearch in xCompressInterCU
Wenju He
wenju at multicorewareinc.com
Fri Mar 7 07:26:36 CET 2014
# HG changeset patch
# User Wenju He <wenju at multicorewareinc.com>
# Date 1394173273 -28800
# Fri Mar 07 14:21:13 2014 +0800
# Node ID 7b757a1a99538a40ce2d74b6674ac974993bb2aa
# Parent 33b67a53b6deb19bd5b5142398f7c8c47ba3d2fa
threading 2Nx2N, Nx2N and 2NxN predInterSearch in xCompressInterCU
TEncSearch is inherited from JobProvider;
two bitmaps are added: m_queuedBitmap and m_completeBitmap;
2Nx2N, Nx2N and 2NxN predInterSearch are enqueued. In each of the partSize,
we set (numL0+numL1) bits in m_queuedBitmap to 1, i.e. motion search against
a specific reference frame is a separate job. When a motion search job finishes,
it will set a bit in m_completeBitmap to 1;
In each partSize, the thread that last finishes motion search will do bidir
and merge. At the end, if partIdx < numPart, it will enqueue the jobs of the
next partIdx iteration, i.e. set (numL0+numL1) bits to 1 in m_queueBitmap and
clear m_completeBitmap. When all partIdx iteration finishes, it will set another
bit to 1 in m_completeBitmap, indicating this partSize is done;
In TEncCu::xCompressInterCU, we enqueue m_search, run a while loop to call
findJob() and finally call dequeue();
Some variable instances are increased, e.g. the number of m_me instances is
increased to 8.
diff -r 33b67a53b6de -r 7b757a1a9953 source/Lib/TLibCommon/TComPrediction.cpp
--- a/source/Lib/TLibCommon/TComPrediction.cpp Thu Mar 06 21:27:55 2014 -0600
+++ b/source/Lib/TLibCommon/TComPrediction.cpp Fri Mar 07 14:21:13 2014 +0800
@@ -64,7 +64,18 @@
m_refAboveFlt = NULL;
m_refLeft = NULL;
m_refLeftFlt = NULL;
- m_immedVals = NULL;
+
+ for (int i = 0; i <= SIZE_nRx2N; i++)
+ {
+ for (int j = 0; j < 2; j++)
+ {
+ for (int k = 0; k < MAX_NUM_REF * 2; k++)
+ {
+ m_immedVals[i][j][k] = NULL;
+ }
+ }
+ }
+ m_threading = false;
}
TComPrediction::~TComPrediction()
@@ -74,16 +85,26 @@
X265_FREE(m_refAboveFlt);
X265_FREE(m_refLeft);
X265_FREE(m_refLeftFlt);
- X265_FREE(m_immedVals);
- m_predYuv[0].destroy();
- m_predYuv[1].destroy();
- m_predShortYuv[0].destroy();
- m_predShortYuv[1].destroy();
- m_predTempYuv.destroy();
+ for (int i = 0; i <= SIZE_nRx2N; i++)
+ {
+ m_predShortYuv[i][0].destroy();
+ m_predShortYuv[i][1].destroy();
+ for (int j = 0; j < 2; j++)
+ {
+ for (int k = 0; k < MAX_NUM_REF; k++)
+ {
+ m_predYuv[i][j][k].destroy();
+ }
+ for (int k = 0; k < MAX_NUM_REF * 2; k++)
+ {
+ X265_FREE(m_immedVals[i][j][k]);
+ }
+ }
+ }
}
-void TComPrediction::initTempBuff(int csp)
+void TComPrediction::initTempBuff(int csp, int numPart, int maxNumRef)
{
m_hChromaShift = CHROMA_H_SHIFT(csp);
m_vChromaShift = CHROMA_V_SHIFT(csp);
@@ -99,13 +120,35 @@
m_refLeft = X265_MALLOC(pixel, 3 * MAX_CU_SIZE);
m_refLeftFlt = X265_MALLOC(pixel, 3 * MAX_CU_SIZE);
- m_predYuv[0].create(MAX_CU_SIZE, MAX_CU_SIZE, csp);
- m_predYuv[1].create(MAX_CU_SIZE, MAX_CU_SIZE, csp);
- m_predShortYuv[0].create(MAX_CU_SIZE, MAX_CU_SIZE, csp);
- m_predShortYuv[1].create(MAX_CU_SIZE, MAX_CU_SIZE, csp);
- m_predTempYuv.create(MAX_CU_SIZE, MAX_CU_SIZE, csp);
+ for (int i = 0; i < numPart; i++)
+ {
+ m_predShortYuv[i][0].create(MAX_CU_SIZE, MAX_CU_SIZE, csp);
+ m_predShortYuv[i][1].create(MAX_CU_SIZE, MAX_CU_SIZE, csp);
+ }
- m_immedVals = X265_MALLOC(int16_t, 64 * (64 + NTAPS_LUMA - 1));
+ if (m_threading)
+ {
+ for (int i = 0; i < numPart; i++)
+ {
+ for (int j = 0; j < maxNumRef; j++)
+ {
+ m_predYuv[i][0][j].create(MAX_CU_SIZE, MAX_CU_SIZE, csp);
+ m_immedVals[i][0][j] = X265_MALLOC(int16_t, 64 * (64 + NTAPS_LUMA - 1));
+ m_immedVals[i][0][j + MAX_NUM_REF] = X265_MALLOC(int16_t, 64 * (64 + NTAPS_LUMA - 1));
+ }
+ /* currently there is only one L1 reference */
+ m_predYuv[i][1][0].create(MAX_CU_SIZE, MAX_CU_SIZE, csp);
+ m_immedVals[i][1][0] = X265_MALLOC(int16_t, 64 * (64 + NTAPS_LUMA - 1));
+ m_immedVals[i][1][0 + MAX_NUM_REF] = X265_MALLOC(int16_t, 64 * (64 + NTAPS_LUMA - 1));
+ }
+ }
+ else
+ {
+ m_predYuv[0][0][0].create(MAX_CU_SIZE, MAX_CU_SIZE, csp);
+ m_predYuv[0][1][0].create(MAX_CU_SIZE, MAX_CU_SIZE, csp);
+ m_immedVals[0][0][0] = X265_MALLOC(int16_t, 64 * (64 + NTAPS_LUMA - 1));
+ }
+
}
}
@@ -279,7 +322,8 @@
{
if (cu->getSlice()->getPPS()->getUseWP())
{
- ShortYuv* shortYuv = &m_predShortYuv[0];
+ int partSize = m_threading ? cu->getPartitionSize(partAddr) : 0;
+ ShortYuv* shortYuv = &m_predShortYuv[partSize][0];
int refId = cu->getCUMvField(list)->getRefIdx(partAddr);
assert(refId >= 0);
@@ -321,7 +365,10 @@
cu->clipMv(mv);
if (bLuma)
- xPredInterLumaBlk(cu, cu->getSlice()->getRefPic(list, refIdx)->getPicYuvRec(), partAddr, &mv, width, height, outPredYuv);
+ {
+ int immedIdx = m_threading ? ((list << 5) + refIdx) : 0;
+ xPredInterLumaBlk(cu, cu->getSlice()->getRefPic(list, refIdx)->getPicYuvRec(), partAddr, &mv, width, height, outPredYuv, immedIdx);
+ }
if (bChroma)
xPredInterChromaBlk(cu, cu->getSlice()->getRefPic(list, refIdx)->getPicYuvRec(), partAddr, &mv, width, height, outPredYuv);
@@ -345,6 +392,7 @@
void TComPrediction::xPredInterBi(TComDataCU* cu, uint32_t partAddr, int width, int height, TComYuv* outPredYuv, bool bLuma, bool bChroma)
{
assert(cu->getSlice()->isInterB());
+ int partSize = m_threading ? cu->getPartitionSize(partAddr) : 0;
int refIdx[2];
refIdx[0] = cu->getCUMvField(REF_PIC_LIST_0)->getRefIdx(partAddr);
@@ -356,16 +404,16 @@
{
assert(refIdx[list] < cu->getSlice()->getNumRefIdx(list));
- xPredInterUni(cu, partAddr, width, height, list, &m_predShortYuv[list], bLuma, bChroma);
+ xPredInterUni(cu, partAddr, width, height, list, &m_predShortYuv[partSize][list], bLuma, bChroma);
}
if (cu->getSlice()->getPPS()->getWPBiPred())
{
- xWeightedPredictionBi(cu, &m_predShortYuv[0], &m_predShortYuv[1], refIdx[0], refIdx[1], partAddr, width, height, outPredYuv, bLuma, bChroma);
+ xWeightedPredictionBi(cu, &m_predShortYuv[partSize][0], &m_predShortYuv[partSize][1], refIdx[0], refIdx[1], partAddr, width, height, outPredYuv, bLuma, bChroma);
}
else
{
- outPredYuv->addAvg(&m_predShortYuv[0], &m_predShortYuv[1], partAddr, width, height, bLuma, bChroma);
+ outPredYuv->addAvg(&m_predShortYuv[partSize][0], &m_predShortYuv[partSize][1], partAddr, width, height, bLuma, bChroma);
}
}
else if (cu->getSlice()->getPPS()->getWPBiPred())
@@ -376,10 +424,10 @@
assert(refIdx[list] < cu->getSlice()->getNumRefIdx(list));
- xPredInterUni(cu, partAddr, width, height, list, &m_predShortYuv[list], bLuma, bChroma);
+ xPredInterUni(cu, partAddr, width, height, list, &m_predShortYuv[partSize][list], bLuma, bChroma);
}
- xWeightedPredictionBi(cu, &m_predShortYuv[0], &m_predShortYuv[1], refIdx[0], refIdx[1], partAddr, width, height, outPredYuv, bLuma, bChroma);
+ xWeightedPredictionBi(cu, &m_predShortYuv[partSize][0], &m_predShortYuv[partSize][1], refIdx[0], refIdx[1], partAddr, width, height, outPredYuv, bLuma, bChroma);
}
else if (refIdx[0] >= 0)
{
@@ -412,10 +460,10 @@
* \param height Height of block
* \param dstPic Pointer to destination picture
*/
-void TComPrediction::xPredInterLumaBlk(TComDataCU *cu, TComPicYuv *refPic, uint32_t partAddr, MV *mv, int width, int height, TComYuv *dstPic)
+void TComPrediction::xPredInterLumaBlk(TComDataCU *cu, TComPicYuv *refPic, uint32_t partAddr, MV *mv, int width, int height, TComYuv *dstPic, int immedIdx)
{
int dstStride = dstPic->getStride();
- pixel *dst = dstPic->getLumaAddr(partAddr);
+ pixel *dst = dstPic->getLumaAddr(partAddr);
int srcStride = refPic->getStride();
int srcOffset = (mv->x >> 2) + (mv->y >> 2) * srcStride;
@@ -442,8 +490,10 @@
int tmpStride = width;
int filterSize = NTAPS_LUMA;
int halfFilterSize = (filterSize >> 1);
- primitives.luma_hps[partEnum](src, srcStride, m_immedVals, tmpStride, xFrac, 1);
- primitives.luma_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, yFrac);
+ int partSize = m_threading ? cu->getPartitionSize(partAddr) : 0;
+ int16_t* immedVals = m_immedVals[partSize][immedIdx >> 5][immedIdx & 31];
+ primitives.luma_hps[partEnum](src, srcStride, immedVals, tmpStride, xFrac, 1);
+ primitives.luma_vsp[partEnum](immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, yFrac);
}
}
@@ -482,8 +532,10 @@
int tmpStride = width;
int filterSize = NTAPS_LUMA;
int halfFilterSize = (filterSize >> 1);
- primitives.luma_hps[partEnum](ref, refStride, m_immedVals, tmpStride, xFrac, 1);
- primitives.luma_vss[partEnum](m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, yFrac);
+ int partSize = m_threading ? cu->getPartitionSize(partAddr) : 0;
+ int16_t* immedVals = m_immedVals[partSize][0][0];
+ primitives.luma_hps[partEnum](ref, refStride, immedVals, tmpStride, xFrac, 1);
+ primitives.luma_vss[partEnum](immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, yFrac);
}
}
@@ -540,12 +592,14 @@
int extStride = width >> m_hChromaShift;
int filterSize = NTAPS_CHROMA;
int halfFilterSize = (filterSize >> 1);
+ int partSize = m_threading ? cu->getPartitionSize(partAddr) : 0;
+ int16_t* immedVals = m_immedVals[partSize][0][0];
- primitives.chroma[csp].filter_hps[partEnum](refCb, refStride, m_immedVals, extStride, xFrac << (1 - cu->getHorzChromaShift()), 1);
- primitives.chroma[csp].filter_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - cu->getVertChromaShift()));
+ primitives.chroma[csp].filter_hps[partEnum](refCb, refStride, immedVals, extStride, xFrac << (1 - cu->getHorzChromaShift()), 1);
+ primitives.chroma[csp].filter_vsp[partEnum](immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - cu->getVertChromaShift()));
- primitives.chroma[csp].filter_hps[partEnum](refCr, refStride, m_immedVals, extStride, xFrac << (1 - cu->getHorzChromaShift()), 1);
- primitives.chroma[csp].filter_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - cu->getVertChromaShift()));
+ primitives.chroma[csp].filter_hps[partEnum](refCr, refStride, immedVals, extStride, xFrac << (1 - cu->getHorzChromaShift()), 1);
+ primitives.chroma[csp].filter_vsp[partEnum](immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - cu->getVertChromaShift()));
}
}
@@ -597,10 +651,13 @@
int extStride = cxWidth;
int filterSize = NTAPS_CHROMA;
int halfFilterSize = (filterSize >> 1);
- primitives.chroma[csp].filter_hps[partEnum](refCb, refStride, m_immedVals, extStride, xFrac << (1 - cu->getHorzChromaShift()), 1);
- primitives.chroma[csp].filter_vss[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - cu->getVertChromaShift()));
- primitives.chroma[csp].filter_hps[partEnum](refCr, refStride, m_immedVals, extStride, xFrac << (1 - cu->getHorzChromaShift()), 1);
- primitives.chroma[csp].filter_vss[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - cu->getVertChromaShift()));
+ int partSize = m_threading ? cu->getPartitionSize(partAddr) : 0;
+ int16_t* immedVals = m_immedVals[partSize][0][0];
+
+ primitives.chroma[csp].filter_hps[partEnum](refCb, refStride, immedVals, extStride, xFrac << (1 - cu->getHorzChromaShift()), 1);
+ primitives.chroma[csp].filter_vss[partEnum](immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - cu->getVertChromaShift()));
+ primitives.chroma[csp].filter_hps[partEnum](refCr, refStride, immedVals, extStride, xFrac << (1 - cu->getHorzChromaShift()), 1);
+ primitives.chroma[csp].filter_vss[partEnum](immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - cu->getVertChromaShift()));
}
}
diff -r 33b67a53b6de -r 7b757a1a9953 source/Lib/TLibCommon/TComPrediction.h
--- a/source/Lib/TLibCommon/TComPrediction.h Thu Mar 06 21:27:55 2014 -0600
+++ b/source/Lib/TLibCommon/TComPrediction.h Fri Mar 07 14:21:13 2014 +0800
@@ -63,19 +63,20 @@
{
protected:
- // references sample for IntraPrediction
- TComYuv m_predYuv[2];
- ShortYuv m_predShortYuv[2];
- TComYuv m_predTempYuv;
+ // references sample for InterPrediction
+ TComYuv m_predYuv[SIZE_nRx2N + 1][2][MAX_NUM_REF];
+ ShortYuv m_predShortYuv[SIZE_nRx2N + 1][2];
- int16_t* m_immedVals;
+ int16_t* m_immedVals[SIZE_nRx2N + 1][2][MAX_NUM_REF * 2];
+ bool m_threading;
+
int m_hChromaShift;
int m_vChromaShift;
// motion compensation functions
void xPredInterUni(TComDataCU* cu, uint32_t partAddr, int width, int height, int picList, TComYuv* outPredYuv, bool bLuma, bool bChroma);
void xPredInterUni(TComDataCU* cu, uint32_t partAddr, int width, int height, int picList, ShortYuv* outPredYuv, bool bLuma, bool bChroma);
- void xPredInterLumaBlk(TComDataCU *cu, TComPicYuv *refPic, uint32_t partAddr, MV *mv, int width, int height, TComYuv *dstPic);
+ void xPredInterLumaBlk(TComDataCU *cu, TComPicYuv *refPic, uint32_t partAddr, MV *mv, int width, int height, TComYuv *dstPic, int immedIdx = 0);
void xPredInterLumaBlk(TComDataCU *cu, TComPicYuv *refPic, uint32_t partAddr, MV *mv, int width, int height, ShortYuv *dstPic);
void xPredInterChromaBlk(TComDataCU *cu, TComPicYuv *refPic, uint32_t partAddr, MV *mv, int width, int height, TComYuv *dstPic);
void xPredInterChromaBlk(TComDataCU *cu, TComPicYuv *refPic, uint32_t partAddr, MV *mv, int width, int height, ShortYuv *dstPic);
@@ -99,7 +100,7 @@
TComPrediction();
virtual ~TComPrediction();
- void initTempBuff(int csp);
+ void initTempBuff(int csp, int numPart, int maxNumRef);
// inter
void motionCompensation(TComDataCU* cu, TComYuv* predYuv, int picList = REF_PIC_LIST_X, int partIdx = -1, bool bLuma = true, bool bChroma = true);
diff -r 33b67a53b6de -r 7b757a1a9953 source/Lib/TLibCommon/TComSlice.cpp
--- a/source/Lib/TLibCommon/TComSlice.cpp Thu Mar 06 21:27:55 2014 -0600
+++ b/source/Lib/TLibCommon/TComSlice.cpp Fri Mar 07 14:21:13 2014 +0800
@@ -374,9 +374,9 @@
* \param *&wpScalingParam
* \returns void
*/
-void TComSlice::getWpScaling(int l, int refIdx, wpScalingParam *&wp)
+void TComSlice::getWpScaling(int l, int refIdx, wpScalingParam *&wp, int partSize)
{
- wp = m_weightPredTable[l][refIdx];
+ wp = m_weightPredTable[partSize][l][refIdx];
}
/** reset Default WP tables settings : no weight.
@@ -391,11 +391,14 @@
{
for (int yuv = 0; yuv < 3; yuv++)
{
- wpScalingParam *pwp = &(m_weightPredTable[e][i][yuv]);
- pwp->bPresentFlag = false;
- pwp->log2WeightDenom = 0;
- pwp->inputWeight = 1;
- pwp->inputOffset = 0;
+ for (int partSize = 0; partSize <= SIZE_nRx2N; partSize++)
+ {
+ wpScalingParam *pwp = &(m_weightPredTable[partSize][e][i][yuv]);
+ pwp->bPresentFlag = false;
+ pwp->log2WeightDenom = 0;
+ pwp->inputWeight = 1;
+ pwp->inputOffset = 0;
+ }
}
}
}
@@ -412,18 +415,21 @@
{
for (int yuv = 0; yuv < 3; yuv++)
{
- wpScalingParam *pwp = &(m_weightPredTable[e][i][yuv]);
- if (!pwp->bPresentFlag)
+ for (int partSize = 0; partSize <= SIZE_nRx2N; partSize++)
{
- // Inferring values not present :
- pwp->inputWeight = (1 << pwp->log2WeightDenom);
- pwp->inputOffset = 0;
+ wpScalingParam *pwp = &(m_weightPredTable[partSize][e][i][yuv]);
+ if (!pwp->bPresentFlag)
+ {
+ // Inferring values not present :
+ pwp->inputWeight = (1 << pwp->log2WeightDenom);
+ pwp->inputOffset = 0;
+ }
+
+ pwp->w = pwp->inputWeight;
+ pwp->o = pwp->inputOffset << (X265_DEPTH - 8);
+ pwp->shift = pwp->log2WeightDenom;
+ pwp->round = (pwp->log2WeightDenom >= 1) ? (1 << (pwp->log2WeightDenom - 1)) : (0);
}
-
- pwp->w = pwp->inputWeight;
- pwp->o = pwp->inputOffset << (X265_DEPTH - 8);
- pwp->shift = pwp->log2WeightDenom;
- pwp->round = (pwp->log2WeightDenom >= 1) ? (1 << (pwp->log2WeightDenom - 1)) : (0);
}
}
}
diff -r 33b67a53b6de -r 7b757a1a9953 source/Lib/TLibCommon/TComSlice.h
--- a/source/Lib/TLibCommon/TComSlice.h Thu Mar 06 21:27:55 2014 -0600
+++ b/source/Lib/TLibCommon/TComSlice.h Fri Mar 07 14:21:13 2014 +0800
@@ -1346,7 +1346,7 @@
public:
- wpScalingParam m_weightPredTable[2][MAX_NUM_REF][3]; // [REF_PIC_LIST_0 or REF_PIC_LIST_1][refIdx][0:Y, 1:U, 2:V]
+ wpScalingParam m_weightPredTable[SIZE_nRx2N + 1][2][MAX_NUM_REF][3]; // [partSize][REF_PIC_LIST_0 or REF_PIC_LIST_1][refIdx][0:Y, 1:U, 2:V]
int m_numWPRefs; // number of references for which unidirectional weighted prediction is used
TComSlice();
@@ -1528,9 +1528,15 @@
bool getFinalized() { return m_bFinalized; }
- void setWpScaling(wpScalingParam wp[2][MAX_NUM_REF][3]) { memcpy(m_weightPredTable, wp, sizeof(wpScalingParam) * 2 * MAX_NUM_REF * 3); }
+ void setWpScaling(wpScalingParam wp[2][MAX_NUM_REF][3])
+ {
+ for (int partSize = 0; partSize <= SIZE_nRx2N; partSize++)
+ {
+ memcpy(m_weightPredTable[partSize], wp, sizeof(wpScalingParam) * 2 * MAX_NUM_REF * 3);
+ }
+ }
- void getWpScaling(int e, int refIdx, wpScalingParam *&wp);
+ void getWpScaling(int l, int refIdx, wpScalingParam *&wp, int partSize = 0);
void resetWpScaling();
void initWpScaling();
diff -r 33b67a53b6de -r 7b757a1a9953 source/Lib/TLibCommon/TComWeightPrediction.cpp
--- a/source/Lib/TLibCommon/TComWeightPrediction.cpp Thu Mar 06 21:27:55 2014 -0600
+++ b/source/Lib/TLibCommon/TComWeightPrediction.cpp Fri Mar 07 14:21:13 2014 +0800
@@ -510,11 +510,11 @@
{ // explicit --------------------
if (refIdx0 >= 0)
{
- slice->getWpScaling(REF_PIC_LIST_0, refIdx0, wp0);
+ slice->getWpScaling(REF_PIC_LIST_0, refIdx0, wp0, cu->getPartitionSize(0));
}
if (refIdx1 >= 0)
{
- slice->getWpScaling(REF_PIC_LIST_1, refIdx1, wp1);
+ slice->getWpScaling(REF_PIC_LIST_1, refIdx1, wp1, cu->getPartitionSize(0));
}
}
else
diff -r 33b67a53b6de -r 7b757a1a9953 source/Lib/TLibEncoder/TEncCu.h
--- a/source/Lib/TLibEncoder/TEncCu.h Thu Mar 06 21:27:55 2014 -0600
+++ b/source/Lib/TLibEncoder/TEncCu.h Fri Mar 07 14:21:13 2014 +0800
@@ -169,6 +169,8 @@
void xComputeCostIntraInInter(TComDataCU* cu, PartSize partSize);
void xCheckRDCostInter(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, bool bUseMRG = false);
void xComputeCostInter(TComDataCU* outTempCU, TComYuv* outPredYUV, PartSize partSize, bool bUseMRG = false);
+ void xComputeCostInterEnqueue(TComDataCU* outTempCU, TComYuv* outPredYUV, PartSize partSize, bool bUseMRG = false);
+ void xComputeDistortionCostInter(TComDataCU* outTempCU, TComYuv* outPredYUV);
void xComputeCostMerge2Nx2N(TComDataCU*& outBestCU, TComDataCU*& outTempCU, TComYuv*& bestPredYuv, TComYuv*& tmpPredYuv);
void xEncodeIntraInInter(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* outResiYuv, TComYuv* outReconYuv);
void encodeResidue(TComDataCU* lcu, TComDataCU* cu, uint32_t absPartIdx, UChar depth);
diff -r 33b67a53b6de -r 7b757a1a9953 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Thu Mar 06 21:27:55 2014 -0600
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Fri Mar 07 14:21:13 2014 +0800
@@ -52,7 +52,7 @@
//! \ingroup TLibEncoder
//! \{
-TEncSearch::TEncSearch()
+TEncSearch::TEncSearch() : JobProvider(0), TComPrediction()
{
m_qtTempCoeffY = NULL;
m_qtTempCoeffCb = NULL;
@@ -75,6 +75,12 @@
m_entropyCoder = NULL;
m_rdSbacCoders = NULL;
m_rdGoOnSbacCoder = NULL;
+
+ m_queuedBitmap = NULL;
+ m_completeBitmap = NULL;
+ m_queuedCU = NULL;
+ m_queuedPredYuv = NULL;
+ m_queuedbUseMRG = NULL;
}
TEncSearch::~TEncSearch()
@@ -106,18 +112,35 @@
delete[] m_qtTempCoeffCr;
delete[] m_qtTempShortYuv;
m_qtTempTransformSkipYuv.destroy();
- m_tmpYuvPred.destroy();
+ for (int i = 0; i <= m_numWords; i++)
+ {
+ m_tmpYuvPred[i].destroy();
+ }
+
+ X265_FREE((void*)m_queuedBitmap);
+ X265_FREE((void*)m_completeBitmap);
+ X265_FREE(m_queuedCU);
+ X265_FREE(m_queuedPredYuv);
+ X265_FREE(m_queuedbUseMRG);
}
bool TEncSearch::init(Encoder* cfg, TComRdCost* rdCost, TComTrQuant* trQuant)
{
+ bool ok = true;
m_cfg = cfg;
m_trQuant = trQuant;
m_rdCost = rdCost;
- initTempBuff(cfg->param->internalCsp);
- m_me.setSearchMethod(cfg->param->searchMethod);
- m_me.setSubpelRefine(cfg->param->subpelRefine);
+ m_threading = m_pool != NULL && cfg->param->rdLevel < 5 && ThreadPool::getThreadPool()->getThreadCount() >= 16;
+ m_numWords = (m_threading && cfg->param->bEnableRectInter) ? 3 : 1;
+
+ initTempBuff(cfg->param->internalCsp, m_numWords, cfg->param->maxNumReferences);
+
+ for (int i = 0; i <= m_numWords; i++)
+ {
+ m_me[i].setSearchMethod(cfg->param->searchMethod);
+ m_me[i].setSubpelRefine(cfg->param->subpelRefine);
+ }
/* When frame parallelism is active, only 'refLagPixels' of reference frames will be guaranteed
* available for motion reference. See refLagRows in FrameEncoder::compressCTURows() */
@@ -142,7 +165,7 @@
m_qtTempCoeffY[i] = X265_MALLOC(TCoeff, g_maxCUSize * g_maxCUSize);
m_qtTempCoeffCb[i] = X265_MALLOC(TCoeff, (g_maxCUSize >> m_hChromaShift) * (g_maxCUSize >> m_vChromaShift));
m_qtTempCoeffCr[i] = X265_MALLOC(TCoeff, (g_maxCUSize >> m_hChromaShift) * (g_maxCUSize >> m_vChromaShift));
- m_qtTempShortYuv[i].create(MAX_CU_SIZE, MAX_CU_SIZE, cfg->param->internalCsp);
+ ok = ok && m_qtTempShortYuv[i].create(MAX_CU_SIZE, MAX_CU_SIZE, cfg->param->internalCsp);
}
const uint32_t numPartitions = 1 << (g_maxCUDepth << 1);
@@ -161,17 +184,39 @@
CHECKED_MALLOC(m_qtTempTUCoeffCb, TCoeff, MAX_TS_WIDTH * MAX_TS_HEIGHT);
CHECKED_MALLOC(m_qtTempTUCoeffCr, TCoeff, MAX_TS_WIDTH * MAX_TS_HEIGHT);
- return m_qtTempTransformSkipYuv.create(g_maxCUSize, g_maxCUSize, cfg->param->internalCsp) &&
- m_tmpYuvPred.create(MAX_CU_SIZE, MAX_CU_SIZE, cfg->param->internalCsp);
+ ok = ok && m_qtTempTransformSkipYuv.create(g_maxCUSize, g_maxCUSize, cfg->param->internalCsp);
+
+ for (int i = 0; i < m_numWords; i++)
+ {
+ ok = ok && m_tmpYuvPred[i].create(MAX_CU_SIZE, MAX_CU_SIZE, cfg->param->internalCsp);
+ }
+
+ if (m_threading)
+ {
+ CHECKED_MALLOC(m_queuedBitmap, uint64_t, m_numWords);
+ memset((void*)m_queuedBitmap, 0, sizeof(uint64_t) * m_numWords);
+
+ CHECKED_MALLOC(m_completeBitmap, uint64_t, m_numWords);
+ memset((void*)m_completeBitmap, 0, sizeof(uint64_t) * m_numWords);
+
+ CHECKED_MALLOC(m_queuedCU, TComDataCU*, m_numWords);
+ CHECKED_MALLOC(m_queuedPredYuv, TComYuv*, m_numWords);
+ CHECKED_MALLOC(m_queuedbUseMRG, bool, m_numWords);
+ }
+
+ return ok;
fail:
- return false;
+ return false;
}
void TEncSearch::setQPLambda(int QP, double lambdaLuma, double lambdaChroma)
{
m_trQuant->setLambda(lambdaLuma, lambdaChroma);
- m_me.setQP(QP);
+ for (int i = 0; i <= m_numWords; i++)
+ {
+ m_me[i].setQP(QP);
+ }
}
void TEncSearch::xEncSubdivCbfQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, bool bLuma, bool bChroma)
@@ -2448,9 +2493,10 @@
uint32_t absPartIdx;
int width, height;
- motionCompensation(cu, &m_tmpYuvPred, REF_PIC_LIST_X, partIdx, true, false);
cu->getPartIndexAndSize(partIdx, absPartIdx, width, height);
- uint32_t cost = m_me.bufSA8D(m_tmpYuvPred.getLumaAddr(absPartIdx), m_tmpYuvPred.getStride());
+ int partSizeId = m_threading ? (int)cu->getPartitionSize(absPartIdx) : 0;
+ motionCompensation(cu, &m_tmpYuvPred[partSizeId], REF_PIC_LIST_X, partIdx, true, false);
+ uint32_t cost = m_me[partSizeId].bufSA8D(m_tmpYuvPred[partSizeId].getLumaAddr(absPartIdx), m_tmpYuvPred[partSizeId].getStride());
x265_emms();
return cost;
}
@@ -2599,11 +2645,11 @@
cu->getPartIndexAndSize(partIdx, partAddr, roiWidth, roiHeight);
Pel* pu = fenc->getLumaAddr(cu->getAddr(), cu->getZorderIdxInCU() + partAddr);
- m_me.setSourcePU(pu - fenc->getLumaAddr(), roiWidth, roiHeight);
-
- cu->getMvPredLeft(m_mvPredictors[0]);
- cu->getMvPredAbove(m_mvPredictors[1]);
- cu->getMvPredAboveRight(m_mvPredictors[2]);
+ m_me[0].setSourcePU(pu - fenc->getLumaAddr(), roiWidth, roiHeight);
+
+ cu->getMvPredLeft(m_mvPredictors[0][0]);
+ cu->getMvPredAbove(m_mvPredictors[0][1]);
+ cu->getMvPredAboveRight(m_mvPredictors[0][2]);
bool bTestNormalMC = true;
@@ -2635,12 +2681,12 @@
MV mvmin, mvmax;
xSetSearchRange(cu, mvp, merange, mvmin, mvmax);
- int satdCost = m_me.motionEstimate(m_mref[list][idx],
- mvmin, mvmax, mvp, 3, m_mvPredictors, merange, outmv);
+ int satdCost = m_me[0].motionEstimate(m_mref[list][idx],
+ mvmin, mvmax, mvp, 3, m_mvPredictors[0], merange, outmv);
/* Get total cost of partition, but only include MV bit cost once */
- bitsTemp += m_me.bitcost(outmv);
- costTemp = (satdCost - m_me.mvcost(outmv)) + m_rdCost->getCost(bitsTemp);
+ bitsTemp += m_me[0].bitcost(outmv);
+ costTemp = (satdCost - m_me[0].mvcost(outmv)) + m_rdCost->getCost(bitsTemp);
xCheckBestMVP(&amvpInfo[list][idx], mvTemp[list][idx], mvPred[list][idx], mvpIdx[list][idx], bitsTemp, costTemp);
@@ -2678,16 +2724,16 @@
::memcpy(mvpIdxBi, mvpIdx, sizeof(mvpIdx));
// Generate reference subpels
- xPredInterLumaBlk(cu, cu->getSlice()->getRefPic(REF_PIC_LIST_0, refIdx[0])->getPicYuvRec(), partAddr, &mv[0], roiWidth, roiHeight, &m_predYuv[0]);
- xPredInterLumaBlk(cu, cu->getSlice()->getRefPic(REF_PIC_LIST_1, refIdx[1])->getPicYuvRec(), partAddr, &mv[1], roiWidth, roiHeight, &m_predYuv[1]);
-
- pixel *ref0 = m_predYuv[0].getLumaAddr(partAddr);
- pixel *ref1 = m_predYuv[1].getLumaAddr(partAddr);
+ xPredInterLumaBlk(cu, cu->getSlice()->getRefPic(REF_PIC_LIST_0, refIdx[0])->getPicYuvRec(), partAddr, &mv[0], roiWidth, roiHeight, &m_predYuv[0][0][0]);
+ xPredInterLumaBlk(cu, cu->getSlice()->getRefPic(REF_PIC_LIST_1, refIdx[1])->getPicYuvRec(), partAddr, &mv[1], roiWidth, roiHeight, &m_predYuv[0][1][0]);
+
+ pixel *ref0 = m_predYuv[0][0][0].getLumaAddr(partAddr);
+ pixel *ref1 = m_predYuv[0][1][0].getLumaAddr(partAddr);
ALIGN_VAR_32(pixel, avg[MAX_CU_SIZE * MAX_CU_SIZE]);
int partEnum = partitionFromSizes(roiWidth, roiHeight);
- primitives.pixelavg_pp[partEnum](avg, roiWidth, ref0, m_predYuv[0].getStride(), ref1, m_predYuv[1].getStride(), 32);
+ primitives.pixelavg_pp[partEnum](avg, roiWidth, ref0, m_predYuv[0][0][0].getStride(), ref1, m_predYuv[0][1][0].getStride(), 32);
int satdCost = primitives.satd[partEnum](pu, fenc->getStride(), avg, roiWidth);
x265_emms();
bits[2] = bits[0] + bits[1] - mbBits[0] - mbBits[1] + mbBits[2];
@@ -2704,11 +2750,11 @@
x265_emms();
unsigned int bitsZero0, bitsZero1;
- m_me.setMVP(mvPredBi[0][refIdxBidir[0]]);
- bitsZero0 = bits[0] - m_me.bitcost(mv[0]) + m_me.bitcost(mvzero);
-
- m_me.setMVP(mvPredBi[1][refIdxBidir[1]]);
- bitsZero1 = bits[1] - m_me.bitcost(mv[1]) + m_me.bitcost(mvzero);
+ m_me[0].setMVP(mvPredBi[0][refIdxBidir[0]]);
+ bitsZero0 = bits[0] - m_me[0].bitcost(mv[0]) + m_me[0].bitcost(mvzero);
+
+ m_me[0].setMVP(mvPredBi[1][refIdxBidir[1]]);
+ bitsZero1 = bits[1] - m_me[0].bitcost(mv[1]) + m_me[0].bitcost(mvzero);
uint32_t costZero = satdCost + m_rdCost->getCost(bitsZero0) + m_rdCost->getCost(bitsZero1);
@@ -2868,6 +2914,338 @@
cu->m_totalBits = totalmebits;
}
+/** search of the best candidate for inter prediction, multi-thread version
+ * \param cu
+ * \param predYuv
+ * \param bUseMRG
+ * \param bLuma
+ * \param bChroma
+ * \returns void
+ */
+void TEncSearch::predInterSearch(TComDataCU* cu, TComYuv* predYuv, int id, bool bUseMRG, bool bLuma, bool bChroma)
+{
+ MV mvzero(0, 0);
+ MV mv[2];
+ MV mvBidir[2];
+ MV mvPredBi[2][MAX_NUM_REF];
+ int mvpIdxBi[2][MAX_NUM_REF];
+
+ uint32_t mbBits[3] = { 1, 1, 0 };
+ int refIdx[2] = { 0, 0 }; /* If un-initialized, may cause SEGV in bi-directional prediction iterative stage. */
+ int refIdxBidir[2] = { 0, 0 };
+
+ PartSize partSize = cu->getPartitionSize(0);
+ int numPart = cu->getNumPartInter();
+ int numPredDir = cu->getSlice()->isInterP() ? 1 : 2;
+
+ uint32_t listCost[2] = { MAX_UINT, MAX_UINT };
+ uint32_t bits[3];
+ uint32_t costbi = MAX_UINT;
+ MV mvValidList1(0, 0);
+ int refIdxValidList1 = 0;
+ uint32_t bitsValidList1 = MAX_UINT;
+ uint32_t costValidList1 = MAX_UINT;
+
+ int& partIdx = m_partIdx[partSize];
+
+ uint32_t partAddr;
+ int roiWidth, roiHeight;
+ xGetBlkBits(partSize, cu->getSlice()->isInterP(), partIdx, m_lastMode[partSize], mbBits);
+ cu->getPartIndexAndSize(partIdx, partAddr, roiWidth, roiHeight);
+ TComPicYuv *fenc = cu->getSlice()->getPic()->getPicYuvOrg();
+ Pel* pu = fenc->getLumaAddr(cu->getAddr(), cu->getZorderIdxInCU() + partAddr);
+
+ int numRefL0 = cu->getSlice()->getNumRefIdx(REF_PIC_LIST_0);
+ int numRefL1 = cu->getSlice()->getNumRefIdx(REF_PIC_LIST_1);
+
+ bool bTestNormalMC = true;
+
+ if (bUseMRG && cu->getCUSize(0) > 8 && numPart == 2)
+ {
+ bTestNormalMC = false;
+ }
+
+ if (id == (numRefL0 + numRefL1))
+ {
+ TComMvField mvFieldNeighbours[MRG_MAX_NUM_CANDS << 1]; // double length for mv of both lists
+ UChar interDirNeighbours[MRG_MAX_NUM_CANDS];
+ int numValidMergeCand = 0;
+
+ m_mrgInterDir[partSize] = 0;
+ m_mrgMvField[partSize][0].setMvField(MV(0, 0), -1);
+ m_mrgMvField[partSize][1].setMvField(MV(0, 0), -1);
+ m_mrgIndex[partSize] = 0;
+ m_mrgBits[partSize] = 0;
+
+ /* find Merge result */
+ xMergeEstimation(cu, partIdx, m_mrgInterDir[partSize], m_mrgMvField[partSize], m_mrgIndex[partSize],
+ m_mrgCost[partSize], m_mrgBits[partSize], mvFieldNeighbours, interDirNeighbours, numValidMergeCand);
+ }
+
+ if (id < (numRefL0 + numRefL1))
+ {
+ /* Uni-directional prediction */
+ int list = id < numRefL0 ? 0 : 1;
+ int idx = id - list * numRefL0;
+
+ m_bitsTemp[partSize][list][idx] = mbBits[list];
+ if (cu->getSlice()->getNumRefIdx(list) > 1)
+ {
+ m_bitsTemp[partSize][list][idx] += idx + 1;
+ if (idx == cu->getSlice()->getNumRefIdx(list) - 1) m_bitsTemp[partSize][list][idx]--;
+ }
+ uint32_t biPDistTemp = MAX_INT;
+ xEstimateMvPredAMVP(cu, partIdx, list, idx, m_mvPred[partSize][list][idx], &m_amvpInfo[partSize][list][idx], &biPDistTemp);
+
+ m_bitsTemp[partSize][list][idx] += MVP_IDX_BITS;
+ int merange = m_adaptiveRange[list][idx];
+ MV& mvp = m_mvPred[partSize][list][idx];
+ MV& outmv = m_mvTemp[partSize][list][idx];
+
+ MV mvmin, mvmax;
+ xSetSearchRange(cu, mvp, merange, mvmin, mvmax);
+ int satdCost = m_me[partSize].motionEstimate(m_mref[list][idx], mvmin, mvmax, mvp, 3, m_mvPredictors[partSize], merange, outmv);
+
+ /* Get total cost of partition, but only include MV bit cost once */
+ m_bitsTemp[partSize][list][idx] += m_me[partSize].bitcost(mvp, outmv);
+ m_costTemp[partSize][list][idx] = (satdCost - m_me[partSize].mvcost(mvp, outmv)) + m_rdCost->getCost(m_bitsTemp[partSize][list][idx]);
+
+ xCheckBestMVP(&m_amvpInfo[partSize][list][idx], m_mvTemp[partSize][list][idx], m_mvPred[partSize][list][idx],
+ m_mvpIdx[partSize][list][idx], m_bitsTemp[partSize][list][idx], m_costTemp[partSize][list][idx], partSize);
+ }
+
+ if (bTestNormalMC)
+ {
+ uint64_t oldval = ATOMIC_OR(&m_completeBitmap[partSize], 1LL << id);
+ oldval |= 1LL << id;
+
+ /* let the last finished thread do bidir */
+ uint64_t finish = 0;
+ int maxId = (cu->getPartitionSize(partAddr) == SIZE_2Nx2N) ? (numRefL0 + numRefL1 - 1) : numRefL0 + numRefL1;
+ for (int i = 0; i <= maxId; i++)
+ {
+ finish |= 1LL << i;
+ }
+ if ((oldval & finish) != finish)
+ {
+ return;
+ }
+ }
+
+ if (bTestNormalMC)
+ {
+ for (int list = 0; list < numPredDir; list++)
+ {
+ for (int idx = 0; idx < cu->getSlice()->getNumRefIdx(list); idx++)
+ {
+ if (m_costTemp[partSize][list][idx] < listCost[list])
+ {
+ listCost[list] = m_costTemp[partSize][list][idx];
+ bits[list] = m_bitsTemp[partSize][list][idx]; /* storing for bi-prediction */
+
+ /* set motion */
+ mv[list] = m_mvTemp[partSize][list][idx];
+ refIdx[list] = idx;
+ }
+
+ if (list == 1 && m_costTemp[partSize][list][idx] < costValidList1)
+ {
+ costValidList1 = m_costTemp[partSize][list][idx];
+ bitsValidList1 = m_bitsTemp[partSize][list][idx];
+
+ /* set motion */
+ mvValidList1 = m_mvTemp[partSize][list][idx];
+ refIdxValidList1 = idx;
+ }
+ }
+ }
+
+ /* Bi-directional prediction */
+ if ((cu->getSlice()->isInterB()) && (cu->isBipredRestriction() == false))
+ {
+ mvBidir[0] = mv[0];
+ mvBidir[1] = mv[1];
+ refIdxBidir[0] = refIdx[0];
+ refIdxBidir[1] = refIdx[1];
+
+ ::memcpy(mvPredBi, m_mvPred[partSize], sizeof(mvPredBi));
+ ::memcpy(mvpIdxBi, m_mvpIdx[partSize], sizeof(mvpIdxBi));
+
+ /* Generate reference subpels */
+ xPredInterLumaBlk(cu, cu->getSlice()->getRefPic(REF_PIC_LIST_0, refIdx[0])->getPicYuvRec(), partAddr, &mv[0], roiWidth, roiHeight, &m_predYuv[partSize][0][0]);
+ xPredInterLumaBlk(cu, cu->getSlice()->getRefPic(REF_PIC_LIST_1, refIdx[1])->getPicYuvRec(), partAddr, &mv[1], roiWidth, roiHeight, &m_predYuv[partSize][1][0]);
+
+ pixel *ref0 = m_predYuv[partSize][0][0].getLumaAddr(partAddr);
+ pixel *ref1 = m_predYuv[partSize][1][0].getLumaAddr(partAddr);
+
+ ALIGN_VAR_32(pixel, avg[MAX_CU_SIZE * MAX_CU_SIZE]);
+
+ int partEnum = partitionFromSizes(roiWidth, roiHeight);
+ primitives.pixelavg_pp[partEnum](avg, roiWidth, ref0, m_predYuv[partSize][0][0].getStride(), ref1, m_predYuv[partSize][1][0].getStride(), 32);
+ int satdCost = primitives.satd[partEnum](pu, fenc->getStride(), avg, roiWidth);
+ x265_emms();
+ bits[2] = bits[0] + bits[1] - mbBits[0] - mbBits[1] + mbBits[2];
+ costbi = satdCost + m_rdCost->getCost(bits[2]);
+
+ if (mv[0].notZero() || mv[1].notZero())
+ {
+ ref0 = m_mref[0][refIdx[0]]->fpelPlane + (pu - fenc->getLumaAddr()); //MV(0,0) of ref0
+ ref1 = m_mref[1][refIdx[1]]->fpelPlane + (pu - fenc->getLumaAddr()); //MV(0,0) of ref1
+ intptr_t refStride = m_mref[0][refIdx[0]]->lumaStride;
+
+ primitives.pixelavg_pp[partEnum](avg, roiWidth, ref0, refStride, ref1, refStride, 32);
+ satdCost = primitives.satd[partEnum](pu, fenc->getStride(), avg, roiWidth);
+ x265_emms();
+
+ unsigned int bitsZero0, bitsZero1;
+ m_me[partSize].setMVP(mvPredBi[0][refIdxBidir[0]]);
+ bitsZero0 = bits[0] - m_me[partSize].bitcost(mv[0]) + m_me[partSize].bitcost(mvzero);
+
+ m_me[partSize].setMVP(mvPredBi[1][refIdxBidir[1]]);
+ bitsZero1 = bits[1] - m_me[partSize].bitcost(mv[1]) + m_me[partSize].bitcost(mvzero);
+
+ uint32_t costZero = satdCost + m_rdCost->getCost(bitsZero0) + m_rdCost->getCost(bitsZero1);
+
+ MV mvpZero[2];
+ int mvpidxZero[2];
+ mvpZero[0] = mvPredBi[0][refIdxBidir[0]];
+ mvpidxZero[0] = mvpIdxBi[0][refIdxBidir[0]];
+ xCheckBestMVP(&m_amvpInfo[partSize][0][refIdxBidir[0]], mvzero, mvpZero[0], mvpidxZero[0], bitsZero0, costZero, partSize);
+ mvpZero[1] = mvPredBi[1][refIdxBidir[1]];
+ mvpidxZero[1] = mvpIdxBi[1][refIdxBidir[1]];
+ xCheckBestMVP(&m_amvpInfo[partSize][1][refIdxBidir[1]], mvzero, mvpZero[1], mvpidxZero[1], bitsZero1, costZero, partSize);
+
+ if (costZero < costbi)
+ {
+ costbi = costZero;
+ mvBidir[0].x = mvBidir[0].y = 0;
+ mvBidir[1].x = mvBidir[1].y = 0;
+ mvPredBi[0][refIdxBidir[0]] = mvpZero[0];
+ mvPredBi[1][refIdxBidir[1]] = mvpZero[1];
+ mvpIdxBi[0][refIdxBidir[0]] = mvpidxZero[0];
+ mvpIdxBi[1][refIdxBidir[1]] = mvpidxZero[1];
+ bits[2] = bitsZero0 + bitsZero1 - mbBits[0] - mbBits[1] + mbBits[2];
+ }
+ }
+ } /* if (B_SLICE) */
+ } /* end if bTestNormalMC */
+
+ /* Clear Motion Field */
+ cu->getCUMvField(REF_PIC_LIST_0)->setAllMvField(TComMvField(), partSize, partAddr, 0, partIdx);
+ cu->getCUMvField(REF_PIC_LIST_1)->setAllMvField(TComMvField(), partSize, partAddr, 0, partIdx);
+
+ uint32_t mebits = 0;
+ /* Set Motion Field */
+ mv[1] = mvValidList1;
+ refIdx[1] = refIdxValidList1;
+ bits[1] = bitsValidList1;
+ listCost[1] = costValidList1;
+
+ if (bTestNormalMC)
+ {
+ if (costbi <= listCost[0] && costbi <= listCost[1])
+ {
+ m_lastMode[partSize] = 2;
+ cu->getCUMvField(REF_PIC_LIST_0)->setAllMv(mvBidir[0], partSize, partAddr, 0, partIdx);
+ cu->getCUMvField(REF_PIC_LIST_0)->setAllRefIdx(refIdxBidir[0], partSize, partAddr, 0, partIdx);
+ cu->getCUMvField(REF_PIC_LIST_1)->setAllMv(mvBidir[1], partSize, partAddr, 0, partIdx);
+ cu->getCUMvField(REF_PIC_LIST_1)->setAllRefIdx(refIdxBidir[1], partSize, partAddr, 0, partIdx);
+
+ MV mvtmp = mvBidir[0] - mvPredBi[0][refIdxBidir[0]];
+ cu->getCUMvField(REF_PIC_LIST_0)->setMvd(partAddr, mvtmp);
+ mvtmp = mvBidir[1] - mvPredBi[1][refIdxBidir[1]];
+ cu->getCUMvField(REF_PIC_LIST_1)->setMvd(partAddr, mvtmp);
+
+ cu->setInterDirSubParts(3, partAddr, partIdx, cu->getDepth(0));
+
+ cu->setMVPIdx(REF_PIC_LIST_0, partAddr, mvpIdxBi[0][refIdxBidir[0]]);
+ cu->setMVPIdx(REF_PIC_LIST_1, partAddr, mvpIdxBi[1][refIdxBidir[1]]);
+
+ mebits = bits[2];
+ }
+ else if (listCost[0] <= listCost[1])
+ {
+ m_lastMode[partSize] = 0;
+ cu->getCUMvField(REF_PIC_LIST_0)->setAllMv(mv[0], partSize, partAddr, 0, partIdx);
+ cu->getCUMvField(REF_PIC_LIST_0)->setAllRefIdx(refIdx[0], partSize, partAddr, 0, partIdx);
+
+ MV mvtmp = mv[0] - m_mvPred[partSize][0][refIdx[0]];
+ cu->getCUMvField(REF_PIC_LIST_0)->setMvd(partAddr, mvtmp);
+
+ cu->setInterDirSubParts(1, partAddr, partIdx, cu->getDepth(0));
+
+ cu->setMVPIdx(REF_PIC_LIST_0, partAddr, m_mvpIdx[partSize][0][refIdx[0]]);
+
+ mebits = bits[0];
+ }
+ else
+ {
+ m_lastMode[partSize] = 1;
+ cu->getCUMvField(REF_PIC_LIST_1)->setAllMv(mv[1], partSize, partAddr, 0, partIdx);
+ cu->getCUMvField(REF_PIC_LIST_1)->setAllRefIdx(refIdx[1], partSize, partAddr, 0, partIdx);
+
+ MV mvtmp = mv[1] - m_mvPred[partSize][1][refIdx[1]];
+ cu->getCUMvField(REF_PIC_LIST_1)->setMvd(partAddr, mvtmp);
+
+ cu->setInterDirSubParts(2, partAddr, partIdx, cu->getDepth(0));
+
+ cu->setMVPIdx(REF_PIC_LIST_1, partAddr, m_mvpIdx[partSize][1][refIdx[1]]);
+
+ mebits = bits[1];
+ }
+ } /* end if bTestNormalMC */
+
+ uint32_t totalbits = mebits;
+ cu->setMergeFlag(partAddr, false);
+
+ if (cu->getPartitionSize(partAddr) != SIZE_2Nx2N)
+ {
+ /* calculate ME cost */
+ uint32_t meError = MAX_UINT;
+ uint32_t meCost = MAX_UINT;
+
+ if (bTestNormalMC)
+ {
+ meError = xGetInterPredictionError(cu, partIdx);
+ meCost = meError + m_rdCost->getCost(mebits);
+ }
+
+ /* compare with Merge result */
+ if (m_mrgCost[partSize] < meCost)
+ {
+ // set Merge result
+ cu->setMergeFlag(partAddr, true);
+ cu->setMergeIndex(partAddr, m_mrgIndex[partSize]);
+ cu->setInterDirSubParts(m_mrgInterDir[partSize], partAddr, partIdx, cu->getDepth(partAddr));
+ {
+ cu->getCUMvField(REF_PIC_LIST_0)->setAllMvField(m_mrgMvField[partSize][0], partSize, partAddr, 0, partIdx);
+ cu->getCUMvField(REF_PIC_LIST_1)->setAllMvField(m_mrgMvField[partSize][1], partSize, partAddr, 0, partIdx);
+ }
+ totalbits = m_mrgBits[partSize];
+ }
+ }
+
+ if (partIdx == 0)
+ {
+ cu->m_totalBits = totalbits;
+ }
+ else
+ {
+ cu->m_totalBits += totalbits;
+ }
+
+ motionCompensation(cu, predYuv, REF_PIC_LIST_X, partIdx, bLuma, bChroma);
+
+ if (++partIdx < numPart)
+ {
+ enqueueInterSearch(cu, predYuv, partSize, bUseMRG, false);
+ }
+ else
+ {
+ m_completeBitmap[partSize] |= 1LL << (MAX_NUM_REF * 2 + 1);
+ }
+}
+
// AMVP
void TEncSearch::xEstimateMvPredAMVP(TComDataCU* cu, uint32_t partIdx, int list, int refIdx, MV& mvPred, AMVPInfo* amvpInfo, uint32_t* distBiP)
{
@@ -2885,10 +3263,13 @@
bestMv = amvpInfo->m_mvCand[0];
+ PartSize partSize = cu->getPartitionSize(partAddr);
+ TComYuv* templateCand = m_threading ? &m_predYuv[partSize][list][refIdx] : &m_predYuv[0][0][0];
+
//-- Check Minimum Cost.
for (i = 0; i < AMVP_MAX_NUM_CANDS; i++)
{
- uint32_t cost = xGetTemplateCost(cu, partAddr, &m_predTempYuv, amvpInfo->m_mvCand[i], list, refIdx, roiWidth, roiHeight);
+ uint32_t cost = xGetTemplateCost(cu, partAddr, templateCand, amvpInfo->m_mvCand[i], list, refIdx, roiWidth, roiHeight);
if (bestCost > cost)
{
bestCost = cost;
@@ -2901,6 +3282,7 @@
// Setting Best MVP
mvPred = bestMv;
cu->setMVPIdx(list, partAddr, bestIdx);
+ m_mvpIdx[partSize][list][refIdx] = bestIdx;
}
void TEncSearch::xGetBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3])
@@ -2953,13 +3335,12 @@
}
/* Check if using an alternative MVP would result in a smaller MVD + signal bits */
-void TEncSearch::xCheckBestMVP(AMVPInfo* amvpInfo, MV mv, MV& mvPred, int& outMvpIdx, uint32_t& outBits, uint32_t& outCost)
+void TEncSearch::xCheckBestMVP(AMVPInfo* amvpInfo, MV mv, MV& mvPred, int& outMvpIdx, uint32_t& outBits, uint32_t& outCost, int partSizeId)
{
assert(amvpInfo->m_mvCand[outMvpIdx] == mvPred);
- m_me.setMVP(mvPred);
int bestMvpIdx = outMvpIdx;
- int mvBitsOrig = m_me.bitcost(mv) + MVP_IDX_BITS;
+ int mvBitsOrig = m_me[partSizeId].bitcost(mvPred, mv) + MVP_IDX_BITS;
int bestMvBits = mvBitsOrig;
for (int mvpIdx = 0; mvpIdx < AMVP_MAX_NUM_CANDS; mvpIdx++)
@@ -2967,8 +3348,7 @@
if (mvpIdx == outMvpIdx)
continue;
- m_me.setMVP(amvpInfo->m_mvCand[mvpIdx]);
- int mvbits = m_me.bitcost(mv) + MVP_IDX_BITS;
+ int mvbits = m_me[partSizeId].bitcost(amvpInfo->m_mvCand[mvpIdx], mv) + MVP_IDX_BITS;
if (mvbits < bestMvBits)
{
@@ -2995,10 +3375,12 @@
cu->clipMv(mvCand);
// prediction pattern
- xPredInterLumaBlk(cu, cu->getSlice()->getRefPic(list, refIdx)->getPicYuvRec(), partAddr, &mvCand, sizex, sizey, templateCand);
+ int immedIdx = m_threading ? ((list << 5) + refIdx + MAX_NUM_REF) : 0;
+ xPredInterLumaBlk(cu, cu->getSlice()->getRefPic(list, refIdx)->getPicYuvRec(), partAddr, &mvCand, sizex, sizey, templateCand, immedIdx);
// calc distortion
- uint32_t cost = m_me.bufSAD(templateCand->getLumaAddr(partAddr), templateCand->getStride());
+ int partSizeId = m_threading ? (int)cu->getPartitionSize(0) : 0;
+ uint32_t cost = m_me[partSizeId].bufSAD(templateCand->getLumaAddr(partAddr), templateCand->getStride());
x265_emms();
return m_rdCost->calcRdSADCost(cost, MVP_IDX_BITS);
}
@@ -4300,4 +4682,102 @@
}
}
+void TEncSearch::setThreadPool(ThreadPool *p)
+{
+ m_pool = p;
+}
+
+bool TEncSearch::threadingInterSearch()
+{
+ return m_threading;
+}
+
+bool TEncSearch::findJob()
+{
+ unsigned long id;
+
+ /* thread safe */
+ for (int w = 0; w < m_numWords; w++)
+ {
+ uint64_t oldval = m_queuedBitmap[w];
+ while (oldval)
+ {
+ CTZ64(id, oldval);
+
+ uint64_t newval = oldval & ~(1LL << id);
+ if (ATOMIC_CAS(&m_queuedBitmap[w], oldval, newval) == oldval)
+ {
+ /* we cleared the bit, do predInterSearch */
+ predInterSearch(m_queuedCU[w], m_queuedPredYuv[w], (int)id, m_queuedbUseMRG[w], true, false);
+ return true;
+ }
+ /* some other thread cleared the bit, try another bit */
+ oldval = m_queuedBitmap[w];
+ }
+ }
+
+ /* made it through the bitmap without finding any enqueued rows */
+ return false;
+}
+
+void TEncSearch::enqueueInterSearch(TComDataCU* cu, TComYuv* predYuv, PartSize partSize, bool bUseMRG, bool firstPart)
+{
+ if (firstPart)
+ {
+ m_partIdx[partSize] = 0;
+ m_lastMode[partSize] = 0;
+
+ m_queuedCU[partSize] = cu;
+ m_queuedPredYuv[partSize] = predYuv;
+ m_queuedbUseMRG[partSize] = bUseMRG;
+ }
+
+ uint32_t partAddr;
+ int roiWidth, roiHeight;
+ cu->getPartIndexAndSize(m_partIdx[partSize], partAddr, roiWidth, roiHeight);
+
+ TComPicYuv *fenc = cu->getSlice()->getPic()->getPicYuvOrg();
+ Pel* pu = fenc->getLumaAddr(cu->getAddr(), cu->getZorderIdxInCU() + partAddr);
+ m_me[partSize].setSourcePU(pu - fenc->getLumaAddr(), roiWidth, roiHeight);
+
+ cu->getMvPredLeft(m_mvPredictors[partSize][0]);
+ cu->getMvPredAbove(m_mvPredictors[partSize][1]);
+ cu->getMvPredAboveRight(m_mvPredictors[partSize][2]);
+
+ /* reset complete bitmap */
+ m_completeBitmap[partSize] = 0;
+
+ int numRef = cu->getSlice()->getNumRefIdx(REF_PIC_LIST_0) + cu->getSlice()->getNumRefIdx(REF_PIC_LIST_1);
+ bool bTestNormalMC = true;
+ if (bUseMRG && cu->getCUSize(0) > 8 && cu->getNumPartInter() == 2)
+ {
+ bTestNormalMC = false;
+ }
+ uint64_t val = 0;
+ for (int bit = 0; bTestNormalMC && bit < numRef; bit++)
+ {
+ /* enqueue motion estimate */
+ val |= 1LL << bit;
+ }
+ if (partSize != SIZE_2Nx2N)
+ {
+ /* enqueue merge estimation */
+ val |= 1LL << numRef;
+ }
+ ATOMIC_OR(&m_queuedBitmap[partSize], val);
+
+ m_pool->pokeIdleThread();
+}
+
+bool TEncSearch::jobCompleted()
+{
+ uint64_t complete = 1LL << (MAX_NUM_REF * 2 + 1);
+ complete &= m_completeBitmap[SIZE_2Nx2N];
+ if (m_cfg->param->bEnableRectInter)
+ {
+ complete &= m_completeBitmap[SIZE_2NxN] & m_completeBitmap[SIZE_Nx2N];
+ }
+ return complete != (uint64_t)0;
+}
+
//! \}
diff -r 33b67a53b6de -r 7b757a1a9953 source/Lib/TLibEncoder/TEncSearch.h
--- a/source/Lib/TLibEncoder/TEncSearch.h Thu Mar 06 21:27:55 2014 -0600
+++ b/source/Lib/TLibEncoder/TEncSearch.h Fri Mar 07 14:21:13 2014 +0800
@@ -50,6 +50,7 @@
#include "primitives.h"
#include "bitcost.h"
#include "motion.h"
+#include "threadpool.h"
#define MVP_IDX_BITS 1
@@ -73,16 +74,16 @@
// ====================================================================================================================
/// encoder search class
-class TEncSearch : public TComPrediction
+class TEncSearch : public TComPrediction, public JobProvider
{
public:
- MotionEstimate m_me;
+ MotionEstimate m_me[SIZE_nRx2N + 1];
MotionReference* m_mref[2][MAX_NUM_REF + 1];
protected:
- ShortYuv* m_qtTempShortYuv;
+ ShortYuv* m_qtTempShortYuv;
pixel* m_sharedPredTransformSkip[3];
TCoeff** m_qtTempCoeffY;
@@ -108,9 +109,9 @@
// ME parameters
int m_refLagPixels;
int m_adaptiveRange[2][MAX_NUM_REF];
- MV m_mvPredictors[3];
+ MV m_mvPredictors[SIZE_nRx2N + 1][3];
- TComYuv m_tmpYuvPred; // to avoid constant memory allocation/deallocation in xGetInterPredictionError()
+ TComYuv m_tmpYuvPred[SIZE_nRx2N + 1]; // to avoid constant memory allocation/deallocation in xGetInterPredictionError()
// Color space parameters
uint32_t m_section;
@@ -119,6 +120,34 @@
uint32_t m_absPartIdxStep;
uint32_t m_partOffset;
+private:
+
+ // bitmap of motion search functions queued for processing, uses atomic intrinsics
+ uint64_t volatile *m_queuedBitmap;
+ uint64_t volatile *m_completeBitmap;
+
+ // number of words in the bitmap
+ int m_numWords;
+
+ TComDataCU** m_queuedCU;
+ TComYuv** m_queuedPredYuv;
+ bool* m_queuedbUseMRG;
+
+ MV m_mvTemp[SIZE_nRx2N + 1][2][MAX_NUM_REF];
+ MV m_mvPred[SIZE_nRx2N + 1][2][MAX_NUM_REF];
+ int m_mvpIdx[SIZE_nRx2N + 1][2][MAX_NUM_REF];
+ AMVPInfo m_amvpInfo[SIZE_nRx2N + 1][2][MAX_NUM_REF];
+ uint32_t m_costTemp[SIZE_nRx2N + 1][2][MAX_NUM_REF];
+ uint32_t m_bitsTemp[SIZE_nRx2N + 1][2][MAX_NUM_REF];
+ uint32_t m_lastMode[SIZE_nRx2N + 1];
+ int m_partIdx[SIZE_nRx2N + 1];
+
+ uint32_t m_mrgInterDir[SIZE_nRx2N + 1];
+ TComMvField m_mrgMvField[SIZE_nRx2N + 1][2];
+ uint32_t m_mrgIndex[SIZE_nRx2N + 1];
+ uint32_t m_mrgCost[SIZE_nRx2N + 1];
+ uint32_t m_mrgBits[SIZE_nRx2N + 1];
+
public:
TEncSbac*** m_rdSbacCoders;
@@ -137,6 +166,20 @@
bool init(Encoder* cfg, TComRdCost* rdCost, TComTrQuant *trQuant);
+ void setThreadPool(ThreadPool *p);
+
+ bool threadingInterSearch();
+
+ int getnumWords() { return m_numWords; }
+ // TEncSearch's implementation of JobProvider::findJob.
+ bool findJob();
+
+ bool jobCompleted();
+
+ void enqueueInterSearch(TComDataCU* cu, TComYuv* predYuv, PartSize partSize, bool bUseMRG, bool firstPart = true);
+
+ void predInterSearch(TComDataCU* cu, TComYuv* predYuv, int id, bool bUseMRG = false, bool bLuma = true, bool bChroma = true);
+
protected:
uint32_t xGetInterPredictionError(TComDataCU* cu, int partIdx);
@@ -231,7 +274,7 @@
MV& mvPred, AMVPInfo* amvpInfo, uint32_t* distBiP = NULL);
void xCheckBestMVP(AMVPInfo* amvpInfo, MV cMv, MV& mvPred, int& mvpIdx,
- uint32_t& outBits, uint32_t& outCost);
+ uint32_t& outBits, uint32_t& outCost, int partSizeId = 0);
uint32_t xGetTemplateCost(TComDataCU* cu, uint32_t partAddr, TComYuv* templateCand, MV mvCand,
int picList, int refIdx, int sizex, int sizey);
diff -r 33b67a53b6de -r 7b757a1a9953 source/encoder/bitcost.h
--- a/source/encoder/bitcost.h Thu Mar 06 21:27:55 2014 -0600
+++ b/source/encoder/bitcost.h Fri Mar 07 14:21:13 2014 +0800
@@ -44,6 +44,9 @@
// return bit cost of motion vector difference, multiplied by lambda
inline uint16_t mvcost(const MV& mv) const { return m_cost_mvx[mv.x] + m_cost_mvy[mv.y]; }
+ // return bit cost of motion vector difference, multiplied by lambda
+ inline uint16_t mvcost(const MV& mvp, const MV& mv) const { return m_cost[mv.x - mvp.x] + m_cost[mv.y - mvp.y]; }
+
// return bit cost of motion vector difference, without lambda
inline uint16_t bitcost(const MV& mv) const
{
@@ -51,6 +54,13 @@
s_bitsizes[(abs(mv.y - m_mvp.y) << 1) + !!(mv.y < m_mvp.y)] + 0.5f);
}
+ // return bit cost of motion vector difference, without lambda
+ inline uint16_t bitcost(const MV& mvp, const MV& mv) const
+ {
+ return (uint16_t)(s_bitsizes[(abs(mv.x - mvp.x) << 1) + !!(mv.x < mvp.x)] +
+ s_bitsizes[(abs(mv.y - mvp.y) << 1) + !!(mv.y < mvp.y)] + 0.5f);
+ }
+
static void destroy();
protected:
diff -r 33b67a53b6de -r 7b757a1a9953 source/encoder/compress.cpp
--- a/source/encoder/compress.cpp Thu Mar 06 21:27:55 2014 -0600
+++ b/source/encoder/compress.cpp Fri Mar 07 14:21:13 2014 +0800
@@ -211,6 +211,32 @@
outTempCU->m_totalCost = m_rdCost->calcRdSADCost(distortion, outTempCU->m_totalBits);
}
+/** check RD costs for a CU block encoded with merge */
+void TEncCu::xComputeCostInterEnqueue(TComDataCU* outTempCU, TComYuv* outPredYuv, PartSize partSize, bool bUseMRG)
+{
+ UChar depth = outTempCU->getDepth(0);
+
+ outTempCU->setPartSizeSubParts(partSize, 0, depth);
+ outTempCU->setPredModeSubParts(MODE_INTER, 0, depth);
+ outTempCU->setCUTransquantBypassSubParts(m_cfg->m_CUTransquantBypassFlagValue, 0, depth);
+
+ //do motion compensation only for Luma since luma cost alone is calculated
+ outTempCU->m_totalBits = 0;
+
+ m_search->enqueueInterSearch(outTempCU, outPredYuv, partSize, bUseMRG);
+}
+
+void TEncCu::xComputeDistortionCostInter(TComDataCU* outTempCU, TComYuv* outPredYuv)
+{
+ UChar depth = outTempCU->getDepth(0);
+
+ int part = g_convertToBit[outTempCU->getCUSize(0)];
+ uint32_t distortion = primitives.sa8d[part](m_origYuv[depth]->getLumaAddr(), m_origYuv[depth]->getStride(),
+ outPredYuv->getLumaAddr(), outPredYuv->getStride());
+ outTempCU->m_totalDistortion = distortion;
+ outTempCU->m_totalCost = m_rdCost->calcRdSADCost(distortion, outTempCU->m_totalBits);
+}
+
void TEncCu::xComputeCostMerge2Nx2N(TComDataCU*& outBestCU, TComDataCU*& outTempCU, TComYuv*& bestPredYuv, TComYuv*& yuvReconBest)
{
assert(outTempCU->getSlice()->getSliceType() != I_SLICE);
@@ -432,22 +458,53 @@
if (!earlyskip)
{
- /*Compute 2Nx2N mode costs*/
+ if (m_search->threadingInterSearch())
{
+ m_search->enqueue();
+
+ /*Compute 2Nx2N mode costs*/
+ xComputeCostInterEnqueue(m_interCU_2Nx2N[depth], m_modePredYuv[0][depth], SIZE_2Nx2N);
+
+ /*Compute Rect costs*/
+ if (m_cfg->param->bEnableRectInter)
+ {
+ xComputeCostInterEnqueue(m_interCU_Nx2N[depth], m_modePredYuv[1][depth], SIZE_Nx2N);
+ xComputeCostInterEnqueue(m_interCU_2NxN[depth], m_modePredYuv[2][depth], SIZE_2NxN);
+ }
+
+ while (!m_search->jobCompleted())
+ {
+ m_search->findJob();
+ }
+
+ m_search->dequeue();
+
+ xComputeDistortionCostInter(m_interCU_2Nx2N[depth], m_modePredYuv[0][depth]);
+
+ if (m_cfg->param->bEnableRectInter)
+ {
+ xComputeDistortionCostInter(m_interCU_Nx2N[depth], m_modePredYuv[1][depth]);
+ xComputeDistortionCostInter(m_interCU_2NxN[depth], m_modePredYuv[2][depth]);
+ }
+ }
+ else
+ {
+ /*Compute 2Nx2N mode costs*/
xComputeCostInter(m_interCU_2Nx2N[depth], m_modePredYuv[0][depth], SIZE_2Nx2N);
- /*Choose best mode; initialise outBestCU to 2Nx2N*/
- outBestCU = m_interCU_2Nx2N[depth];
- tempYuv = m_modePredYuv[0][depth];
- m_modePredYuv[0][depth] = m_bestPredYuv[depth];
- m_bestPredYuv[depth] = tempYuv;
+
+ /*Compute Rect costs*/
+ if (m_cfg->param->bEnableRectInter)
+ {
+ xComputeCostInter(m_interCU_Nx2N[depth], m_modePredYuv[1][depth], SIZE_Nx2N);
+ xComputeCostInter(m_interCU_2NxN[depth], m_modePredYuv[2][depth], SIZE_2NxN);
+ }
}
- /*Compute Rect costs*/
- if (m_cfg->param->bEnableRectInter)
- {
- xComputeCostInter(m_interCU_Nx2N[depth], m_modePredYuv[1][depth], SIZE_Nx2N);
- xComputeCostInter(m_interCU_2NxN[depth], m_modePredYuv[2][depth], SIZE_2NxN);
- }
+ /*Choose best mode; initialise outBestCU to 2Nx2N*/
+ outBestCU = m_interCU_2Nx2N[depth];
+ tempYuv = m_modePredYuv[0][depth];
+ m_modePredYuv[0][depth] = m_bestPredYuv[depth];
+ m_bestPredYuv[depth] = tempYuv;
if (m_interCU_Nx2N[depth]->m_totalCost < outBestCU->m_totalCost)
{
diff -r 33b67a53b6de -r 7b757a1a9953 source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp Thu Mar 06 21:27:55 2014 -0600
+++ b/source/encoder/frameencoder.cpp Fri Mar 07 14:21:13 2014 +0800
@@ -108,6 +108,8 @@
m_rows = new CTURow[m_numRows];
for (int i = 0; i < m_numRows; ++i)
{
+ m_rows[i].m_search.setThreadPool(m_pool);
+
ok &= m_rows[i].create(top);
for (int list = 0; list <= 1; list++)
@@ -344,7 +346,10 @@
double chromaLambda = lambda / crWeight;
m_rows[row].m_search.setQPLambda(qp, lambda, chromaLambda);
- m_rows[row].m_search.m_me.setSourcePlane(fenc->getLumaAddr(), fenc->getStride());
+ for (int partSize = 0; partSize <= m_rows[row].m_search.getnumWords(); partSize++)
+ {
+ m_rows[row].m_search.m_me[partSize].setSourcePlane(fenc->getLumaAddr(), fenc->getStride());
+ }
m_rows[row].m_rdCost.setLambda(lambda);
m_rows[row].m_rdCost.setCbDistortionWeight(cbWeight);
m_rows[row].m_rdCost.setCrDistortionWeight(crWeight);
@@ -391,7 +396,10 @@
for (int i = 0; i < m_numRows; i++)
{
m_rows[i].m_search.setQPLambda(qp, lambda, chromaLambda);
- m_rows[i].m_search.m_me.setSourcePlane(fenc->getLumaAddr(), fenc->getStride());
+ for (int partSize = 0; partSize <= m_rows[i].m_search.getnumWords(); partSize++)
+ {
+ m_rows[i].m_search.m_me[partSize].setSourcePlane(fenc->getLumaAddr(), fenc->getStride());
+ }
m_rows[i].m_rdCost.setLambda(lambda);
m_rows[i].m_rdCost.setCbDistortionWeight(cbWeight);
m_rows[i].m_rdCost.setCrDistortionWeight(crWeight);
@@ -466,9 +474,9 @@
for (int ref = 0; ref < slice->getNumRefIdx(l); ref++)
{
wpScalingParam *w = NULL;
- if ((slice->isInterP() && slice->getPPS()->getUseWP() && slice->m_weightPredTable[l][ref][0].bPresentFlag))
+ if ((slice->isInterP() && slice->getPPS()->getUseWP() && slice->m_weightPredTable[0][l][ref][0].bPresentFlag))
{
- w = slice->m_weightPredTable[l][ref];
+ w = slice->m_weightPredTable[0][l][ref];
slice->m_numWPRefs++;
}
m_mref[l][ref].init(slice->getRefPic(l, ref)->getPicYuvRec(), w);
diff -r 33b67a53b6de -r 7b757a1a9953 source/encoder/motion.cpp
--- a/source/encoder/motion.cpp Thu Mar 06 21:27:55 2014 -0600
+++ b/source/encoder/motion.cpp Fri Mar 07 14:21:13 2014 +0800
@@ -168,7 +168,7 @@
{ \
MV tmv(mx, my); \
int cost = sad(fenc, FENC_STRIDE, fref + mx + my * stride, stride); \
- cost += mvcost(tmv << 2); \
+ cost += mvcost(qmvp, tmv << 2); \
if (cost < bcost) { \
bcost = cost; \
bmv = tmv; \
@@ -181,7 +181,7 @@
do \
{ \
int cost = sad(fenc, FENC_STRIDE, fref + mx + my * stride, stride); \
- cost += mvcost(MV(mx, my) << 2); \
+ cost += mvcost(qmvp, MV(mx, my) << 2); \
COPY2_IF_LT(bcost, cost, bmv, MV(mx, my)); \
} while (0)
@@ -193,9 +193,9 @@
pix_base + (m1x) + (m1y) * stride, \
pix_base + (m2x) + (m2y) * stride, \
stride, costs); \
- (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
- (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
- (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
+ (costs)[0] += mvcost(qmvp, (bmv + MV(m0x, m0y)) << 2); \
+ (costs)[1] += mvcost(qmvp, (bmv + MV(m1x, m1y)) << 2); \
+ (costs)[2] += mvcost(qmvp, (bmv + MV(m2x, m2y)) << 2); \
}
#define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \
@@ -206,10 +206,10 @@
fref + (m2x) + (m2y) * stride, \
fref + (m3x) + (m3y) * stride, \
stride, costs); \
- costs[0] += mvcost(MV(m0x, m0y) << 2); \
- costs[1] += mvcost(MV(m1x, m1y) << 2); \
- costs[2] += mvcost(MV(m2x, m2y) << 2); \
- costs[3] += mvcost(MV(m3x, m3y) << 2); \
+ costs[0] += mvcost(qmvp, MV(m0x, m0y) << 2); \
+ costs[1] += mvcost(qmvp, MV(m1x, m1y) << 2); \
+ costs[2] += mvcost(qmvp, MV(m2x, m2y) << 2); \
+ costs[3] += mvcost(qmvp, MV(m3x, m3y) << 2); \
COPY4_IF_LT(bcost, costs[0], bmv, MV(m0x, m0y), bPointNr, p0, bDistance, d0); \
COPY4_IF_LT(bcost, costs[1], bmv, MV(m1x, m1y), bPointNr, p1, bDistance, d1); \
COPY4_IF_LT(bcost, costs[2], bmv, MV(m2x, m2y), bPointNr, p2, bDistance, d2); \
@@ -224,10 +224,10 @@
pix_base + (m2x) + (m2y) * stride, \
pix_base + (m3x) + (m3y) * stride, \
stride, costs); \
- costs[0] += mvcost((omv + MV(m0x, m0y)) << 2); \
- costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \
- costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \
- costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \
+ costs[0] += mvcost(qmvp, (omv + MV(m0x, m0y)) << 2); \
+ costs[1] += mvcost(qmvp, (omv + MV(m1x, m1y)) << 2); \
+ costs[2] += mvcost(qmvp, (omv + MV(m2x, m2y)) << 2); \
+ costs[3] += mvcost(qmvp, (omv + MV(m3x, m3y)) << 2); \
COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
@@ -243,10 +243,10 @@
pix_base + (m2x) + (m2y) * stride, \
pix_base + (m3x) + (m3y) * stride, \
stride, costs); \
- (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
- (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
- (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
- (costs)[3] += mvcost((bmv + MV(m3x, m3y)) << 2); \
+ (costs)[0] += mvcost(qmvp, (bmv + MV(m0x, m0y)) << 2); \
+ (costs)[1] += mvcost(qmvp, (bmv + MV(m1x, m1y)) << 2); \
+ (costs)[2] += mvcost(qmvp, (bmv + MV(m2x, m2y)) << 2); \
+ (costs)[3] += mvcost(qmvp, (bmv + MV(m3x, m3y)) << 2); \
}
#define DIA1_ITER(mx, my) \
@@ -284,6 +284,7 @@
void MotionEstimate::StarPatternSearch(ReferencePlanes *ref,
const MV & mvmin,
const MV & mvmax,
+ const MV & qmvp,
MV & bmv,
int & bcost,
int & bPointNr,
@@ -563,13 +564,13 @@
int bcost = bprecost;
if (pmv.isSubpel())
{
- bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2);
+ bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(qmvp, bmv << 2);
}
// measure SAD cost at MV(0) if MVP is not zero
if (pmv.notZero())
{
- int cost = sad(fenc, FENC_STRIDE, fref, stride) + mvcost(MV(0, 0));
+ int cost = sad(fenc, FENC_STRIDE, fref, stride) + mvcost(qmvp, MV(0, 0));
if (cost < bcost)
{
bcost = cost;
@@ -585,9 +586,9 @@
{
int cost;
if (ref->isLowres)
- cost = ref->lowresQPelCost(fenc, blockOffset, m, sad) + mvcost(m);
+ cost = ref->lowresQPelCost(fenc, blockOffset, m, sad) + mvcost(qmvp, m);
else
- cost = subpelCompare(ref, m, sad) + mvcost(m);
+ cost = subpelCompare(ref, m, sad) + mvcost(qmvp, m);
if (cost < bprecost)
{
@@ -891,7 +892,7 @@
int bDistance = 0;
const int EarlyExitIters = 3;
- StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, EarlyExitIters, merange);
+ StarPatternSearch(ref, mvmin, mvmax, qmvp, bmv, bcost, bPointNr, bDistance, EarlyExitIters, merange);
if (bDistance == 1)
{
// if best distance was only 1, check two missing points. If no new point is found, stop
@@ -940,16 +941,16 @@
pix_base + RasterDistance * 2,
pix_base + RasterDistance * 3,
stride, costs);
- costs[0] += mvcost(tmv << 2);
+ costs[0] += mvcost(qmvp, tmv << 2);
COPY2_IF_LT(bcost, costs[0], bmv, tmv);
tmv.x += RasterDistance;
- costs[1] += mvcost(tmv << 2);
+ costs[1] += mvcost(qmvp, tmv << 2);
COPY2_IF_LT(bcost, costs[1], bmv, tmv);
tmv.x += RasterDistance;
- costs[2] += mvcost(tmv << 2);
+ costs[2] += mvcost(qmvp, tmv << 2);
COPY2_IF_LT(bcost, costs[2], bmv, tmv);
tmv.x += RasterDistance;
- costs[3] += mvcost(tmv << 3);
+ costs[3] += mvcost(qmvp, tmv << 3);
COPY2_IF_LT(bcost, costs[3], bmv, tmv);
}
else
@@ -964,7 +965,7 @@
bDistance = 0;
bPointNr = 0;
const int MaxIters = 32;
- StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, MaxIters, merange);
+ StarPatternSearch(ref, mvmin, mvmax, qmvp, bmv, bcost, bPointNr, bDistance, MaxIters, merange);
if (bDistance == 1)
{
@@ -1012,16 +1013,16 @@
pix_base + 2,
pix_base + 3,
stride, costs);
- costs[0] += mvcost(tmv << 2);
+ costs[0] += mvcost(qmvp, tmv << 2);
COPY2_IF_LT(bcost, costs[0], bmv, tmv);
tmv.x++;
- costs[1] += mvcost(tmv << 2);
+ costs[1] += mvcost(qmvp, tmv << 2);
COPY2_IF_LT(bcost, costs[1], bmv, tmv);
tmv.x++;
- costs[2] += mvcost(tmv << 2);
+ costs[2] += mvcost(qmvp, tmv << 2);
COPY2_IF_LT(bcost, costs[2], bmv, tmv);
tmv.x++;
- costs[3] += mvcost(tmv << 2);
+ costs[3] += mvcost(qmvp, tmv << 2);
COPY2_IF_LT(bcost, costs[3], bmv, tmv);
}
else
@@ -1057,18 +1058,18 @@
for (int i = 1; i <= wl.hpel_dirs; i++)
{
MV qmv = bmv + square1[i] * 2;
- cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv);
+ cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmvp, qmv);
COPY2_IF_LT(bcost, cost, bdir, i);
}
bmv += square1[bdir] * 2;
- bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd) + mvcost(bmv);
+ bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd) + mvcost(qmvp, bmv);
bdir = 0;
for (int i = 1; i <= wl.qpel_dirs; i++)
{
MV qmv = bmv + square1[i];
- cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv);
+ cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmvp, qmv);
COPY2_IF_LT(bcost, cost, bdir, i);
}
@@ -1080,7 +1081,7 @@
if (wl.hpel_satd)
{
- bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
+ bcost = subpelCompare(ref, bmv, satd) + mvcost(qmvp, bmv);
hpelcomp = satd;
}
else
@@ -1092,7 +1093,7 @@
for (int i = 1; i <= wl.hpel_dirs; i++)
{
MV qmv = bmv + square1[i] * 2;
- cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
+ cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmvp, qmv);
COPY2_IF_LT(bcost, cost, bdir, i);
}
@@ -1101,7 +1102,7 @@
/* if HPEL search used SAD, remeasure with SATD before QPEL */
if (!wl.hpel_satd)
- bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
+ bcost = subpelCompare(ref, bmv, satd) + mvcost(qmvp, bmv);
for (int iter = 0; iter < wl.qpel_iters; iter++)
{
@@ -1109,7 +1110,7 @@
for (int i = 1; i <= wl.qpel_dirs; i++)
{
MV qmv = bmv + square1[i];
- cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
+ cost = subpelCompare(ref, qmv, satd) + mvcost(qmvp, qmv);
COPY2_IF_LT(bcost, cost, bdir, i);
}
diff -r 33b67a53b6de -r 7b757a1a9953 source/encoder/motion.h
--- a/source/encoder/motion.h Thu Mar 06 21:27:55 2014 -0600
+++ b/source/encoder/motion.h Fri Mar 07 14:21:13 2014 +0800
@@ -99,6 +99,7 @@
inline void StarPatternSearch(ReferencePlanes *ref,
const MV & mvmin,
const MV & mvmax,
+ const MV & qmvp,
MV & bmv,
int & bcost,
int & bPointNr,
More information about the x265-devel
mailing list