[x265] refine cbf==0 path: remove clearing coeff and resi
Satoshi Nakagawa
nakagawa424 at oki.com
Mon Jun 2 04:47:58 CEST 2014
# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1401677099 -32400
# Mon Jun 02 11:44:59 2014 +0900
# Node ID 73f86312c2e0aa5a105e84b0045478e02c8a03e7
# Parent a5998df9b12ef81e48e7c5b89219a74276a75f27
refine cbf==0 path: remove clearing coeff and resi
diff -r a5998df9b12e -r 73f86312c2e0 source/Lib/TLibEncoder/TEncEntropy.cpp
--- a/source/Lib/TLibEncoder/TEncEntropy.cpp Mon Jun 02 07:36:20 2014 +0530
+++ b/source/Lib/TLibEncoder/TEncEntropy.cpp Mon Jun 02 11:44:59 2014 +0900
@@ -202,7 +202,6 @@
void TEncEntropy::initTUEntropySection(TComTURecurse *tuIterator, uint32_t splitMode, uint32_t absPartIdxStep, uint32_t m_absPartIdxTU)
{
- tuIterator->m_partOffset = 0;
tuIterator->m_section = 0;
tuIterator->m_absPartIdxTURelCU = m_absPartIdxTU;
tuIterator->m_splitMode = splitMode;
diff -r a5998df9b12e -r 73f86312c2e0 source/Lib/TLibEncoder/TEncEntropy.h
--- a/source/Lib/TLibEncoder/TEncEntropy.h Mon Jun 02 07:36:20 2014 +0530
+++ b/source/Lib/TLibEncoder/TEncEntropy.h Mon Jun 02 11:44:59 2014 +0900
@@ -66,7 +66,6 @@
uint32_t m_splitMode;
uint32_t m_absPartIdxTURelCU;
uint32_t m_absPartIdxStep;
- uint32_t m_partOffset;
};
// ====================================================================================================================
diff -r a5998df9b12e -r 73f86312c2e0 source/Lib/TLibEncoder/TEncSbac.cpp
--- a/source/Lib/TLibEncoder/TEncSbac.cpp Mon Jun 02 07:36:20 2014 +0530
+++ b/source/Lib/TLibEncoder/TEncSbac.cpp Mon Jun 02 11:44:59 2014 +0900
@@ -2120,8 +2120,9 @@
// compute number of significant coefficients
uint32_t numSig = primitives.count_nonzero(coeff, trSize * trSize);
- if (numSig == 0)
- return;
+#if CHECKED_BUILD || _DEBUG
+ X265_CHECK(numSig > 0, "cbf check fail");
+#endif
bool beValid;
if (cu->getCUTransquantBypass(absPartIdx))
diff -r a5998df9b12e -r 73f86312c2e0 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Mon Jun 02 07:36:20 2014 +0530
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Mon Jun 02 11:44:59 2014 +0900
@@ -408,8 +408,8 @@
coeff_t* coeff = m_qtTempCoeff[0][qtLayer] + coeffOffsetY;
int16_t* reconQt = m_qtTempShortYuv[qtLayer].getLumaAddr(absPartIdx);
-
X265_CHECK(m_qtTempShortYuv[qtLayer].m_width == MAX_CU_SIZE, "width is not max CU size\n");
+ const uint32_t reconQtStride = MAX_CU_SIZE;
uint32_t zorder = cu->getZorderIdxInCU() + absPartIdx;
pixel* reconIPred = cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
@@ -443,25 +443,29 @@
//--- set coded block flag ---
cu->setCbfSubParts((absSum ? 1 : 0) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
- //--- inverse transform ---
if (absSum)
{
+ //--- inverse transform ---
int scalingListType = 0 + TEXT_LUMA;
- X265_CHECK(scalingListType < 6, "scalingListType is too large %d\n", scalingListType);
+ X265_CHECK(scalingListType < 6, "scalingListType invalid %d\n", scalingListType);
m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), cu->getLumaIntraDir(absPartIdx), residual, stride, coeff, tuSize, scalingListType, useTransformSkip, lastPos);
+ X265_CHECK(tuSize <= 32, "tuSize is too large %d\n", tuSize);
+ //===== reconstruction =====
+ primitives.calcrecon[sizeIdx](pred, residual, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
+ //===== update distortion =====
+ outDist += primitives.sse_sp[part](reconQt, reconQtStride, fenc, stride);
}
else
{
- int16_t* resiTmp = residual;
+#if CHECKED_BUILD || _DEBUG
memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);
- primitives.blockfill_s[sizeIdx](resiTmp, stride, 0);
+#endif
+ //===== reconstruction =====
+ primitives.luma_copy_ps[part](reconQt, reconQtStride, pred, stride);
+ primitives.luma_copy_pp[part](reconIPred, reconIPredStride, pred, stride);
+ //===== update distortion =====
+ outDist += primitives.sse_pp[part](pred, stride, fenc, stride);
}
-
- X265_CHECK(tuSize <= 32, "tuSize is too large %d\n", tuSize);
- //===== reconstruction =====
- primitives.calcrecon[sizeIdx](pred, residual, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
- //===== update distortion =====
- outDist += primitives.sse_sp[part](reconQt, MAX_CU_SIZE, fenc, stride);
}
void TEncSearch::xIntraCodingChromaBlk(TComDataCU* cu,
@@ -519,67 +523,67 @@
primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
//===== transform and quantization =====
+ //--- init rate estimation arrays for RDOQ ---
+ if (useTransformSkipChroma ? m_cfg->bEnableRDOQTS : m_cfg->bEnableRDOQ)
{
- //--- init rate estimation arrays for RDOQ ---
- if (useTransformSkipChroma ? m_cfg->bEnableRDOQTS : m_cfg->bEnableRDOQ)
- {
- m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac, tuSize, ttype);
- }
- //--- transform and quantization ---
- uint32_t absSum = 0;
- int lastPos = -1;
-
- int curChromaQpOffset;
- if (ttype == TEXT_CHROMA_U)
- {
- curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
- }
- else
- {
- curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
- }
- m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
-
- m_trQuant->selectLambda(TEXT_CHROMA);
-
- absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, tuSize, ttype, absPartIdx, &lastPos, useTransformSkipChroma);
-
- //--- set coded block flag ---
- cu->setCbfPartRange((((absSum > 0) ? 1 : 0) << origTrDepth), ttype, absPartIdx, absPartIdxStep);
-
+ m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac, tuSize, ttype);
+ }
+
+ //--- transform and quantization ---
+ uint32_t absSum = 0;
+ int lastPos = -1;
+
+ int curChromaQpOffset;
+ if (ttype == TEXT_CHROMA_U)
+ {
+ curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
+ }
+ else
+ {
+ curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
+ }
+ m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
+ m_trQuant->selectLambda(TEXT_CHROMA);
+
+ absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, tuSize, ttype, absPartIdx, &lastPos, useTransformSkipChroma);
+
+ //--- set coded block flag ---
+ cu->setCbfPartRange((((absSum > 0) ? 1 : 0) << origTrDepth), ttype, absPartIdx, absPartIdxStep);
+
+ uint32_t dist;
+ if (absSum)
+ {
//--- inverse transform ---
- if (absSum)
- {
- int scalingListType = 0 + ttype;
- X265_CHECK(scalingListType < 6, "scalingListType invalid %d\n", scalingListType);
- m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, residual, stride, coeff, tuSize, scalingListType, useTransformSkipChroma, lastPos);
- }
- else
- {
- int16_t* resiTmp = residual;
- memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);
- primitives.blockfill_s[sizeIdx](resiTmp, stride, 0);
- }
+ int scalingListType = 0 + ttype;
+ X265_CHECK(scalingListType < 6, "scalingListType invalid %d\n", scalingListType);
+ m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, residual, stride, coeff, tuSize, scalingListType, useTransformSkipChroma, lastPos);
+ X265_CHECK(tuSize <= 32, "tuSize is too large %d\n", tuSize);
+ //===== reconstruction =====
+ primitives.calcrecon[sizeIdx](pred, residual, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
+ //===== update distortion =====
+ dist = primitives.sse_sp[part](reconQt, reconQtStride, fenc, stride);
}
-
- X265_CHECK(((intptr_t)residual & (tuSize - 1)) == 0, "residual alignment check failure\n");
- X265_CHECK(tuSize <= 32, "tuSize invalud\n");
- //===== reconstruction =====
- primitives.calcrecon[sizeIdx](pred, residual, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
- //===== update distortion =====
- uint32_t dist = primitives.sse_sp[part](reconQt, reconQtStride, fenc, stride);
+ else
+ {
+#if CHECKED_BUILD || _DEBUG
+ memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);
+#endif
+ //===== reconstruction =====
+ primitives.square_copy_ps[sizeIdx](reconQt, reconQtStride, pred, stride);
+ primitives.square_copy_pp[sizeIdx](reconIPred, reconIPredStride, pred, stride);
+ //===== update distortion =====
+ dist = primitives.sse_pp[part](pred, stride, fenc, stride);
+ }
+
+ X265_CHECK(ttype == TEXT_CHROMA_U || ttype == TEXT_CHROMA_V, "invalid ttype\n");
if (ttype == TEXT_CHROMA_U)
{
outDist += m_rdCost->scaleChromaDistCb(dist);
}
- else if (ttype == TEXT_CHROMA_V)
+ else
{
outDist += m_rdCost->scaleChromaDistCr(dist);
}
- else
- {
- outDist += dist;
- }
}
void TEncSearch::xRecurIntraCodingQT(TComDataCU* cu,
@@ -784,15 +788,15 @@
cu->setTransformSkipSubParts(bestModeId, TEXT_LUMA, absPartIdx, fullDepth);
//--- set reconstruction for next intra prediction blocks ---
- uint32_t width = cu->getCUSize(0) >> trDepth;
- uint32_t height = cu->getCUSize(0) >> trDepth;
uint32_t qtLayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
uint32_t zorder = cu->getZorderIdxInCU() + absPartIdx;
int16_t* src = m_qtTempShortYuv[qtLayer].getLumaAddr(absPartIdx);
X265_CHECK(m_qtTempShortYuv[qtLayer].m_width == MAX_CU_SIZE, "width is not max CU size\n");
+ const uint32_t srcstride = MAX_CU_SIZE;
pixel* dst = cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
uint32_t dststride = cu->getPic()->getPicYuvRec()->getStride();
- primitives.blockcpy_ps(width, height, dst, dststride, src, MAX_CU_SIZE);
+ int sizeIdx = trSizeLog2 - 2;
+ primitives.square_copy_sp[sizeIdx](dst, dststride, src, srcstride);
}
outDistY += singleDistY;
@@ -866,25 +870,29 @@
//--- set coded block flag ---
cu->setCbfSubParts((absSum ? 1 : 0) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
- //--- inverse transform ---
+ int part = partitionFromSize(tuSize);
+
if (absSum)
{
+ //--- inverse transform ---
int scalingListType = 0 + TEXT_LUMA;
X265_CHECK(scalingListType < 6, "scalingListType %d\n", scalingListType);
m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), cu->getLumaIntraDir(absPartIdx), residual, stride, coeff, tuSize, scalingListType, useTransformSkip, lastPos);
+
+ // Generate Recon
+ primitives.luma_add_ps[part](recon, stride, pred, residual, stride, stride);
+ primitives.luma_copy_pp[part](reconIPred, reconIPredStride, recon, stride);
}
else
{
- int16_t* resiTmp = residual;
+#if CHECKED_BUILD || _DEBUG
memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);
- primitives.blockfill_s[sizeIdx](resiTmp, stride, 0);
+#endif
+
+ // Generate Recon
+ primitives.luma_copy_pp[part](recon, stride, pred, stride);
+ primitives.luma_copy_pp[part](reconIPred, reconIPredStride, pred, stride);
}
-
- //Generate Recon
- X265_CHECK(tuSize <= 32, "tuSize is too large\n");
- int part = partitionFromSize(tuSize);
- primitives.luma_add_ps[part](recon, stride, pred, residual, stride, stride);
- primitives.blockcpy_pp(tuSize, tuSize, reconIPred, reconIPredStride, recon, stride);
}
if (bCheckSplit && !bCheckFull)
@@ -980,8 +988,10 @@
pixel* reconIPred = cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zOrder);
uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getStride();
int16_t* reconQt = m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx);
- primitives.blockcpy_ps(trSize, trSize, reconIPred, reconIPredStride, reconQt, MAX_CU_SIZE);
X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width is not max CU size\n");
+ const uint32_t reconQtStride = MAX_CU_SIZE;
+ int sizeIdx = trSizeLog2 - 2;
+ primitives.square_copy_sp[sizeIdx](reconIPred, reconIPredStride, reconQt, reconQtStride);
}
void TEncSearch::xStoreIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t chromaId, const bool splitIntoSubTUs)
@@ -1059,8 +1069,7 @@
}
//===== copy transform coefficients =====
- uint32_t trSizeC = 1 << trSizeCLog2;
- uint32_t numCoeffC = 1 << trSizeCLog2 * 2;
+ uint32_t numCoeffC = 1 << (trSizeCLog2 * 2);
uint32_t coeffOffsetC = absPartIdx << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
coeff_t* coeffDst = m_qtTempCoeff[chromaId][qtlayer] + coeffOffsetC;
@@ -1072,12 +1081,13 @@
m_qtTempTransformSkipYuv.copyPartToPartChroma(&m_qtTempShortYuv[qtlayer], absPartIdx, lumaSize, chromaId, splitIntoSubTUs);
uint32_t zorder = cu->getZorderIdxInCU() + absPartIdx;
- uint32_t reconQtStride = m_qtTempShortYuv[qtlayer].m_cwidth;
uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getCStride();
pixel* reconIPred = cu->getPic()->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), zorder);
int16_t* reconQt = m_qtTempShortYuv[qtlayer].getChromaAddr(chromaId, absPartIdx);
- primitives.blockcpy_ps(trSizeC, trSizeC, reconIPred, reconIPredStride, reconQt, reconQtStride);
+ uint32_t reconQtStride = m_qtTempShortYuv[qtlayer].m_cwidth;
+ int sizeIdxC = trSizeCLog2 - 2;
+ primitives.square_copy_sp[sizeIdxC](reconIPred, reconIPredStride, reconQt, reconQtStride);
}
}
@@ -1387,6 +1397,7 @@
uint32_t stride = fencYuv->getCStride();
const bool splitIntoSubTUs = (chFmt == CHROMA_422);
int sizeIdx = g_convertToBit[tuSize];
+ int part = partitionFromSize(tuSize);
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
@@ -1456,28 +1467,28 @@
//--- set coded block flag ---
cu->setCbfPartRange((((absSum > 0) ? 1 : 0) << origTrDepth), ttype, absPartIdxC, tuIterator.m_absPartIdxStep);
- //--- inverse transform ---
if (absSum)
{
+ //--- inverse transform ---
int scalingListType = 0 + ttype;
X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, residual, stride, coeff, tuSize, scalingListType, useTransformSkipChroma, lastPos);
+
+ //===== reconstruction =====
+ // use square primitives
+ primitives.chroma[CHROMA_444].add_ps[part](recon, stride, pred, residual, stride, stride);
+ primitives.square_copy_pp[sizeIdx](reconIPred, reconIPredStride, recon, stride);
}
else
{
- int16_t* resiTmp = residual;
+#if CHECKED_BUILD || _DEBUG
memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);
- primitives.blockfill_s[sizeIdx](resiTmp, stride, 0);
+#endif
+
+ //===== reconstruction =====
+ primitives.square_copy_pp[sizeIdx](recon, stride, pred, stride);
+ primitives.square_copy_pp[sizeIdx](reconIPred, reconIPredStride, pred, stride);
}
-
- //===== reconstruction =====
- X265_CHECK(((intptr_t)residual & (tuSize - 1)) == 0, "residual alignment check failed\n");
- X265_CHECK(tuSize <= 32, "tuSize out of range\n");
-
- // use square primitive
- int part = partitionFromSize(tuSize);
- primitives.chroma[CHROMA_444].add_ps[part](recon, stride, pred, residual, stride, stride);
- primitives.chroma[CHROMA_444].copy_pp[part](reconIPred, reconIPredStride, recon, stride);
}
while (isNextSection(&tuIterator));
@@ -1859,7 +1870,6 @@
void TEncSearch::initSection(TComTURecurse *tuIterator, uint32_t splitMode, uint32_t absPartIdxStep, uint32_t m_absPartIdxTU)
{
- tuIterator->m_partOffset = 0;
tuIterator->m_section = 0;
tuIterator->m_absPartIdxTURelCU = m_absPartIdxTU;
tuIterator->m_splitMode = splitMode;
@@ -1874,16 +1884,21 @@
{
uint32_t depth = cu->getDepth(0);
uint32_t initTrDepth = (cu->getPartitionSize(0) != SIZE_2Nx2N) && (cu->getChromaFormat() == CHROMA_444 ? 1 : 0);
-
+ uint32_t tuSize = cu->getCUSize(0) >> initTrDepth;
uint32_t splitMode = (initTrDepth == 0) ? DONT_SPLIT : QUAD_SPLIT;
uint32_t absPartIdx = (cu->getPic()->getNumPartInCU() >> (depth << 1));
+ int chFmt = cu->getChromaFormat();
+ int part = partitionFromSize(tuSize);
+
TComTURecurse tuIterator;
initSection(&tuIterator, splitMode, absPartIdx);
do
{
+ uint32_t absPartIdxC = tuIterator.m_absPartIdxTURelCU;
+
uint32_t bestMode = 0;
uint32_t bestDist = 0;
uint64_t bestCost = MAX_INT64;
@@ -1893,9 +1908,7 @@
uint32_t maxMode = NUM_CHROMA_MODE;
uint32_t modeList[NUM_CHROMA_MODE];
- tuIterator.m_partOffset = tuIterator.m_absPartIdxTURelCU;
-
- cu->getAllowedChromaDir(tuIterator.m_partOffset, modeList);
+ cu->getAllowedChromaDir(absPartIdxC, modeList);
//----- check chroma modes -----
for (uint32_t mode = minMode; mode < maxMode; mode++)
@@ -1906,16 +1919,16 @@
//----- chroma coding -----
uint32_t dist = 0;
- cu->setChromIntraDirSubParts(modeList[mode], tuIterator.m_partOffset, depth + initTrDepth);
-
- xRecurIntraChromaCodingQT(cu, initTrDepth, tuIterator.m_absPartIdxTURelCU, fencYuv, predYuv, resiYuv, dist);
+ cu->setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTrDepth);
+
+ xRecurIntraChromaCodingQT(cu, initTrDepth, absPartIdxC, fencYuv, predYuv, resiYuv, dist);
if (cu->getSlice()->getPPS()->getUseTransformSkip())
{
m_rdGoOnSbacCoder->load(m_rdSbacCoders[depth][CI_CURR_BEST]);
}
- uint32_t bits = xGetIntraBitsQT(cu, initTrDepth, tuIterator.m_absPartIdxTURelCU, tuIterator.m_absPartIdxStep, false, true);
+ uint32_t bits = xGetIntraBitsQT(cu, initTrDepth, absPartIdxC, tuIterator.m_absPartIdxStep, false, true);
uint64_t cost = m_rdCost->calcRdCost(dist, bits);
//----- compare -----
@@ -1924,37 +1937,36 @@
bestCost = cost;
bestDist = dist;
bestMode = modeList[mode];
- xSetIntraResultChromaQT(cu, initTrDepth, tuIterator.m_absPartIdxTURelCU, reconYuv);
- ::memcpy(m_qtTempCbf[1], cu->getCbf(TEXT_CHROMA_U) + tuIterator.m_partOffset, tuIterator.m_absPartIdxStep * sizeof(uint8_t));
- ::memcpy(m_qtTempCbf[2], cu->getCbf(TEXT_CHROMA_V) + tuIterator.m_partOffset, tuIterator.m_absPartIdxStep * sizeof(uint8_t));
- ::memcpy(m_qtTempTransformSkipFlag[1], cu->getTransformSkip(TEXT_CHROMA_U) + tuIterator.m_partOffset, tuIterator.m_absPartIdxStep * sizeof(uint8_t));
- ::memcpy(m_qtTempTransformSkipFlag[2], cu->getTransformSkip(TEXT_CHROMA_V) + tuIterator.m_partOffset, tuIterator.m_absPartIdxStep * sizeof(uint8_t));
+ xSetIntraResultChromaQT(cu, initTrDepth, absPartIdxC, reconYuv);
+ ::memcpy(m_qtTempCbf[1], cu->getCbf(TEXT_CHROMA_U) + absPartIdxC, tuIterator.m_absPartIdxStep * sizeof(uint8_t));
+ ::memcpy(m_qtTempCbf[2], cu->getCbf(TEXT_CHROMA_V) + absPartIdxC, tuIterator.m_absPartIdxStep * sizeof(uint8_t));
+ ::memcpy(m_qtTempTransformSkipFlag[1], cu->getTransformSkip(TEXT_CHROMA_U) + absPartIdxC, tuIterator.m_absPartIdxStep * sizeof(uint8_t));
+ ::memcpy(m_qtTempTransformSkipFlag[2], cu->getTransformSkip(TEXT_CHROMA_V) + absPartIdxC, tuIterator.m_absPartIdxStep * sizeof(uint8_t));
}
}
if (!isLastSection(&tuIterator))
{
- uint32_t compWidth = (cu->getCUSize(0) >> m_hChromaShift) >> initTrDepth;
- uint32_t compHeight = (cu->getCUSize(0) >> m_vChromaShift) >> initTrDepth;
- uint32_t zorder = cu->getZorderIdxInCU() + tuIterator.m_partOffset;
- pixel* dst = cu->getPic()->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder);
+ uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
uint32_t dststride = cu->getPic()->getPicYuvRec()->getCStride();
- pixel* src = reconYuv->getCbAddr(tuIterator.m_partOffset);
uint32_t srcstride = reconYuv->getCStride();
-
- primitives.blockcpy_pp(compWidth, compHeight, dst, dststride, src, srcstride);
-
- dst = cu->getPic()->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder);
- src = reconYuv->getCrAddr(tuIterator.m_partOffset);
- primitives.blockcpy_pp(compWidth, compHeight, dst, dststride, src, srcstride);
+ pixel *src, *dst;
+
+ dst = cu->getPic()->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder);
+ src = reconYuv->getCbAddr(absPartIdxC);
+ primitives.chroma[chFmt].copy_pp[part](dst, dststride, src, srcstride);
+
+ dst = cu->getPic()->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder);
+ src = reconYuv->getCrAddr(absPartIdxC);
+ primitives.chroma[chFmt].copy_pp[part](dst, dststride, src, srcstride);
}
//----- set data -----
- ::memcpy(cu->getCbf(TEXT_CHROMA_U) + tuIterator.m_partOffset, m_qtTempCbf[1], tuIterator.m_absPartIdxStep * sizeof(uint8_t));
- ::memcpy(cu->getCbf(TEXT_CHROMA_V) + tuIterator.m_partOffset, m_qtTempCbf[2], tuIterator.m_absPartIdxStep * sizeof(uint8_t));
- ::memcpy(cu->getTransformSkip(TEXT_CHROMA_U) + tuIterator.m_partOffset, m_qtTempTransformSkipFlag[1], tuIterator.m_absPartIdxStep * sizeof(uint8_t));
- ::memcpy(cu->getTransformSkip(TEXT_CHROMA_V) + tuIterator.m_partOffset, m_qtTempTransformSkipFlag[2], tuIterator.m_absPartIdxStep * sizeof(uint8_t));
- cu->setChromIntraDirSubParts(bestMode, tuIterator.m_partOffset, depth + initTrDepth);
+ ::memcpy(cu->getCbf(TEXT_CHROMA_U) + absPartIdxC, m_qtTempCbf[1], tuIterator.m_absPartIdxStep * sizeof(uint8_t));
+ ::memcpy(cu->getCbf(TEXT_CHROMA_V) + absPartIdxC, m_qtTempCbf[2], tuIterator.m_absPartIdxStep * sizeof(uint8_t));
+ ::memcpy(cu->getTransformSkip(TEXT_CHROMA_U) + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.m_absPartIdxStep * sizeof(uint8_t));
+ ::memcpy(cu->getTransformSkip(TEXT_CHROMA_V) + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.m_absPartIdxStep * sizeof(uint8_t));
+ cu->setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTrDepth);
cu->m_totalDistortion += bestDist;
}
while (isNextSection(&tuIterator));
@@ -2685,9 +2697,11 @@
::memset(cu->getCbf(TEXT_LUMA), 0, qpartnum * sizeof(uint8_t));
::memset(cu->getCbf(TEXT_CHROMA_U), 0, qpartnum * sizeof(uint8_t));
::memset(cu->getCbf(TEXT_CHROMA_V), 0, qpartnum * sizeof(uint8_t));
+#if CHECKED_BUILD || _DEBUG
::memset(cu->getCoeffY(), 0, cuSize * cuSize * sizeof(coeff_t));
::memset(cu->getCoeffCb(), 0, cuSize * cuSize * sizeof(coeff_t) >> (m_hChromaShift + m_vChromaShift));
::memset(cu->getCoeffCr(), 0, cuSize * cuSize * sizeof(coeff_t) >> (m_hChromaShift + m_vChromaShift));
+#endif
cu->setTransformSkipSubParts(0, 0, 0, 0, cu->getDepth(0));
}
else
@@ -2841,25 +2855,26 @@
m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
m_trQuant->selectLambda(TEXT_LUMA);
- absSumY = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, coeffCurY,
+ int16_t *curResiY = resiYuv->getLumaAddr(absPartIdx);
+ const uint32_t strideResiY = resiYuv->m_width;
+ const uint32_t strideResiC = resiYuv->m_cwidth;
+
+ absSumY = m_trQuant->transformNxN(cu, curResiY, strideResiY, coeffCurY,
trSize, TEXT_LUMA, absPartIdx, &lastPosY, false, curuseRDOQ);
cu->setCbfSubParts(absSumY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
if (absSumY)
{
- int16_t *curResiY = resiYuv->getLumaAddr(absPartIdx);
-
m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
int scalingListType = 3 + TEXT_LUMA;
X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
- m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, resiYuv->m_width, coeffCurY, trSize, scalingListType, false, lastPosY); //this is for inter mode only
+ m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, strideResiY, coeffCurY, trSize, scalingListType, false, lastPosY); //this is for inter mode only
}
else
{
- int16_t *ptr = resiYuv->getLumaAddr(absPartIdx);
- primitives.blockfill_s[sizeIdx](ptr, resiYuv->m_width, 0);
+ primitives.blockfill_s[sizeIdx](curResiY, strideResiY, 0);
}
cu->setCbfSubParts(absSumY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
@@ -2873,6 +2888,9 @@
uint32_t absPartIdxC = tuIterator.m_absPartIdxTURelCU;
uint32_t subTUBufferOffset = trSizeC * trSizeC * tuIterator.m_section;
+ int16_t *curResiU = resiYuv->getCbAddr(absPartIdxC);
+ int16_t *curResiV = resiYuv->getCrAddr(absPartIdxC);
+
cu->setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
cu->setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
@@ -2881,12 +2899,12 @@
m_trQuant->selectLambda(TEXT_CHROMA);
- absSumU = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurU + subTUBufferOffset,
+ absSumU = m_trQuant->transformNxN(cu, curResiU, strideResiC, coeffCurU + subTUBufferOffset,
trSizeC, TEXT_CHROMA_U, absPartIdxC, &lastPosU, false, curuseRDOQ);
curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
- absSumV = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurV + subTUBufferOffset,
+ absSumV = m_trQuant->transformNxN(cu, curResiV, strideResiC, coeffCurV + subTUBufferOffset,
trSizeC, TEXT_CHROMA_V, absPartIdxC, &lastPosV, false, curuseRDOQ);
cu->setCbfPartRange(absSumU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
@@ -2894,34 +2912,29 @@
if (absSumU)
{
- int16_t *pcResiCurrU = resiYuv->getCbAddr(absPartIdxC);
-
curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
int scalingListType = 3 + TEXT_CHROMA_U;
X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
- m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, pcResiCurrU, resiYuv->m_cwidth, coeffCurU + subTUBufferOffset, trSizeC, scalingListType, false, lastPosU);
+ m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiU, strideResiC, coeffCurU + subTUBufferOffset, trSizeC, scalingListType, false, lastPosU);
}
else
{
- int16_t *ptr = resiYuv->getCbAddr(absPartIdxC);
- primitives.blockfill_s[sizeIdxC](ptr, resiYuv->m_cwidth, 0);
+ primitives.blockfill_s[sizeIdxC](curResiU, strideResiC, 0);
}
if (absSumV)
{
- int16_t *curResiV = resiYuv->getCrAddr(absPartIdxC);
curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
int scalingListType = 3 + TEXT_CHROMA_V;
X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
- m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, resiYuv->m_cwidth, coeffCurV + subTUBufferOffset, trSizeC, scalingListType, false, lastPosV);
+ m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, strideResiC, coeffCurV + subTUBufferOffset, trSizeC, scalingListType, false, lastPosV);
}
else
{
- int16_t *ptr = resiYuv->getCrAddr(absPartIdxC);
- primitives.blockfill_s[sizeIdxC](ptr, resiYuv->m_cwidth, 0);
+ primitives.blockfill_s[sizeIdxC](curResiV, strideResiC, 0);
}
cu->setCbfPartRange(absSumU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
cu->setCbfPartRange(absSumV ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
@@ -3027,6 +3040,8 @@
if (bCheckFull)
{
uint32_t trSizeC = 1 << trSizeCLog2;
+ int sizeIdx = trSizeLog2 - 2;
+ int sizeIdxC = trSizeCLog2 - 2;
const uint32_t qtlayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
@@ -3070,7 +3085,7 @@
do
{
uint32_t absPartIdxC = tuIterator.m_absPartIdxTURelCU;
- uint32_t subTUBufferOffset = trSizeC * trSizeC * tuIterator.m_section;
+ uint32_t subTUBufferOffset = trSizeC * trSizeC * tuIterator.m_section;
cu->setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
cu->setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
@@ -3112,7 +3127,8 @@
while (isNextSection(&tuIterator));
}
- const uint32_t numSamplesLuma = 1 << (trSizeLog2 << 1);
+ const uint32_t numCoeffY = 1 << (trSizeLog2 * 2);
+ const uint32_t numCoeffC = 1 << (trSizeCLog2 * 2);
for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
{
@@ -3123,6 +3139,10 @@
int partSize = partitionFromSize(trSize);
uint32_t distY = primitives.sse_sp[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, (pixel*)RDCost::zeroPel, trSize);
+ int16_t *curResiY = m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx);
+ X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width not full CU\n");
+ const uint32_t strideResiY = MAX_CU_SIZE;
+ const uint32_t strideResiC = m_qtTempShortYuv[qtlayer].m_cwidth;
if (outZeroDist)
{
@@ -3130,16 +3150,13 @@
}
if (absSum[TEXT_LUMA][0])
{
- int16_t *curResiY = m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx);
-
m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
int scalingListType = 3 + TEXT_LUMA;
X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
- X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width not full CU\n");
- m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, MAX_CU_SIZE, coeffCurY, trSize, scalingListType, false, lastPos[TEXT_LUMA][0]); //this is for inter mode only
-
- const uint32_t nonZeroDistY = primitives.sse_ss[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx), MAX_CU_SIZE);
+ m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, strideResiY, coeffCurY, trSize, scalingListType, false, lastPos[TEXT_LUMA][0]); //this is for inter mode only
+
+ const uint32_t nonZeroDistY = primitives.sse_ss[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, curResiY, strideResiY);
if (cu->isLosslessCoded(0))
{
distY = nonZeroDistY;
@@ -3154,7 +3171,9 @@
if (nullCostY < singleCostY)
{
absSum[TEXT_LUMA][0] = 0;
- ::memset(coeffCurY, 0, sizeof(coeff_t) * numSamplesLuma);
+#if CHECKED_BUILD || _DEBUG
+ ::memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY);
+#endif
if (checkTransformSkipY)
{
minCost[TEXT_LUMA][0] = nullCostY;
@@ -3182,10 +3201,7 @@
if (!absSum[TEXT_LUMA][0])
{
- int16_t *ptr = m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx);
- X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width not full CU\n");
- int sizeIdx = trSizeLog2 - 2;
- primitives.blockfill_s[sizeIdx](ptr, MAX_CU_SIZE, 0);
+ primitives.blockfill_s[sizeIdx](curResiY, strideResiY, 0);
}
cu->setCbfSubParts(absSum[TEXT_LUMA][0] ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);
@@ -3197,13 +3213,15 @@
initSection(&tuIterator, splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
int partSizeC = partitionFromSize(trSizeC);
- const uint32_t numSamplesChroma = trSizeC * trSizeC;
do
{
uint32_t absPartIdxC = tuIterator.m_absPartIdxTURelCU;
uint32_t subTUBufferOffset = trSizeC * trSizeC * tuIterator.m_section;
+ int16_t *curResiU = m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC);
+ int16_t *curResiV = m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC);
+
distU = m_rdCost->scaleChromaDistCb(primitives.sse_sp[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, (pixel*)RDCost::zeroPel, trSizeC));
if (outZeroDist)
@@ -3212,18 +3230,15 @@
}
if (absSum[TEXT_CHROMA_U][tuIterator.m_section])
{
- int16_t *pcResiCurrU = m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC);
-
int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
int scalingListType = 3 + TEXT_CHROMA_U;
X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
- m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, pcResiCurrU, m_qtTempShortYuv[qtlayer].m_cwidth, coeffCurU + subTUBufferOffset,
+ m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiU, strideResiC, coeffCurU + subTUBufferOffset,
trSizeC, scalingListType, false, lastPos[TEXT_CHROMA_U][tuIterator.m_section]);
uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth,
- m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC),
- m_qtTempShortYuv[qtlayer].m_cwidth);
+ curResiU, strideResiC);
const uint32_t nonZeroDistU = m_rdCost->scaleChromaDistCb(dist);
if (cu->isLosslessCoded(0))
@@ -3240,7 +3255,9 @@
if (nullCostU < singleCostU)
{
absSum[TEXT_CHROMA_U][tuIterator.m_section] = 0;
- ::memset(coeffCurU + subTUBufferOffset, 0, sizeof(coeff_t) * numSamplesChroma);
+#if CHECKED_BUILD || _DEBUG
+ ::memset(coeffCurU + subTUBufferOffset, 0, sizeof(coeff_t) * numCoeffC);
+#endif
if (checkTransformSkipUV)
{
minCost[TEXT_CHROMA_U][tuIterator.m_section] = nullCostU;
@@ -3268,10 +3285,7 @@
if (!absSum[TEXT_CHROMA_U][tuIterator.m_section])
{
- int16_t *ptr = m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC);
- const uint32_t stride = m_qtTempShortYuv[qtlayer].m_cwidth;
- int sizeIdxC = trSizeCLog2 - 2;
- primitives.blockfill_s[sizeIdxC](ptr, stride, 0);
+ primitives.blockfill_s[sizeIdxC](curResiU, strideResiC, 0);
}
distV = m_rdCost->scaleChromaDistCr(primitives.sse_sp[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, (pixel*)RDCost::zeroPel, trSizeC));
@@ -3281,17 +3295,15 @@
}
if (absSum[TEXT_CHROMA_V][tuIterator.m_section])
{
- int16_t *curResiV = m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC);
int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();
m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
int scalingListType = 3 + TEXT_CHROMA_V;
X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
- m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, m_qtTempShortYuv[qtlayer].m_cwidth, coeffCurV + subTUBufferOffset,
+ m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, strideResiC, coeffCurV + subTUBufferOffset,
trSizeC, scalingListType, false, lastPos[TEXT_CHROMA_V][tuIterator.m_section]);
uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth,
- m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC),
- m_qtTempShortYuv[qtlayer].m_cwidth);
+ curResiV, strideResiC);
const uint32_t nonZeroDistV = m_rdCost->scaleChromaDistCr(dist);
if (cu->isLosslessCoded(0))
@@ -3308,7 +3320,9 @@
if (nullCostV < singleCostV)
{
absSum[TEXT_CHROMA_V][tuIterator.m_section] = 0;
- ::memset(coeffCurV + subTUBufferOffset, 0, sizeof(coeff_t) * numSamplesChroma);
+#if CHECKED_BUILD || _DEBUG
+ ::memset(coeffCurV + subTUBufferOffset, 0, sizeof(coeff_t) * numCoeffC);
+#endif
if (checkTransformSkipUV)
{
minCost[TEXT_CHROMA_V][tuIterator.m_section] = nullCostV;
@@ -3336,10 +3350,7 @@
if (!absSum[TEXT_CHROMA_V][tuIterator.m_section])
{
- int16_t *ptr = m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC);
- const uint32_t stride = m_qtTempShortYuv[qtlayer].m_cwidth;
- int sizeIdxC = trSizeCLog2 - 2;
- primitives.blockfill_s[sizeIdxC](ptr, stride, 0);
+ primitives.blockfill_s[sizeIdxC](curResiV, strideResiC, 0);
}
cu->setCbfPartRange(absSum[TEXT_CHROMA_U][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
@@ -3354,17 +3365,11 @@
uint32_t nonZeroDistY = 0, absSumTransformSkipY;
uint64_t singleCostY = MAX_INT64;
- int16_t *curResiY = m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx);
- X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width not full CU\n");
-
- coeff_t bestCoeffY[32 * 32];
- memcpy(bestCoeffY, coeffCurY, sizeof(coeff_t) * numSamplesLuma);
-
- int16_t bestResiY[32 * 32];
- for (int i = 0; i < trSize; ++i)
- {
- memcpy(bestResiY + i * trSize, curResiY + i * MAX_CU_SIZE, sizeof(int16_t) * trSize);
- }
+ coeff_t bestCoeffY[MAX_TS_SIZE * MAX_TS_SIZE];
+ memcpy(bestCoeffY, coeffCurY, sizeof(coeff_t) * numCoeffY);
+
+ int16_t bestResiY[MAX_TS_SIZE * MAX_TS_SIZE];
+ primitives.square_copy_ss[sizeIdx](bestResiY, trSize, curResiY, strideResiY);
m_rdGoOnSbacCoder->load(m_rdSbacCoders[depth][CI_QT_TRAFO_ROOT]);
@@ -3393,13 +3398,11 @@
int scalingListType = 3 + TEXT_LUMA;
X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
- X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width not full CU\n");
-
- m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, MAX_CU_SIZE, coeffCurY, trSize, scalingListType, true, lastPosTransformSkip[TEXT_LUMA][0]);
+
+ m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, strideResiY, coeffCurY, trSize, scalingListType, true, lastPosTransformSkip[TEXT_LUMA][0]);
nonZeroDistY = primitives.sse_ss[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width,
- m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx),
- MAX_CU_SIZE);
+ curResiY, strideResiY);
singleCostY = m_rdCost->calcRdCost(nonZeroDistY, skipSingleBitsY);
}
@@ -3407,11 +3410,8 @@
if (!absSumTransformSkipY || minCost[TEXT_LUMA][0] < singleCostY)
{
cu->setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
- memcpy(coeffCurY, bestCoeffY, sizeof(coeff_t) * numSamplesLuma);
- for (int i = 0; i < trSize; ++i)
- {
- memcpy(curResiY + i * MAX_CU_SIZE, &bestResiY[i * trSize], sizeof(int16_t) * trSize);
- }
+ memcpy(coeffCurY, bestCoeffY, sizeof(coeff_t) * numCoeffY);
+ primitives.square_copy_ss[sizeIdx](curResiY, strideResiY, bestResiY, trSize);
}
else
{
@@ -3435,7 +3435,6 @@
initSection(&tuIterator, splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
int partSizeC = partitionFromSize(trSizeC);
- const uint32_t numSamplesChroma = trSizeC * trSizeC;
do
{
@@ -3444,18 +3443,14 @@
int16_t *curResiU = m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC);
int16_t *curResiV = m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC);
- uint32_t stride = m_qtTempShortYuv[qtlayer].m_cwidth;
-
- coeff_t bestCoeffU[32 * 32], bestCoeffV[32 * 32];
- memcpy(bestCoeffU, coeffCurU + subTUBufferOffset, sizeof(coeff_t) * numSamplesChroma);
- memcpy(bestCoeffV, coeffCurV + subTUBufferOffset, sizeof(coeff_t) * numSamplesChroma);
-
- int16_t bestResiU[32 * 32], bestResiV[32 * 32];
- for (int i = 0; i < trSizeC; ++i)
- {
- memcpy(&bestResiU[i * trSizeC], curResiU + i * stride, sizeof(int16_t) * trSizeC);
- memcpy(&bestResiV[i * trSizeC], curResiV + i * stride, sizeof(int16_t) * trSizeC);
- }
+
+ coeff_t bestCoeffU[MAX_TS_SIZE * MAX_TS_SIZE], bestCoeffV[MAX_TS_SIZE * MAX_TS_SIZE];
+ memcpy(bestCoeffU, coeffCurU + subTUBufferOffset, sizeof(coeff_t) * numCoeffC);
+ memcpy(bestCoeffV, coeffCurV + subTUBufferOffset, sizeof(coeff_t) * numCoeffC);
+
+ int16_t bestResiU[MAX_TS_SIZE * MAX_TS_SIZE], bestResiV[MAX_TS_SIZE * MAX_TS_SIZE];
+ primitives.square_copy_ss[sizeIdxC](bestResiU, trSizeC, curResiU, strideResiC);
+ primitives.square_copy_ss[sizeIdxC](bestResiV, trSizeC, curResiV, strideResiC);
cu->setTransformSkipPartRange(1, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
cu->setTransformSkipPartRange(1, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
@@ -3493,11 +3488,10 @@
int scalingListType = 3 + TEXT_CHROMA_U;
X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
- m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiU, m_qtTempShortYuv[qtlayer].m_cwidth, coeffCurU + subTUBufferOffset,
+ m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiU, strideResiC, coeffCurU + subTUBufferOffset,
trSizeC, scalingListType, true, lastPosTransformSkip[TEXT_CHROMA_U][tuIterator.m_section]);
uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth,
- m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC),
- m_qtTempShortYuv[qtlayer].m_cwidth);
+ curResiU, strideResiC);
nonZeroDistU = m_rdCost->scaleChromaDistCb(dist);
singleCostU = m_rdCost->calcRdCost(nonZeroDistU, singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section]);
}
@@ -3506,11 +3500,8 @@
{
cu->setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);
- memcpy(coeffCurU + subTUBufferOffset, bestCoeffU, sizeof(coeff_t) * numSamplesChroma);
- for (int i = 0; i < trSizeC; ++i)
- {
- memcpy(curResiU + i * stride, &bestResiU[i * trSizeC], sizeof(int16_t) * trSizeC);
- }
+ memcpy(coeffCurU + subTUBufferOffset, bestCoeffU, sizeof(coeff_t) * numCoeffC);
+ primitives.square_copy_ss[sizeIdxC](curResiU, strideResiC, bestResiU, trSizeC);
}
else
{
@@ -3530,11 +3521,10 @@
int scalingListType = 3 + TEXT_CHROMA_V;
X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);
- m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, m_qtTempShortYuv[qtlayer].m_cwidth, coeffCurV + subTUBufferOffset,
+ m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, strideResiC, coeffCurV + subTUBufferOffset,
trSizeC, scalingListType, true, lastPosTransformSkip[TEXT_CHROMA_V][tuIterator.m_section]);
uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth,
- m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC),
- m_qtTempShortYuv[qtlayer].m_cwidth);
+ curResiV, strideResiC);
nonZeroDistV = m_rdCost->scaleChromaDistCr(dist);
singleCostV = m_rdCost->calcRdCost(nonZeroDistV, singleBitsComp[TEXT_CHROMA_V][tuIterator.m_section]);
}
@@ -3543,11 +3533,8 @@
{
cu->setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
- memcpy(coeffCurV + subTUBufferOffset, bestCoeffV, sizeof(coeff_t) * numSamplesChroma);
- for (int i = 0; i < trSizeC; ++i)
- {
- memcpy(curResiV + i * stride, &bestResiV[i * trSizeC], sizeof(int16_t) * trSizeC);
- }
+ memcpy(coeffCurV + subTUBufferOffset, bestCoeffV, sizeof(coeff_t) * numCoeffC);
+ primitives.square_copy_ss[sizeIdxC](curResiV, strideResiC, bestResiV, trSizeC);
}
else
{
@@ -3560,6 +3547,7 @@
cu->setCbfPartRange(absSum[TEXT_CHROMA_V][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);
}
while (isNextSection(&tuIterator));
+
}
m_rdGoOnSbacCoder->load(m_rdSbacCoders[depth][CI_QT_TRAFO_ROOT]);
@@ -3929,7 +3917,7 @@
if (bCodeChroma)
{
- m_qtTempShortYuv[qtlayer].copyPartToPartChroma(resiYuv, absPartIdx, 1 << trSizeLog2, (bChromaSame && (chFmt != CHROMA_422)));
+ m_qtTempShortYuv[qtlayer].copyPartToPartChroma(resiYuv, absPartIdx, trSize, (bChromaSame && (chFmt != CHROMA_422)));
}
}
else
diff -r a5998df9b12e -r 73f86312c2e0 source/common/primitives.cpp
--- a/source/common/primitives.cpp Mon Jun 02 07:36:20 2014 +0530
+++ b/source/common/primitives.cpp Mon Jun 02 11:44:59 2014 +0900
@@ -55,6 +55,11 @@
LUMA_4x4, LUMA_8x8, 255, LUMA_16x16, 255, 255, 255, LUMA_32x32, 255, 255, 255, 255, 255, 255, 255, LUMA_64x64
};
+extern const uint8_t lumaPartitionsFromSquareBlocksTable[] =
+{
+ LUMA_4x4, LUMA_8x8, LUMA_16x16, LUMA_32x32, LUMA_64x64
+};
+
/* the "authoritative" set of encoder primitives */
EncoderPrimitives primitives;
@@ -72,6 +77,31 @@
Setup_C_IPredPrimitives(p); // intrapred.cpp
Setup_C_LoopFilterPrimitives(p); // loopfilter.cpp
}
+
+static void Setup_Alias_Primitives(EncoderPrimitives &p)
+{
+ /* copy reusable luma primitives to chroma 4:4:4 */
+ for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
+ {
+ p.chroma[X265_CSP_I444].copy_pp[i] = p.luma_copy_pp[i];
+ p.chroma[X265_CSP_I444].copy_ps[i] = p.luma_copy_ps[i];
+ p.chroma[X265_CSP_I444].copy_sp[i] = p.luma_copy_sp[i];
+ p.chroma[X265_CSP_I444].copy_ss[i] = p.luma_copy_ss[i];
+ p.chroma[X265_CSP_I444].add_ps[i] = p.luma_add_ps[i];
+ p.chroma[X265_CSP_I444].sub_ps[i] = p.luma_sub_ps[i];
+ p.chroma[X265_CSP_I444].addAvg[i] = p.luma_addAvg[i];
+ }
+
+ for (int i = 0; i < NUM_SQUARE_BLOCKS; i++)
+ {
+ int partL = lumaPartitionsFromSquareBlocksTable[i];
+ p.sad_square[i] = p.sad[partL];
+ p.square_copy_pp[i] = p.luma_copy_pp[partL];
+ p.square_copy_ps[i] = p.luma_copy_ps[partL];
+ p.square_copy_sp[i] = p.luma_copy_sp[partL];
+ p.square_copy_ss[i] = p.luma_copy_ss[partL];
+ }
+}
}
using namespace x265;
@@ -95,6 +125,8 @@
x265_log(param, X265_LOG_WARNING, "Assembly not supported in this binary\n");
#endif
+ Setup_Alias_Primitives(primitives);
+
initROM();
}
diff -r a5998df9b12e -r 73f86312c2e0 source/common/primitives.h
--- a/source/common/primitives.h Mon Jun 02 07:36:20 2014 +0530
+++ b/source/common/primitives.h Mon Jun 02 11:44:59 2014 +0900
@@ -213,6 +213,10 @@
copy_ss_t luma_copy_ss[NUM_LUMA_PARTITIONS];
pixel_sub_ps_t luma_sub_ps[NUM_LUMA_PARTITIONS];
pixel_add_ps_t luma_add_ps[NUM_LUMA_PARTITIONS];
+ copy_pp_t square_copy_pp[NUM_SQUARE_BLOCKS];
+ copy_sp_t square_copy_sp[NUM_SQUARE_BLOCKS];
+ copy_ps_t square_copy_ps[NUM_SQUARE_BLOCKS];
+ copy_ss_t square_copy_ss[NUM_SQUARE_BLOCKS];
filter_pp_t luma_hpp[NUM_LUMA_PARTITIONS];
filter_hps_t luma_hps[NUM_LUMA_PARTITIONS];
diff -r a5998df9b12e -r 73f86312c2e0 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jun 02 07:36:20 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Jun 02 11:44:59 2014 +0900
@@ -1316,30 +1316,12 @@
}
#endif // if HIGH_BIT_DEPTH
- /* copy reusable luma primitives to chroma 4:4:4 */
- for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
- {
- p.chroma[X265_CSP_I444].copy_pp[i] = p.luma_copy_pp[i];
- p.chroma[X265_CSP_I444].copy_ps[i] = p.luma_copy_ps[i];
- p.chroma[X265_CSP_I444].copy_sp[i] = p.luma_copy_sp[i];
- p.chroma[X265_CSP_I444].copy_ss[i] = p.luma_copy_ss[i];
- p.chroma[X265_CSP_I444].add_ps[i] = p.luma_add_ps[i];
- p.chroma[X265_CSP_I444].sub_ps[i] = p.luma_sub_ps[i];
- p.chroma[X265_CSP_I444].addAvg[i] = p.luma_addAvg[i];
- }
-
primitives.sa8d[BLOCK_4x4] = primitives.sa8d_inter[LUMA_4x4];
primitives.sa8d[BLOCK_8x8] = primitives.sa8d_inter[LUMA_8x8];
primitives.sa8d[BLOCK_16x16] = primitives.sa8d_inter[LUMA_16x16];
primitives.sa8d[BLOCK_32x32] = primitives.sa8d_inter[LUMA_32x32];
primitives.sa8d[BLOCK_64x64] = primitives.sa8d_inter[LUMA_64x64];
- primitives.sad_square[BLOCK_4x4] = primitives.sad[LUMA_4x4];
- primitives.sad_square[BLOCK_8x8] = primitives.sad[LUMA_8x8];
- primitives.sad_square[BLOCK_16x16] = primitives.sad[LUMA_16x16];
- primitives.sad_square[BLOCK_32x32] = primitives.sad[LUMA_32x32];
- primitives.sad_square[BLOCK_64x64] = primitives.sad[LUMA_64x64];
-
// SA8D devolves to SATD for blocks not even multiples of 8x8
primitives.sa8d_inter[LUMA_4x4] = primitives.satd[LUMA_4x4];
primitives.sa8d_inter[LUMA_4x8] = primitives.satd[LUMA_4x8];
More information about the x265-devel
mailing list