[x265] reduce malloc call to avoid fragment and overhead
Satoshi Nakagawa
nakagawa424 at oki.com
Thu May 8 11:57:34 CEST 2014
# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1399542737 -32400
# Thu May 08 18:52:17 2014 +0900
# Node ID 237ce67bc16fcea4696e119b7e5f34b640445d8c
# Parent 8e64aa56d6352a1a8cfb6fb57cb547607bcec9b1
reduce malloc call to avoid fragment and overhead
diff -r 8e64aa56d635 -r 237ce67bc16f source/Lib/TLibCommon/TComDataCU.cpp
--- a/source/Lib/TLibCommon/TComDataCU.cpp Thu May 08 11:01:04 2014 +0530
+++ b/source/Lib/TLibCommon/TComDataCU.cpp Thu May 08 18:52:17 2014 +0900
@@ -116,6 +116,9 @@
tmp = g_convertToBit[tmp] + 2;
m_unitMask = ~((1 << tmp) - 1);
+ uint32_t sizeL = cuSize * cuSize;
+ uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
+
bool ok = true;
ok &= m_cuMvField[0].create(numPartition);
ok &= m_cuMvField[1].create(numPartition);
@@ -134,25 +137,25 @@
CHECKED_MALLOC(m_interDir, uint8_t, numPartition);
CHECKED_MALLOC(m_trIdx, uint8_t, numPartition);
- CHECKED_MALLOC(m_transformSkip[0], uint8_t, numPartition);
- CHECKED_MALLOC(m_transformSkip[1], uint8_t, numPartition);
- CHECKED_MALLOC(m_transformSkip[2], uint8_t, numPartition);
+ CHECKED_MALLOC(m_transformSkip[0], uint8_t, numPartition * 3);
+ m_transformSkip[1] = m_transformSkip[0] + numPartition;
+ m_transformSkip[2] = m_transformSkip[0] + numPartition * 2;
- CHECKED_MALLOC(m_cbf[0], uint8_t, numPartition);
- CHECKED_MALLOC(m_cbf[1], uint8_t, numPartition);
- CHECKED_MALLOC(m_cbf[2], uint8_t, numPartition);
+ CHECKED_MALLOC(m_cbf[0], uint8_t, numPartition * 3);
+ m_cbf[1] = m_cbf[0] + numPartition;
+ m_cbf[2] = m_cbf[0] + numPartition * 2;
CHECKED_MALLOC(m_mvpIdx[0], uint8_t, numPartition * 2);
m_mvpIdx[1] = m_mvpIdx[0] + numPartition;
- CHECKED_MALLOC(m_trCoeffY, coeff_t, cuSize * cuSize);
- CHECKED_MALLOC(m_trCoeffCb, coeff_t, cuSize * cuSize >> (m_hChromaShift + m_vChromaShift));
- CHECKED_MALLOC(m_trCoeffCr, coeff_t, cuSize * cuSize >> (m_hChromaShift + m_vChromaShift));
+ CHECKED_MALLOC(m_trCoeffY, coeff_t, sizeL + sizeC * 2);
+ m_trCoeffCb = m_trCoeffY + sizeL;
+ m_trCoeffCr = m_trCoeffY + sizeL + sizeC;
CHECKED_MALLOC(m_iPCMFlags, bool, numPartition);
- CHECKED_MALLOC(m_iPCMSampleY, pixel, cuSize * cuSize);
- CHECKED_MALLOC(m_iPCMSampleCb, pixel, cuSize * cuSize >> (m_hChromaShift + m_vChromaShift));
- CHECKED_MALLOC(m_iPCMSampleCr, pixel, cuSize * cuSize >> (m_hChromaShift + m_vChromaShift));
+ CHECKED_MALLOC(m_iPCMSampleY, pixel, sizeL + sizeC * 2);
+ m_iPCMSampleCb = m_iPCMSampleY + sizeL;
+ m_iPCMSampleCr = m_iPCMSampleY + sizeL + sizeC;
memset(m_partSizes, SIZE_NONE, numPartition * sizeof(*m_partSizes));
return ok;
@@ -168,23 +171,15 @@
X265_FREE(m_depth);
X265_FREE(m_cuSize);
X265_FREE(m_cbf[0]);
- X265_FREE(m_cbf[1]);
- X265_FREE(m_cbf[2]);
X265_FREE(m_interDir);
X265_FREE(m_bMergeFlags);
X265_FREE(m_lumaIntraDir);
X265_FREE(m_chromaIntraDir);
X265_FREE(m_trIdx);
X265_FREE(m_transformSkip[0]);
- X265_FREE(m_transformSkip[1]);
- X265_FREE(m_transformSkip[2]);
X265_FREE(m_trCoeffY);
- X265_FREE(m_trCoeffCb);
- X265_FREE(m_trCoeffCr);
X265_FREE(m_iPCMFlags);
X265_FREE(m_iPCMSampleY);
- X265_FREE(m_iPCMSampleCb);
- X265_FREE(m_iPCMSampleCr);
X265_FREE(m_mvpIdx[0]);
X265_FREE(m_cuTransquantBypass);
X265_FREE(m_skipFlag);
@@ -258,8 +253,6 @@
memset(m_iPCMFlags, false, numElements * sizeof(*m_iPCMFlags));
}
- uint32_t y_tmp = g_maxCUSize * g_maxCUSize;
- uint32_t c_tmp = g_maxCUSize * g_maxCUSize >> (m_hChromaShift + m_vChromaShift);
{
m_cuMvField[0].clearMvField();
m_cuMvField[1].clearMvField();
@@ -267,6 +260,8 @@
// TODO: can be remove, but I haven't data to verify it, remove later
if (getSlice()->getSPS()->getUsePCM())
{
+ uint32_t y_tmp = g_maxCUSize * g_maxCUSize;
+ uint32_t c_tmp = g_maxCUSize * g_maxCUSize >> (m_hChromaShift + m_vChromaShift);
memset(m_iPCMSampleY, 0, sizeof(pixel) * y_tmp);
memset(m_iPCMSampleCb, 0, sizeof(pixel) * c_tmp);
memset(m_iPCMSampleCr, 0, sizeof(pixel) * c_tmp);
@@ -598,9 +593,8 @@
uint32_t tmp2 = partUnitIdx * tmp;
memcpy(m_trCoeffY + tmp2, cu->getCoeffY(), sizeof(coeff_t) * tmp);
memcpy(m_iPCMSampleY + tmp2, cu->getPCMSampleY(), sizeof(pixel) * tmp);
-
tmp >>= m_hChromaShift + m_vChromaShift;
- tmp2 = partUnitIdx * tmp;
+ tmp2 >>= m_hChromaShift + m_vChromaShift;
memcpy(m_trCoeffCb + tmp2, cu->getCoeffCb(), sizeof(coeff_t) * tmp);
memcpy(m_trCoeffCr + tmp2, cu->getCoeffCr(), sizeof(coeff_t) * tmp);
memcpy(m_iPCMSampleCb + tmp2, cu->getPCMSampleCb(), sizeof(pixel) * tmp);
@@ -684,7 +678,6 @@
uint32_t tmp = (g_maxCUSize * g_maxCUSize) >> (depth << 1);
uint32_t tmp2 = m_absIdxInLCU << m_pic->getLog2UnitSize() * 2;
memcpy(rpcCU->getCoeffY() + tmp2, m_trCoeffY, sizeof(coeff_t) * tmp);
-
tmp >>= m_hChromaShift + m_vChromaShift;
tmp2 >>= m_hChromaShift + m_vChromaShift;
memcpy(rpcCU->getCoeffCb() + tmp2, m_trCoeffCb, sizeof(coeff_t) * tmp);
@@ -737,7 +730,6 @@
uint32_t tmp2 = partOffset << m_pic->getLog2UnitSize() * 2;
memcpy(cu->getCoeffY() + tmp2, m_trCoeffY, sizeof(coeff_t) * tmp);
memcpy(cu->getPCMSampleY() + tmp2, m_iPCMSampleY, sizeof(pixel) * tmp);
-
tmp >>= m_hChromaShift + m_vChromaShift;
tmp2 >>= m_hChromaShift + m_vChromaShift;
memcpy(cu->getCoeffCb() + tmp2, m_trCoeffCb, sizeof(coeff_t) * tmp);
diff -r 8e64aa56d635 -r 237ce67bc16f source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Thu May 08 11:01:04 2014 +0530
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Thu May 08 18:52:17 2014 +0900
@@ -82,24 +82,15 @@
for (uint32_t i = 0; i < numLayersToAllocate; ++i)
{
X265_FREE(m_qtTempCoeffY[i]);
- X265_FREE(m_qtTempCoeffCb[i]);
- X265_FREE(m_qtTempCoeffCr[i]);
m_qtTempShortYuv[i].destroy();
}
}
X265_FREE(m_qtTempTUCoeffY);
- X265_FREE(m_qtTempTUCoeffCb);
- X265_FREE(m_qtTempTUCoeffCr);
X265_FREE(m_qtTempTrIdx);
- for (uint32_t i = 0; i < 3; ++i)
- {
- X265_FREE(m_qtTempCbf[i]);
- X265_FREE(m_qtTempTransformSkipFlag[i]);
- }
+ X265_FREE(m_qtTempCbf[0]);
+ X265_FREE(m_qtTempTransformSkipFlag[0]);
delete[] m_qtTempCoeffY;
- delete[] m_qtTempCoeffCb;
- delete[] m_qtTempCoeffCr;
delete[] m_qtTempShortYuv;
m_qtTempTransformSkipYuv.destroy();
}
@@ -119,30 +110,32 @@
m_refLagPixels = cfg->param->frameNumThreads > 1 ? cfg->param->searchRange : cfg->param->sourceHeight;
const uint32_t numLayersToAllocate = cfg->m_quadtreeTULog2MaxSize - cfg->m_quadtreeTULog2MinSize + 1;
- m_qtTempCoeffY = new coeff_t*[numLayersToAllocate];
- m_qtTempCoeffCb = new coeff_t*[numLayersToAllocate];
- m_qtTempCoeffCr = new coeff_t*[numLayersToAllocate];
+ m_qtTempCoeffY = new coeff_t*[numLayersToAllocate * 3];
+ m_qtTempCoeffCb = m_qtTempCoeffY + numLayersToAllocate;
+ m_qtTempCoeffCr = m_qtTempCoeffY + numLayersToAllocate * 2;
m_qtTempShortYuv = new ShortYuv[numLayersToAllocate];
+ uint32_t sizeL = g_maxCUSize * g_maxCUSize;
+ uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
for (uint32_t i = 0; i < numLayersToAllocate; ++i)
{
- m_qtTempCoeffY[i] = X265_MALLOC(coeff_t, g_maxCUSize * g_maxCUSize);
- m_qtTempCoeffCb[i] = X265_MALLOC(coeff_t, g_maxCUSize * g_maxCUSize >> (m_hChromaShift + m_vChromaShift));
- m_qtTempCoeffCr[i] = X265_MALLOC(coeff_t, g_maxCUSize * g_maxCUSize >> (m_hChromaShift + m_vChromaShift));
+ m_qtTempCoeffY[i] = X265_MALLOC(coeff_t, sizeL + sizeC * 2);
+ m_qtTempCoeffCb[i] = m_qtTempCoeffY[i] + sizeL;
+ m_qtTempCoeffCr[i] = m_qtTempCoeffY[i] + sizeL + sizeC;
m_qtTempShortYuv[i].create(MAX_CU_SIZE, MAX_CU_SIZE, cfg->param->internalCsp);
}
const uint32_t numPartitions = 1 << (g_maxCUDepth << 1);
CHECKED_MALLOC(m_qtTempTrIdx, uint8_t, numPartitions);
- CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions);
- CHECKED_MALLOC(m_qtTempCbf[1], uint8_t, numPartitions);
- CHECKED_MALLOC(m_qtTempCbf[2], uint8_t, numPartitions);
- CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions);
- CHECKED_MALLOC(m_qtTempTransformSkipFlag[1], uint8_t, numPartitions);
- CHECKED_MALLOC(m_qtTempTransformSkipFlag[2], uint8_t, numPartitions);
-
- CHECKED_MALLOC(m_qtTempTUCoeffY, coeff_t, MAX_TS_WIDTH * MAX_TS_HEIGHT);
- CHECKED_MALLOC(m_qtTempTUCoeffCb, coeff_t, MAX_TS_WIDTH * MAX_TS_HEIGHT);
- CHECKED_MALLOC(m_qtTempTUCoeffCr, coeff_t, MAX_TS_WIDTH * MAX_TS_HEIGHT);
+ CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
+ m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
+ m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
+ CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
+ m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
+ m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
+
+ CHECKED_MALLOC(m_qtTempTUCoeffY, coeff_t, MAX_TS_WIDTH * MAX_TS_HEIGHT * 3);
+ m_qtTempTUCoeffCb = m_qtTempTUCoeffY + MAX_TS_WIDTH * MAX_TS_HEIGHT;
+ m_qtTempTUCoeffCr = m_qtTempTUCoeffY + MAX_TS_WIDTH * MAX_TS_HEIGHT * 2;
return m_qtTempTransformSkipYuv.create(g_maxCUSize, g_maxCUSize, cfg->param->internalCsp);
More information about the x265-devel
mailing list