[x265] reduce malloc call to avoid fragment and overhead
Steve Borho
steve at borho.org
Thu May 8 20:29:37 CEST 2014
On Thu, May 8, 2014 at 4:57 AM, Satoshi Nakagawa <nakagawa424 at oki.com> wrote:
> # HG changeset patch
> # User Satoshi Nakagawa <nakagawa424 at oki.com>
> # Date 1399542737 -32400
> # Thu May 08 18:52:17 2014 +0900
> # Node ID 237ce67bc16fcea4696e119b7e5f34b640445d8c
> # Parent 8e64aa56d6352a1a8cfb6fb57cb547607bcec9b1
> reduce malloc call to avoid fragment and overhead
nice; queued for validations
>
> diff -r 8e64aa56d635 -r 237ce67bc16f source/Lib/TLibCommon/TComDataCU.cpp
> --- a/source/Lib/TLibCommon/TComDataCU.cpp Thu May 08 11:01:04 2014 +0530
> +++ b/source/Lib/TLibCommon/TComDataCU.cpp Thu May 08 18:52:17 2014 +0900
> @@ -116,6 +116,9 @@
> tmp = g_convertToBit[tmp] + 2;
> m_unitMask = ~((1 << tmp) - 1);
>
> + uint32_t sizeL = cuSize * cuSize;
> + uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
> +
> bool ok = true;
> ok &= m_cuMvField[0].create(numPartition);
> ok &= m_cuMvField[1].create(numPartition);
> @@ -134,25 +137,25 @@
> CHECKED_MALLOC(m_interDir, uint8_t, numPartition);
>
> CHECKED_MALLOC(m_trIdx, uint8_t, numPartition);
> - CHECKED_MALLOC(m_transformSkip[0], uint8_t, numPartition);
> - CHECKED_MALLOC(m_transformSkip[1], uint8_t, numPartition);
> - CHECKED_MALLOC(m_transformSkip[2], uint8_t, numPartition);
> + CHECKED_MALLOC(m_transformSkip[0], uint8_t, numPartition * 3);
> + m_transformSkip[1] = m_transformSkip[0] + numPartition;
> + m_transformSkip[2] = m_transformSkip[0] + numPartition * 2;
>
> - CHECKED_MALLOC(m_cbf[0], uint8_t, numPartition);
> - CHECKED_MALLOC(m_cbf[1], uint8_t, numPartition);
> - CHECKED_MALLOC(m_cbf[2], uint8_t, numPartition);
> + CHECKED_MALLOC(m_cbf[0], uint8_t, numPartition * 3);
> + m_cbf[1] = m_cbf[0] + numPartition;
> + m_cbf[2] = m_cbf[0] + numPartition * 2;
>
> CHECKED_MALLOC(m_mvpIdx[0], uint8_t, numPartition * 2);
> m_mvpIdx[1] = m_mvpIdx[0] + numPartition;
>
> - CHECKED_MALLOC(m_trCoeffY, coeff_t, cuSize * cuSize);
> - CHECKED_MALLOC(m_trCoeffCb, coeff_t, cuSize * cuSize >> (m_hChromaShift + m_vChromaShift));
> - CHECKED_MALLOC(m_trCoeffCr, coeff_t, cuSize * cuSize >> (m_hChromaShift + m_vChromaShift));
> + CHECKED_MALLOC(m_trCoeffY, coeff_t, sizeL + sizeC * 2);
> + m_trCoeffCb = m_trCoeffY + sizeL;
> + m_trCoeffCr = m_trCoeffY + sizeL + sizeC;
>
> CHECKED_MALLOC(m_iPCMFlags, bool, numPartition);
> - CHECKED_MALLOC(m_iPCMSampleY, pixel, cuSize * cuSize);
> - CHECKED_MALLOC(m_iPCMSampleCb, pixel, cuSize * cuSize >> (m_hChromaShift + m_vChromaShift));
> - CHECKED_MALLOC(m_iPCMSampleCr, pixel, cuSize * cuSize >> (m_hChromaShift + m_vChromaShift));
> + CHECKED_MALLOC(m_iPCMSampleY, pixel, sizeL + sizeC * 2);
> + m_iPCMSampleCb = m_iPCMSampleY + sizeL;
> + m_iPCMSampleCr = m_iPCMSampleY + sizeL + sizeC;
>
> memset(m_partSizes, SIZE_NONE, numPartition * sizeof(*m_partSizes));
> return ok;
> @@ -168,23 +171,15 @@
> X265_FREE(m_depth);
> X265_FREE(m_cuSize);
> X265_FREE(m_cbf[0]);
> - X265_FREE(m_cbf[1]);
> - X265_FREE(m_cbf[2]);
> X265_FREE(m_interDir);
> X265_FREE(m_bMergeFlags);
> X265_FREE(m_lumaIntraDir);
> X265_FREE(m_chromaIntraDir);
> X265_FREE(m_trIdx);
> X265_FREE(m_transformSkip[0]);
> - X265_FREE(m_transformSkip[1]);
> - X265_FREE(m_transformSkip[2]);
> X265_FREE(m_trCoeffY);
> - X265_FREE(m_trCoeffCb);
> - X265_FREE(m_trCoeffCr);
> X265_FREE(m_iPCMFlags);
> X265_FREE(m_iPCMSampleY);
> - X265_FREE(m_iPCMSampleCb);
> - X265_FREE(m_iPCMSampleCr);
> X265_FREE(m_mvpIdx[0]);
> X265_FREE(m_cuTransquantBypass);
> X265_FREE(m_skipFlag);
> @@ -258,8 +253,6 @@
> memset(m_iPCMFlags, false, numElements * sizeof(*m_iPCMFlags));
> }
>
> - uint32_t y_tmp = g_maxCUSize * g_maxCUSize;
> - uint32_t c_tmp = g_maxCUSize * g_maxCUSize >> (m_hChromaShift + m_vChromaShift);
> {
> m_cuMvField[0].clearMvField();
> m_cuMvField[1].clearMvField();
> @@ -267,6 +260,8 @@
> // TODO: can be remove, but I haven't data to verify it, remove later
> if (getSlice()->getSPS()->getUsePCM())
> {
> + uint32_t y_tmp = g_maxCUSize * g_maxCUSize;
> + uint32_t c_tmp = g_maxCUSize * g_maxCUSize >> (m_hChromaShift + m_vChromaShift);
> memset(m_iPCMSampleY, 0, sizeof(pixel) * y_tmp);
> memset(m_iPCMSampleCb, 0, sizeof(pixel) * c_tmp);
> memset(m_iPCMSampleCr, 0, sizeof(pixel) * c_tmp);
> @@ -598,9 +593,8 @@
> uint32_t tmp2 = partUnitIdx * tmp;
> memcpy(m_trCoeffY + tmp2, cu->getCoeffY(), sizeof(coeff_t) * tmp);
> memcpy(m_iPCMSampleY + tmp2, cu->getPCMSampleY(), sizeof(pixel) * tmp);
> -
> tmp >>= m_hChromaShift + m_vChromaShift;
> - tmp2 = partUnitIdx * tmp;
> + tmp2 >>= m_hChromaShift + m_vChromaShift;
> memcpy(m_trCoeffCb + tmp2, cu->getCoeffCb(), sizeof(coeff_t) * tmp);
> memcpy(m_trCoeffCr + tmp2, cu->getCoeffCr(), sizeof(coeff_t) * tmp);
> memcpy(m_iPCMSampleCb + tmp2, cu->getPCMSampleCb(), sizeof(pixel) * tmp);
> @@ -684,7 +678,6 @@
> uint32_t tmp = (g_maxCUSize * g_maxCUSize) >> (depth << 1);
> uint32_t tmp2 = m_absIdxInLCU << m_pic->getLog2UnitSize() * 2;
> memcpy(rpcCU->getCoeffY() + tmp2, m_trCoeffY, sizeof(coeff_t) * tmp);
> -
> tmp >>= m_hChromaShift + m_vChromaShift;
> tmp2 >>= m_hChromaShift + m_vChromaShift;
> memcpy(rpcCU->getCoeffCb() + tmp2, m_trCoeffCb, sizeof(coeff_t) * tmp);
> @@ -737,7 +730,6 @@
> uint32_t tmp2 = partOffset << m_pic->getLog2UnitSize() * 2;
> memcpy(cu->getCoeffY() + tmp2, m_trCoeffY, sizeof(coeff_t) * tmp);
> memcpy(cu->getPCMSampleY() + tmp2, m_iPCMSampleY, sizeof(pixel) * tmp);
> -
> tmp >>= m_hChromaShift + m_vChromaShift;
> tmp2 >>= m_hChromaShift + m_vChromaShift;
> memcpy(cu->getCoeffCb() + tmp2, m_trCoeffCb, sizeof(coeff_t) * tmp);
> diff -r 8e64aa56d635 -r 237ce67bc16f source/Lib/TLibEncoder/TEncSearch.cpp
> --- a/source/Lib/TLibEncoder/TEncSearch.cpp Thu May 08 11:01:04 2014 +0530
> +++ b/source/Lib/TLibEncoder/TEncSearch.cpp Thu May 08 18:52:17 2014 +0900
> @@ -82,24 +82,15 @@
> for (uint32_t i = 0; i < numLayersToAllocate; ++i)
> {
> X265_FREE(m_qtTempCoeffY[i]);
> - X265_FREE(m_qtTempCoeffCb[i]);
> - X265_FREE(m_qtTempCoeffCr[i]);
> m_qtTempShortYuv[i].destroy();
> }
> }
> X265_FREE(m_qtTempTUCoeffY);
> - X265_FREE(m_qtTempTUCoeffCb);
> - X265_FREE(m_qtTempTUCoeffCr);
> X265_FREE(m_qtTempTrIdx);
> - for (uint32_t i = 0; i < 3; ++i)
> - {
> - X265_FREE(m_qtTempCbf[i]);
> - X265_FREE(m_qtTempTransformSkipFlag[i]);
> - }
> + X265_FREE(m_qtTempCbf[0]);
> + X265_FREE(m_qtTempTransformSkipFlag[0]);
>
> delete[] m_qtTempCoeffY;
> - delete[] m_qtTempCoeffCb;
> - delete[] m_qtTempCoeffCr;
> delete[] m_qtTempShortYuv;
> m_qtTempTransformSkipYuv.destroy();
> }
> @@ -119,30 +110,32 @@
> m_refLagPixels = cfg->param->frameNumThreads > 1 ? cfg->param->searchRange : cfg->param->sourceHeight;
>
> const uint32_t numLayersToAllocate = cfg->m_quadtreeTULog2MaxSize - cfg->m_quadtreeTULog2MinSize + 1;
> - m_qtTempCoeffY = new coeff_t*[numLayersToAllocate];
> - m_qtTempCoeffCb = new coeff_t*[numLayersToAllocate];
> - m_qtTempCoeffCr = new coeff_t*[numLayersToAllocate];
> + m_qtTempCoeffY = new coeff_t*[numLayersToAllocate * 3];
> + m_qtTempCoeffCb = m_qtTempCoeffY + numLayersToAllocate;
> + m_qtTempCoeffCr = m_qtTempCoeffY + numLayersToAllocate * 2;
> m_qtTempShortYuv = new ShortYuv[numLayersToAllocate];
> + uint32_t sizeL = g_maxCUSize * g_maxCUSize;
> + uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
> for (uint32_t i = 0; i < numLayersToAllocate; ++i)
> {
> - m_qtTempCoeffY[i] = X265_MALLOC(coeff_t, g_maxCUSize * g_maxCUSize);
> - m_qtTempCoeffCb[i] = X265_MALLOC(coeff_t, g_maxCUSize * g_maxCUSize >> (m_hChromaShift + m_vChromaShift));
> - m_qtTempCoeffCr[i] = X265_MALLOC(coeff_t, g_maxCUSize * g_maxCUSize >> (m_hChromaShift + m_vChromaShift));
> + m_qtTempCoeffY[i] = X265_MALLOC(coeff_t, sizeL + sizeC * 2);
> + m_qtTempCoeffCb[i] = m_qtTempCoeffY[i] + sizeL;
> + m_qtTempCoeffCr[i] = m_qtTempCoeffY[i] + sizeL + sizeC;
> m_qtTempShortYuv[i].create(MAX_CU_SIZE, MAX_CU_SIZE, cfg->param->internalCsp);
> }
>
> const uint32_t numPartitions = 1 << (g_maxCUDepth << 1);
> CHECKED_MALLOC(m_qtTempTrIdx, uint8_t, numPartitions);
> - CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions);
> - CHECKED_MALLOC(m_qtTempCbf[1], uint8_t, numPartitions);
> - CHECKED_MALLOC(m_qtTempCbf[2], uint8_t, numPartitions);
> - CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions);
> - CHECKED_MALLOC(m_qtTempTransformSkipFlag[1], uint8_t, numPartitions);
> - CHECKED_MALLOC(m_qtTempTransformSkipFlag[2], uint8_t, numPartitions);
> -
> - CHECKED_MALLOC(m_qtTempTUCoeffY, coeff_t, MAX_TS_WIDTH * MAX_TS_HEIGHT);
> - CHECKED_MALLOC(m_qtTempTUCoeffCb, coeff_t, MAX_TS_WIDTH * MAX_TS_HEIGHT);
> - CHECKED_MALLOC(m_qtTempTUCoeffCr, coeff_t, MAX_TS_WIDTH * MAX_TS_HEIGHT);
> + CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
> + m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
> + m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
> + CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
> + m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
> + m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
> +
> + CHECKED_MALLOC(m_qtTempTUCoeffY, coeff_t, MAX_TS_WIDTH * MAX_TS_HEIGHT * 3);
> + m_qtTempTUCoeffCb = m_qtTempTUCoeffY + MAX_TS_WIDTH * MAX_TS_HEIGHT;
> + m_qtTempTUCoeffCr = m_qtTempTUCoeffY + MAX_TS_WIDTH * MAX_TS_HEIGHT * 2;
>
> return m_qtTempTransformSkipYuv.create(g_maxCUSize, g_maxCUSize, cfg->param->internalCsp);
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list