[x265] reduce malloc call to avoid fragment and overhead

Steve Borho steve at borho.org
Thu May 8 20:29:37 CEST 2014


On Thu, May 8, 2014 at 4:57 AM, Satoshi Nakagawa <nakagawa424 at oki.com> wrote:
> # HG changeset patch
> # User Satoshi Nakagawa <nakagawa424 at oki.com>
> # Date 1399542737 -32400
> #      Thu May 08 18:52:17 2014 +0900
> # Node ID 237ce67bc16fcea4696e119b7e5f34b640445d8c
> # Parent  8e64aa56d6352a1a8cfb6fb57cb547607bcec9b1
> reduce malloc call to avoid fragment and overhead

nice; queued for validations

>
> diff -r 8e64aa56d635 -r 237ce67bc16f source/Lib/TLibCommon/TComDataCU.cpp
> --- a/source/Lib/TLibCommon/TComDataCU.cpp      Thu May 08 11:01:04 2014 +0530
> +++ b/source/Lib/TLibCommon/TComDataCU.cpp      Thu May 08 18:52:17 2014 +0900
> @@ -116,6 +116,9 @@
>      tmp = g_convertToBit[tmp] + 2;
>      m_unitMask = ~((1 << tmp) - 1);
>
> +    uint32_t sizeL = cuSize * cuSize;
> +    uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
> +
>      bool ok = true;
>      ok &= m_cuMvField[0].create(numPartition);
>      ok &= m_cuMvField[1].create(numPartition);
> @@ -134,25 +137,25 @@
>      CHECKED_MALLOC(m_interDir, uint8_t, numPartition);
>
>      CHECKED_MALLOC(m_trIdx, uint8_t, numPartition);
> -    CHECKED_MALLOC(m_transformSkip[0], uint8_t, numPartition);
> -    CHECKED_MALLOC(m_transformSkip[1], uint8_t, numPartition);
> -    CHECKED_MALLOC(m_transformSkip[2], uint8_t, numPartition);
> +    CHECKED_MALLOC(m_transformSkip[0], uint8_t, numPartition * 3);
> +    m_transformSkip[1] = m_transformSkip[0] + numPartition;
> +    m_transformSkip[2] = m_transformSkip[0] + numPartition * 2;
>
> -    CHECKED_MALLOC(m_cbf[0], uint8_t, numPartition);
> -    CHECKED_MALLOC(m_cbf[1], uint8_t, numPartition);
> -    CHECKED_MALLOC(m_cbf[2], uint8_t, numPartition);
> +    CHECKED_MALLOC(m_cbf[0], uint8_t, numPartition * 3);
> +    m_cbf[1] = m_cbf[0] + numPartition;
> +    m_cbf[2] = m_cbf[0] + numPartition * 2;
>
>      CHECKED_MALLOC(m_mvpIdx[0], uint8_t, numPartition * 2);
>      m_mvpIdx[1] = m_mvpIdx[0] + numPartition;
>
> -    CHECKED_MALLOC(m_trCoeffY, coeff_t, cuSize * cuSize);
> -    CHECKED_MALLOC(m_trCoeffCb, coeff_t, cuSize * cuSize >> (m_hChromaShift + m_vChromaShift));
> -    CHECKED_MALLOC(m_trCoeffCr, coeff_t, cuSize * cuSize >> (m_hChromaShift + m_vChromaShift));
> +    CHECKED_MALLOC(m_trCoeffY, coeff_t, sizeL + sizeC * 2);
> +    m_trCoeffCb = m_trCoeffY + sizeL;
> +    m_trCoeffCr = m_trCoeffY + sizeL + sizeC;
>
>      CHECKED_MALLOC(m_iPCMFlags, bool, numPartition);
> -    CHECKED_MALLOC(m_iPCMSampleY, pixel, cuSize * cuSize);
> -    CHECKED_MALLOC(m_iPCMSampleCb, pixel, cuSize * cuSize >> (m_hChromaShift + m_vChromaShift));
> -    CHECKED_MALLOC(m_iPCMSampleCr, pixel, cuSize * cuSize >> (m_hChromaShift + m_vChromaShift));
> +    CHECKED_MALLOC(m_iPCMSampleY, pixel, sizeL + sizeC * 2);
> +    m_iPCMSampleCb = m_iPCMSampleY + sizeL;
> +    m_iPCMSampleCr = m_iPCMSampleY + sizeL + sizeC;
>
>      memset(m_partSizes, SIZE_NONE, numPartition * sizeof(*m_partSizes));
>      return ok;
> @@ -168,23 +171,15 @@
>      X265_FREE(m_depth);
>      X265_FREE(m_cuSize);
>      X265_FREE(m_cbf[0]);
> -    X265_FREE(m_cbf[1]);
> -    X265_FREE(m_cbf[2]);
>      X265_FREE(m_interDir);
>      X265_FREE(m_bMergeFlags);
>      X265_FREE(m_lumaIntraDir);
>      X265_FREE(m_chromaIntraDir);
>      X265_FREE(m_trIdx);
>      X265_FREE(m_transformSkip[0]);
> -    X265_FREE(m_transformSkip[1]);
> -    X265_FREE(m_transformSkip[2]);
>      X265_FREE(m_trCoeffY);
> -    X265_FREE(m_trCoeffCb);
> -    X265_FREE(m_trCoeffCr);
>      X265_FREE(m_iPCMFlags);
>      X265_FREE(m_iPCMSampleY);
> -    X265_FREE(m_iPCMSampleCb);
> -    X265_FREE(m_iPCMSampleCr);
>      X265_FREE(m_mvpIdx[0]);
>      X265_FREE(m_cuTransquantBypass);
>      X265_FREE(m_skipFlag);
> @@ -258,8 +253,6 @@
>          memset(m_iPCMFlags,          false,         numElements * sizeof(*m_iPCMFlags));
>      }
>
> -    uint32_t y_tmp = g_maxCUSize * g_maxCUSize;
> -    uint32_t c_tmp = g_maxCUSize * g_maxCUSize >> (m_hChromaShift + m_vChromaShift);
>      {
>          m_cuMvField[0].clearMvField();
>          m_cuMvField[1].clearMvField();
> @@ -267,6 +260,8 @@
>          // TODO: can be remove, but I haven't data to verify it, remove later
>          if (getSlice()->getSPS()->getUsePCM())
>          {
> +            uint32_t y_tmp = g_maxCUSize * g_maxCUSize;
> +            uint32_t c_tmp = g_maxCUSize * g_maxCUSize >> (m_hChromaShift + m_vChromaShift);
>              memset(m_iPCMSampleY, 0, sizeof(pixel) * y_tmp);
>              memset(m_iPCMSampleCb, 0, sizeof(pixel) * c_tmp);
>              memset(m_iPCMSampleCr, 0, sizeof(pixel) * c_tmp);
> @@ -598,9 +593,8 @@
>      uint32_t tmp2 = partUnitIdx * tmp;
>      memcpy(m_trCoeffY  + tmp2, cu->getCoeffY(),  sizeof(coeff_t) * tmp);
>      memcpy(m_iPCMSampleY + tmp2, cu->getPCMSampleY(), sizeof(pixel) * tmp);
> -
>      tmp  >>= m_hChromaShift + m_vChromaShift;
> -    tmp2 = partUnitIdx * tmp;
> +    tmp2 >>= m_hChromaShift + m_vChromaShift;
>      memcpy(m_trCoeffCb + tmp2, cu->getCoeffCb(), sizeof(coeff_t) * tmp);
>      memcpy(m_trCoeffCr + tmp2, cu->getCoeffCr(), sizeof(coeff_t) * tmp);
>      memcpy(m_iPCMSampleCb + tmp2, cu->getPCMSampleCb(), sizeof(pixel) * tmp);
> @@ -684,7 +678,6 @@
>      uint32_t tmp  = (g_maxCUSize * g_maxCUSize) >> (depth << 1);
>      uint32_t tmp2 = m_absIdxInLCU << m_pic->getLog2UnitSize() * 2;
>      memcpy(rpcCU->getCoeffY() + tmp2, m_trCoeffY, sizeof(coeff_t) * tmp);
> -
>      tmp  >>= m_hChromaShift + m_vChromaShift;
>      tmp2 >>= m_hChromaShift + m_vChromaShift;
>      memcpy(rpcCU->getCoeffCb() + tmp2, m_trCoeffCb, sizeof(coeff_t) * tmp);
> @@ -737,7 +730,6 @@
>      uint32_t tmp2 = partOffset << m_pic->getLog2UnitSize() * 2;
>      memcpy(cu->getCoeffY()  + tmp2, m_trCoeffY,  sizeof(coeff_t) * tmp);
>      memcpy(cu->getPCMSampleY() + tmp2, m_iPCMSampleY, sizeof(pixel) * tmp);
> -
>      tmp  >>= m_hChromaShift + m_vChromaShift;
>      tmp2 >>= m_hChromaShift + m_vChromaShift;
>      memcpy(cu->getCoeffCb() + tmp2, m_trCoeffCb, sizeof(coeff_t) * tmp);
> diff -r 8e64aa56d635 -r 237ce67bc16f source/Lib/TLibEncoder/TEncSearch.cpp
> --- a/source/Lib/TLibEncoder/TEncSearch.cpp     Thu May 08 11:01:04 2014 +0530
> +++ b/source/Lib/TLibEncoder/TEncSearch.cpp     Thu May 08 18:52:17 2014 +0900
> @@ -82,24 +82,15 @@
>          for (uint32_t i = 0; i < numLayersToAllocate; ++i)
>          {
>              X265_FREE(m_qtTempCoeffY[i]);
> -            X265_FREE(m_qtTempCoeffCb[i]);
> -            X265_FREE(m_qtTempCoeffCr[i]);
>              m_qtTempShortYuv[i].destroy();
>          }
>      }
>      X265_FREE(m_qtTempTUCoeffY);
> -    X265_FREE(m_qtTempTUCoeffCb);
> -    X265_FREE(m_qtTempTUCoeffCr);
>      X265_FREE(m_qtTempTrIdx);
> -    for (uint32_t i = 0; i < 3; ++i)
> -    {
> -        X265_FREE(m_qtTempCbf[i]);
> -        X265_FREE(m_qtTempTransformSkipFlag[i]);
> -    }
> +    X265_FREE(m_qtTempCbf[0]);
> +    X265_FREE(m_qtTempTransformSkipFlag[0]);
>
>      delete[] m_qtTempCoeffY;
> -    delete[] m_qtTempCoeffCb;
> -    delete[] m_qtTempCoeffCr;
>      delete[] m_qtTempShortYuv;
>      m_qtTempTransformSkipYuv.destroy();
>  }
> @@ -119,30 +110,32 @@
>      m_refLagPixels = cfg->param->frameNumThreads > 1 ? cfg->param->searchRange : cfg->param->sourceHeight;
>
>      const uint32_t numLayersToAllocate = cfg->m_quadtreeTULog2MaxSize - cfg->m_quadtreeTULog2MinSize + 1;
> -    m_qtTempCoeffY   = new coeff_t*[numLayersToAllocate];
> -    m_qtTempCoeffCb  = new coeff_t*[numLayersToAllocate];
> -    m_qtTempCoeffCr  = new coeff_t*[numLayersToAllocate];
> +    m_qtTempCoeffY   = new coeff_t*[numLayersToAllocate * 3];
> +    m_qtTempCoeffCb  = m_qtTempCoeffY + numLayersToAllocate;
> +    m_qtTempCoeffCr  = m_qtTempCoeffY + numLayersToAllocate * 2;
>      m_qtTempShortYuv = new ShortYuv[numLayersToAllocate];
> +    uint32_t sizeL = g_maxCUSize * g_maxCUSize;
> +    uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
>      for (uint32_t i = 0; i < numLayersToAllocate; ++i)
>      {
> -        m_qtTempCoeffY[i]  = X265_MALLOC(coeff_t, g_maxCUSize * g_maxCUSize);
> -        m_qtTempCoeffCb[i] = X265_MALLOC(coeff_t, g_maxCUSize * g_maxCUSize >> (m_hChromaShift + m_vChromaShift));
> -        m_qtTempCoeffCr[i] = X265_MALLOC(coeff_t, g_maxCUSize * g_maxCUSize >> (m_hChromaShift + m_vChromaShift));
> +        m_qtTempCoeffY[i]  = X265_MALLOC(coeff_t, sizeL + sizeC * 2);
> +        m_qtTempCoeffCb[i] = m_qtTempCoeffY[i] + sizeL;
> +        m_qtTempCoeffCr[i] = m_qtTempCoeffY[i] + sizeL + sizeC;
>          m_qtTempShortYuv[i].create(MAX_CU_SIZE, MAX_CU_SIZE, cfg->param->internalCsp);
>      }
>
>      const uint32_t numPartitions = 1 << (g_maxCUDepth << 1);
>      CHECKED_MALLOC(m_qtTempTrIdx, uint8_t, numPartitions);
> -    CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions);
> -    CHECKED_MALLOC(m_qtTempCbf[1], uint8_t, numPartitions);
> -    CHECKED_MALLOC(m_qtTempCbf[2], uint8_t, numPartitions);
> -    CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions);
> -    CHECKED_MALLOC(m_qtTempTransformSkipFlag[1], uint8_t, numPartitions);
> -    CHECKED_MALLOC(m_qtTempTransformSkipFlag[2], uint8_t, numPartitions);
> -
> -    CHECKED_MALLOC(m_qtTempTUCoeffY, coeff_t, MAX_TS_WIDTH * MAX_TS_HEIGHT);
> -    CHECKED_MALLOC(m_qtTempTUCoeffCb, coeff_t, MAX_TS_WIDTH * MAX_TS_HEIGHT);
> -    CHECKED_MALLOC(m_qtTempTUCoeffCr, coeff_t, MAX_TS_WIDTH * MAX_TS_HEIGHT);
> +    CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
> +    m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
> +    m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
> +    CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
> +    m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
> +    m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
> +
> +    CHECKED_MALLOC(m_qtTempTUCoeffY, coeff_t, MAX_TS_WIDTH * MAX_TS_HEIGHT * 3);
> +    m_qtTempTUCoeffCb = m_qtTempTUCoeffY + MAX_TS_WIDTH * MAX_TS_HEIGHT;
> +    m_qtTempTUCoeffCr = m_qtTempTUCoeffY + MAX_TS_WIDTH * MAX_TS_HEIGHT * 2;
>
>      return m_qtTempTransformSkipYuv.create(g_maxCUSize, g_maxCUSize, cfg->param->internalCsp);
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel



-- 
Steve Borho


More information about the x265-devel mailing list