[x265] reduce malloc call to avoid fragment and overhead

Satoshi Nakagawa nakagawa424 at oki.com
Thu May 8 11:57:34 CEST 2014


# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1399542737 -32400
#      Thu May 08 18:52:17 2014 +0900
# Node ID 237ce67bc16fcea4696e119b7e5f34b640445d8c
# Parent  8e64aa56d6352a1a8cfb6fb57cb547607bcec9b1
reduce malloc call to avoid fragment and overhead

diff -r 8e64aa56d635 -r 237ce67bc16f source/Lib/TLibCommon/TComDataCU.cpp
--- a/source/Lib/TLibCommon/TComDataCU.cpp	Thu May 08 11:01:04 2014 +0530
+++ b/source/Lib/TLibCommon/TComDataCU.cpp	Thu May 08 18:52:17 2014 +0900
@@ -116,6 +116,9 @@
     tmp = g_convertToBit[tmp] + 2;
     m_unitMask = ~((1 << tmp) - 1);
 
+    uint32_t sizeL = cuSize * cuSize;
+    uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
+
     bool ok = true;
     ok &= m_cuMvField[0].create(numPartition);
     ok &= m_cuMvField[1].create(numPartition);
@@ -134,25 +137,25 @@
     CHECKED_MALLOC(m_interDir, uint8_t, numPartition);
 
     CHECKED_MALLOC(m_trIdx, uint8_t, numPartition);
-    CHECKED_MALLOC(m_transformSkip[0], uint8_t, numPartition);
-    CHECKED_MALLOC(m_transformSkip[1], uint8_t, numPartition);
-    CHECKED_MALLOC(m_transformSkip[2], uint8_t, numPartition);
+    CHECKED_MALLOC(m_transformSkip[0], uint8_t, numPartition * 3);
+    m_transformSkip[1] = m_transformSkip[0] + numPartition;
+    m_transformSkip[2] = m_transformSkip[0] + numPartition * 2;
 
-    CHECKED_MALLOC(m_cbf[0], uint8_t, numPartition);
-    CHECKED_MALLOC(m_cbf[1], uint8_t, numPartition);
-    CHECKED_MALLOC(m_cbf[2], uint8_t, numPartition);
+    CHECKED_MALLOC(m_cbf[0], uint8_t, numPartition * 3);
+    m_cbf[1] = m_cbf[0] + numPartition;
+    m_cbf[2] = m_cbf[0] + numPartition * 2;
 
     CHECKED_MALLOC(m_mvpIdx[0], uint8_t, numPartition * 2);
     m_mvpIdx[1] = m_mvpIdx[0] + numPartition;
 
-    CHECKED_MALLOC(m_trCoeffY, coeff_t, cuSize * cuSize);
-    CHECKED_MALLOC(m_trCoeffCb, coeff_t, cuSize * cuSize >> (m_hChromaShift + m_vChromaShift));
-    CHECKED_MALLOC(m_trCoeffCr, coeff_t, cuSize * cuSize >> (m_hChromaShift + m_vChromaShift));
+    CHECKED_MALLOC(m_trCoeffY, coeff_t, sizeL + sizeC * 2);
+    m_trCoeffCb = m_trCoeffY + sizeL;
+    m_trCoeffCr = m_trCoeffY + sizeL + sizeC;
 
     CHECKED_MALLOC(m_iPCMFlags, bool, numPartition);
-    CHECKED_MALLOC(m_iPCMSampleY, pixel, cuSize * cuSize);
-    CHECKED_MALLOC(m_iPCMSampleCb, pixel, cuSize * cuSize >> (m_hChromaShift + m_vChromaShift));
-    CHECKED_MALLOC(m_iPCMSampleCr, pixel, cuSize * cuSize >> (m_hChromaShift + m_vChromaShift));
+    CHECKED_MALLOC(m_iPCMSampleY, pixel, sizeL + sizeC * 2);
+    m_iPCMSampleCb = m_iPCMSampleY + sizeL;
+    m_iPCMSampleCr = m_iPCMSampleY + sizeL + sizeC;
 
     memset(m_partSizes, SIZE_NONE, numPartition * sizeof(*m_partSizes));
     return ok;
@@ -168,23 +171,15 @@
     X265_FREE(m_depth);
     X265_FREE(m_cuSize);
     X265_FREE(m_cbf[0]);
-    X265_FREE(m_cbf[1]);
-    X265_FREE(m_cbf[2]);
     X265_FREE(m_interDir);
     X265_FREE(m_bMergeFlags);
     X265_FREE(m_lumaIntraDir);
     X265_FREE(m_chromaIntraDir);
     X265_FREE(m_trIdx);
     X265_FREE(m_transformSkip[0]);
-    X265_FREE(m_transformSkip[1]);
-    X265_FREE(m_transformSkip[2]);
     X265_FREE(m_trCoeffY);
-    X265_FREE(m_trCoeffCb);
-    X265_FREE(m_trCoeffCr);
     X265_FREE(m_iPCMFlags);
     X265_FREE(m_iPCMSampleY);
-    X265_FREE(m_iPCMSampleCb);
-    X265_FREE(m_iPCMSampleCr);
     X265_FREE(m_mvpIdx[0]);
     X265_FREE(m_cuTransquantBypass);
     X265_FREE(m_skipFlag);
@@ -258,8 +253,6 @@
         memset(m_iPCMFlags,          false,         numElements * sizeof(*m_iPCMFlags));
     }
 
-    uint32_t y_tmp = g_maxCUSize * g_maxCUSize;
-    uint32_t c_tmp = g_maxCUSize * g_maxCUSize >> (m_hChromaShift + m_vChromaShift);
     {
         m_cuMvField[0].clearMvField();
         m_cuMvField[1].clearMvField();
@@ -267,6 +260,8 @@
         // TODO: can be remove, but I haven't data to verify it, remove later
         if (getSlice()->getSPS()->getUsePCM())
         {
+            uint32_t y_tmp = g_maxCUSize * g_maxCUSize;
+            uint32_t c_tmp = g_maxCUSize * g_maxCUSize >> (m_hChromaShift + m_vChromaShift);
             memset(m_iPCMSampleY, 0, sizeof(pixel) * y_tmp);
             memset(m_iPCMSampleCb, 0, sizeof(pixel) * c_tmp);
             memset(m_iPCMSampleCr, 0, sizeof(pixel) * c_tmp);
@@ -598,9 +593,8 @@
     uint32_t tmp2 = partUnitIdx * tmp;
     memcpy(m_trCoeffY  + tmp2, cu->getCoeffY(),  sizeof(coeff_t) * tmp);
     memcpy(m_iPCMSampleY + tmp2, cu->getPCMSampleY(), sizeof(pixel) * tmp);
-
     tmp  >>= m_hChromaShift + m_vChromaShift;
-    tmp2 = partUnitIdx * tmp;
+    tmp2 >>= m_hChromaShift + m_vChromaShift;
     memcpy(m_trCoeffCb + tmp2, cu->getCoeffCb(), sizeof(coeff_t) * tmp);
     memcpy(m_trCoeffCr + tmp2, cu->getCoeffCr(), sizeof(coeff_t) * tmp);
     memcpy(m_iPCMSampleCb + tmp2, cu->getPCMSampleCb(), sizeof(pixel) * tmp);
@@ -684,7 +678,6 @@
     uint32_t tmp  = (g_maxCUSize * g_maxCUSize) >> (depth << 1);
     uint32_t tmp2 = m_absIdxInLCU << m_pic->getLog2UnitSize() * 2;
     memcpy(rpcCU->getCoeffY() + tmp2, m_trCoeffY, sizeof(coeff_t) * tmp);
-
     tmp  >>= m_hChromaShift + m_vChromaShift;
     tmp2 >>= m_hChromaShift + m_vChromaShift;
     memcpy(rpcCU->getCoeffCb() + tmp2, m_trCoeffCb, sizeof(coeff_t) * tmp);
@@ -737,7 +730,6 @@
     uint32_t tmp2 = partOffset << m_pic->getLog2UnitSize() * 2;
     memcpy(cu->getCoeffY()  + tmp2, m_trCoeffY,  sizeof(coeff_t) * tmp);
     memcpy(cu->getPCMSampleY() + tmp2, m_iPCMSampleY, sizeof(pixel) * tmp);
-
     tmp  >>= m_hChromaShift + m_vChromaShift;
     tmp2 >>= m_hChromaShift + m_vChromaShift;
     memcpy(cu->getCoeffCb() + tmp2, m_trCoeffCb, sizeof(coeff_t) * tmp);
diff -r 8e64aa56d635 -r 237ce67bc16f source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Thu May 08 11:01:04 2014 +0530
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Thu May 08 18:52:17 2014 +0900
@@ -82,24 +82,15 @@
         for (uint32_t i = 0; i < numLayersToAllocate; ++i)
         {
             X265_FREE(m_qtTempCoeffY[i]);
-            X265_FREE(m_qtTempCoeffCb[i]);
-            X265_FREE(m_qtTempCoeffCr[i]);
             m_qtTempShortYuv[i].destroy();
         }
     }
     X265_FREE(m_qtTempTUCoeffY);
-    X265_FREE(m_qtTempTUCoeffCb);
-    X265_FREE(m_qtTempTUCoeffCr);
     X265_FREE(m_qtTempTrIdx);
-    for (uint32_t i = 0; i < 3; ++i)
-    {
-        X265_FREE(m_qtTempCbf[i]);
-        X265_FREE(m_qtTempTransformSkipFlag[i]);
-    }
+    X265_FREE(m_qtTempCbf[0]);
+    X265_FREE(m_qtTempTransformSkipFlag[0]);
 
     delete[] m_qtTempCoeffY;
-    delete[] m_qtTempCoeffCb;
-    delete[] m_qtTempCoeffCr;
     delete[] m_qtTempShortYuv;
     m_qtTempTransformSkipYuv.destroy();
 }
@@ -119,30 +110,32 @@
     m_refLagPixels = cfg->param->frameNumThreads > 1 ? cfg->param->searchRange : cfg->param->sourceHeight;
 
     const uint32_t numLayersToAllocate = cfg->m_quadtreeTULog2MaxSize - cfg->m_quadtreeTULog2MinSize + 1;
-    m_qtTempCoeffY   = new coeff_t*[numLayersToAllocate];
-    m_qtTempCoeffCb  = new coeff_t*[numLayersToAllocate];
-    m_qtTempCoeffCr  = new coeff_t*[numLayersToAllocate];
+    m_qtTempCoeffY   = new coeff_t*[numLayersToAllocate * 3];
+    m_qtTempCoeffCb  = m_qtTempCoeffY + numLayersToAllocate;
+    m_qtTempCoeffCr  = m_qtTempCoeffY + numLayersToAllocate * 2;
     m_qtTempShortYuv = new ShortYuv[numLayersToAllocate];
+    uint32_t sizeL = g_maxCUSize * g_maxCUSize;
+    uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
     for (uint32_t i = 0; i < numLayersToAllocate; ++i)
     {
-        m_qtTempCoeffY[i]  = X265_MALLOC(coeff_t, g_maxCUSize * g_maxCUSize);
-        m_qtTempCoeffCb[i] = X265_MALLOC(coeff_t, g_maxCUSize * g_maxCUSize >> (m_hChromaShift + m_vChromaShift));
-        m_qtTempCoeffCr[i] = X265_MALLOC(coeff_t, g_maxCUSize * g_maxCUSize >> (m_hChromaShift + m_vChromaShift));
+        m_qtTempCoeffY[i]  = X265_MALLOC(coeff_t, sizeL + sizeC * 2);
+        m_qtTempCoeffCb[i] = m_qtTempCoeffY[i] + sizeL;
+        m_qtTempCoeffCr[i] = m_qtTempCoeffY[i] + sizeL + sizeC;
         m_qtTempShortYuv[i].create(MAX_CU_SIZE, MAX_CU_SIZE, cfg->param->internalCsp);
     }
 
     const uint32_t numPartitions = 1 << (g_maxCUDepth << 1);
     CHECKED_MALLOC(m_qtTempTrIdx, uint8_t, numPartitions);
-    CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions);
-    CHECKED_MALLOC(m_qtTempCbf[1], uint8_t, numPartitions);
-    CHECKED_MALLOC(m_qtTempCbf[2], uint8_t, numPartitions);
-    CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions);
-    CHECKED_MALLOC(m_qtTempTransformSkipFlag[1], uint8_t, numPartitions);
-    CHECKED_MALLOC(m_qtTempTransformSkipFlag[2], uint8_t, numPartitions);
-
-    CHECKED_MALLOC(m_qtTempTUCoeffY, coeff_t, MAX_TS_WIDTH * MAX_TS_HEIGHT);
-    CHECKED_MALLOC(m_qtTempTUCoeffCb, coeff_t, MAX_TS_WIDTH * MAX_TS_HEIGHT);
-    CHECKED_MALLOC(m_qtTempTUCoeffCr, coeff_t, MAX_TS_WIDTH * MAX_TS_HEIGHT);
+    CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
+    m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
+    m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
+    CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
+    m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
+    m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
+
+    CHECKED_MALLOC(m_qtTempTUCoeffY, coeff_t, MAX_TS_WIDTH * MAX_TS_HEIGHT * 3);
+    m_qtTempTUCoeffCb = m_qtTempTUCoeffY + MAX_TS_WIDTH * MAX_TS_HEIGHT;
+    m_qtTempTUCoeffCr = m_qtTempTUCoeffY + MAX_TS_WIDTH * MAX_TS_HEIGHT * 2;
 
     return m_qtTempTransformSkipYuv.create(g_maxCUSize, g_maxCUSize, cfg->param->internalCsp);
 


More information about the x265-devel mailing list