[x265] refine block size related, use more log2 domain.

Mon Jul 14 07:53:48 CEST 2014

# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1405317034 -32400
#      Mon Jul 14 14:50:34 2014 +0900
# Node ID fa683df9621ef79cacdf98d53d966b4bf90c6e88
# Parent  6055baa75085cd074c62ab7c52357cac64d10a7e
refine block size related, use more log2 domain.

diff -r 6055baa75085 -r fa683df9621e source/Lib/TLibCommon/TComDataCU.cpp

--- a/source/Lib/TLibCommon/TComDataCU.cpp	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/Lib/TLibCommon/TComDataCU.cpp	Mon Jul 14 14:50:34 2014 +0900
@@ -71,7 +71,7 @@
     m_baseQp       = 0;
     m_DataCUMemPool.qpMemBlock             = NULL;
     m_DataCUMemPool.depthMemBlock          = NULL;
-    m_DataCUMemPool.cuSizeMemBlock         = NULL;
+    m_DataCUMemPool.log2CUSizeMemBlock     = NULL;
     m_DataCUMemPool.skipFlagMemBlock       = NULL;
     m_DataCUMemPool.partSizeMemBlock       = NULL;
     m_DataCUMemPool.predModeMemBlock       = NULL;
@@ -101,7 +101,7 @@
     CHECKED_MALLOC(m_DataCUMemPool.qpMemBlock, char,  numPartition * numBlocks);
 
     CHECKED_MALLOC(m_DataCUMemPool.depthMemBlock, uint8_t, numPartition * numBlocks);
-    CHECKED_MALLOC(m_DataCUMemPool.cuSizeMemBlock, uint8_t, numPartition * numBlocks);
+    CHECKED_MALLOC(m_DataCUMemPool.log2CUSizeMemBlock, uint8_t, numPartition * numBlocks);
     CHECKED_MALLOC(m_DataCUMemPool.skipFlagMemBlock, bool, numPartition * numBlocks);
     CHECKED_MALLOC(m_DataCUMemPool.partSizeMemBlock, char, numPartition * numBlocks);
     CHECKED_MALLOC(m_DataCUMemPool.predModeMemBlock, char, numPartition * numBlocks);
@@ -153,7 +153,7 @@
 
     m_qp                 = cu->m_DataCUMemPool.qpMemBlock             + index * numPartition;
     m_depth              = cu->m_DataCUMemPool.depthMemBlock          + index * numPartition;
-    m_cuSize             = cu->m_DataCUMemPool.cuSizeMemBlock         + index * numPartition;
+    m_log2CUSize         = cu->m_DataCUMemPool.log2CUSizeMemBlock     + index * numPartition;
     m_skipFlag           = cu->m_DataCUMemPool.skipFlagMemBlock       + index * numPartition;
     m_partSizes          = cu->m_DataCUMemPool.partSizeMemBlock       + index * numPartition;
     m_predModes          = cu->m_DataCUMemPool.predModeMemBlock       + index * numPartition;
@@ -204,10 +204,10 @@
         m_DataCUMemPool.depthMemBlock = NULL;
     }
 
-    if (m_DataCUMemPool.cuSizeMemBlock)
+    if (m_DataCUMemPool.log2CUSizeMemBlock)
     {
-        X265_FREE(m_DataCUMemPool.cuSizeMemBlock);
-        m_DataCUMemPool.cuSizeMemBlock = NULL;
+        X265_FREE(m_DataCUMemPool.log2CUSizeMemBlock);
+        m_DataCUMemPool.log2CUSizeMemBlock = NULL;
     }
 
     if (m_DataCUMemPool.cbfMemBlock)
@@ -318,8 +318,8 @@
     m_pic              = pic;
     m_slice            = pic->getSlice();
     m_cuAddr           = cuAddr;
-    m_cuPelX           = (cuAddr % pic->getFrameWidthInCU()) * g_maxCUSize;
-    m_cuPelY           = (cuAddr / pic->getFrameWidthInCU()) * g_maxCUSize;
+    m_cuPelX           = (cuAddr % pic->getFrameWidthInCU()) << g_maxLog2CUSize;
+    m_cuPelY           = (cuAddr / pic->getFrameWidthInCU()) << g_maxLog2CUSize;
     m_absIdxInLCU      = 0;
     m_psyEnergy        = 0;
     m_totalPsyCost     = MAX_INT64;
@@ -349,7 +349,7 @@
     memset(m_transformSkip[0],   0,             m_numPartitions * sizeof(*m_transformSkip[0]));
     memset(m_transformSkip[1],   0,             m_numPartitions * sizeof(*m_transformSkip[1]));
     memset(m_transformSkip[2],   0,             m_numPartitions * sizeof(*m_transformSkip[2]));
-    memset(m_cuSize,             g_maxCUSize,   m_numPartitions * sizeof(*m_cuSize));
+    memset(m_log2CUSize,         g_maxLog2CUSize, m_numPartitions * sizeof(*m_log2CUSize));
     memset(m_bMergeFlags,        false,         m_numPartitions * sizeof(*m_bMergeFlags));
     memset(m_lumaIntraDir,       DC_IDX,        m_numPartitions * sizeof(*m_lumaIntraDir));
     memset(m_chromaIntraDir,     0,             m_numPartitions * sizeof(*m_chromaIntraDir));
@@ -365,8 +365,8 @@
 
     if (getSlice()->getPPS()->getTransquantBypassEnableFlag())
     {
-        uint32_t y_tmp = g_maxCUSize * g_maxCUSize;
-        uint32_t c_tmp = g_maxCUSize * g_maxCUSize >> (m_hChromaShift + m_vChromaShift);
+        uint32_t y_tmp = 1 << (g_maxLog2CUSize * 2);
+        uint32_t c_tmp = 1 << (g_maxLog2CUSize * 2 - m_hChromaShift - m_vChromaShift);
         memset(m_tqBypassOrigYuv[0], 0, sizeof(pixel) * y_tmp);
         memset(m_tqBypassOrigYuv[1], 0, sizeof(pixel) * c_tmp);
         memset(m_tqBypassOrigYuv[2], 0, sizeof(pixel) * c_tmp);
@@ -399,7 +399,7 @@
 void TComDataCU::initSubCU(TComDataCU* cu, uint32_t partUnitIdx, uint32_t depth, int qp)
 {
     X265_CHECK(partUnitIdx < 4, "part unit should be less than 4\n");
-    uint8_t cuSize = g_maxCUSize >> depth;
+    uint8_t log2CUSize = g_maxLog2CUSize - depth;
     uint32_t partOffset = (cu->getTotalNumPart() >> 2) * partUnitIdx;
 
     m_pic              = cu->getPic();
@@ -407,8 +407,8 @@
     m_cuAddr           = cu->getAddr();
     m_absIdxInLCU      = cu->getZorderIdxInCU() + partOffset;
 
-    m_cuPelX           = cu->getCUPelX() + cuSize * (partUnitIdx &  1);
-    m_cuPelY           = cu->getCUPelY() + cuSize * (partUnitIdx >> 1);
+    m_cuPelX           = cu->getCUPelX() + ((partUnitIdx &  1) << log2CUSize);
+    m_cuPelY           = cu->getCUPelY() + ((partUnitIdx >> 1) << log2CUSize);
 
     m_psyEnergy        = 0;
     m_totalPsyCost     = MAX_INT64;
@@ -441,7 +441,7 @@
     memset(m_cbf[1],             0,      sizeInChar);
     memset(m_cbf[2],             0,      sizeInChar);
     memset(m_depth,              depth,  sizeInChar);
-    memset(m_cuSize,             cuSize, sizeInChar);
+    memset(m_log2CUSize,         log2CUSize, sizeInChar);
     memset(m_partSizes,          SIZE_NONE, sizeInChar);
     memset(m_predModes,          MODE_NONE, sizeInChar);
     memset(m_skipFlag,           false, sizeInBool);
@@ -473,8 +473,8 @@
     m_cuAddr           = cu->getAddr();
     m_absIdxInLCU      = cu->getZorderIdxInCU() + partOffset;
 
-    m_cuPelX           = cu->getCUPelX() + (g_maxCUSize >> depth) * (partUnitIdx & 1);
-    m_cuPelY           = cu->getCUPelY() + (g_maxCUSize >> depth) * (partUnitIdx >> 1);
+    m_cuPelX           = cu->getCUPelX() + ((partUnitIdx &  1) << (g_maxLog2CUSize - depth));
+    m_cuPelY           = cu->getCUPelY() + ((partUnitIdx >> 1) << (g_maxLog2CUSize - depth));
 
     m_psyEnergy        = 0;
     m_totalPsyCost     = MAX_INT64;
@@ -497,7 +497,7 @@
 
     memcpy(m_lumaIntraDir, otherCU->getLumaIntraDir() + m_absIdxInLCU, sizeInChar);
     memcpy(m_depth, otherCU->getDepth() + m_absIdxInLCU, sizeInChar);
-    memcpy(m_cuSize, otherCU->getCUSize() + m_absIdxInLCU, sizeInChar);
+    memcpy(m_log2CUSize, otherCU->getLog2CUSize() + m_absIdxInLCU, sizeInChar);
 }
 
 // --------------------------------------------------------------------------------------------------------------------
@@ -544,7 +544,7 @@
     memcpy(m_cbf[2] + offset, cu->getCbf(TEXT_CHROMA_V), sizeInChar);
 
     memcpy(m_depth  + offset, cu->getDepth(),  sizeInChar);
-    memcpy(m_cuSize + offset, cu->getCUSize(), sizeInChar);
+    memcpy(m_log2CUSize + offset, cu->getLog2CUSize(), sizeInChar);
 
     memcpy(m_mvpIdx[0] + offset, cu->getMVPIdx(REF_PIC_LIST_0), sizeInChar);
     memcpy(m_mvpIdx[1] + offset, cu->getMVPIdx(REF_PIC_LIST_1), sizeInChar);
@@ -557,7 +557,7 @@
     m_cuMvField[0].copyFrom(cu->getCUMvField(REF_PIC_LIST_0), cu->getTotalNumPart(), offset);
     m_cuMvField[1].copyFrom(cu->getCUMvField(REF_PIC_LIST_1), cu->getTotalNumPart(), offset);
 
-    uint32_t tmp  = g_maxCUSize * g_maxCUSize >> (depth << 1);
+    uint32_t tmp  = 1 << ((g_maxLog2CUSize - depth) * 2);
     uint32_t tmp2 = partUnitIdx * tmp;
     memcpy(m_trCoeff[0] + tmp2, cu->getCoeffY(), sizeof(coeff_t) * tmp);
 
@@ -613,7 +613,7 @@
     memcpy(cu->getCbf(TEXT_CHROMA_V) + m_absIdxInLCU, m_cbf[2], sizeInChar);
 
     memcpy(cu->getDepth()  + m_absIdxInLCU, m_depth,  sizeInChar);
-    memcpy(cu->getCUSize() + m_absIdxInLCU, m_cuSize, sizeInChar);
+    memcpy(cu->getLog2CUSize() + m_absIdxInLCU, m_log2CUSize, sizeInChar);
 
     memcpy(cu->getMVPIdx(REF_PIC_LIST_0) + m_absIdxInLCU, m_mvpIdx[0], sizeInChar);
     memcpy(cu->getMVPIdx(REF_PIC_LIST_1) + m_absIdxInLCU, m_mvpIdx[1], sizeInChar);
@@ -621,7 +621,7 @@
     m_cuMvField[0].copyTo(cu->getCUMvField(REF_PIC_LIST_0), m_absIdxInLCU);
     m_cuMvField[1].copyTo(cu->getCUMvField(REF_PIC_LIST_1), m_absIdxInLCU);
 
-    uint32_t tmpY  = (g_maxCUSize * g_maxCUSize) >> (depth << 1);
+    uint32_t tmpY  = 1 << ((g_maxLog2CUSize - depth) * 2);
     uint32_t tmpY2 = m_absIdxInLCU << m_pic->getLog2UnitSize() * 2;
     memcpy(cu->getCoeffY() + tmpY2, m_trCoeff[0], sizeof(coeff_t) * tmpY);
 
@@ -632,7 +632,7 @@
 
     if (getSlice()->getPPS()->getTransquantBypassEnableFlag())
     {
-        uint32_t tmp  = (g_maxCUSize * g_maxCUSize) >> (depth << 1);
+        uint32_t tmp  = 1 << ((g_maxLog2CUSize - depth) * 2);
         uint32_t tmp2 = m_absIdxInLCU << m_pic->getLog2UnitSize() * 2;
         memcpy(cu->getLumaOrigYuv() + tmp2, m_tqBypassOrigYuv[0], sizeof(pixel) * tmp);
 
@@ -659,7 +659,7 @@
     memcpy(cu->getCbf(TEXT_CHROMA_U) + m_absIdxInLCU, m_cbf[1], sizeInChar);
     memcpy(cu->getCbf(TEXT_CHROMA_V) + m_absIdxInLCU, m_cbf[2], sizeInChar);
 
-    uint32_t tmpY  = (g_maxCUSize * g_maxCUSize) >> (depth << 1);
+    uint32_t tmpY  = 1 << ((g_maxLog2CUSize - depth) * 2);
     uint32_t tmpY2 = m_absIdxInLCU << m_pic->getLog2UnitSize() * 2;
     memcpy(cu->getCoeffY() + tmpY2, m_trCoeff[0], sizeof(coeff_t) * tmpY);
     tmpY  >>= m_hChromaShift + m_vChromaShift;
@@ -705,14 +705,14 @@
     memcpy(cu->getCbf(TEXT_CHROMA_V) + partOffset, m_cbf[2], sizeInChar);
 
     memcpy(cu->getDepth()  + partOffset, m_depth,  sizeInChar);
-    memcpy(cu->getCUSize() + partOffset, m_cuSize, sizeInChar);
+    memcpy(cu->getLog2CUSize() + partOffset, m_log2CUSize, sizeInChar);
 
     memcpy(cu->getMVPIdx(REF_PIC_LIST_0) + partOffset, m_mvpIdx[0], sizeInChar);
     memcpy(cu->getMVPIdx(REF_PIC_LIST_1) + partOffset, m_mvpIdx[1], sizeInChar);
     m_cuMvField[0].copyTo(cu->getCUMvField(REF_PIC_LIST_0), m_absIdxInLCU, partStart, qNumPart);
     m_cuMvField[1].copyTo(cu->getCUMvField(REF_PIC_LIST_1), m_absIdxInLCU, partStart, qNumPart);
 
-    uint32_t tmpY  = (g_maxCUSize * g_maxCUSize) >> ((depth + partDepth) << 1);
+    uint32_t tmpY  = 1 << ((g_maxLog2CUSize - depth - partDepth) * 2);
     uint32_t tmpY2 = partOffset << m_pic->getLog2UnitSize() * 2;
     memcpy(cu->getCoeffY() + tmpY2, m_trCoeff[0],  sizeof(coeff_t) * tmpY);
 
@@ -737,11 +737,11 @@
 TComDataCU* TComDataCU::getPULeft(uint32_t& lPartUnitIdx, uint32_t curPartUnitIdx)
 {
     uint32_t absPartIdx       = g_zscanToRaster[curPartUnitIdx];
-    uint32_t absZorderCUIdx   = g_zscanToRaster[m_absIdxInLCU];
     uint32_t numPartInCUSize  = m_pic->getNumPartInCUSize();
 
     if (!RasterAddress::isZeroCol(absPartIdx, numPartInCUSize))
     {
+        uint32_t absZorderCUIdx   = g_zscanToRaster[m_absIdxInLCU];
         lPartUnitIdx = g_rasterToZscan[absPartIdx - 1];
         if (RasterAddress::isEqualCol(absPartIdx, absZorderCUIdx, numPartInCUSize))
         {
@@ -761,11 +761,11 @@
 TComDataCU* TComDataCU::getPUAbove(uint32_t& aPartUnitIdx, uint32_t curPartUnitIdx, bool planarAtLCUBoundary)
 {
     uint32_t absPartIdx       = g_zscanToRaster[curPartUnitIdx];
-    uint32_t absZorderCUIdx   = g_zscanToRaster[m_absIdxInLCU];
     uint32_t numPartInCUSize  = m_pic->getNumPartInCUSize();
 
     if (!RasterAddress::isZeroRow(absPartIdx, numPartInCUSize))
     {
+        uint32_t absZorderCUIdx   = g_zscanToRaster[m_absIdxInLCU];
         aPartUnitIdx = g_rasterToZscan[absPartIdx - numPartInCUSize];
         if (RasterAddress::isEqualRow(absPartIdx, absZorderCUIdx, numPartInCUSize))
         {
@@ -788,13 +788,13 @@
 TComDataCU* TComDataCU::getPUAboveLeft(uint32_t& alPartUnitIdx, uint32_t curPartUnitIdx)
 {
     uint32_t absPartIdx      = g_zscanToRaster[curPartUnitIdx];
-    uint32_t absZorderCUIdx  = g_zscanToRaster[m_absIdxInLCU];
     uint32_t numPartInCUSize = m_pic->getNumPartInCUSize();
 
     if (!RasterAddress::isZeroCol(absPartIdx, numPartInCUSize))
     {
         if (!RasterAddress::isZeroRow(absPartIdx, numPartInCUSize))
         {
+            uint32_t absZorderCUIdx  = g_zscanToRaster[m_absIdxInLCU];
             alPartUnitIdx = g_rasterToZscan[absPartIdx - numPartInCUSize - 1];
             if (RasterAddress::isEqualRowOrCol(absPartIdx, absZorderCUIdx, numPartInCUSize))
             {
@@ -823,7 +823,6 @@
 TComDataCU* TComDataCU::getPUAboveRight(uint32_t& arPartUnitIdx, uint32_t curPartUnitIdx)
 {
     uint32_t absPartIdxRT    = g_zscanToRaster[curPartUnitIdx];
-    uint32_t absZorderCUIdx  = g_zscanToRaster[m_absIdxInLCU] + (m_cuSize[0] >> m_pic->getLog2UnitSize()) - 1;
     uint32_t numPartInCUSize = m_pic->getNumPartInCUSize();
 
     if ((m_pic->getCU(m_cuAddr)->getCUPelX() + g_rasterToPelX[absPartIdxRT] + m_pic->getUnitSize()) >= m_slice->getSPS()->getPicWidthInLumaSamples())
@@ -837,6 +836,7 @@
         {
             if (curPartUnitIdx > g_rasterToZscan[absPartIdxRT - numPartInCUSize + 1])
             {
+                uint32_t absZorderCUIdx  = g_zscanToRaster[m_absIdxInLCU] + (1 << (m_log2CUSize[0] - m_pic->getLog2UnitSize())) - 1;
                 arPartUnitIdx = g_rasterToZscan[absPartIdxRT - numPartInCUSize + 1];
                 if (RasterAddress::isEqualRowOrCol(absPartIdxRT, absZorderCUIdx, numPartInCUSize))
                 {
@@ -866,20 +866,21 @@
 TComDataCU* TComDataCU::getPUBelowLeft(uint32_t& blPartUnitIdx, uint32_t curPartUnitIdx)
 {
     uint32_t absPartIdxLB     = g_zscanToRaster[curPartUnitIdx];
-    uint32_t absZorderCUIdxLB = g_zscanToRaster[m_absIdxInLCU] + ((m_cuSize[0] >> m_pic->getLog2UnitSize()) - 1) * m_pic->getNumPartInCUSize();
-    uint32_t numPartInCUSize  = m_pic->getNumPartInCUSize();
 
     if ((m_pic->getCU(m_cuAddr)->getCUPelY() + g_rasterToPelY[absPartIdxLB] + m_pic->getUnitSize()) >= m_slice->getSPS()->getPicHeightInLumaSamples())
     {
         return NULL;
     }
 
+    uint32_t numPartInCUSize  = m_pic->getNumPartInCUSize();
+
     if (RasterAddress::lessThanRow(absPartIdxLB, numPartInCUSize - 1, numPartInCUSize))
     {
         if (!RasterAddress::isZeroCol(absPartIdxLB, numPartInCUSize))
         {
             if (curPartUnitIdx > g_rasterToZscan[absPartIdxLB + numPartInCUSize - 1])
             {
+                uint32_t absZorderCUIdxLB = g_zscanToRaster[m_absIdxInLCU] + ((1 << (m_log2CUSize[0] - m_pic->getLog2UnitSize())) - 1) * m_pic->getNumPartInCUSize();
                 blPartUnitIdx = g_rasterToZscan[absPartIdxLB + numPartInCUSize - 1];
                 if (RasterAddress::isEqualRowOrCol(absPartIdxLB, absZorderCUIdxLB, numPartInCUSize))
                 {
@@ -903,21 +904,22 @@
 TComDataCU* TComDataCU::getPUBelowLeftAdi(uint32_t& blPartUnitIdx,  uint32_t curPartUnitIdx, uint32_t partUnitOffset)
 {
     uint32_t absPartIdxLB     = g_zscanToRaster[curPartUnitIdx];
-    uint32_t absZorderCUIdxLB = g_zscanToRaster[m_absIdxInLCU] + ((m_cuSize[0] >> m_pic->getLog2UnitSize()) - 1) * m_pic->getNumPartInCUSize();
-    uint32_t numPartInCUSize  = m_pic->getNumPartInCUSize();
 
-    if ((m_pic->getCU(m_cuAddr)->getCUPelY() + g_rasterToPelY[absPartIdxLB] + (partUnitOffset << m_pic->getPicSym()->getLog2UnitSize())) >=
+    if ((m_pic->getCU(m_cuAddr)->getCUPelY() + g_rasterToPelY[absPartIdxLB] + (partUnitOffset << m_pic->getLog2UnitSize())) >=
         m_slice->getSPS()->getPicHeightInLumaSamples())
     {
         return NULL;
     }
 
+    uint32_t numPartInCUSize  = m_pic->getNumPartInCUSize();
+
     if (RasterAddress::lessThanRow(absPartIdxLB, numPartInCUSize - partUnitOffset, numPartInCUSize))
     {
         if (!RasterAddress::isZeroCol(absPartIdxLB, numPartInCUSize))
         {
             if (curPartUnitIdx > g_rasterToZscan[absPartIdxLB + partUnitOffset * numPartInCUSize - 1])
             {
+                uint32_t absZorderCUIdxLB = g_zscanToRaster[m_absIdxInLCU] + ((1 << (m_log2CUSize[0] - m_pic->getLog2UnitSize())) - 1) * m_pic->getNumPartInCUSize();
                 blPartUnitIdx = g_rasterToZscan[absPartIdxLB + partUnitOffset * numPartInCUSize - 1];
                 if (RasterAddress::isEqualRowOrCol(absPartIdxLB, absZorderCUIdxLB, numPartInCUSize))
                 {
@@ -945,21 +947,22 @@
 TComDataCU* TComDataCU::getPUAboveRightAdi(uint32_t& arPartUnitIdx, uint32_t curPartUnitIdx, uint32_t partUnitOffset)
 {
     uint32_t absPartIdxRT    = g_zscanToRaster[curPartUnitIdx];
-    uint32_t absZorderCUIdx  = g_zscanToRaster[m_absIdxInLCU] + (m_cuSize[0] >> m_pic->getLog2UnitSize()) - 1;
-    uint32_t numPartInCUSize = m_pic->getNumPartInCUSize();
 
-    if ((m_pic->getCU(m_cuAddr)->getCUPelX() + g_rasterToPelX[absPartIdxRT] + (partUnitOffset << m_pic->getPicSym()->getLog2UnitSize())) >=
+    if ((m_pic->getCU(m_cuAddr)->getCUPelX() + g_rasterToPelX[absPartIdxRT] + (partUnitOffset << m_pic->getLog2UnitSize())) >=
         m_slice->getSPS()->getPicWidthInLumaSamples())
     {
         return NULL;
     }
 
+    uint32_t numPartInCUSize = m_pic->getNumPartInCUSize();
+
     if (RasterAddress::lessThanCol(absPartIdxRT, numPartInCUSize - partUnitOffset, numPartInCUSize))
     {
         if (!RasterAddress::isZeroRow(absPartIdxRT, numPartInCUSize))
         {
             if (curPartUnitIdx > g_rasterToZscan[absPartIdxRT - numPartInCUSize + partUnitOffset])
             {
+                uint32_t absZorderCUIdx  = g_zscanToRaster[m_absIdxInLCU] + (1 << (m_log2CUSize[0] - m_pic->getLog2UnitSize())) - 1;
                 arPartUnitIdx = g_rasterToZscan[absPartIdxRT - numPartInCUSize + partUnitOffset];
                 if (RasterAddress::isEqualRowOrCol(absPartIdxRT, absZorderCUIdx, numPartInCUSize))
                 {
@@ -1207,7 +1210,7 @@
 
 uint32_t TComDataCU::getQuadtreeTULog2MinSizeInCU(uint32_t absPartIdx)
 {
-    uint32_t log2CUSize = g_convertToBit[getCUSize(absPartIdx)] + 2;
+    uint32_t log2CUSize = getLog2CUSize(absPartIdx);
     PartSize partSize   = getPartitionSize(absPartIdx);
     uint32_t quadtreeTUMaxDepth = getPredictionMode(absPartIdx) == MODE_INTRA ? m_slice->getSPS()->getQuadtreeTUMaxDepthIntra() : m_slice->getSPS()->getQuadtreeTUMaxDepthInter();
     int intraSplitFlag = (getPredictionMode(absPartIdx) == MODE_INTRA && partSize == SIZE_NxN) ? 1 : 0;
@@ -1528,7 +1531,7 @@
 
 void TComDataCU::getPartIndexAndSize(uint32_t partIdx, uint32_t& outPartAddr, int& outWidth, int& outHeight)
 {
-    int cuSize = getCUSize(0);
+    int cuSize = 1 << getLog2CUSize(0);
 
     switch (m_partSizes[0])
     {
@@ -1592,7 +1595,7 @@
 void TComDataCU::deriveLeftRightTopIdxGeneral(uint32_t absPartIdx, uint32_t partIdx, uint32_t& outPartIdxLT, uint32_t& outPartIdxRT)
 {
     outPartIdxLT = m_absIdxInLCU + absPartIdx;
-    uint32_t cuSize = m_cuSize[absPartIdx];
+    uint32_t cuSize = 1 << m_log2CUSize[absPartIdx];
     uint32_t puWidth = 0;
 
     switch (m_partSizes[absPartIdx])
@@ -1647,7 +1650,7 @@
 
 void TComDataCU::deriveLeftBottomIdxGeneral(uint32_t absPartIdx, uint32_t partIdx, uint32_t& outPartIdxLB)
 {
-    uint32_t cuSize = m_cuSize[absPartIdx];
+    uint32_t cuSize = 1 << m_log2CUSize[absPartIdx];
     uint32_t puHeight = 0;
 
     switch (m_partSizes[absPartIdx])
@@ -1703,7 +1706,7 @@
 void TComDataCU::deriveLeftRightTopIdx(uint32_t partIdx, uint32_t& ruiPartIdxLT, uint32_t& ruiPartIdxRT)
 {
     ruiPartIdxLT = m_absIdxInLCU;
-    ruiPartIdxRT = g_rasterToZscan[g_zscanToRaster[ruiPartIdxLT] + (m_cuSize[0] >> m_pic->getLog2UnitSize()) - 1];
+    ruiPartIdxRT = g_rasterToZscan[g_zscanToRaster[ruiPartIdxLT] + (1 << (m_log2CUSize[0] - m_pic->getLog2UnitSize())) - 1];
 
     switch (m_partSizes[0])
     {
@@ -1744,7 +1747,7 @@
 
 void TComDataCU::deriveLeftBottomIdx(uint32_t partIdx, uint32_t& outPartIdxLB)
 {
-    outPartIdxLB = g_rasterToZscan[g_zscanToRaster[m_absIdxInLCU] + (((m_cuSize[0] >> m_pic->getLog2UnitSize()) >> 1) - 1) * m_pic->getNumPartInCUSize()];
+    outPartIdxLB = g_rasterToZscan[g_zscanToRaster[m_absIdxInLCU] + ((1 << (m_log2CUSize[0] - m_pic->getLog2UnitSize() - 1)) - 1) * m_pic->getNumPartInCUSize()];
 
     switch (m_partSizes[0])
     {
@@ -1785,8 +1788,9 @@
  */
 void TComDataCU::deriveRightBottomIdx(uint32_t partIdx, uint32_t& outPartIdxRB)
 {
-    outPartIdxRB = g_rasterToZscan[g_zscanToRaster[m_absIdxInLCU] + (((m_cuSize[0] >> m_pic->getLog2UnitSize()) >> 1) - 1) *
-                                   m_pic->getNumPartInCUSize() +  (m_cuSize[0] >> m_pic->getLog2UnitSize()) - 1];
+    outPartIdxRB = g_rasterToZscan[g_zscanToRaster[m_absIdxInLCU] +
+                                   ((1 << (m_log2CUSize[0] - m_pic->getLog2UnitSize() - 1)) - 1) * m_pic->getNumPartInCUSize() +
+                                   (1 << (m_log2CUSize[0] - m_pic->getLog2UnitSize())) - 1];
 
     switch (m_partSizes[0])
     {
@@ -1822,7 +1826,7 @@
 
 void TComDataCU::deriveLeftRightTopIdxAdi(uint32_t& outPartIdxLT, uint32_t& outPartIdxRT, uint32_t partOffset, uint32_t partDepth)
 {
-    uint32_t numPartInWidth = m_cuSize[0] >> (m_pic->getLog2UnitSize() + partDepth);
+    uint32_t numPartInWidth = 1 << (m_log2CUSize[0] - m_pic->getLog2UnitSize() - partDepth);
 
     outPartIdxLT = m_absIdxInLCU + partOffset;
     outPartIdxRT = g_rasterToZscan[g_zscanToRaster[outPartIdxLT] + numPartInWidth - 1];
@@ -2196,7 +2200,7 @@
 {
     uint32_t col = m_cuPelX;
     uint32_t row = m_cuPelY;
-    uint32_t cuSize = getCUSize(0);
+    uint32_t cuSize = 1 << getLog2CUSize(0);
 
     switch (m_partSizes[0])
     {
@@ -2425,7 +2429,7 @@
 
 bool TComDataCU::isBipredRestriction()
 {
-    return getCUSize(0) == 8 && getPartitionSize(0) != SIZE_2Nx2N;
+    return getLog2CUSize(0) == 3 && getPartitionSize(0) != SIZE_2Nx2N;
 }
 
 void TComDataCU::clipMv(MV& outMV)
@@ -2744,8 +2748,8 @@
 
     outPartIdxCenter = m_absIdxInLCU + partAddr; // partition origin.
     outPartIdxCenter = g_rasterToZscan[g_zscanToRaster[outPartIdxCenter]
-                                       + (partHeight >> m_pic->getLog2UnitSize()) / 2 * m_pic->getNumPartInCUSize()
-                                       + (partWidth >> m_pic->getLog2UnitSize()) / 2];
+                                       + (partHeight >> (m_pic->getLog2UnitSize() + 1)) * m_pic->getNumPartInCUSize()
+                                       + (partWidth  >> (m_pic->getLog2UnitSize() + 1))];
 }
 
 uint32_t TComDataCU::getCoefScanIdx(uint32_t absPartIdx, uint32_t log2TrSize, bool bIsLuma, bool bIsIntra)
diff -r 6055baa75085 -r fa683df9621e source/Lib/TLibCommon/TComDataCU.h
--- a/source/Lib/TLibCommon/TComDataCU.h	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/Lib/TLibCommon/TComDataCU.h	Mon Jul 14 14:50:34 2014 +0900
@@ -76,7 +76,7 @@
 {
     char*    qpMemBlock;
     uint8_t* depthMemBlock;
-    uint8_t* cuSizeMemBlock;
+    uint8_t* log2CUSizeMemBlock;
     bool*    skipFlagMemBlock;
     char*    partSizeMemBlock;
     char*    predModeMemBlock;
@@ -119,7 +119,7 @@
     uint32_t      m_cuPelX;          ///< CU position in a pixel (X)
     uint32_t      m_cuPelY;          ///< CU position in a pixel (Y)
     uint32_t      m_numPartitions;   ///< total number of minimum partitions in a CU
-    uint8_t*      m_cuSize;          ///< array of cu width/height
+    uint8_t*      m_log2CUSize;      ///< array of cu width/height
     uint8_t*      m_depth;           ///< array of depths
     int           m_chromaFormat;
     int           m_hChromaShift;
@@ -271,9 +271,9 @@
 
     void          setPredModeSubParts(PredMode eMode, uint32_t absPartIdx, uint32_t depth);
 
-    uint8_t*      getCUSize()                     { return m_cuSize; }
+    uint8_t*      getLog2CUSize()                     { return m_log2CUSize; }
 
-    uint8_t       getCUSize(uint32_t idx)         { return m_cuSize[idx]; }
+    uint8_t       getLog2CUSize(uint32_t idx) const   { return m_log2CUSize[idx]; }
 
     char*         getQP()                         { return m_qp; }
 
diff -r 6055baa75085 -r fa683df9621e source/Lib/TLibCommon/TComLoopFilter.cpp
--- a/source/Lib/TLibCommon/TComLoopFilter.cpp	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/Lib/TLibCommon/TComLoopFilter.cpp	Mon Jul 14 14:50:34 2014 +0900
@@ -139,9 +139,9 @@
     for (uint32_t partIdx = absZOrderIdx; partIdx < absZOrderIdx + curNumParts; partIdx++)
     {
         uint32_t bsCheck;
-        if ((g_maxCUSize >> g_maxCUDepth) == 4)
+        if (g_log2UnitSize == 2)
         {
-            bsCheck = (dir == EDGE_VER && partIdx % 2 == 0) || (dir == EDGE_HOR && (partIdx - ((partIdx >> 2) << 2)) / 2 == 0);
+            bsCheck = (dir == EDGE_VER && (partIdx & 1) == 0) || (dir == EDGE_HOR && (partIdx & 2) == 0);
         }
         else
         {
@@ -154,16 +154,16 @@
         }
     }
 
-    uint32_t pelsInPart = g_maxCUSize >> g_maxCUDepth;
-    uint32_t partIdxIncr = DEBLOCK_SMALLEST_BLOCK / pelsInPart ? DEBLOCK_SMALLEST_BLOCK / pelsInPart : 1;
+    uint32_t log2UnitSize = g_log2UnitSize;
+    uint32_t partIdxIncr = (DEBLOCK_SMALLEST_BLOCK >> log2UnitSize) ? (DEBLOCK_SMALLEST_BLOCK >> log2UnitSize) : 1;
 
     uint32_t sizeInPU = pic->getNumPartInCUSize() >> (depth);
     uint32_t shiftFactor = (dir == EDGE_VER) ? cu->getHorzChromaShift() : cu->getVertChromaShift();
-    const bool bAlwaysDoChroma = (cu->getChromaFormat() == CHROMA_444);
+    const bool bAlwaysDoChroma = (cu->getChromaFormat() == CHROMA_444 || (1 << log2UnitSize) > DEBLOCK_SMALLEST_BLOCK);
     for (uint32_t e = 0; e < sizeInPU; e += partIdxIncr)
     {
         xEdgeFilterLuma(cu, absZOrderIdx, depth, dir, e, blockingStrength);
-        if (bAlwaysDoChroma || (pelsInPart > DEBLOCK_SMALLEST_BLOCK) || (e % ((DEBLOCK_SMALLEST_BLOCK << shiftFactor) / pelsInPart)) == 0)
+        if (bAlwaysDoChroma || (e % ((DEBLOCK_SMALLEST_BLOCK << shiftFactor) >> log2UnitSize)) == 0)
         {
             xEdgeFilterChroma(cu, absZOrderIdx, depth, dir, e, blockingStrength);
         }
@@ -178,7 +178,6 @@
     }
     const uint32_t numElem = widthInBaseUnits;
     X265_CHECK(numElem > 0, "numElem edge filter check\n");
-    X265_CHECK(widthInBaseUnits > 0, "widthInBaseUnits edge filter check\n");
     for (uint32_t i = 0; i < numElem; i++)
     {
         const uint32_t bsidx = xCalcBsIdx(cu, scanIdx, dir, edgeIdx, i);
@@ -205,9 +204,7 @@
         return;
     }
 
-    int trWidth  = cu->getCUSize(absZOrderIdx) >> cu->getTransformIdx(absZOrderIdx);
-
-    uint32_t widthInBaseUnits  = trWidth / (g_maxCUSize >> g_maxCUDepth);
+    uint32_t widthInBaseUnits  = 1 << (cu->getLog2CUSize(absZOrderIdx) - cu->getTransformIdx(absZOrderIdx) - g_log2UnitSize);
 
     xSetEdgefilterMultiple(cu, absTUPartIdx, depth, dir, 0, true, edgeFilter, blockingStrength, widthInBaseUnits);
 }
@@ -451,7 +448,8 @@
     int qpQ = 0;
     uint32_t numParts = cu->getPic()->getNumPartInCUSize() >> depth;
 
-    uint32_t pelsInPart = g_maxCUSize >> g_maxCUDepth;
+    uint32_t log2UnitSize = g_log2UnitSize;
+    uint32_t blocksInPart = (log2UnitSize - 2) > 0 ? 1 << (log2UnitSize - 2) : 1;
     uint32_t bsAbsIdx = 0, bs = 0;
     int  offset, srcStep;
 
@@ -468,17 +466,18 @@
     {
         offset = 1;
         srcStep = stride;
-        tmpsrc += edge * pelsInPart;
+        tmpsrc += (edge << log2UnitSize);
     }
     else // (dir == EDGE_HOR)
     {
         offset = stride;
         srcStep = 1;
-        tmpsrc += edge * pelsInPart * stride;
+        tmpsrc += (edge << log2UnitSize) * stride;
     }
 
     for (uint32_t idx = 0; idx < numParts; idx++)
     {
+        uint32_t partOffset = idx << log2UnitSize;
         bsAbsIdx = xCalcBsIdx(cu, absZOrderIdx, dir, edge, idx);
         bs = blockingStrength[bsAbsIdx];
         if (bs)
@@ -507,13 +506,12 @@
             int sideThreshold = (beta + (beta >> 1)) >> 3;
             int thrCut = tc * 10;
 
-            uint32_t blocksInPart = pelsInPart / 4 ? pelsInPart / 4 : 1;
             for (uint32_t blkIdx = 0; blkIdx < blocksInPart; blkIdx++)
             {
-                int dp0 = xCalcDP(tmpsrc + srcStep * (idx * pelsInPart + blkIdx * 4 + 0), offset);
-                int dq0 = xCalcDQ(tmpsrc + srcStep * (idx * pelsInPart + blkIdx * 4 + 0), offset);
-                int dp3 = xCalcDP(tmpsrc + srcStep * (idx * pelsInPart + blkIdx * 4 + 3), offset);
-                int dq3 = xCalcDQ(tmpsrc + srcStep * (idx * pelsInPart + blkIdx * 4 + 3), offset);
+                int dp0 = xCalcDP(tmpsrc + srcStep * (partOffset + blkIdx * 4 + 0), offset);
+                int dq0 = xCalcDQ(tmpsrc + srcStep * (partOffset + blkIdx * 4 + 0), offset);
+                int dp3 = xCalcDP(tmpsrc + srcStep * (partOffset + blkIdx * 4 + 3), offset);
+                int dq3 = xCalcDQ(tmpsrc + srcStep * (partOffset + blkIdx * 4 + 3), offset);
                 int d0 = dp0 + dq0;
                 int d3 = dp3 + dq3;
 
@@ -533,12 +531,12 @@
                     bool bFilterP = (dp < sideThreshold);
                     bool bFilterQ = (dq < sideThreshold);
 
-                    bool sw =  xUseStrongFiltering(offset, 2 * d0, beta, tc, tmpsrc + srcStep * (idx * pelsInPart + blkIdx * 4 + 0))
-                        && xUseStrongFiltering(offset, 2 * d3, beta, tc, tmpsrc + srcStep * (idx * pelsInPart + blkIdx * 4 + 3));
+                    bool sw =  xUseStrongFiltering(offset, 2 * d0, beta, tc, tmpsrc + srcStep * (partOffset + blkIdx * 4 + 0))
+                        && xUseStrongFiltering(offset, 2 * d3, beta, tc, tmpsrc + srcStep * (partOffset + blkIdx * 4 + 3));
 
                     for (int i = 0; i < DEBLOCK_SMALLEST_BLOCK / 2; i++)
                     {
-                        xPelFilterLuma(tmpsrc + srcStep * (idx * pelsInPart + blkIdx * 4 + i), offset, tc, sw, bPartPNoFilter, bPartQNoFilter, thrCut, bFilterP, bFilterQ);
+                        xPelFilterLuma(tmpsrc + srcStep * (partOffset + blkIdx * 4 + i), offset, tc, sw, bPartPNoFilter, bPartQNoFilter, thrCut, bFilterP, bFilterQ);
                     }
                 }
             }
@@ -555,8 +553,10 @@
     int qp = 0;
     int qpP = 0;
     int qpQ = 0;
-    uint32_t  pelsInPartChromaH = g_maxCUSize >> (g_maxCUDepth + cu->getHorzChromaShift());
-    uint32_t  pelsInPartChromaV = g_maxCUSize >> (g_maxCUDepth + cu->getVertChromaShift());
+    uint32_t log2UnitSizeH = g_log2UnitSize - cu->getHorzChromaShift();
+    uint32_t log2UnitSizeV = g_log2UnitSize - cu->getVertChromaShift();
+    uint32_t unitSizeChromaH = 1 << log2UnitSizeH;
+    uint32_t unitSizeChromaV = 1 << log2UnitSizeV;
     int   offset, srcStep;
 
     const uint32_t lcuWidthInBaseUnits = cu->getPic()->getNumPartInCUSize();
@@ -573,9 +573,9 @@
     uint32_t edgeNumInLCUVert = g_zscanToRaster[absZOrderIdx] % lcuWidthInBaseUnits + edge;
     uint32_t edgeNumInLCUHor = g_zscanToRaster[absZOrderIdx] / lcuWidthInBaseUnits + edge;
 
-    if ((pelsInPartChromaH < DEBLOCK_SMALLEST_BLOCK) && (pelsInPartChromaV < DEBLOCK_SMALLEST_BLOCK) &&
-        (((edgeNumInLCUVert % (DEBLOCK_SMALLEST_BLOCK / pelsInPartChromaH)) && (dir == 0)) ||
-         ((edgeNumInLCUHor % (DEBLOCK_SMALLEST_BLOCK / pelsInPartChromaV)) && dir)))
+    if ((unitSizeChromaH < DEBLOCK_SMALLEST_BLOCK) && (unitSizeChromaV < DEBLOCK_SMALLEST_BLOCK) &&
+        (((edgeNumInLCUVert % (DEBLOCK_SMALLEST_BLOCK >> log2UnitSizeH)) && (dir == 0)) ||
+         ((edgeNumInLCUHor % (DEBLOCK_SMALLEST_BLOCK >> log2UnitSizeV)) && dir)))
     {
         return;
     }
@@ -593,17 +593,17 @@
     {
         offset     = 1;
         srcStep    = stride;
-        tmpSrcCb   += edge * pelsInPartChromaH;
-        tmpSrcCr   += edge * pelsInPartChromaH;
-        loopLength = pelsInPartChromaV;
+        tmpSrcCb   += (edge << log2UnitSizeH);
+        tmpSrcCr   += (edge << log2UnitSizeH);
+        loopLength = unitSizeChromaV;
     }
     else // (dir == EDGE_HOR)
     {
         offset     = stride;
         srcStep    = 1;
-        tmpSrcCb   += edge * stride * pelsInPartChromaV;
-        tmpSrcCr   += edge * stride * pelsInPartChromaV;
-        loopLength = pelsInPartChromaH;
+        tmpSrcCb   += edge * stride << log2UnitSizeV;
+        tmpSrcCr   += edge * stride << log2UnitSizeV;
+        loopLength = unitSizeChromaH;
     }
 
     for (uint32_t idx = 0; idx < numParts; idx++)
diff -r 6055baa75085 -r fa683df9621e source/Lib/TLibCommon/TComPattern.cpp
--- a/source/Lib/TLibCommon/TComPattern.cpp	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/Lib/TLibCommon/TComPattern.cpp	Mon Jul 14 14:50:34 2014 +0900
@@ -68,7 +68,7 @@
 
     fillReferenceSamples(roiOrigin, picStride, adiTemp, intraNeighbors);
 
-    bool bUseFilteredPredictions = (dirMode == ALL_IDX || TComPrediction::filteringIntraReferenceSamples(dirMode, tuSize));
+    bool bUseFilteredPredictions = (dirMode == ALL_IDX || TComPrediction::filteringIntraReferenceSamples(dirMode, intraNeighbors.log2TrSize));
 
     if (bUseFilteredPredictions && 8 <= tuSize && tuSize <= 32)
     {
@@ -104,7 +104,7 @@
 
             if (bilinearLeft && bilinearAbove)
             {
-                int shift = g_convertToBit[tuSize] + 3; // log2(tuSize2)
+                int shift = intraNeighbors.log2TrSize + 1;
                 filterBufN[0] = filterBuf[0];
                 filterBufN[tuSize2] = filterBuf[tuSize2];
                 filterBufN[bufSize - 1] = filterBuf[bufSize - 1];
@@ -183,16 +183,15 @@
 
 void TComPattern::initIntraNeighbors(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth, TextType cType, IntraNeighbors *intraNeighbors)
 {
-    uint32_t tuSize  = cu->getCUSize(0) >> partDepth;
-    int baseUnitSize = g_maxCUSize >> g_maxCUDepth;
-    int unitWidth    = baseUnitSize;
-    int unitHeight   = baseUnitSize;
+    uint32_t log2TrSize = cu->getLog2CUSize(0) - partDepth;
+    int log2UnitWidth  = g_log2UnitSize;
+    int log2UnitHeight = g_log2UnitSize;
 
     if (cType != TEXT_LUMA)
     {
-        tuSize     >>= cu->getHorzChromaShift();
-        unitWidth  >>= cu->getHorzChromaShift();
-        unitHeight >>= cu->getVertChromaShift();
+        log2TrSize     -= cu->getHorzChromaShift();
+        log2UnitWidth  -= cu->getHorzChromaShift();
+        log2UnitHeight -= cu->getVertChromaShift();
     }
 
     int   numIntraNeighbor = 0;
@@ -202,11 +201,12 @@
 
     cu->deriveLeftRightTopIdxAdi(partIdxLT, partIdxRT, zOrderIdxInPart, partDepth);
 
-    int  partIdxStride   = cu->getPic()->getNumPartInCUSize();
-    int  tuHeightInUnits = tuSize / unitHeight;
-    int  tuWidthInUnits  = tuSize / unitWidth;
+    uint32_t tuSize  = 1 << log2TrSize;
+    int  tuWidthInUnits  = tuSize >> log2UnitWidth;
+    int  tuHeightInUnits = tuSize >> log2UnitHeight;
     int  aboveUnits      = tuWidthInUnits << 1;
     int  leftUnits       = tuHeightInUnits << 1;
+    int  partIdxStride   = cu->getPic()->getNumPartInCUSize();
     partIdxLB            = g_rasterToZscan[g_zscanToRaster[partIdxLT] + ((tuHeightInUnits - 1) * partIdxStride)];
 
     if (!cu->getSlice()->getPPS()->getConstrainedIntraPred())
@@ -231,9 +231,10 @@
     intraNeighbors->totalUnits       = aboveUnits + leftUnits + 1;
     intraNeighbors->aboveUnits       = aboveUnits;
     intraNeighbors->leftUnits        = leftUnits;
+    intraNeighbors->unitWidth        = 1 << log2UnitWidth;
+    intraNeighbors->unitHeight       = 1 << log2UnitHeight;
     intraNeighbors->tuSize           = tuSize;
-    intraNeighbors->unitWidth        = unitWidth;
-    intraNeighbors->unitHeight       = unitHeight;
+    intraNeighbors->log2TrSize       = log2TrSize;
 }
 
 void TComPattern::fillReferenceSamples(pixel* roiOrigin, int picStride, pixel* adiTemp, const IntraNeighbors& intraNeighbors)
diff -r 6055baa75085 -r fa683df9621e source/Lib/TLibCommon/TComPattern.h
--- a/source/Lib/TLibCommon/TComPattern.h	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/Lib/TLibCommon/TComPattern.h	Mon Jul 14 14:50:34 2014 +0900
@@ -59,9 +59,10 @@
     int  totalUnits;
     int  aboveUnits;
     int  leftUnits;
-    int  tuSize;
     int  unitWidth;
     int  unitHeight;
+    int  tuSize;
+    uint32_t log2TrSize;
     bool bNeighborFlags[4 * MAX_NUM_SPU_W + 1];
 };
 
diff -r 6055baa75085 -r fa683df9621e source/Lib/TLibCommon/TComPicSym.cpp
--- a/source/Lib/TLibCommon/TComPicSym.cpp	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/Lib/TLibCommon/TComPicSym.cpp	Mon Jul 14 14:50:34 2014 +0900
@@ -66,13 +66,13 @@
     m_saoParam        = NULL;
     m_numPartitions   = 1 << (g_maxCUDepth << 1);
 
-    m_unitSize        = g_maxCUSize >> g_maxCUDepth;
-    m_log2UnitSize    = g_convertToBit[m_unitSize] + 2;
+    m_log2UnitSize    = g_log2UnitSize;
+    m_unitSize        = 1 << m_log2UnitSize;
 
     m_numPartInCUSize = g_maxCUSize >> m_log2UnitSize;
 
-    m_widthInCU       = (param->sourceWidth + g_maxCUSize - 1) / g_maxCUSize;
-    m_heightInCU      = (param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;
+    m_widthInCU       = (param->sourceWidth + g_maxCUSize - 1) >> g_maxLog2CUSize;
+    m_heightInCU      = (param->sourceHeight + g_maxCUSize - 1) >> g_maxLog2CUSize;
 
     m_numCUsInFrame   = m_widthInCU * m_heightInCU;
 
@@ -84,7 +84,7 @@
     bool tqBypass = param->bCULossless || param->bLossless;
     for (i = 0; i < m_numCUsInFrame; i++)
     {
-        uint32_t sizeL = g_maxCUSize * g_maxCUSize;
+        uint32_t sizeL = 1 << (g_maxLog2CUSize * 2);
         uint32_t sizeC = sizeL >> (CHROMA_H_SHIFT(param->internalCsp) + CHROMA_V_SHIFT(param->internalCsp));
         if (!m_cuData[i].initialize(m_numPartitions, sizeL, sizeC, 1, tqBypass))
             return false;
diff -r 6055baa75085 -r fa683df9621e source/Lib/TLibCommon/TComPrediction.cpp
--- a/source/Lib/TLibCommon/TComPrediction.cpp	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/Lib/TLibCommon/TComPrediction.cpp	Mon Jul 14 14:50:34 2014 +0900
@@ -113,29 +113,28 @@
 // Public member functions
 // ====================================================================================================================
 
-bool TComPrediction::filteringIntraReferenceSamples(uint32_t dirMode, uint32_t tuSize)
+bool TComPrediction::filteringIntraReferenceSamples(uint32_t dirMode, uint32_t log2TrSize)
 {
     bool bFilter;
 
-    if (dirMode == DC_IDX || tuSize <= 4)
+    if (dirMode == DC_IDX || log2TrSize <= 2)
     {
         bFilter = false; // no smoothing for DC
     }
     else
     {
         int diff = std::min<int>(abs((int)dirMode - HOR_IDX), abs((int)dirMode - VER_IDX));
-        uint32_t sizeIdx = g_convertToBit[tuSize];
+        uint32_t sizeIdx = log2TrSize - 2;
         bFilter = diff > intraFilterThreshold[sizeIdx];
     }
 
     return bFilter;
 }
 
-void TComPrediction::predIntraLumaAng(uint32_t dirMode, pixel* dst, intptr_t stride, int tuSize)
+void TComPrediction::predIntraLumaAng(uint32_t dirMode, pixel* dst, intptr_t stride, uint32_t log2TrSize)
 {
-    X265_CHECK(tuSize >= 4 && tuSize <= 64, "intra block size is out of range\n");
-    int sizeIdx = g_convertToBit[tuSize];
-    bool bUseFilteredPredictions = TComPrediction::filteringIntraReferenceSamples(dirMode, tuSize);
+    int tuSize = 1 << log2TrSize;
+    bool bUseFilteredPredictions = filteringIntraReferenceSamples(dirMode, log2TrSize);
 
     pixel *refLft, *refAbv;
     refLft = m_refLeft + tuSize - 1;
@@ -147,31 +146,23 @@
         refAbv = m_refAboveFlt + tuSize - 1;
     }
 
-    bool bFilter = tuSize <= 16 && dirMode != PLANAR_IDX;
+    bool bFilter = log2TrSize <= 4 && dirMode != PLANAR_IDX;
+    int sizeIdx = log2TrSize - 2;
+    X265_CHECK(sizeIdx >= 0 && sizeIdx < 4, "intra block size is out of range\n");
     primitives.intra_pred[sizeIdx][dirMode](dst, stride, refLft, refAbv, dirMode, bFilter);
 }
 
 // Angular chroma
-void TComPrediction::predIntraChromaAng(pixel* src, uint32_t dirMode, pixel* dst, intptr_t stride, int tuSize, int chFmt)
+void TComPrediction::predIntraChromaAng(pixel* src, uint32_t dirMode, pixel* dst, intptr_t stride, uint32_t log2TrSizeC, int chFmt)
 {
-    int sizeIdx = g_convertToBit[tuSize];
+    int tuSize = 1 << log2TrSizeC;
     uint32_t tuSize2 = tuSize << 1;
 
     // Create the prediction
     pixel refAbv[3 * MAX_CU_SIZE];
     pixel refLft[3 * MAX_CU_SIZE];
 
-    bool bUseFilteredPredictions = true;
-
-    if (chFmt != CHROMA_444)
-    {
-        bUseFilteredPredictions = false;
-    }
-    else
-    {
-        X265_CHECK(tuSize >= 4 && tuSize < 128, "intra prediction size is out of range\n");
-        bUseFilteredPredictions = TComPrediction::filteringIntraReferenceSamples(dirMode, tuSize);
-    }
+    bool bUseFilteredPredictions = (chFmt == CHROMA_444 && filteringIntraReferenceSamples(dirMode, log2TrSizeC));
 
     if (bUseFilteredPredictions)
     {
@@ -222,6 +213,8 @@
         }
     }
 
+    int sizeIdx = log2TrSizeC - 2;
+    X265_CHECK(sizeIdx >= 0 && sizeIdx < 4, "intra block size is out of range\n");
     primitives.intra_pred[sizeIdx][dirMode](dst, stride, refLft + tuSize - 1, refAbv + tuSize - 1, dirMode, 0);
 }
 
diff -r 6055baa75085 -r fa683df9621e source/Lib/TLibCommon/TComPrediction.h
--- a/source/Lib/TLibCommon/TComPrediction.h	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/Lib/TLibCommon/TComPrediction.h	Mon Jul 14 14:50:34 2014 +0900
@@ -103,9 +103,9 @@
     void motionCompensation(TComDataCU* cu, TComYuv* predYuv, int picList = REF_PIC_LIST_X, int partIdx = -1, bool bLuma = true, bool bChroma = true);
 
     // Angular Intra
-    void predIntraLumaAng(uint32_t dirMode, pixel* pred, intptr_t stride, int tuSize);
-    void predIntraChromaAng(pixel* src, uint32_t dirMode, pixel* pred, intptr_t stride, int tuSize, int chFmt);
-    static bool filteringIntraReferenceSamples(uint32_t dirMode, uint32_t tuSize);
+    void predIntraLumaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSize);
+    void predIntraChromaAng(pixel* src, uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSizeC, int chFmt);
+    static bool filteringIntraReferenceSamples(uint32_t dirMode, uint32_t log2TrSize);
 };
 }
 //! \}
diff -r 6055baa75085 -r fa683df9621e source/Lib/TLibCommon/TComRom.cpp
--- a/source/Lib/TLibCommon/TComRom.cpp	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/Lib/TLibCommon/TComRom.cpp	Mon Jul 14 14:50:34 2014 +0900
@@ -43,7 +43,7 @@
 //! \ingroup TLibCommon
 //! \{
 // scanning order table
-uint16_t* g_scanOrder[SCAN_NUMBER_OF_GROUP_TYPES][SCAN_NUMBER_OF_TYPES][MAX_CU_DEPTH];
+uint16_t* g_scanOrder[SCAN_NUMBER_OF_GROUP_TYPES][SCAN_NUMBER_OF_TYPES][MAX_LOG2_TR_SIZE + 1];
 
 class ScanGenerator
 {
@@ -192,7 +192,7 @@
     }
 
     // initialise scan orders
-    for (uint32_t log2BlockSize = 0; log2BlockSize < MAX_CU_DEPTH; log2BlockSize++)
+    for (uint32_t log2BlockSize = 0; log2BlockSize <= MAX_LOG2_TR_SIZE; log2BlockSize++)
     {
         const uint32_t blockWidth  = 1 << log2BlockSize;
         const uint32_t blockHeight = 1 << log2BlockSize;
@@ -259,7 +259,7 @@
     {
         for (uint32_t scanOrderIndex = 0; scanOrderIndex < SCAN_NUMBER_OF_TYPES; scanOrderIndex++)
         {
-            for (uint32_t log2BlockSize = 0; log2BlockSize < MAX_CU_DEPTH; log2BlockSize++)
+            for (uint32_t log2BlockSize = 0; log2BlockSize <= MAX_LOG2_TR_SIZE; log2BlockSize++)
             {
                 X265_FREE(g_scanOrder[groupTypeIndex][scanOrderIndex][log2BlockSize]);
             }
@@ -271,9 +271,11 @@
 // Data structure related table & variable
 // ====================================================================================================================
 
+uint32_t g_maxLog2CUSize = MAX_LOG2_CU_SIZE;
 uint32_t g_maxCUSize   = MAX_CU_SIZE;
-uint32_t g_maxCUDepth  = MAX_CU_DEPTH;
-uint32_t g_addCUDepth  = 0;
+uint32_t g_maxCUDepth  = MAX_FULL_DEPTH;
+uint32_t g_addCUDepth  = 1;
+uint32_t g_log2UnitSize = 2;
 uint32_t g_zscanToRaster[MAX_NUM_SPU_W * MAX_NUM_SPU_W] = { 0, };
 uint32_t g_rasterToZscan[MAX_NUM_SPU_W * MAX_NUM_SPU_W] = { 0, };
 uint32_t g_rasterToPelX[MAX_NUM_SPU_W * MAX_NUM_SPU_W] = { 0, };
diff -r 6055baa75085 -r fa683df9621e source/Lib/TLibCommon/TComRom.h
--- a/source/Lib/TLibCommon/TComRom.h	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/Lib/TLibCommon/TComRom.h	Mon Jul 14 14:50:34 2014 +0900
@@ -50,13 +50,20 @@
 // Macros
 // ====================================================================================================================
 
-#define MAX_CU_DEPTH            6                           // log2(LCUSize)
-#define MAX_CU_SIZE             (1 << (MAX_CU_DEPTH))       // maximum allowable size of CU
+#define MAX_CU_DEPTH            4                           // maximun CU depth
+#define MAX_FULL_DEPTH          5                           // maximun full depth
+#define MAX_LOG2_CU_SIZE        6                           // log2(LCUSize)
+#define MAX_CU_SIZE             (1 << MAX_LOG2_CU_SIZE)     // maximum allowable size of CU
 #define MIN_PU_SIZE             4
 #define MIN_TU_SIZE             4
 #define MAX_NUM_SPU_W           (MAX_CU_SIZE / MIN_PU_SIZE) // maximum number of SPU in horizontal line
 #define ADI_BUF_STRIDE          (2 * MAX_CU_SIZE + 1 + 15)  // alignment to 16 bytes
 
+#define MAX_LOG2_TR_SIZE 5
+#define MAX_LOG2_TS_SIZE 2 // TODO: RExt
+#define MAX_TR_SIZE (1 << MAX_LOG2_TR_SIZE)
+#define MAX_TS_SIZE (1 << MAX_LOG2_TS_SIZE)
+
 // ====================================================================================================================
 // Initialize / destroy functions
 // ====================================================================================================================
@@ -75,7 +82,7 @@
 // flexible conversion from relative to absolute index
 extern uint32_t g_zscanToRaster[MAX_NUM_SPU_W * MAX_NUM_SPU_W];
 extern uint32_t g_rasterToZscan[MAX_NUM_SPU_W * MAX_NUM_SPU_W];
-extern uint16_t*  g_scanOrder[SCAN_NUMBER_OF_GROUP_TYPES][SCAN_NUMBER_OF_TYPES][MAX_CU_DEPTH];
+extern uint16_t* g_scanOrder[SCAN_NUMBER_OF_GROUP_TYPES][SCAN_NUMBER_OF_TYPES][MAX_LOG2_TR_SIZE + 1];
 void initZscanToRaster(int maxDepth, int depth, uint32_t startVal, uint32_t*& curIdx);
 void initRasterToZscan(uint32_t maxCUSize, uint32_t maxCUDepth);
 
@@ -86,12 +93,11 @@
 void initRasterToPelXY(uint32_t maxCUSize, uint32_t maxCUDepth);
 
 // global variable (LCU width/height, max. CU depth)
+extern uint32_t g_maxLog2CUSize;
 extern uint32_t g_maxCUSize;
 extern uint32_t g_maxCUDepth;
 extern uint32_t g_addCUDepth;
-
-#define LOG2_MAX_TS_SIZE 2 // TODO: RExt
-#define MAX_TS_SIZE (1 << LOG2_MAX_TS_SIZE)
+extern uint32_t g_log2UnitSize;
 
 extern const uint32_t g_puOffset[8];
 
diff -r 6055baa75085 -r fa683df9621e source/Lib/TLibCommon/TComSlice.h
--- a/source/Lib/TLibCommon/TComSlice.h	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/Lib/TLibCommon/TComSlice.h	Mon Jul 14 14:50:34 2014 +0900
@@ -954,7 +954,7 @@
     // AMP accuracy
     int       getAMPAcc(uint32_t depth) const { return m_iAMPAcc[depth]; }
 
-    void      setAMPAcc(uint32_t depth, int iAccu) { X265_CHECK(depth < g_maxCUDepth, "AMP Acc depth\n");  m_iAMPAcc[depth] = iAccu; }
+    void      setAMPAcc(uint32_t depth, int iAccu) { X265_CHECK(depth < MAX_CU_DEPTH, "AMP Acc depth\n");  m_iAMPAcc[depth] = iAccu; }
 
     // Bit-depth
     int      getBitDepthY() const { return m_bitDepthY; }
diff -r 6055baa75085 -r fa683df9621e source/Lib/TLibCommon/TComTrQuant.cpp
--- a/source/Lib/TLibCommon/TComTrQuant.cpp	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp	Mon Jul 14 14:50:34 2014 +0900
@@ -1365,7 +1365,7 @@
  */
 void TComTrQuant::setErrScaleCoeff(uint32_t list, uint32_t size, uint32_t qp)
 {
-    uint32_t log2TrSize = g_convertToBit[g_scalingListSizeX[size]] + 2;
+    uint32_t log2TrSize = size + 2;
     int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform
 
     uint32_t i, maxNumCoeff = g_scalingListSize[size];
diff -r 6055baa75085 -r fa683df9621e source/Lib/TLibEncoder/TEncCu.cpp
--- a/source/Lib/TLibEncoder/TEncCu.cpp	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/Lib/TLibEncoder/TEncCu.cpp	Mon Jul 14 14:50:34 2014 +0900
@@ -102,6 +102,8 @@
  */
 bool TEncCu::create(uint8_t totalDepth, uint32_t maxWidth)
 {
+    X265_CHECK(totalDepth <= MAX_CU_DEPTH, "invalid totalDepth\n");
+
     m_totalDepth     = totalDepth;
 
     m_bestPredYuv = new TComYuv*[totalDepth];
@@ -494,7 +496,7 @@
         bTestMergeAMP_Ver = true;
     }
 
-    if (outBestCU->getCUSize(0) == 64)
+    if (outBestCU->getLog2CUSize(0) == 6)
     {
         bTestAMP_Hor = false;
         bTestAMP_Ver = false;
@@ -527,10 +529,11 @@
         // copy partition YUV from depth 0 CTU cache
         m_origYuv[0]->copyPartToYuv(m_origYuv[depth], outBestCU->getZorderIdxInCU());
 
-    uint32_t cuSize = outTempCU->getCUSize(0);
+    uint32_t log2CUSize = outTempCU->getLog2CUSize(0);
     TComSlice* slice = outTempCU->getSlice();
     if (!bInsidePicture)
     {
+        uint32_t cuSize = 1 << log2CUSize;
         uint32_t lpelx = outBestCU->getCUPelX();
         uint32_t tpely = outBestCU->getCUPelY();
         uint32_t rpelx = lpelx + cuSize;
@@ -546,7 +549,7 @@
 
         if (depth == g_maxCUDepth - g_addCUDepth)
         {
-            if (cuSize > (1 << slice->getSPS()->getQuadtreeTULog2MinSize()))
+            if (log2CUSize > slice->getSPS()->getQuadtreeTULog2MinSize())
                 xCheckRDCostIntra(outBestCU, outTempCU, SIZE_NxN);
         }
 
@@ -678,10 +681,11 @@
     bool doNotBlockPu = true;
     bool earlyDetectionSkipMode = false;
 
-    uint32_t cuSize = outTempCU->getCUSize(0);
+    uint32_t log2CUSize = outTempCU->getLog2CUSize(0);
     TComSlice* slice = outTempCU->getSlice();
     if (!bInsidePicture)
     {
+        uint32_t cuSize = 1 << log2CUSize;
         uint32_t lpelx = outBestCU->getCUPelX();
         uint32_t tpely = outBestCU->getCUPelY();
         uint32_t rpelx = lpelx + cuSize;
@@ -725,7 +729,7 @@
             if (slice->getSliceType() != I_SLICE)
             {
                 // 2Nx2N, NxN
-                if (!(cuSize == 8))
+                if (!(log2CUSize == 3))
                 {
                     if (depth == g_maxCUDepth - g_addCUDepth && doNotBlockPu)
                     {
@@ -842,7 +846,7 @@
 
                 if (depth == g_maxCUDepth - g_addCUDepth)
                 {
-                    if (cuSize > (1 << slice->getSPS()->getQuadtreeTULog2MinSize()))
+                    if (log2CUSize > slice->getSPS()->getQuadtreeTULog2MinSize())
                     {
                         xCheckRDCostIntraInInter(outBestCU, outTempCU, SIZE_NxN);
                         outTempCU->initEstData();
@@ -984,7 +988,7 @@
     uint32_t posy = (externalAddress / pic->getFrameWidthInCU()) * g_maxCUSize + g_rasterToPelY[g_zscanToRaster[internalAddress]];
     uint32_t width = slice->getSPS()->getPicWidthInLumaSamples();
     uint32_t height = slice->getSPS()->getPicHeightInLumaSamples();
-    uint32_t cuSize = cu->getCUSize(absPartIdx);
+    uint32_t cuSize = 1 << cu->getLog2CUSize(absPartIdx);
 
     while (posx >= width || posy >= height)
     {
@@ -1110,7 +1114,7 @@
     m_sbacCoder->codePredInfo(cu, absPartIdx);
 
     // Encode Coefficients, allow codeCoeff() to modify m_bEncodeDQP
-    m_sbacCoder->codeCoeff(cu, absPartIdx, depth, cu->getCUSize(absPartIdx), m_bEncodeDQP);
+    m_sbacCoder->codeCoeff(cu, absPartIdx, depth, m_bEncodeDQP);
 
     // --- write terminating bit ---
     finishCU(cu, absPartIdx, depth);
@@ -1275,15 +1279,15 @@
     outTempCU->m_mvBits = m_sbacCoder->getNumberOfWrittenBits();
 
     // Encode Coefficients
-    bool bEncodeDQP = m_bEncodeDQP;
-    m_sbacCoder->codeCoeff(outTempCU, 0, depth, outTempCU->getCUSize(0), bEncodeDQP);
+    bool bCodeDQP = m_bEncodeDQP;
+    m_sbacCoder->codeCoeff(outTempCU, 0, depth, bCodeDQP);
     m_sbacCoder->store(m_rdSbacCoders[depth][CI_TEMP_BEST]);
     outTempCU->m_totalBits = m_sbacCoder->getNumberOfWrittenBits();
     outTempCU->m_coeffBits = outTempCU->m_totalBits - outTempCU->m_mvBits;
 
     if (m_rdCost.psyRdEnabled())
     {
-        int part = g_convertToBit[outTempCU->getCUSize(0)];
+        int part = outTempCU->getLog2CUSize(0) - 2;
         outTempCU->m_psyEnergy = m_rdCost.psyCost(part, m_origYuv[depth]->getLumaAddr(), m_origYuv[depth]->getStride(),
                                                   m_tmpRecoYuv[depth]->getLumaAddr(), m_tmpRecoYuv[depth]->getStride());
         outTempCU->m_totalPsyCost = m_rdCost.calcPsyRdCost(outTempCU->m_totalDistortion, outTempCU->m_totalBits, outTempCU->m_psyEnergy);
@@ -1325,14 +1329,14 @@
 
     // Encode Coefficients
     bool bCodeDQP = m_bEncodeDQP;
-    m_sbacCoder->codeCoeff(outTempCU, 0, depth, outTempCU->getCUSize(0), bCodeDQP);
+    m_sbacCoder->codeCoeff(outTempCU, 0, depth, bCodeDQP);
     m_sbacCoder->store(m_rdSbacCoders[depth][CI_TEMP_BEST]);
     outTempCU->m_totalBits = m_sbacCoder->getNumberOfWrittenBits();
     outTempCU->m_coeffBits = outTempCU->m_totalBits - outTempCU->m_mvBits;
 
     if (m_rdCost.psyRdEnabled())
     {
-        int part = g_convertToBit[outTempCU->getCUSize(0)];
+        int part = outTempCU->getLog2CUSize(0) - 2;
         outTempCU->m_psyEnergy = m_rdCost.psyCost(part, m_origYuv[depth]->getLumaAddr(), m_origYuv[depth]->getStride(),
                                                   m_tmpRecoYuv[depth]->getLumaAddr(), m_tmpRecoYuv[depth]->getStride());
         outTempCU->m_totalPsyCost = m_rdCost.calcPsyRdCost(outTempCU->m_totalDistortion, outTempCU->m_totalBits, outTempCU->m_psyEnergy);
@@ -1397,8 +1401,8 @@
  */
 void TEncCu::xFillOrigYUVBuffer(TComDataCU* cu, TComYuv* fencYuv)
 {
-    uint32_t width  = cu->getCUSize(0);
-    uint32_t height = cu->getCUSize(0);
+    uint32_t width  = 1 << cu->getLog2CUSize(0);
+    uint32_t height = 1 << cu->getLog2CUSize(0);
 
     pixel* srcY = fencYuv->getLumaAddr();
     pixel* dstY = cu->getLumaOrigYuv();
diff -r 6055baa75085 -r fa683df9621e source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Mon Jul 14 14:50:34 2014 +0900
@@ -98,7 +98,7 @@
     m_qtTempCoeff[1] = m_qtTempCoeff[0] + m_numLayers;
     m_qtTempCoeff[2] = m_qtTempCoeff[0] + m_numLayers * 2;
     m_qtTempShortYuv = new ShortYuv[m_numLayers];
-    uint32_t sizeL = g_maxCUSize * g_maxCUSize;
+    uint32_t sizeL = 1 << (g_maxLog2CUSize * 2);
     uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
     for (uint32_t i = 0; i < m_numLayers; ++i)
     {
@@ -145,7 +145,7 @@
     uint32_t fullDepth  = cu->getDepth(0) + trDepth;
     uint32_t trMode     = cu->getTransformIdx(absPartIdx);
     uint32_t subdiv     = (trMode > trDepth ? 1 : 0);
-    uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
+    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 
     if (cu->getPredictionMode(0) == MODE_INTRA && cu->getPartitionSize(0) == SIZE_NxN && trDepth == 0)
     {
@@ -189,7 +189,7 @@
     uint32_t fullDepth  = cu->getDepth(0) + trDepth;
     uint32_t trMode     = cu->getTransformIdx(absPartIdx);
     uint32_t subdiv     = (trMode > trDepth ? 1 : 0);
-    uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
+    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 
     int      chFmt      = cu->getChromaFormat();
     if ((log2TrSize > 2) && !(chFmt == CHROMA_444))
@@ -236,7 +236,7 @@
         return;
     }
 
-    uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
+    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
     uint32_t qtLayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
     uint32_t log2UnitSize = cu->getPic()->getLog2UnitSize();
     uint32_t coeffOffset = absPartIdx << (log2UnitSize * 2);
@@ -263,7 +263,7 @@
         return;
     }
 
-    uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
+    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
     uint32_t qtLayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
     uint32_t log2UnitSize = cu->getPic()->getLog2UnitSize();
 
@@ -371,9 +371,10 @@
 
 uint32_t TEncSearch::xGetIntraBitsQTChroma(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t absPartIdxStep)
 {
+    int cuSize = 1 << cu->getLog2CUSize(absPartIdx);
     m_sbacCoder->resetBits();
     xEncIntraHeaderChroma(cu, absPartIdx);
-    xEncSubdivCbfQTChroma(cu, trDepth, absPartIdx, absPartIdxStep, cu->getCUSize(absPartIdx), cu->getCUSize(absPartIdx));
+    xEncSubdivCbfQTChroma(cu, trDepth, absPartIdx, absPartIdxStep, cuSize, cuSize);
     xEncCoeffQTChroma(cu, trDepth, absPartIdx, TEXT_CHROMA_U);
     xEncCoeffQTChroma(cu, trDepth, absPartIdx, TEXT_CHROMA_V);
     return m_sbacCoder->getNumberOfWrittenBits();
@@ -410,7 +411,6 @@
                                      uint32_t&   cbf,
                                      uint32_t&   outDist)
 {
-    uint32_t tuSize       = 1 << log2TrSize;
     uint32_t stride       = fencYuv->getStride();
     pixel*   fenc         = fencYuv->getLumaAddr(absPartIdx);
     pixel*   pred         = predYuv->getLumaAddr(absPartIdx);
@@ -420,19 +420,22 @@
     pixel*   reconIPred       = cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
     uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getStride();
     bool     useTransformSkip = !!cu->getTransformSkip(absPartIdx, TEXT_LUMA);
-    int      part = partitionFromSize(tuSize);
+    int      part = partitionFromLog2Size(log2TrSize);
     int      sizeIdx = log2TrSize - 2;
 
     //===== get residual signal =====
+#if CHECKED_BUILD || _DEBUG
+    uint32_t tuSize       = 1 << log2TrSize;
     X265_CHECK(!((intptr_t)fenc & (tuSize - 1)), "fenc alignment check fail\n");
     X265_CHECK(!((intptr_t)pred & (tuSize - 1)), "pred alignment check fail\n");
     X265_CHECK(!((intptr_t)residual & (tuSize - 1)), "residual alignment check fail\n");
+#endif
     primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
 
     //===== transform and quantization =====
     //--- init rate estimation arrays for RDOQ ---
     if (m_bEnableRDOQ)
-        m_sbacCoder->estBit(m_trQuant.m_estBitsSbac, tuSize, TEXT_LUMA);
+        m_sbacCoder->estBit(m_trQuant.m_estBitsSbac, log2TrSize, TEXT_LUMA);
 
     //--- transform and quantization ---
     int chFmt = cu->getChromaFormat();
@@ -455,7 +458,7 @@
     else
     {
 #if CHECKED_BUILD || _DEBUG
-        memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);
+        memset(coeff, 0, sizeof(coeff_t) << log2TrSize * 2);
 #endif
         //===== reconstruction =====
         primitives.square_copy_ps[sizeIdx](reconQt,    reconQtStride,    pred, stride);
@@ -479,7 +482,6 @@
                                        uint32_t    log2TrSizeC)
 {
     TextType ttype        = (TextType)chromaId;
-    uint32_t tuSize       = 1 << log2TrSizeC;
     uint32_t stride       = fencYuv->getCStride();
     pixel*   fenc         = fencYuv->getChromaAddr(chromaId, absPartIdx);
     pixel*   pred         = predYuv->getChromaAddr(chromaId, absPartIdx);
@@ -489,19 +491,22 @@
     pixel*   reconIPred       = cu->getPic()->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), zorder);
     uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getCStride();
     bool     useTransformSkipC = !!cu->getTransformSkip(absPartIdx, ttype);
-    int      part = partitionFromSize(tuSize);
+    int      part = partitionFromLog2Size(log2TrSizeC);
     int      sizeIdxC = log2TrSizeC - 2;
 
     //===== get residual signal =====
+#if CHECKED_BUILD || _DEBUG
+    uint32_t tuSize       = 1 << log2TrSizeC;
     X265_CHECK(!((intptr_t)fenc & (tuSize - 1)), "fenc alignment check fail\n");
     X265_CHECK(!((intptr_t)pred & (tuSize - 1)), "pred alignment check fail\n");
     X265_CHECK(!((intptr_t)residual & (tuSize - 1)), "residual alignment check fail\n");
+#endif
     primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
 
     //===== transform and quantization =====
     //--- init rate estimation arrays for RDOQ ---
     if (m_bEnableRDOQ)
-        m_sbacCoder->estBit(m_trQuant.m_estBitsSbac, tuSize, TEXT_CHROMA);
+        m_sbacCoder->estBit(m_trQuant.m_estBitsSbac, log2TrSizeC, TEXT_CHROMA);
 
     //--- transform and quantization ---
     int chFmt = cu->getChromaFormat();
@@ -531,7 +536,7 @@
     else
     {
 #if CHECKED_BUILD || _DEBUG
-        memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);
+        memset(coeff, 0, sizeof(coeff_t) << log2TrSizeC * 2);
 #endif
         //===== reconstruction =====
         primitives.square_copy_ps[sizeIdxC](reconQt,    reconQtStride,    pred, stride);
@@ -558,7 +563,7 @@
                                      uint64_t&   rdCost)
 {
     uint32_t fullDepth   = cu->getDepth(0) +  trDepth;
-    uint32_t log2TrSize  = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
+    uint32_t log2TrSize  = g_maxLog2CUSize - fullDepth;
     bool     bCheckFull  = (log2TrSize <= cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize());
     bool     bCheckSplit = (log2TrSize > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx));
 
@@ -592,7 +597,7 @@
         uint32_t tuSize = 1 << log2TrSize;
 
         bool checkTransformSkip = (cu->getSlice()->getPPS()->getUseTransformSkip() &&
-                                   log2TrSize <= LOG2_MAX_TS_SIZE &&
+                                   log2TrSize <= MAX_LOG2_TS_SIZE &&
                                    !cu->getCUTransquantBypass(0));
         if (checkTransformSkip)
         {
@@ -611,7 +616,7 @@
         TComPattern::initAdiPattern(cu, absPartIdx, trDepth, m_predBuf, m_refAbove, m_refLeft, m_refAboveFlt, m_refLeftFlt, lumaPredMode);
 
         //===== get prediction signal =====
-        predIntraLumaAng(lumaPredMode, pred, stride, tuSize);
+        predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
 
         cu->setTrIdxSubParts(trDepth, absPartIdx, fullDepth);
 
@@ -821,7 +826,7 @@
                                              TComYuv*    reconYuv)
 {
     uint32_t fullDepth   = cu->getDepth(0) +  trDepth;
-    uint32_t log2TrSize  = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
+    uint32_t log2TrSize  = g_maxLog2CUSize - fullDepth;
     bool     bCheckFull  = (log2TrSize <= cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize());
     bool     bCheckSplit = (log2TrSize > cu->getQuadtreeTULog2MinSizeInCU(absPartIdx));
 
@@ -838,7 +843,6 @@
 
         //----- code luma block with given intra prediction mode and store Cbf-----
         uint32_t lumaPredMode = cu->getLumaIntraDir(absPartIdx);
-        uint32_t tuSize       = 1 << log2TrSize;
         int      chFmt        = cu->getChromaFormat();
         uint32_t stride       = fencYuv->getStride();
         pixel*   fenc         = fencYuv->getLumaAddr(absPartIdx);
@@ -857,14 +861,17 @@
         //===== init availability pattern =====
         TComPattern::initAdiPattern(cu, absPartIdx, trDepth, m_predBuf, m_refAbove, m_refLeft, m_refAboveFlt, m_refLeftFlt, lumaPredMode);
         //===== get prediction signal =====
-        predIntraLumaAng(lumaPredMode, pred, stride, tuSize);
+        predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
 
         cu->setTrIdxSubParts(trDepth, absPartIdx, fullDepth);
 
         //===== get residual signal =====
+#if CHECKED_BUILD || _DEBUG
+        uint32_t tuSize       = 1 << log2TrSize;
         X265_CHECK(!((intptr_t)fenc & (tuSize - 1)), "fenc alignment failure\n");
         X265_CHECK(!((intptr_t)pred & (tuSize - 1)), "pred alignment failure\n");
         X265_CHECK(!((intptr_t)residual & (tuSize - 1)), "residual alignment failure\n");
+#endif
         int sizeIdx = log2TrSize - 2;
         primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
 
@@ -875,7 +882,7 @@
         //--- set coded block flag ---
         cu->setCbfSubParts((numSig ? 1 : 0) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
 
-        int part = partitionFromSize(tuSize);
+        int part = partitionFromLog2Size(log2TrSize);
 
         if (numSig)
         {
@@ -889,7 +896,7 @@
         else
         {
 #if CHECKED_BUILD || _DEBUG
-            memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);
+            memset(coeff, 0, sizeof(coeff_t) << log2TrSize * 2);
 #endif
 
             // Generate Recon
@@ -924,7 +931,7 @@
 
     if (trMode == trDepth)
     {
-        uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
+        uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
         uint32_t qtLayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
 
         //===== copy transform coefficients =====
@@ -934,7 +941,7 @@
         ::memcpy(coeffDestY, coeffSrcY, sizeof(coeff_t) << (log2TrSize * 2));
 
         //===== copy reconstruction =====
-        m_qtTempShortYuv[qtLayer].copyPartToPartLuma(reconYuv, absPartIdx, 1 << log2TrSize);
+        m_qtTempShortYuv[qtLayer].copyPartToPartLuma(reconYuv, absPartIdx, log2TrSize);
     }
     else
     {
@@ -972,7 +979,7 @@
 {
     uint32_t depth = cu->getDepth(0);
     uint32_t fullDepth = depth + trDepth;
-    uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
+    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 
     uint32_t trDepthC = trDepth;
     if ((log2TrSize == 2) && !(cu->getChromaFormat() == CHROMA_444))
@@ -1018,7 +1025,7 @@
     if (trMode == trDepth)
     {
         int chFmt = cu->getChromaFormat();
-        uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
+        uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
         uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
 
         uint32_t trDepthC = trDepth;
@@ -1033,19 +1040,20 @@
                 return;
         }
 
+        uint32_t log2UnitSize = cu->getPic()->getLog2UnitSize();
         uint32_t tuSize = 1 << log2TrSizeC;
         uint32_t stride = fencYuv->getCStride();
         const bool splitIntoSubTUs = (chFmt == CHROMA_422);
 
         bool checkTransformSkip = (cu->getSlice()->getPPS()->getUseTransformSkip() &&
-                                   log2TrSizeC <= LOG2_MAX_TS_SIZE &&
+                                   log2TrSizeC <= MAX_LOG2_TS_SIZE &&
                                    !cu->getCUTransquantBypass(0));
 
         uint32_t qtLayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
 
         if (m_param->bEnableTSkipFast)
         {
-            checkTransformSkip &= ((cu->getCUSize(0) >> trDepth) <= 4);
+            checkTransformSkip &= (log2TrSize <= MAX_LOG2_TS_SIZE);
             if (checkTransformSkip)
             {
                 int nbLumaSkip = 0;
@@ -1079,14 +1087,14 @@
                 }
                 chromaPredMode = (chFmt == CHROMA_422) ? g_chroma422IntraAngleMappingTable[chromaPredMode] : chromaPredMode;
                 //===== get prediction signal =====
-                predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, tuSize, chFmt);
+                predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, chFmt);
 
                 uint32_t singleCbfC     = 0;
                 uint32_t singlePsyEnergyTmp = 0;
 
                 int16_t* reconQt        = m_qtTempShortYuv[qtLayer].getChromaAddr(chromaId, absPartIdxC);
                 uint32_t reconQtStride  = m_qtTempShortYuv[qtLayer].m_cwidth;
-                uint32_t coeffOffsetC   = absPartIdxC << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
+                uint32_t coeffOffsetC   = absPartIdxC << (log2UnitSize * 2 - (m_hChromaShift + m_vChromaShift));
                 coeff_t* coeffC         = m_qtTempCoeff[chromaId][qtLayer] + coeffOffsetC;
 
                 if (checkTransformSkip)
@@ -1222,11 +1230,10 @@
     if (trMode == trDepth)
     {
         int      chFmt      = cu->getChromaFormat();
-        uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
+        uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
         uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
         uint32_t qtLayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
 
-        bool bChromaSame = false;
         if ((log2TrSize == 2) && !(chFmt == CHROMA_444))
         {
             X265_CHECK(trDepth > 0, "invalid trDepth\n");
@@ -1235,7 +1242,6 @@
             uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trDepth) << 1);
             if ((absPartIdx & (qpdiv - 1)) != 0)
                 return;
-            bChromaSame = true;
         }
 
         //===== copy transform coefficients =====
@@ -1251,7 +1257,7 @@
         ::memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
 
         //===== copy reconstruction =====
-        m_qtTempShortYuv[qtLayer].copyPartToPartChroma(reconYuv, absPartIdx, 1 << log2TrSize, (bChromaSame && (chFmt != CHROMA_422)));
+        m_qtTempShortYuv[qtLayer].copyPartToPartChroma(reconYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
     }
     else
     {
@@ -1275,9 +1281,8 @@
     if (trMode == trDepth)
     {
         int      chFmt     = cu->getChromaFormat();
-        uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - fullDepth;
+        uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
         uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-        uint32_t origTrDepth = trDepth;
         uint32_t trDepthC = trDepth;
         if ((log2TrSize == 2) && !(chFmt == CHROMA_444))
         {
@@ -1290,11 +1295,12 @@
                 return;
         }
 
+        uint32_t log2UnitSize = cu->getPic()->getLog2UnitSize();
         uint32_t tuSize = 1 << log2TrSizeC;
         uint32_t stride = fencYuv->getCStride();
         const bool splitIntoSubTUs = (chFmt == CHROMA_422);
         int sizeIdxC = log2TrSizeC - 2;
-        int part = partitionFromSize(tuSize);
+        int part = partitionFromLog2Size(log2TrSizeC);
 
         for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
         {
@@ -1310,7 +1316,7 @@
                 pixel*   pred           = predYuv->getChromaAddr(chromaId, absPartIdxC);
                 int16_t* residual       = resiYuv->getChromaAddr(chromaId, absPartIdxC);
                 pixel*   recon          = reconYuv->getChromaAddr(chromaId, absPartIdxC);
-                uint32_t coeffOffsetC   = absPartIdxC << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
+                uint32_t coeffOffsetC   = absPartIdxC << (log2UnitSize * 2 - (m_hChromaShift + m_vChromaShift));
                 coeff_t* coeff          = cu->getCoeff(ttype) + coeffOffsetC;
                 uint32_t zorder         = cu->getZorderIdxInCU() + absPartIdxC;
                 pixel*   reconIPred     = cu->getPic()->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), zorder);
@@ -1333,7 +1339,7 @@
                 pixel* chromaPred = TComPattern::getAdiChromaBuf(chromaId, tuSize, m_predBuf);
 
                 //===== get prediction signal =====
-                predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, tuSize, chFmt);
+                predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, chFmt);
 
                 //===== get residual signal =====
                 X265_CHECK(!((intptr_t)fenc & (tuSize - 1)), "fenc alignment failure\n");
@@ -1351,7 +1357,7 @@
                 uint32_t numSig = m_trQuant.transformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTransformSkipC);
 
                 //--- set coded block flag ---
-                cu->setCbfPartRange((((numSig > 0) ? 1 : 0) << origTrDepth), ttype, absPartIdxC, tuIterator.absPartIdxStep);
+                cu->setCbfPartRange((((numSig > 0) ? 1 : 0) << trDepth), ttype, absPartIdxC, tuIterator.absPartIdxStep);
 
                 if (numSig)
                 {
@@ -1406,13 +1412,14 @@
     uint32_t depth        = cu->getDepth(0);
     uint32_t initTrDepth  = cu->getPartitionSize(0) == SIZE_2Nx2N ? 0 : 1;
     uint32_t numPU        = 1 << (2 * initTrDepth);
-    uint32_t tuSize       = cu->getCUSize(0) >> initTrDepth;
+    uint32_t log2TrSize   = cu->getLog2CUSize(0) - initTrDepth;
+    uint32_t tuSize       = 1 << log2TrSize;
     uint32_t qNumParts    = cu->getTotalNumPart() >> 2;
     uint32_t qPartNum     = cu->getPic()->getNumPartInCU() >> ((depth + initTrDepth) << 1);
     uint32_t overallDistY = 0;
     uint32_t candNum;
     uint64_t candCostList[FAST_UDI_MAX_RDMODE_NUM];
-    uint32_t sizeIdx      = g_convertToBit[tuSize]; // log2(tuSize) - 2
+    uint32_t sizeIdx      = log2TrSize - 2;
     static const uint8_t intraModeNumFast[] = { 8, 8, 3, 3, 3 }; // 4x4, 8x8, 16x16, 32x32, 64x64
 
     //===== loop over partitions =====
@@ -1474,7 +1481,7 @@
                 scaleTuSize = 32;
                 scaleStride = 32;
                 costShift = 2;
-                sizeIdx = 5 - 2; // g_convertToBit[scaleTuSize];
+                sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
 
                 // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
                 above         = aboveScale;
@@ -1628,12 +1635,11 @@
         if (pu != numPU - 1)
         {
             uint32_t zorder      = cu->getZorderIdxInCU() + partOffset;
-            int      part        = partitionFromSize(tuSize);
             pixel*   dst         = cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
             uint32_t dststride   = cu->getPic()->getPicYuvRec()->getStride();
             pixel*   src         = reconYuv->getLumaAddr(partOffset);
             uint32_t srcstride   = reconYuv->getStride();
-            primitives.luma_copy_pp[part](dst, dststride, src, srcstride);
+            primitives.square_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride);
         }
 
         //=== update PU data ====
@@ -1671,7 +1677,8 @@
     uint32_t maxMode = NUM_CHROMA_MODE;
     uint32_t modeList[NUM_CHROMA_MODE];
 
-    uint32_t tuSize = cu->getCUSize(0) >> (trDepth + m_hChromaShift);
+    uint32_t log2TrSizeC = cu->getLog2CUSize(0) - trDepth - m_hChromaShift;
+    uint32_t tuSize = 1 << log2TrSizeC;
     int      chFmt  = cu->getChromaFormat();
     uint32_t stride = fencYuv->getCStride();
     int scaleTuSize = tuSize;
@@ -1681,8 +1688,9 @@
     {
         scaleTuSize = 32;
         costShift = 2;
+        log2TrSizeC = 5;
     }
-    int sizeIdx = g_convertToBit[scaleTuSize];
+    int sizeIdx = log2TrSizeC - 2;
     pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
 
     TComPattern::initAdiPatternChroma(cu, absPartIdx, trDepth, m_predBuf, 1);
@@ -1700,10 +1708,10 @@
         {
             pixel* fenc = fencYuv->getChromaAddr(chromaId, absPartIdx);
             pixel* pred = predYuv->getChromaAddr(chromaId, absPartIdx);
-            pixel* chromaPred = TComPattern::getAdiChromaBuf(chromaId, tuSize, m_predBuf);
+            pixel* chromaPred = TComPattern::getAdiChromaBuf(chromaId, scaleTuSize, m_predBuf);
 
             //===== get prediction signal =====
-            predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, scaleTuSize, chFmt);
+            predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, chFmt);
             cost += sa8d(fenc, stride, pred, stride) << costShift;
         }
 
@@ -1726,11 +1734,11 @@
 {
     uint32_t depth              = cu->getDepth(0);
     uint32_t initTrDepth        = (cu->getPartitionSize(0) != SIZE_2Nx2N) && (cu->getChromaFormat() == CHROMA_444 ? 1 : 0);
-    uint32_t tuSize             = cu->getCUSize(0) >> initTrDepth;
+    uint32_t log2TrSize         = cu->getLog2CUSize(0) - initTrDepth;
     uint32_t absPartIdx         = (cu->getPic()->getNumPartInCU() >> (depth << 1));
 
     int chFmt = cu->getChromaFormat();
-    int part = partitionFromSize(tuSize);
+    int part = partitionFromLog2Size(log2TrSize);
 
     TURecurse tuIterator((initTrDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartIdx, 0);
 
@@ -1843,7 +1851,7 @@
 {
     assert(cu->getPartitionSize(0) != SIZE_2Nx2N);
 
-    if (cu->getCUSize(0) <= 8 && cu->getSlice()->getPPS()->getLog2ParallelMergeLevelMinus2())
+    if (cu->getLog2CUSize(0) <= 3 && cu->getSlice()->getPPS()->getLog2ParallelMergeLevelMinus2())
     {
         if (puIdx == 0)
         {
@@ -1948,7 +1956,7 @@
             merge.height = roiHeight;
             mrgCost = xMergeEstimation(cu, partIdx, merge);
 
-            if (bMergeOnly && cu->getCUSize(0) > 8)
+            if (bMergeOnly && cu->getLog2CUSize(0) > 3)
             {
                 if (mrgCost == MAX_UINT)
                 {
@@ -2311,7 +2319,8 @@
     uint32_t bits = 0, bestBits = 0;
     uint32_t distortion = 0, bestDist = 0;
 
-    uint32_t cuSize = cu->getCUSize(0);
+    uint32_t log2CUSize = cu->getLog2CUSize(0);
+    uint32_t cuSize = 1 << log2CUSize;
     uint8_t  depth  = cu->getDepth(0);
 
     // No residual coding : SKIP mode
@@ -2321,7 +2330,7 @@
 
         predYuv->copyToPartYuv(outReconYuv, 0);
         // Luma
-        int part = partitionFromSize(cuSize);
+        int part = partitionFromLog2Size(log2CUSize);
         distortion = primitives.sse_pp[part](fencYuv->getLumaAddr(), fencYuv->getStride(), outReconYuv->getLumaAddr(), outReconYuv->getStride());
         // Chroma
         part = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
@@ -2341,7 +2350,7 @@
         cu->m_totalDistortion = distortion;
         if (m_rdCost.psyRdEnabled())
         {
-            int size = g_convertToBit[cuSize];
+            int size = log2CUSize - 2;
             cu->m_psyEnergy = m_rdCost.psyCost(size, fencYuv->getLumaAddr(), fencYuv->getStride(),
                                                    outReconYuv->getLumaAddr(), outReconYuv->getStride());
             cu->m_totalPsyCost = m_rdCost.calcPsyRdCost(cu->m_totalDistortion, cu->m_totalBits, cu->m_psyEnergy);
@@ -2356,7 +2365,7 @@
         return;
     }
 
-    outResiYuv->subtract(fencYuv, predYuv, cuSize);
+    outResiYuv->subtract(fencYuv, predYuv, log2CUSize);
 
     // Residual coding.
     bool bIsTQBypassEnable = false, bIsLosslessMode = false;
@@ -2394,7 +2403,7 @@
         if (m_rdCost.psyRdEnabled())
         {
             // need to check whether zero distortion is similar to psyenergy of fenc
-            int size = g_convertToBit[cuSize];
+            int size = log2CUSize - 2;
             zeroPsyEnergyY = m_rdCost.psyCost(size, fencYuv->getLumaAddr(), fencYuv->getStride(), (pixel*)RDCost::zeroPel, 0);
             zeroCost = m_rdCost.calcPsyRdCost(zeroDistortion, zeroResiBits, zeroPsyEnergyY);
         }
@@ -2451,14 +2460,14 @@
             predYuv->copyToPartYuv(outReconYuv, 0);
 
         // update with clipped distortion and cost (qp estimation loop uses unclipped values)
-        int part = partitionFromSize(cuSize);
+        int part = partitionFromLog2Size(log2CUSize);
         bestDist = primitives.sse_pp[part](fencYuv->getLumaAddr(), fencYuv->getStride(), outReconYuv->getLumaAddr(), outReconYuv->getStride());
         part = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
         bestDist += m_rdCost.scaleChromaDistCb(primitives.sse_pp[part](fencYuv->getCbAddr(), fencYuv->getCStride(), outReconYuv->getCbAddr(), outReconYuv->getCStride()));
         bestDist += m_rdCost.scaleChromaDistCr(primitives.sse_pp[part](fencYuv->getCrAddr(), fencYuv->getCStride(), outReconYuv->getCrAddr(), outReconYuv->getCStride()));
         if (m_rdCost.psyRdEnabled())
         {
-            int size = g_convertToBit[cuSize];
+            int size = log2CUSize - 2;
             cu->m_psyEnergy = m_rdCost.psyCost(size, fencYuv->getLumaAddr(), fencYuv->getStride(),
                                                outReconYuv->getLumaAddr(), outReconYuv->getStride());
             cu->m_totalPsyCost = m_rdCost.calcPsyRdCost(bestDist, bestBits, cu->m_psyEnergy);
@@ -2484,9 +2493,9 @@
     if (cu->getPredictionMode(0) == MODE_INTER)
     {
         residualTransformQuantInter(cu, 0, resiYuv, cu->getDepth(0), true);
-        uint32_t width  = cu->getCUSize(0);
+        uint32_t cuSize = 1 << cu->getLog2CUSize(0);
         if (cu->getQtRootCbf(0))
-            reconYuv->addClip(predYuv, resiYuv, width);
+            reconYuv->addClip(predYuv, resiYuv, cuSize);
         else
         {
             predYuv->copyToPartYuv(reconYuv, 0);
@@ -2507,7 +2516,7 @@
 {
     X265_CHECK(cu->getDepth(0) == cu->getDepth(absPartIdx), "invalid depth\n");
     const uint32_t trMode = depth - cu->getDepth(0);
-    const uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
+    const uint32_t log2TrSize = g_maxLog2CUSize - depth;
     const uint32_t setCbf     = 1 << trMode;
     int chFmt                 = cu->getChromaFormat();
 
@@ -2664,7 +2673,7 @@
 {
     X265_CHECK(cu->getDepth(0) == cu->getDepth(absPartIdx), "depth not matching\n");
     const uint32_t trMode = depth - cu->getDepth(0);
-    const uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
+    const uint32_t log2TrSize = g_maxLog2CUSize - depth;
     const uint32_t subTUDepth = trMode + 1;
     const uint32_t setCbf     = 1 << trMode;
     int chFmt                 = cu->getChromaFormat();
@@ -2726,13 +2735,13 @@
 
         cu->setTrIdxSubParts(depth - cu->getDepth(0), absPartIdx, depth);
         bool checkTransformSkip   = cu->getSlice()->getPPS()->getUseTransformSkip() && !cu->getCUTransquantBypass(0);
-        bool checkTransformSkipY  = checkTransformSkip && log2TrSize  <= LOG2_MAX_TS_SIZE;
-        bool checkTransformSkipUV = checkTransformSkip && log2TrSizeC <= LOG2_MAX_TS_SIZE;
+        bool checkTransformSkipY  = checkTransformSkip && log2TrSize  <= MAX_LOG2_TS_SIZE;
+        bool checkTransformSkipUV = checkTransformSkip && log2TrSizeC <= MAX_LOG2_TS_SIZE;
 
         cu->setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
 
         if (m_bEnableRDOQ && curuseRDOQ)
-            m_sbacCoder->estBit(m_trQuant.m_estBitsSbac, trSize, TEXT_LUMA);
+            m_sbacCoder->estBit(m_trQuant.m_estBitsSbac, log2TrSize, TEXT_LUMA);
 
         m_trQuant.setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
         numSigY = m_trQuant.transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, coeffCurY,
@@ -2761,7 +2770,7 @@
                 cu->setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
 
                 if (m_bEnableRDOQ && curuseRDOQ)
-                    m_sbacCoder->estBit(m_trQuant.m_estBitsSbac, trSizeC, TEXT_CHROMA);
+                    m_sbacCoder->estBit(m_trQuant.m_estBitsSbac, log2TrSizeC, TEXT_CHROMA);
 
                 //Cb transform
                 int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
@@ -2804,13 +2813,13 @@
             minCost[TEXT_CHROMA_V][subTUIndex] = MAX_INT64;
         }
 
-        int partSize = partitionFromSize(trSize);
+        int partSize = partitionFromLog2Size(log2TrSize);
         uint32_t distY = primitives.sse_sp[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, (pixel*)RDCost::zeroPel, 0);
         uint32_t psyEnergyY = 0;
         if (m_rdCost.psyRdEnabled())
         {
             // need to check whether zero distortion is similar to psyenergy of fenc
-            int size = g_convertToBit[trSize];
+            int size = log2TrSize - 2;
             psyEnergyY = m_rdCost.psyCost(size, fencYuv->getLumaAddr(absPartIdx), fencYuv->getStride(), (pixel*)RDCost::zeroPel, 0);
         }
         int16_t *curResiY = m_qtTempShortYuv[qtLayer].getLumaAddr(absPartIdx);
@@ -2904,7 +2913,7 @@
         {
             TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
 
-            int partSizeC = partitionFromSize(trSizeC);
+            int partSizeC = partitionFromLog2Size(log2TrSizeC);
 
             do
             {
@@ -3103,7 +3112,7 @@
             cu->setTransformSkipSubParts(1, TEXT_LUMA, absPartIdx, depth);
 
             if (m_bEnableRDOQ)
-                m_sbacCoder->estBit(m_trQuant.m_estBitsSbac, trSize, TEXT_LUMA);
+                m_sbacCoder->estBit(m_trQuant.m_estBitsSbac, log2TrSize, TEXT_LUMA);
 
             m_trQuant.setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
             uint32_t numSigTSkipY = m_trQuant.transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, tsCoeffY,
@@ -3167,7 +3176,7 @@
 
             TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
 
-            int partSizeC = partitionFromSize(trSizeC);
+            int partSizeC = partitionFromLog2Size(log2TrSizeC);
 
             do
             {
@@ -3186,7 +3195,7 @@
                 cu->setTransformSkipPartRange(1, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
 
                 if (m_bEnableRDOQ)
-                    m_sbacCoder->estBit(m_trQuant.m_estBitsSbac, trSizeC, TEXT_CHROMA);
+                    m_sbacCoder->estBit(m_trQuant.m_estBitsSbac, log2TrSizeC, TEXT_CHROMA);
 
                 int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();
                 m_trQuant.setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);
@@ -3510,7 +3519,7 @@
     const uint32_t curTrMode   = depth - cu->getDepth(0);
     const uint32_t trMode      = cu->getTransformIdx(absPartIdx);
     const bool     bSubdiv     = curTrMode != trMode;
-    const uint32_t log2TrSize  = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
+    const uint32_t log2TrSize  = g_maxLog2CUSize - depth;
     uint32_t       log2TrSizeC = log2TrSize - m_hChromaShift;
     int            chFmt       = cu->getChromaFormat();
     const bool splitIntoSubTUs = (chFmt == CHROMA_422);
@@ -3626,12 +3635,11 @@
     if (curTrMode == trMode)
     {
         int            chFmt      = cu->getChromaFormat();
-        const uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
+        const uint32_t log2TrSize = g_maxLog2CUSize - depth;
         const uint32_t qtLayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
 
         uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
         bool bCodeChroma = true;
-        bool bChromaSame = false;
         uint32_t trModeC = trMode;
         if ((log2TrSize == 2) && !(chFmt == CHROMA_444))
         {
@@ -3639,16 +3647,14 @@
             trModeC--;
             uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trModeC) << 1);
             bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
-            bChromaSame = true;
         }
 
         if (bSpatial)
         {
-            uint32_t trSize = 1 << log2TrSize;
-            m_qtTempShortYuv[qtLayer].copyPartToPartLuma(resiYuv, absPartIdx, trSize);
+            m_qtTempShortYuv[qtLayer].copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize);
 
             if (bCodeChroma)
-                m_qtTempShortYuv[qtLayer].copyPartToPartChroma(resiYuv, absPartIdx, trSize, (bChromaSame && (chFmt != CHROMA_422)));
+                m_qtTempShortYuv[qtLayer].copyPartToPartChroma(resiYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
         }
         else
         {
@@ -3761,7 +3767,7 @@
         m_sbacCoder->codePredInfo(cu, 0);
         bool bDummy = false;
         cu->m_mvBits = m_sbacCoder->getNumberOfWrittenBits();
-        m_sbacCoder->codeCoeff(cu, 0, cu->getDepth(0), cu->getCUSize(0), bDummy);
+        m_sbacCoder->codeCoeff(cu, 0, cu->getDepth(0), bDummy);
         int totalBits = m_sbacCoder->getNumberOfWrittenBits();
         cu->m_coeffBits = totalBits - cu->m_mvBits;
         return totalBits;
diff -r 6055baa75085 -r fa683df9621e source/common/frame.cpp
--- a/source/common/frame.cpp	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/common/frame.cpp	Mon Jul 14 14:50:34 2014 +0900
@@ -73,8 +73,8 @@
     bool isVbv = param->rc.vbvBufferSize > 0 && param->rc.vbvMaxBitrate > 0;
     if (ok && (isVbv || param->rc.aqMode))
     {
-        int numCols = (param->sourceWidth + g_maxCUSize - 1) / g_maxCUSize;
-        int numRows = (param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;
+        int numCols = (param->sourceWidth + g_maxCUSize - 1) >> g_maxLog2CUSize;
+        int numRows = (param->sourceHeight + g_maxCUSize - 1) >> g_maxLog2CUSize;
 
         if (param->rc.aqMode)
             CHECKED_MALLOC(m_qpaAq, double, numRows);
@@ -126,8 +126,8 @@
 
 void Frame::reinit(x265_param *param)
 {
-    int numCols = (param->sourceWidth + g_maxCUSize - 1) / g_maxCUSize;
-    int numRows = (param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;
+    int numCols = (param->sourceWidth + g_maxCUSize - 1) >> g_maxLog2CUSize;
+    int numRows = (param->sourceHeight + g_maxCUSize - 1) >> g_maxLog2CUSize;
     if (param->rc.vbvBufferSize > 0 && param->rc.vbvMaxBitrate > 0)
     {
         memset(m_rowDiagQp, 0, numRows * sizeof(double));
diff -r 6055baa75085 -r fa683df9621e source/common/intrapred.cpp
--- a/source/common/intrapred.cpp	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/common/intrapred.cpp	Mon Jul 14 14:50:34 2014 +0900
@@ -99,8 +99,8 @@
     }
 }
 
-template<int width>
-void planad_pred_c(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int /*dirMode*/, int /*bFilter*/)
+template<int log2Size>
+void planar_pred_c(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int /*dirMode*/, int /*bFilter*/)
 {
     above += 1;
     left  += 1;
@@ -110,10 +110,10 @@
     int32_t leftColumn[MAX_CU_SIZE + 1], topRow[MAX_CU_SIZE + 1];
     // CHECK_ME: dynamic range is 9 bits or 15 bits(I assume max input bit_depth is 14 bits)
     int16_t bottomRow[MAX_CU_SIZE], rightColumn[MAX_CU_SIZE];
-    int blkSize = width;
-    int offset2D = width;
-    int shift1D = g_convertToBit[width] + 2;
-    int shift2D = shift1D + 1;
+    const int blkSize = 1 << log2Size;
+    const int offset2D = blkSize;
+    const int shift1D = log2Size;
+    const int shift2D = shift1D + 1;
 
     // Get left and above reference column and row
     for (k = 0; k < blkSize + 1; k++)
@@ -257,14 +257,16 @@
     }
 }
 
-template<int size>
+template<int log2Size>
 void all_angs_pred_c(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma)
 {
+    const int size = 1 << log2Size;
+    const int sizeIdx = log2Size - 2;
     for (int mode = 2; mode <= 34; mode++)
     {
-        pixel *left = (IntraFilterType[(int)g_convertToBit[size]][mode] ? left1 : left0);
-        pixel *above = (IntraFilterType[(int)g_convertToBit[size]][mode] ? above1 : above0);
-        pixel *out = dest + (mode - 2) * (size * size);
+        pixel *left = (IntraFilterType[sizeIdx][mode] ? left1 : left0);
+        pixel *above = (IntraFilterType[sizeIdx][mode] ? above1 : above0);
+        pixel *out = dest + ((mode - 2) << (log2Size * 2));
 
         intra_pred_ang_c<size>(out, size, left, above, mode, bLuma);
 
@@ -293,10 +295,10 @@
 
 void Setup_C_IPredPrimitives(EncoderPrimitives& p)
 {
-    p.intra_pred[BLOCK_4x4][0] = planad_pred_c<4>;
-    p.intra_pred[BLOCK_8x8][0] = planad_pred_c<8>;
-    p.intra_pred[BLOCK_16x16][0] = planad_pred_c<16>;
-    p.intra_pred[BLOCK_32x32][0] = planad_pred_c<32>;
+    p.intra_pred[BLOCK_4x4][0] = planar_pred_c<2>;
+    p.intra_pred[BLOCK_8x8][0] = planar_pred_c<3>;
+    p.intra_pred[BLOCK_16x16][0] = planar_pred_c<4>;
+    p.intra_pred[BLOCK_32x32][0] = planar_pred_c<5>;
 
     // Intra Prediction DC
     p.intra_pred[BLOCK_4x4][1] = intra_pred_dc_c<4>;
@@ -311,9 +313,9 @@
         p.intra_pred[BLOCK_32x32][i] = intra_pred_ang_c<32>;
     }
 
-    p.intra_pred_allangs[BLOCK_4x4] = all_angs_pred_c<4>;
-    p.intra_pred_allangs[BLOCK_8x8] = all_angs_pred_c<8>;
-    p.intra_pred_allangs[BLOCK_16x16] = all_angs_pred_c<16>;
-    p.intra_pred_allangs[BLOCK_32x32] = all_angs_pred_c<32>;
+    p.intra_pred_allangs[BLOCK_4x4] = all_angs_pred_c<2>;
+    p.intra_pred_allangs[BLOCK_8x8] = all_angs_pred_c<3>;
+    p.intra_pred_allangs[BLOCK_16x16] = all_angs_pred_c<4>;
+    p.intra_pred_allangs[BLOCK_32x32] = all_angs_pred_c<5>;
 }
 }
diff -r 6055baa75085 -r fa683df9621e source/common/param.cpp
--- a/source/common/param.cpp	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/common/param.cpp	Mon Jul 14 14:50:34 2014 +0900
@@ -875,7 +875,8 @@
         return check_failed;
 
     uint32_t maxCUDepth = (uint32_t)g_convertToBit[param->maxCUSize];
-    uint32_t tuQTMaxLog2Size = maxCUDepth + 2 - 1;
+    uint32_t maxLog2CUSize = maxCUDepth + 2;
+    uint32_t tuQTMaxLog2Size = maxLog2CUSize - 1;
     uint32_t tuQTMinLog2Size = 2; //log2(4)
 
     CHECK((param->maxCUSize >> maxCUDepth) < 4,
@@ -947,16 +948,16 @@
     CHECK(param->crQpOffset < -12, "Min. Chroma Cr QP Offset is -12");
     CHECK(param->crQpOffset >  12, "Max. Chroma Cr QP Offset is  12");
 
-    CHECK((1u << tuQTMaxLog2Size) > param->maxCUSize,
+    CHECK(tuQTMaxLog2Size > maxLog2CUSize,
           "QuadtreeTULog2MaxSize must be log2(maxCUSize) or smaller.");
 
     CHECK(param->tuQTMaxInterDepth < 1 || param->tuQTMaxInterDepth > 4,
           "QuadtreeTUMaxDepthInter must be greater than 0 and less than 5");
-    CHECK(param->maxCUSize < (1u << (tuQTMinLog2Size + param->tuQTMaxInterDepth - 1)),
+    CHECK(maxLog2CUSize < tuQTMinLog2Size + param->tuQTMaxInterDepth - 1,
           "QuadtreeTUMaxDepthInter must be less than or equal to the difference between log2(maxCUSize) and QuadtreeTULog2MinSize plus 1");
     CHECK(param->tuQTMaxIntraDepth < 1 || param->tuQTMaxIntraDepth > 4,
           "QuadtreeTUMaxDepthIntra must be greater 0 and less than 5");
-    CHECK(param->maxCUSize < (1u << (tuQTMinLog2Size + param->tuQTMaxIntraDepth - 1)),
+    CHECK(maxLog2CUSize < tuQTMinLog2Size + param->tuQTMaxIntraDepth - 1,
           "QuadtreeTUMaxDepthInter must be less than or equal to the difference between log2(maxCUSize) and QuadtreeTULog2MinSize plus 1");
 
     CHECK(param->maxNumMergeCand < 1, "MaxNumMergeCand must be 1 or greater.");
@@ -1087,17 +1088,15 @@
     {
         // set max CU width & height
         g_maxCUSize = param->maxCUSize;
+        g_maxLog2CUSize = maxCUDepth + 2;
 
         // compute actual CU depth with respect to config depth and max transform size
-        g_addCUDepth = 0;
-        while ((param->maxCUSize >> maxCUDepth) > (1u << (tuQTMinLog2Size + g_addCUDepth)))
-        {
-            g_addCUDepth++;
-        }
+        g_addCUDepth = g_maxLog2CUSize - maxCUDepth - tuQTMinLog2Size;
 
         maxCUDepth += g_addCUDepth;
         g_addCUDepth++;
         g_maxCUDepth = maxCUDepth;
+        g_log2UnitSize = g_maxLog2CUSize - g_maxCUDepth;
 
         // initialize partition order
         uint32_t* tmp = &g_zscanToRaster[0];
diff -r 6055baa75085 -r fa683df9621e source/common/primitives.h
--- a/source/common/primitives.h	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/common/primitives.h	Mon Jul 14 14:50:34 2014 +0900
@@ -120,6 +120,13 @@
     return part;
 }
 
+inline int partitionFromLog2Size(int log2Size)
+{
+    X265_CHECK(2 <= log2Size && log2Size <= 6, "Invalid block size\n");
+    extern const uint8_t lumaPartitionsFromSquareBlocksTable[];
+    return (int)lumaPartitionsFromSquareBlocksTable[log2Size - 2];
+}
+
 typedef int  (*pixelcmp_t)(pixel *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride); // fenc is aligned
 typedef int  (*pixelcmp_ss_t)(int16_t *fenc, intptr_t fencstride, int16_t *fref, intptr_t frefstride);
 typedef int  (*pixelcmp_sp_t)(int16_t *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride);
diff -r 6055baa75085 -r fa683df9621e source/common/shortyuv.cpp
--- a/source/common/shortyuv.cpp	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/common/shortyuv.cpp	Mon Jul 14 14:50:34 2014 +0900
@@ -85,9 +85,9 @@
     ::memset(m_buf[2], 0, (m_cwidth * m_cheight) * sizeof(int16_t));
 }
 
-void ShortYuv::subtract(TComYuv* srcYuv0, TComYuv* srcYuv1, uint32_t partSize)
+void ShortYuv::subtract(TComYuv* srcYuv0, TComYuv* srcYuv1, uint32_t log2Size)
 {
-    int part = partitionFromSize(partSize);
+    int part = partitionFromLog2Size(log2Size);
 
     pixel* srcY0 = srcYuv0->getLumaAddr();
     pixel* srcY1 = srcYuv1->getLumaAddr();
@@ -119,67 +119,37 @@
     primitives.pixeladd_ss(cpartSize, cpartSize, getCrAddr(), m_cwidth, srcV0, srcV1, srcYuv0->m_cwidth, srcYuv1->m_cwidth);
 }
 
-void ShortYuv::copyPartToPartLuma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t partSize)
+void ShortYuv::copyPartToPartLuma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t log2Size)
 {
-    int part = partitionFromSize(partSize);
     int16_t* src = getLumaAddr(partIdx);
     int16_t* dst = dstPicYuv->getLumaAddr(partIdx);
 
-    primitives.luma_copy_ss[part](dst, dstPicYuv->m_width, src, m_width);
+    primitives.square_copy_ss[log2Size - 2](dst, dstPicYuv->m_width, src, m_width);
 }
 
-void ShortYuv::copyPartToPartLuma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t partSize)
+void ShortYuv::copyPartToPartLuma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t log2Size)
 {
-    int part = partitionFromSize(partSize);
     int16_t* src = getLumaAddr(partIdx);
     pixel* dst = dstPicYuv->getLumaAddr(partIdx);
 
-    primitives.luma_copy_sp[part](dst, dstPicYuv->getStride(), src, m_width);
+    primitives.square_copy_sp[log2Size - 2](dst, dstPicYuv->getStride(), src, m_width);
 }
 
-void ShortYuv::copyPartToPartLuma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height)
+void ShortYuv::copyPartToPartChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t log2SizeL)
 {
-    int part = partitionFromSizes(width, height);
-    int16_t* src = getLumaAddr(partIdx);
-    int16_t* dst = dstPicYuv->getLumaAddr(partIdx);
-
-    primitives.luma_copy_ss[part](dst, dstPicYuv->m_width, src, m_width);
-}
-
-void ShortYuv::copyPartToPartLuma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height)
-{
-    int part = partitionFromSizes(width, height);
-    int16_t* src = getLumaAddr(partIdx);
-    pixel* dst = dstPicYuv->getLumaAddr(partIdx);
-
-    primitives.luma_copy_sp[part](dst, dstPicYuv->getStride(), src, m_width);
-}
-
-void ShortYuv::copyPartToPartChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, bool bChromaSame)
-{
-    int part = partitionFromSize(lumaSize);
-
-    part = ((part == 0) && (m_csp == CHROMA_422)) ? 1 : part;
+    int part = partitionFromLog2Size(log2SizeL);
     int16_t* srcU = getCbAddr(partIdx);
     int16_t* srcV = getCrAddr(partIdx);
     int16_t* dstU = dstPicYuv->getCbAddr(partIdx);
     int16_t* dstV = dstPicYuv->getCrAddr(partIdx);
 
-    if (bChromaSame)
-    {
-        primitives.luma_copy_ss[part](dstU, dstPicYuv->m_cwidth, srcU, m_cwidth);
-        primitives.luma_copy_ss[part](dstV, dstPicYuv->m_cwidth, srcV, m_cwidth);
-    }
-    else
-    {
-        primitives.chroma[m_csp].copy_ss[part](dstU, dstPicYuv->m_cwidth, srcU, m_cwidth);
-        primitives.chroma[m_csp].copy_ss[part](dstV, dstPicYuv->m_cwidth, srcV, m_cwidth);
-    }
+    primitives.chroma[m_csp].copy_ss[part](dstU, dstPicYuv->m_cwidth, srcU, m_cwidth);
+    primitives.chroma[m_csp].copy_ss[part](dstV, dstPicYuv->m_cwidth, srcV, m_cwidth);
 }
 
-void ShortYuv::copyPartToPartChroma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, bool bChromaSame)
+void ShortYuv::copyPartToPartChroma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t log2SizeL)
 {
-    int part = partitionFromSize(lumaSize);
+    int part = partitionFromLog2Size(log2SizeL);
     int16_t* srcU = getCbAddr(partIdx);
     int16_t* srcV = getCrAddr(partIdx);
     pixel* dstU = dstPicYuv->getCbAddr(partIdx);
@@ -188,16 +158,8 @@
     uint32_t srcStride = m_cwidth;
     uint32_t dstStride = dstPicYuv->getCStride();
 
-    if (bChromaSame)
-    {
-        primitives.luma_copy_sp[part](dstU, dstStride, srcU, srcStride);
-        primitives.luma_copy_sp[part](dstV, dstStride, srcV, srcStride);
-    }
-    else
-    {
-        primitives.chroma[m_csp].copy_sp[part](dstU, dstStride, srcU, srcStride);
-        primitives.chroma[m_csp].copy_sp[part](dstV, dstStride, srcV, srcStride);
-    }
+    primitives.chroma[m_csp].copy_sp[part](dstU, dstStride, srcU, srcStride);
+    primitives.chroma[m_csp].copy_sp[part](dstV, dstStride, srcV, srcStride);
 }
 
 void ShortYuv::copyPartToPartShortChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, uint32_t chromaId)
diff -r 6055baa75085 -r fa683df9621e source/common/shortyuv.h
--- a/source/common/shortyuv.h	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/common/shortyuv.h	Mon Jul 14 14:50:34 2014 +0900
@@ -89,17 +89,15 @@
 
     int16_t* getChromaAddr(uint32_t chromaId, uint32_t partUnitIdx) { return m_buf[chromaId] + getChromaAddrOffset(partUnitIdx, m_cwidth); }
 
-    void subtract(TComYuv* srcYuv0, TComYuv* srcYuv1, uint32_t partSize);
+    void subtract(TComYuv* srcYuv0, TComYuv* srcYuv1, uint32_t log2Size);
     void addClip(ShortYuv* srcYuv0, ShortYuv* srcYuv1, uint32_t partSize);
 
-    void copyPartToPartLuma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t partSize);
-    void copyPartToPartLuma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height);
-    void copyPartToPartChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, bool bChromaSame);
+    void copyPartToPartLuma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t log2Size);
+    void copyPartToPartChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t log2SizeL);
     void copyPartToPartShortChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, uint32_t chromaId);
 
-    void copyPartToPartLuma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t partSize);
-    void copyPartToPartLuma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height);
-    void copyPartToPartChroma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, bool bChromaSame);
+    void copyPartToPartLuma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t log2Size);
+    void copyPartToPartChroma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t log2SizeL);
 
     // -------------------------------------------------------------------------------------------------------------------
     // member functions to support multiple color space formats
diff -r 6055baa75085 -r fa683df9621e source/encoder/compress.cpp
--- a/source/encoder/compress.cpp	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/encoder/compress.cpp	Mon Jul 14 14:50:34 2014 +0900
@@ -69,14 +69,14 @@
 
     // Encode Coefficients
     bool bCodeDQP = m_bEncodeDQP;
-    m_sbacCoder->codeCoeff(cu, 0, depth, cu->getCUSize(0), bCodeDQP);
+    m_sbacCoder->codeCoeff(cu, 0, depth, bCodeDQP);
     m_sbacCoder->store(m_rdSbacCoders[depth][CI_TEMP_BEST]);
 
     cu->m_totalBits = m_sbacCoder->getNumberOfWrittenBits();
     cu->m_coeffBits = cu->m_totalBits - cu->m_mvBits;
     if (m_rdCost.psyRdEnabled())
     {
-        int part = g_convertToBit[cu->getCUSize(0)];
+        int part = cu->getLog2CUSize(0) - 2;
         cu->m_psyEnergy = m_rdCost.psyCost(part, m_origYuv[depth]->getLumaAddr(), m_origYuv[depth]->getStride(),
             m_tmpRecoYuv[depth]->getLumaAddr(), m_tmpRecoYuv[depth]->getStride());
         cu->m_totalPsyCost = m_rdCost.calcPsyRdCost(cu->m_totalDistortion, cu->m_totalBits, cu->m_psyEnergy);
@@ -94,7 +94,8 @@
     cu->setCUTransquantBypassSubParts(!!m_param->bLossless, 0, depth);
 
     uint32_t initTrDepth = cu->getPartitionSize(0) == SIZE_2Nx2N ? 0 : 1;
-    uint32_t tuSize      = cu->getCUSize(0) >> initTrDepth;
+    uint32_t log2TrSize  = cu->getLog2CUSize(0) - initTrDepth;
+    uint32_t tuSize      = 1 << log2TrSize;
     const uint32_t partOffset  = 0;
 
     // Reference sample smoothing
@@ -116,7 +117,8 @@
     ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]);
     int scaleTuSize = tuSize;
     int scaleStride = stride;
-    int costMultiplier = 1;
+    int costShift = 0;
+    int sizeIdx = log2TrSize - 2;
 
     if (tuSize > 32)
     {
@@ -137,7 +139,8 @@
 
         scaleTuSize = 32;
         scaleStride = 32;
-        costMultiplier = 4;
+        costShift = 2;
+        sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
 
         // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
         above         = aboveScale;
@@ -146,7 +149,6 @@
         leftFiltered  = leftScale;
     }
 
-    int sizeIdx = g_convertToBit[scaleTuSize];
     pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
 
     uint32_t preds[3];
@@ -157,7 +159,7 @@
 
     // DC
     primitives.intra_pred[sizeIdx][DC_IDX](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
-    bsad = costMultiplier * sa8d(fenc, scaleStride, tmp, scaleStride);
+    bsad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
     bmode = mode = DC_IDX;
     bbits = !(mpms & ((uint64_t)1 << mode)) ? rbits : xModeBitsIntra(cu, mode, partOffset, depth);
     bcost = m_rdCost.calcRdSADCost(bsad, bbits);
@@ -173,7 +175,7 @@
 
     // PLANAR
     primitives.intra_pred[sizeIdx][PLANAR_IDX](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
-    sad = costMultiplier * sa8d(fenc, scaleStride, tmp, scaleStride);
+    sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
     mode = PLANAR_IDX;
     bits = !(mpms & ((uint64_t)1 << mode)) ? rbits : xModeBitsIntra(cu, mode, partOffset, depth);
     cost = m_rdCost.calcRdSADCost(sad, bits);
@@ -189,7 +191,7 @@
         bool modeHor = (mode < 18);
         pixel *cmp = (modeHor ? buf_trans : fenc);
         intptr_t srcStride = (modeHor ? scaleTuSize : scaleStride);
-        sad  = costMultiplier * sa8d(cmp, srcStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize);
+        sad  = sa8d(cmp, srcStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
         bits = !(mpms & ((uint64_t)1 << mode)) ? rbits : xModeBitsIntra(cu, mode, partOffset, depth);
         cost = m_rdCost.calcRdSADCost(sad, bits);
         COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
@@ -216,7 +218,7 @@
     outTempCU->m_totalBits = 0;
     if (predInterSearch(outTempCU, outPredYuv, bUseMRG, false))
     {
-        int sizeIdx = g_convertToBit[outTempCU->getCUSize(0)];
+        int sizeIdx = outTempCU->getLog2CUSize(0) - 2;
         uint32_t distortion = primitives.sa8d[sizeIdx](m_origYuv[depth]->getLumaAddr(), m_origYuv[depth]->getStride(),
                                                        outPredYuv->getLumaAddr(), outPredYuv->getStride());
         outTempCU->m_totalDistortion = distortion;
@@ -248,7 +250,7 @@
     outBestCU->setPredModeSubParts(MODE_INTER, 0, depth);
     outBestCU->setMergeFlag(0, true);
 
-    int sizeIdx = g_convertToBit[outTempCU->getCUSize(0)];
+    int sizeIdx = outTempCU->getLog2CUSize(0) - 2;
     int bestMergeCand = -1;
 
     for (uint32_t mergeCand = 0; mergeCand < maxNumMergeCand; ++mergeCand)
@@ -353,10 +355,11 @@
     TComSlice* slice = outTempCU->getSlice();
     if (!bInsidePicture)
     {
+        int cuSize = 1 << outTempCU->getLog2CUSize(0);
         uint32_t lpelx = outTempCU->getCUPelX();
         uint32_t tpely = outTempCU->getCUPelY();
-        uint32_t rpelx = lpelx + outTempCU->getCUSize(0);
-        uint32_t bpely = tpely + outTempCU->getCUSize(0);
+        uint32_t rpelx = lpelx + cuSize;
+        uint32_t bpely = tpely + cuSize;
         bInsidePicture = (rpelx <= slice->getSPS()->getPicWidthInLumaSamples() &&
                           bpely <= slice->getSPS()->getPicHeightInLumaSamples());
     }
@@ -555,7 +558,7 @@
                         for (int partIdx = 0; partIdx < numPart; partIdx++)
                             motionCompensation(outBestCU, m_bestPredYuv[depth], REF_PIC_LIST_X, partIdx, false, true);
 
-                        m_tmpResiYuv[depth]->subtract(m_origYuv[depth], m_bestPredYuv[depth], outBestCU->getCUSize(0));
+                        m_tmpResiYuv[depth]->subtract(m_origYuv[depth], m_bestPredYuv[depth], outBestCU->getLog2CUSize(0));
                         generateCoeffRecon(outBestCU, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth], m_bestRecoYuv[depth], false);
                     }
                     else
@@ -851,7 +854,7 @@
             uint32_t src2stride = m_bestPredYuv[0]->getStride();
             uint32_t src1stride = m_origYuv[0]->getStride();
             uint32_t dststride = m_tmpResiYuv[depth]->m_width;
-            int part = partitionFromSize(cu->getCUSize(0));
+            int part = partitionFromLog2Size(cu->getLog2CUSize(0));
             primitives.luma_sub_ps[part](dst, dststride, src1, src2, src1stride, src2stride);
 
             src2 = m_bestPredYuv[0]->getCbAddr(absPartIdx);
@@ -910,7 +913,7 @@
 
         // Generate Recon
         TComPicYuv* rec = pic->getPicYuvRec();
-        int part = partitionFromSize(cu->getCUSize(0));
+        int part = partitionFromLog2Size(cu->getLog2CUSize(0));
         pixel* src = m_bestPredYuv[0]->getLumaAddr(absPartIdx);
         pixel* dst = rec->getLumaAddr(cu->getAddr(), absPartIdx);
         uint32_t srcstride = m_bestPredYuv[0]->getStride();
diff -r 6055baa75085 -r fa683df9621e source/encoder/cturow.h
--- a/source/encoder/cturow.h	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/encoder/cturow.h	Mon Jul 14 14:50:34 2014 +0900
@@ -61,7 +61,7 @@
 
     SBac            m_sbacCoder;
     SBac            m_bufferSbacCoder;
-    SBac            m_rdSbacCoders[MAX_CU_DEPTH + 1][CI_NUM];
+    SBac            m_rdSbacCoders[MAX_FULL_DEPTH + 1][CI_NUM];
 
     // to compute stats for 2 pass
     double          m_iCuCnt;
diff -r 6055baa75085 -r fa683df9621e source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/encoder/encoder.cpp	Mon Jul 14 14:50:34 2014 +0900
@@ -1063,16 +1063,10 @@
     sps->setMaxCUSize(g_maxCUSize);
     sps->setMaxCUDepth(g_maxCUDepth);
 
-    int minCUSize = sps->getMaxCUSize() >> (sps->getMaxCUDepth() - g_addCUDepth);
-    int log2MinCUSize = 0;
-    while (minCUSize > 1)
-    {
-        minCUSize >>= 1;
-        log2MinCUSize++;
-    }
+    int log2MinCUSize = g_maxLog2CUSize - (g_maxCUDepth - g_addCUDepth);
 
     sps->setLog2MinCodingBlockSize(log2MinCUSize);
-    sps->setLog2DiffMaxMinCodingBlockSize(sps->getMaxCUDepth() - g_addCUDepth);
+    sps->setLog2DiffMaxMinCodingBlockSize(g_maxCUDepth - g_addCUDepth);
 
     sps->setQuadtreeTULog2MaxSize(m_quadtreeTULog2MaxSize);
     sps->setQuadtreeTULog2MinSize(m_quadtreeTULog2MinSize);
@@ -1219,7 +1213,8 @@
 
     setThreadPool(ThreadPool::allocThreadPool(p->poolNumThreads));
     int poolThreadCount = ThreadPool::getThreadPool()->getThreadCount();
-    int rows = (p->sourceHeight + p->maxCUSize - 1) / p->maxCUSize;
+    uint32_t maxLog2CUSize = g_convertToBit[p->maxCUSize] + 2;
+    int rows = (p->sourceHeight + p->maxCUSize - 1) >> maxLog2CUSize;
 
     if (p->frameNumThreads == 0)
     {
@@ -1362,7 +1357,7 @@
 
     //====== Coding Tools ========
 
-    uint32_t tuQTMaxLog2Size = g_convertToBit[p->maxCUSize] + 2 - 1;
+    uint32_t tuQTMaxLog2Size = maxLog2CUSize - 1;
     m_quadtreeTULog2MaxSize = tuQTMaxLog2Size;
     uint32_t tuQTMinLog2Size = 2; //log2(4)
     m_quadtreeTULog2MinSize = tuQTMinLog2Size;
diff -r 6055baa75085 -r fa683df9621e source/encoder/entropy.cpp
--- a/source/encoder/entropy.cpp	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/encoder/entropy.cpp	Mon Jul 14 14:50:34 2014 +0900
@@ -68,10 +68,10 @@
 }
 
 void SBac::encodeTransform(TComDataCU* cu, CoeffCodeState& state, uint32_t offsetLuma, uint32_t offsetChroma, uint32_t absPartIdx,
-                              uint32_t absPartIdxStep, uint32_t depth, uint32_t tuSize, uint32_t trIdx, bool& bCodeDQP)
+                              uint32_t absPartIdxStep, uint32_t depth, uint32_t log2TrSize, uint32_t trIdx, bool& bCodeDQP)
 {
     const bool subdiv = cu->getTransformIdx(absPartIdx) + cu->getDepth(absPartIdx) > (uint8_t)depth;
-    const uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
+//    const uint32_t log2TrSize = cu->getSlice()->getSPS()->getLog2MaxCodingBlockSize() - depth;
     uint32_t hChromaShift = cu->getHorzChromaShift();
     uint32_t vChromaShift = cu->getVertChromaShift();
     uint32_t cbfY = cu->getCbf(absPartIdx, TEXT_LUMA, trIdx);
@@ -133,12 +133,13 @@
     const bool bFirstCbfOfCU = trDepthCurr == 0;
 
     bool mCodeAll = true;
-    const uint32_t numPels = (tuSize * tuSize) >> (hChromaShift + vChromaShift);
+    const uint32_t numPels = 1 << (log2TrSize * 2 - hChromaShift - vChromaShift);
     if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE))
         mCodeAll = false;
 
     if (bFirstCbfOfCU || mCodeAll)
     {
+        uint32_t tuSize = 1 << log2TrSize;
         if (bFirstCbfOfCU || cu->getCbf(absPartIdx, TEXT_CHROMA_U, trDepthCurr - 1))
             codeQtCbf(cu, absPartIdx, absPartIdxStep, (tuSize >> hChromaShift), (tuSize >> vChromaShift), TEXT_CHROMA_U, trDepthCurr, (subdiv == 0));
         if (bFirstCbfOfCU || cu->getCbf(absPartIdx, TEXT_CHROMA_V, trDepthCurr - 1))
@@ -152,30 +153,30 @@
 
     if (subdiv)
     {
-        tuSize >>= 1;
-        uint32_t numCoeff  = tuSize * tuSize;
+        log2TrSize--;
+        uint32_t numCoeff  = 1 << (log2TrSize * 2);
         uint32_t numCoeffC = (numCoeff >> (hChromaShift + vChromaShift));
         trIdx++;
         ++depth;
         absPartIdxStep >>= 2;
         const uint32_t partNum = cu->getPic()->getNumPartInCU() >> (depth << 1);
 
-        encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, tuSize, trIdx, bCodeDQP);
+        encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP);
 
         absPartIdx += partNum;
         offsetLuma += numCoeff;
         offsetChroma += numCoeffC;
-        encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, tuSize, trIdx, bCodeDQP);
+        encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP);
 
         absPartIdx += partNum;
         offsetLuma += numCoeff;
         offsetChroma += numCoeffC;
-        encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, tuSize, trIdx, bCodeDQP);
+        encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP);
 
         absPartIdx += partNum;
         offsetLuma += numCoeff;
         offsetChroma += numCoeffC;
-        encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, tuSize, trIdx, bCodeDQP);
+        encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP);
     }
     else
     {
@@ -324,11 +325,8 @@
     codeRefFrmIdx(cu, absPartIdx, list);
 }
 
-void SBac::codeCoeff(TComDataCU* cu, uint32_t absPartIdx, uint32_t depth, uint32_t cuSize, bool& bCodeDQP)
+void SBac::codeCoeff(TComDataCU* cu, uint32_t absPartIdx, uint32_t depth, bool& bCodeDQP)
 {
-    uint32_t lumaOffset   = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
-    uint32_t chromaOffset = lumaOffset >> (cu->getHorzChromaShift() + cu->getVertChromaShift());
-
     if (!cu->isIntra(absPartIdx))
     {
         if (!(cu->getMergeFlag(absPartIdx) && cu->getPartitionSize(absPartIdx) == SIZE_2Nx2N))
@@ -337,9 +335,12 @@
             return;
     }
 
+    uint32_t log2CUSize   = cu->getLog2CUSize(absPartIdx);
+    uint32_t lumaOffset   = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
+    uint32_t chromaOffset = lumaOffset >> (cu->getHorzChromaShift() + cu->getVertChromaShift());
     uint32_t absPartIdxStep = cu->getPic()->getNumPartInCU() >> (depth << 1);
     CoeffCodeState state;
-    encodeTransform(cu, state, lumaOffset, chromaOffset, absPartIdx, absPartIdxStep, depth, cuSize, 0, bCodeDQP);
+    encodeTransform(cu, state, lumaOffset, chromaOffset, absPartIdx, absPartIdxStep, depth, log2CUSize, 0, bCodeDQP);
 }
 
 void SBac::codeSaoOffset(SaoLcuParam* saoLcuParam, uint32_t compIdx)
@@ -1711,7 +1712,7 @@
     case SIZE_nRx2N:
         encodeBin(0, m_contextModels[OFF_PART_SIZE_CTX + 0]);
         encodeBin(0, m_contextModels[OFF_PART_SIZE_CTX + 1]);
-        if (depth == g_maxCUDepth - g_addCUDepth && !(cu->getCUSize(absPartIdx) == 8))
+        if (depth == g_maxCUDepth - g_addCUDepth && !(cu->getLog2CUSize(absPartIdx) == 3))
             encodeBin(1, m_contextModels[OFF_PART_SIZE_CTX + 2]);
         if (cu->getSlice()->getSPS()->getAMPAcc(depth))
         {
@@ -1722,7 +1723,7 @@
         break;
 
     case SIZE_NxN:
-        if (depth == g_maxCUDepth - g_addCUDepth && !(cu->getCUSize(absPartIdx) == 8))
+        if (depth == g_maxCUDepth - g_addCUDepth && !(cu->getLog2CUSize(absPartIdx) == 3))
         {
             encodeBin(0, m_contextModels[OFF_PART_SIZE_CTX + 0]);
             encodeBin(0, m_contextModels[OFF_PART_SIZE_CTX + 1]);
@@ -1892,7 +1893,7 @@
     const uint32_t interDir = cu->getInterDir(absPartIdx) - 1;
     const uint32_t ctx      = cu->getCtxInterDir(absPartIdx);
 
-    if (cu->getPartitionSize(absPartIdx) == SIZE_2Nx2N || cu->getCUSize(absPartIdx) != 8)
+    if (cu->getPartitionSize(absPartIdx) == SIZE_2Nx2N || cu->getLog2CUSize(absPartIdx) != 3)
         encodeBin(interDir == 2 ? 1 : 0, m_contextModels[OFF_INTER_DIR_CTX + ctx]);
     if (interDir < 2)
         encodeBin(interDir, m_contextModels[OFF_INTER_DIR_CTX + 4]);
@@ -2326,14 +2327,14 @@
 }
 
 /* estimate bit cost for CBP, significant map and significant coefficients */
-void SBac::estBit(EstBitsSbac* estBitsSbac, int trSize, TextType ttype)
+void SBac::estBit(EstBitsSbac* estBitsSbac, uint32_t log2TrSize, TextType ttype)
 {
     estCBFBit(estBitsSbac);
 
     estSignificantCoeffGroupMapBit(estBitsSbac, ttype);
 
     // encode significance map
-    estSignificantMapBit(estBitsSbac, trSize, ttype);
+    estSignificantMapBit(estBitsSbac, log2TrSize, ttype);
 
     // encode significant coefficients
     estSignificantCoefficientsBit(estBitsSbac, ttype);
@@ -2371,16 +2372,16 @@
 }
 
 /* estimate SAMBAC bit cost for significant coefficient map */
-void SBac::estSignificantMapBit(EstBitsSbac* estBitsSbac, int trSize, TextType ttype)
+void SBac::estSignificantMapBit(EstBitsSbac* estBitsSbac, uint32_t log2TrSize, TextType ttype)
 {
     int firstCtx = 1, numCtx = 8;
 
-    if (trSize >= 16)
+    if (log2TrSize >= 4)
     {
         firstCtx = (ttype == TEXT_LUMA) ? 21 : 12;
         numCtx = (ttype == TEXT_LUMA) ? 6 : 3;
     }
-    else if (trSize == 8)
+    else if (log2TrSize == 3)
     {
         firstCtx = 9;
         numCtx = (ttype == TEXT_LUMA) ? 12 : 3;
@@ -2418,7 +2419,6 @@
     }
     int bitsX = 0, bitsY = 0;
 
-    uint32_t log2TrSize = g_convertToBit[trSize] + 2;
     int blkSizeOffset = ttype ? NUM_CTX_LAST_FLAG_XY_LUMA : ((log2TrSize - 2) * 3 + ((log2TrSize - 1) >> 2));
     int ctxShift = ttype ? log2TrSize - 2 : ((log2TrSize + 1) >> 2);
     uint32_t maxGroupIdx = log2TrSize * 2 - 1;
diff -r 6055baa75085 -r fa683df9621e source/encoder/entropy.h
--- a/source/encoder/entropy.h	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/encoder/entropy.h	Mon Jul 14 14:50:34 2014 +0900
@@ -155,17 +155,17 @@
     void codeQtCbf(TComDataCU* cu, uint32_t absPartIdx, TextType ttype, uint32_t trDepth);
     void codeQtCbfZero(TComDataCU* cu, TextType ttype, uint32_t trDepth);
     void codeQtRootCbfZero(TComDataCU* cu);
-    void codeCoeff(TComDataCU* cu, uint32_t absPartIdx, uint32_t depth, uint32_t cuSize, bool& bCodeDQP);
+    void codeCoeff(TComDataCU* cu, uint32_t absPartIdx, uint32_t depth, bool& bCodeDQP);
     void codeCoeffNxN(TComDataCU* cu, coeff_t* coef, uint32_t absPartIdx, uint32_t log2TrSize, TextType ttype);
 
     void codeIntraDirLumaAng(TComDataCU* cu, uint32_t absPartIdx, bool isMultiple);
     void codeIntraDirChroma(TComDataCU* cu, uint32_t absPartIdx);
 
     // RDO functions
-    void estBit(EstBitsSbac* estBitsSbac, int trSize, TextType ttype);
+    void estBit(EstBitsSbac* estBitsSbac, uint32_t log2TrSize, TextType ttype);
     void estCBFBit(EstBitsSbac* estBitsSbac);
     void estSignificantCoeffGroupMapBit(EstBitsSbac* estBitsSbac, TextType ttype);
-    void estSignificantMapBit(EstBitsSbac* estBitsSbac, int trSize, TextType ttype);
+    void estSignificantMapBit(EstBitsSbac* estBitsSbac, uint32_t log2TrSize, TextType ttype);
     void estSignificantCoefficientsBit(EstBitsSbac* estBitsSbac, TextType ttype);
 
 private:
@@ -215,7 +215,7 @@
         uint32_t bakAbsPartIdxCU;
     };
 
-    void encodeTransform(TComDataCU* cu, CoeffCodeState& state, uint32_t offsetLumaOffset, uint32_t offsetChroma, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t depth, uint32_t tuSize, uint32_t uiTrIdx, bool& bCodeDQP);
+    void encodeTransform(TComDataCU* cu, CoeffCodeState& state, uint32_t offsetLumaOffset, uint32_t offsetChroma, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t depth, uint32_t log2TrSize, uint32_t uiTrIdx, bool& bCodeDQP);
 
     void copyFrom(SBac& src);
     void copyContextsFrom(SBac& src);
diff -r 6055baa75085 -r fa683df9621e source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Mon Jul 14 10:53:01 2014 +0530
+++ b/source/encoder/slicetype.cpp	Mon Jul 14 14:50:34 2014 +0900
@@ -1605,7 +1605,7 @@
     }
     if (!fenc->bIntraCalculated)
     {
-        int sizeIdx = g_convertToBit[cuSize]; // partition size
+        const int sizeIdx = X265_LOWRES_CU_BITS - 2; // partition size
 
         pixel _above0[X265_LOWRES_CU_SIZE * 4 + 1], *const above0 = _above0 + 2 * X265_LOWRES_CU_SIZE;
         pixel _above1[X265_LOWRES_CU_SIZE * 4 + 1], *const above1 = _above1 + 2 * X265_LOWRES_CU_SIZE;
@@ -1653,7 +1653,7 @@
         // calculate 35 satd costs, keep least cost
         ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
         primitives.transpose[sizeIdx](buf_trans, m_me.fenc, FENC_STRIDE);
-        pixelcmp_t satd = primitives.satd[partitionFromSize(cuSize)];
+        pixelcmp_t satd = primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)];
         int icost = m_me.COST_MAX, cost;
         for (uint32_t mode = 0; mode < 35; mode++)
         {