[x265] more use CUGeom

Sat Jan 17 10:36:30 CET 2015

# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1421487172 -32400
#      Sat Jan 17 18:32:52 2015 +0900
# Node ID 270c9786681069d34c8eb709b74412843e37373a
# Parent  65e71f08c55a0e9303d51691b3435cb5fdf6c6a1
more use CUGeom

diff -r 65e71f08c55a -r 270c97866810 source/common/cudata.cpp

--- a/source/common/cudata.cpp	Sat Jan 17 10:12:34 2015 +0530
+++ b/source/common/cudata.cpp	Sat Jan 17 18:32:52 2015 +0900
@@ -57,51 +57,51 @@
 void bcast256(uint8_t* dst, uint8_t val) { memset(dst, val, 256); }
 
 /* Check whether 2 addresses point to the same column */
-inline bool isEqualCol(int addrA, int addrB, int numUnitsPerRow)
+inline bool isEqualCol(int addrA, int addrB, int numUnits)
 {
-    // addrA % numUnitsPerRow == addrB % numUnitsPerRow
-    return ((addrA ^ addrB) &  (numUnitsPerRow - 1)) == 0;
+    // addrA % numUnits == addrB % numUnits
+    return ((addrA ^ addrB) &  (numUnits - 1)) == 0;
 }
 
 /* Check whether 2 addresses point to the same row */
-inline bool isEqualRow(int addrA, int addrB, int numUnitsPerRow)
+inline bool isEqualRow(int addrA, int addrB, int numUnits)
 {
-    // addrA / numUnitsPerRow == addrB / numUnitsPerRow
-    return ((addrA ^ addrB) & ~(numUnitsPerRow - 1)) == 0;
+    // addrA / numUnits == addrB / numUnits
+    return ((addrA ^ addrB) & ~(numUnits - 1)) == 0;
 }
 
 /* Check whether 2 addresses point to the same row or column */
-inline bool isEqualRowOrCol(int addrA, int addrB, int numUnitsPerRow)
+inline bool isEqualRowOrCol(int addrA, int addrB, int numUnits)
 {
-    return isEqualCol(addrA, addrB, numUnitsPerRow) | isEqualRow(addrA, addrB, numUnitsPerRow);
+    return isEqualCol(addrA, addrB, numUnits) | isEqualRow(addrA, addrB, numUnits);
 }
 
 /* Check whether one address points to the first column */
-inline bool isZeroCol(int addr, int numUnitsPerRow)
+inline bool isZeroCol(int addr, int numUnits)
 {
-    // addr % numUnitsPerRow == 0
-    return (addr & (numUnitsPerRow - 1)) == 0;
+    // addr % numUnits == 0
+    return (addr & (numUnits - 1)) == 0;
 }
 
 /* Check whether one address points to the first row */
-inline bool isZeroRow(int addr, int numUnitsPerRow)
+inline bool isZeroRow(int addr, int numUnits)
 {
-    // addr / numUnitsPerRow == 0
-    return (addr & ~(numUnitsPerRow - 1)) == 0;
+    // addr / numUnits == 0
+    return (addr & ~(numUnits - 1)) == 0;
 }
 
 /* Check whether one address points to a column whose index is smaller than a given value */
-inline bool lessThanCol(int addr, int val, int numUnitsPerRow)
+inline bool lessThanCol(int addr, int val, int numUnits)
 {
-    // addr % numUnitsPerRow < val
-    return (addr & (numUnitsPerRow - 1)) < val;
+    // addr % numUnits < val
+    return (addr & (numUnits - 1)) < val;
 }
 
 /* Check whether one address points to a row whose index is smaller than a given value */
-inline bool lessThanRow(int addr, int val, int numUnitsPerRow)
+inline bool lessThanRow(int addr, int val, int numUnits)
 {
-    // addr / numUnitsPerRow < val
-    return addr < val * numUnitsPerRow;
+    // addr / numUnits < val
+    return addr < val * numUnits;
 }
 
 inline MV scaleMv(MV mv, int scale)
@@ -1533,17 +1533,17 @@
             m_encData->getPicCTU(m_cuAddr)->m_cuPelY + g_zscanToPelY[partIdxRB] + UNIT_SIZE < m_slice->m_sps->picHeightInLumaSamples)
         {
             uint32_t absPartIdxRB = g_zscanToRaster[partIdxRB];
-            uint32_t numPartInCUSize = s_numPartInCUSize;
-            bool bNotLastCol = lessThanCol(absPartIdxRB, numPartInCUSize - 1, numPartInCUSize); // is not at the last column of CTU
-            bool bNotLastRow = lessThanRow(absPartIdxRB, numPartInCUSize - 1, numPartInCUSize); // is not at the last row    of CTU
+            uint32_t numUnits = s_numPartInCUSize;
+            bool bNotLastCol = lessThanCol(absPartIdxRB, numUnits - 1, numUnits); // is not at the last column of CTU
+            bool bNotLastRow = lessThanRow(absPartIdxRB, numUnits - 1, numUnits); // is not at the last row    of CTU
 
             if (bNotLastCol && bNotLastRow)
             {
-                absPartAddr = g_rasterToZscan[absPartIdxRB + numPartInCUSize + 1];
+                absPartAddr = g_rasterToZscan[absPartIdxRB + numUnits + 1];
                 ctuIdx = m_cuAddr;
             }
             else if (bNotLastCol)
-                absPartAddr = g_rasterToZscan[(absPartIdxRB + numPartInCUSize + 1) & (numPartInCUSize - 1)];
+                absPartAddr = g_rasterToZscan[(absPartIdxRB + numUnits + 1) & (numUnits - 1)];
             else if (bNotLastRow)
             {
                 absPartAddr = g_rasterToZscan[absPartIdxRB + 1];
@@ -1760,17 +1760,17 @@
             m_encData->getPicCTU(m_cuAddr)->m_cuPelY + g_zscanToPelY[partIdxRB] + UNIT_SIZE < m_slice->m_sps->picHeightInLumaSamples)
         {
             uint32_t absPartIdxRB = g_zscanToRaster[partIdxRB];
-            uint32_t numPartInCUSize = s_numPartInCUSize;
-            bool bNotLastCol = lessThanCol(absPartIdxRB, numPartInCUSize - 1, numPartInCUSize); // is not at the last column of CTU
-            bool bNotLastRow = lessThanRow(absPartIdxRB, numPartInCUSize - 1, numPartInCUSize); // is not at the last row    of CTU
+            uint32_t numUnits = s_numPartInCUSize;
+            bool bNotLastCol = lessThanCol(absPartIdxRB, numUnits - 1, numUnits); // is not at the last column of CTU
+            bool bNotLastRow = lessThanRow(absPartIdxRB, numUnits - 1, numUnits); // is not at the last row    of CTU
 
             if (bNotLastCol && bNotLastRow)
             {
-                absPartAddr = g_rasterToZscan[absPartIdxRB + numPartInCUSize + 1];
+                absPartAddr = g_rasterToZscan[absPartIdxRB + numUnits + 1];
                 ctuIdx = m_cuAddr;
             }
             else if (bNotLastCol)
-                absPartAddr = g_rasterToZscan[(absPartIdxRB + numPartInCUSize + 1) & (numPartInCUSize - 1)];
+                absPartAddr = g_rasterToZscan[(absPartIdxRB + numUnits + 1) & (numUnits - 1)];
             else if (bNotLastRow)
             {
                 absPartAddr = g_rasterToZscan[absPartIdxRB + 1];
diff -r 65e71f08c55a -r 270c97866810 source/common/deblock.cpp
--- a/source/common/deblock.cpp	Sat Jan 17 10:12:34 2015 +0530
+++ b/source/common/deblock.cpp	Sat Jan 17 18:32:52 2015 +0900
@@ -33,13 +33,13 @@
 #define DEBLOCK_SMALLEST_BLOCK  8
 #define DEFAULT_INTRA_TC_OFFSET 2
 
-void Deblock::deblockCTU(const CUData* ctu, int32_t dir)
+void Deblock::deblockCTU(const CUData* ctu, const CUGeom& cuGeom, int32_t dir)
 {
     uint8_t blockStrength[MAX_NUM_PARTITIONS];
 
-    memset(blockStrength, 0, sizeof(uint8_t) * m_numPartitions);
+    memset(blockStrength, 0, sizeof(uint8_t) * cuGeom.numPartitions);
 
-    deblockCU(ctu, 0, 0, dir, blockStrength);
+    deblockCU(ctu, cuGeom, dir, blockStrength);
 }
 
 static inline uint8_t bsCuEdge(const CUData* cu, uint32_t absPartIdx, int32_t dir)
@@ -68,32 +68,31 @@
 
 /* Deblocking filter process in CU-based (the same function as conventional's)
  * param Edge the direction of the edge in block boundary (horizonta/vertical), which is added newly */
-void Deblock::deblockCU(const CUData* cu, uint32_t absPartIdx, uint32_t depth, const int32_t dir, uint8_t blockStrength[])
+void Deblock::deblockCU(const CUData* cu, const CUGeom& cuGeom, const int32_t dir, uint8_t blockStrength[])
 {
+    uint32_t absPartIdx = cuGeom.encodeIdx;
+    uint32_t depth = cuGeom.depth;
     if (cu->m_predMode[absPartIdx] == MODE_NONE)
         return;
 
-    uint32_t curNumParts = NUM_CU_PARTITIONS >> (depth << 1);
-
-    const SPS& sps = *cu->m_slice->m_sps;
-
     if (cu->m_cuDepth[absPartIdx] > depth)
     {
-        uint32_t qNumParts   = curNumParts >> 2;
-        uint32_t xmax = sps.picWidthInLumaSamples  - cu->m_cuPelX;
-        uint32_t ymax = sps.picHeightInLumaSamples - cu->m_cuPelY;
-        for (uint32_t partIdx = 0; partIdx < 4; partIdx++, absPartIdx += qNumParts)
-            if (g_zscanToPelX[absPartIdx] < xmax && g_zscanToPelY[absPartIdx] < ymax)
-                deblockCU(cu, absPartIdx, depth + 1, dir, blockStrength);
+        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
+        {
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+            if (childGeom.flags & CUGeom::PRESENT)
+                deblockCU(cu, childGeom, dir, blockStrength);
+        }
         return;
     }
 
-    const uint32_t numUnits  = sps.numPartInCUSize >> depth;
+    uint32_t numUnits = 1 << (cuGeom.log2CUSize - LOG2_UNIT_SIZE);
     setEdgefilterPU(cu, absPartIdx, dir, blockStrength, numUnits);
-    setEdgefilterTU(cu, absPartIdx, depth, dir, blockStrength);
+    setEdgefilterTU(cu, absPartIdx, 0, dir, blockStrength);
     setEdgefilterMultiple(cu, absPartIdx, dir, 0, bsCuEdge(cu, absPartIdx, dir), blockStrength, numUnits);
 
-    for (uint32_t partIdx = absPartIdx; partIdx < absPartIdx + curNumParts; partIdx++)
+    uint32_t numParts = cuGeom.numPartitions;
+    for (uint32_t partIdx = absPartIdx; partIdx < absPartIdx + numParts; partIdx++)
     {
         uint32_t bsCheck = !(partIdx & (1 << dir));
 
@@ -102,12 +101,11 @@
     }
 
     const uint32_t partIdxIncr = DEBLOCK_SMALLEST_BLOCK >> LOG2_UNIT_SIZE;
-    uint32_t sizeInPU = sps.numPartInCUSize >> depth;
     uint32_t shiftFactor = (dir == EDGE_VER) ? cu->m_hChromaShift : cu->m_vChromaShift;
     uint32_t chromaMask = ((DEBLOCK_SMALLEST_BLOCK << shiftFactor) >> LOG2_UNIT_SIZE) - 1;
     uint32_t e0 = (dir == EDGE_VER ? g_zscanToPelX[absPartIdx] : g_zscanToPelY[absPartIdx]) >> LOG2_UNIT_SIZE;
         
-    for (uint32_t e = 0; e < sizeInPU; e += partIdxIncr)
+    for (uint32_t e = 0; e < numUnits; e += partIdxIncr)
     {
         edgeFilterLuma(cu, absPartIdx, depth, dir, e, blockStrength);
         if (!((e0 + e) & chromaMask))
@@ -117,12 +115,12 @@
 
 static inline uint32_t calcBsIdx(const CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, int32_t baseUnitIdx)
 {
-    uint32_t numPartInCUSize = cu->m_slice->m_sps->numPartInCUSize;
+    uint32_t numUnits = cu->m_slice->m_sps->numPartInCUSize;
 
     if (dir)
-        return g_rasterToZscan[g_zscanToRaster[absPartIdx] + edgeIdx * numPartInCUSize + baseUnitIdx];
+        return g_rasterToZscan[g_zscanToRaster[absPartIdx] + edgeIdx * numUnits + baseUnitIdx];
     else
-        return g_rasterToZscan[g_zscanToRaster[absPartIdx] + baseUnitIdx * numPartInCUSize + edgeIdx];
+        return g_rasterToZscan[g_zscanToRaster[absPartIdx] + baseUnitIdx * numUnits + edgeIdx];
 }
 
 void Deblock::setEdgefilterMultiple(const CUData* cu, uint32_t scanIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits)
@@ -135,19 +133,18 @@
     }
 }
 
-void Deblock::setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, uint8_t blockStrength[])
+void Deblock::setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t tuDepth, int32_t dir, uint8_t blockStrength[])
 {
-    if ((uint32_t)cu->m_tuDepth[absPartIdx] + cu->m_cuDepth[absPartIdx] > depth)
+    uint32_t log2TrSize = cu->m_log2CUSize[absPartIdx] - tuDepth;
+    if (cu->m_tuDepth[absPartIdx] > tuDepth)
     {
-        const uint32_t curNumParts = NUM_CU_PARTITIONS >> (depth << 1);
-        const uint32_t qNumParts   = curNumParts >> 2;
-
-        for (uint32_t partIdx = 0; partIdx < 4; partIdx++, absPartIdx += qNumParts)
-            setEdgefilterTU(cu, absPartIdx, depth + 1, dir, blockStrength);
+        uint32_t qNumParts = 1 << (log2TrSize - LOG2_UNIT_SIZE - 1) * 2;
+        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+            setEdgefilterTU(cu, absPartIdx, tuDepth + 1, dir, blockStrength);
         return;
     }
 
-    uint32_t numUnits  = 1 << (cu->m_log2CUSize[absPartIdx] - cu->m_tuDepth[absPartIdx] - LOG2_UNIT_SIZE);
+    uint32_t numUnits  = 1 << (log2TrSize - LOG2_UNIT_SIZE);
     setEdgefilterMultiple(cu, absPartIdx, dir, 0, 2, blockStrength, numUnits);
 }
 
@@ -501,7 +498,6 @@
     srcChroma[1] = reconPic->m_picOrg[2] + srcOffset;
 
     uint32_t numUnits = cuQ->m_slice->m_sps->numPartInCUSize >> (depth + chromaShift);
-
     for (uint32_t idx = 0; idx < numUnits; idx++)
     {
         uint32_t partQ = calcBsIdx(cuQ, absPartIdx, dir, edge, idx << chromaShift);
diff -r 65e71f08c55a -r 270c97866810 source/common/deblock.h
--- a/source/common/deblock.h	Sat Jan 17 10:12:34 2015 +0530
+++ b/source/common/deblock.h	Sat Jan 17 18:32:52 2015 +0900
@@ -30,27 +30,22 @@
 // private namespace
 
 class CUData;
+struct CUGeom;
 
 class Deblock
 {
 public:
     enum { EDGE_VER, EDGE_HOR };
 
-    uint32_t m_numPartitions;
-
-    Deblock() : m_numPartitions(0) {}
-
-    void init() { m_numPartitions = 1 << (g_maxFullDepth * 2); }
-
-    void deblockCTU(const CUData* ctu, int32_t dir);
+    void deblockCTU(const CUData* ctu, const CUGeom& cuGeom, int32_t dir);
 
 protected:
 
     // CU-level deblocking function
-    void deblockCU(const CUData* cu, uint32_t absPartIdx, uint32_t depth, const int32_t dir, uint8_t blockStrength[]);
+    void deblockCU(const CUData* cu, const CUGeom& cuGeom, const int32_t dir, uint8_t blockStrength[]);
 
     // set filtering functions
-    void setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, uint8_t blockStrength[]);
+    void setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t tuDepth, int32_t dir, uint8_t blockStrength[]);
     void setEdgefilterPU(const CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockStrength[], uint32_t numUnits);
     void setEdgefilterMultiple(const CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits);
 
diff -r 65e71f08c55a -r 270c97866810 source/encoder/framefilter.cpp
--- a/source/encoder/framefilter.cpp	Sat Jan 17 10:12:34 2015 +0530
+++ b/source/encoder/framefilter.cpp	Sat Jan 17 18:32:52 2015 +0900
@@ -63,8 +63,6 @@
     m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0;
     m_lastHeight = m_param->sourceHeight % g_maxCUSize ? m_param->sourceHeight % g_maxCUSize : g_maxCUSize;
 
-    m_deblock.init();
-
     if (m_param->bEnableSAO)
         if (!m_sao.create(m_param))
             m_param->bEnableSAO = 0;
@@ -96,22 +94,24 @@
 
     if (m_param->bEnableLoopFilter)
     {
+        const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
+        const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
+
         for (uint32_t col = 0; col < numCols; col++)
         {
             uint32_t cuAddr = lineStartCUAddr + col;
             const CUData* ctu = encData.getPicCTU(cuAddr);
-
-            m_deblock.deblockCTU(ctu, Deblock::EDGE_VER);
+            deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER);
 
             if (col > 0)
             {
                 const CUData* ctuPrev = encData.getPicCTU(cuAddr - 1);
-                m_deblock.deblockCTU(ctuPrev, Deblock::EDGE_HOR);
+                deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR);
             }
         }
 
         const CUData* ctuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1);
-        m_deblock.deblockCTU(ctuPrev, Deblock::EDGE_HOR);
+        deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[lineStartCUAddr + numCols - 1]], Deblock::EDGE_HOR);
     }
 
     // SAO
@@ -394,23 +394,24 @@
 }
 
 /* restore original YUV samples to recon after SAO (if lossless) */
-static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absPartIdx, uint32_t depth)
+static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absPartIdx)
 {
-    int size = g_maxLog2CUSize - depth - 2;
+    int size = cu->m_log2CUSize[absPartIdx] - 2;
+    uint32_t cuAddr = cu->m_cuAddr;
 
     PicYuv* reconPic = frame.m_reconPic;
     PicYuv* fencPic  = frame.m_fencPic;
 
-    pixel* dst = reconPic->getLumaAddr(cu->m_cuAddr, absPartIdx);
-    pixel* src = fencPic->getLumaAddr(cu->m_cuAddr, absPartIdx);
+    pixel* dst = reconPic->getLumaAddr(cuAddr, absPartIdx);
+    pixel* src = fencPic->getLumaAddr(cuAddr, absPartIdx);
 
     primitives.cu[size].copy_pp(dst, reconPic->m_stride, src, fencPic->m_stride);
    
-    pixel* dstCb = reconPic->getCbAddr(cu->m_cuAddr, absPartIdx);
-    pixel* srcCb = fencPic->getCbAddr(cu->m_cuAddr, absPartIdx);
+    pixel* dstCb = reconPic->getCbAddr(cuAddr, absPartIdx);
+    pixel* srcCb = fencPic->getCbAddr(cuAddr, absPartIdx);
 
-    pixel* dstCr = reconPic->getCrAddr(cu->m_cuAddr, absPartIdx);
-    pixel* srcCr = fencPic->getCrAddr(cu->m_cuAddr, absPartIdx);
+    pixel* dstCr = reconPic->getCrAddr(cuAddr, absPartIdx);
+    pixel* srcCr = fencPic->getCrAddr(cuAddr, absPartIdx);
 
     int csp = fencPic->m_picCsp;
     primitives.chroma[csp].cu[size].copy_pp(dstCb, reconPic->m_strideC, srcCb, fencPic->m_strideC);
@@ -418,34 +419,29 @@
 }
 
 /* Original YUV restoration for CU in lossless coding */
-static void origCUSampleRestoration(const CUData* cu, Frame& frame, uint32_t absPartIdx, uint32_t depth)
+static void origCUSampleRestoration(const CUData* cu, const CUGeom& cuGeom, Frame& frame)
 {
-    if (cu->m_cuDepth[absPartIdx] > depth)
+    uint32_t absPartIdx = cuGeom.encodeIdx;
+    if (cu->m_cuDepth[absPartIdx] > cuGeom.depth)
     {
-        /* TODO: this could use cuGeom.numPartition and flags */
-        uint32_t curNumParts = NUM_CU_PARTITIONS >> (depth << 1);
-        uint32_t qNumParts   = curNumParts >> 2;
-        uint32_t xmax = cu->m_slice->m_sps->picWidthInLumaSamples  - cu->m_cuPelX;
-        uint32_t ymax = cu->m_slice->m_sps->picHeightInLumaSamples - cu->m_cuPelY;
-
-        /* process four split sub-cu at next depth */
-        for (int subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdx += qNumParts)
+        for (int subPartIdx = 0; subPartIdx < 4; subPartIdx++)
         {
-            if (g_zscanToPelX[absPartIdx] < xmax && g_zscanToPelY[absPartIdx] < ymax)
-                origCUSampleRestoration(cu, frame, absPartIdx, depth + 1);
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+            if (childGeom.flags & CUGeom::PRESENT)
+                origCUSampleRestoration(cu, childGeom, frame);
         }
-
         return;
     }
 
     // restore original YUV samples
     if (cu->m_tqBypass[absPartIdx])
-        restoreOrigLosslessYuv(cu, frame, absPartIdx, depth);
+        restoreOrigLosslessYuv(cu, frame, absPartIdx);
 }
 
 void FrameFilter::processSao(int row)
 {
-    SAOParam* saoParam = m_frame->m_encData->m_saoParam;
+    FrameData& encData = *m_frame->m_encData;
+    SAOParam* saoParam = encData.m_saoParam;
 
     if (saoParam->bSaoFlag[0])
         m_sao.processSaoUnitRow(saoParam->ctuParam[0], row, 0);
@@ -456,12 +452,19 @@
         m_sao.processSaoUnitRow(saoParam->ctuParam[2], row, 2);
     }
 
-    if (m_frame->m_encData->m_slice->m_pps->bTransquantBypassEnabled)
+    if (encData.m_slice->m_pps->bTransquantBypassEnabled)
     {
-        uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth;
+        uint32_t numCols = encData.m_slice->m_sps->numCuInWidth;
         uint32_t lineStartCUAddr = row * numCols;
 
+        const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
+        const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
+
         for (uint32_t col = 0; col < numCols; col++)
-            origCUSampleRestoration(m_frame->m_encData->getPicCTU(lineStartCUAddr + col), *m_frame, 0, 0);
+        {
+            uint32_t cuAddr = lineStartCUAddr + col;
+            const CUData* ctu = encData.getPicCTU(cuAddr);
+            origCUSampleRestoration(ctu, cuGeoms[ctuGeomMap[cuAddr]], *m_frame);
+        }
     }
 }
diff -r 65e71f08c55a -r 270c97866810 source/encoder/framefilter.h
--- a/source/encoder/framefilter.h	Sat Jan 17 10:12:34 2015 +0530
+++ b/source/encoder/framefilter.h	Sat Jan 17 18:32:52 2015 +0900
@@ -39,7 +39,7 @@
 struct ThreadLocalData;
 
 // Manages the processing of a single frame loopfilter
-class FrameFilter
+class FrameFilter : public Deblock
 {
 public:
 
@@ -50,7 +50,6 @@
     int           m_vChromaShift;
     int           m_pad[2];
 
-    Deblock       m_deblock;
     SAO           m_sao;
     int           m_numRows;
     int           m_saoRowDelay;
diff -r 65e71f08c55a -r 270c97866810 source/encoder/search.cpp
--- a/source/encoder/search.cpp	Sat Jan 17 10:12:34 2015 +0530
+++ b/source/encoder/search.cpp	Sat Jan 17 18:32:52 2015 +0900
@@ -63,6 +63,7 @@
 
 bool Search::initSearch(const x265_param& param, ScalingList& scalingList)
 {
+    uint32_t maxLog2CUSize = g_log2Size[param.maxCUSize];
     m_param = ¶m;
     m_bEnableRDOQ = param.rdLevel >= 4;
     m_bFrameParallel = param.frameNumThreads > 1;
@@ -81,9 +82,9 @@
      * available for motion reference.  See refLagRows in FrameEncoder::compressCTURows() */
     m_refLagPixels = m_bFrameParallel ? param.searchRange : param.sourceHeight;
 
-    uint32_t sizeL = 1 << (g_maxLog2CUSize * 2);
+    uint32_t sizeL = 1 << (maxLog2CUSize * 2);
     uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
-    uint32_t numPartitions = NUM_CU_PARTITIONS;
+    uint32_t numPartitions = 1 << (maxLog2CUSize - LOG2_UNIT_SIZE) * 2;
 
     /* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32
      * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
@@ -167,9 +168,8 @@
 
 void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx)
 {
-    uint32_t fullDepth  = cu.m_cuDepth[0] + tuDepth;
     uint32_t subdiv     = tuDepth < cu.m_tuDepth[absPartIdx];
-    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
+    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
 
     if (!(log2TrSize - m_hChromaShift < 2))
     {
@@ -192,8 +192,7 @@
     if (!cu.getCbf(absPartIdx, ttype, tuDepth))
         return;
 
-    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
-    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
+    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
 
     if (tuDepth < cu.m_tuDepth[absPartIdx])
     {
@@ -241,8 +240,8 @@
 void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2])
 {
     CUData& cu = mode.cu;
-    uint32_t fullDepth  = cu.m_cuDepth[0] + tuDepth;
-    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
+    uint32_t fullDepth  = cuGeom.depth + tuDepth;
+    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
     uint32_t qtLayer    = log2TrSize - 2;
     uint32_t sizeIdx    = log2TrSize - 2;
     bool mightNotSplit  = log2TrSize <= depthRange[1];
@@ -317,7 +316,7 @@
                 m_entropyCoder.codePredMode(cu.m_predMode[0]);
             }
 
-            m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]);
+            m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
         }
         if (cu.m_partSize[0] == SIZE_2Nx2N)
         {
@@ -434,8 +433,8 @@
 
 void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
 {
-    uint32_t fullDepth = mode.cu.m_cuDepth[0] + tuDepth;
-    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
+    uint32_t fullDepth = cuGeom.depth + tuDepth;
+    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
     uint32_t tuSize = 1 << log2TrSize;
 
     X265_CHECK(tuSize == MAX_TS_SIZE, "transform skip is only possible at 4x4 TUs\n");
@@ -528,7 +527,7 @@
                 m_entropyCoder.codePredMode(cu.m_predMode[0]);
             }
 
-            m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]);
+            m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
         }
         if (cu.m_partSize[0] == SIZE_2Nx2N)
         {
@@ -604,8 +603,8 @@
 void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
 {
     CUData& cu = mode.cu;
-    uint32_t fullDepth  = cu.m_cuDepth[0] + tuDepth;
-    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
+    uint32_t fullDepth  = cuGeom.depth + tuDepth;
+    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
     bool     bCheckFull = log2TrSize <= depthRange[1];
 
     X265_CHECK(m_slice->m_sliceType != I_SLICE, "residualTransformQuantIntra not intended for I slices\n");
@@ -675,8 +674,7 @@
 
 void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx)
 {
-    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
-    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
+    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
 
     if (tuDepth == cu.m_tuDepth[absPartIdx])
     {
@@ -709,9 +707,7 @@
 /* 4:2:2 post-TU split processing */
 void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx)
 {
-    uint32_t depth = cu.m_cuDepth[0];
-    uint32_t fullDepth = depth + tuDepth;
-    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
+    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
 
     if (log2TrSize == 2)
     {
@@ -735,8 +731,7 @@
 uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy)
 {
     CUData& cu = mode.cu;
-    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
-    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
+    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
 
     if (tuDepth < cu.m_tuDepth[absPartIdx])
     {
@@ -782,7 +777,7 @@
     const uint32_t sizeIdxC = log2TrSizeC - 2;
     uint32_t outDist = 0;
 
-    uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
+    uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
     const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
 
     TURecurse tuIterator(splitType, curPartNum, absPartIdx);
@@ -858,8 +853,8 @@
 uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy)
 {
     CUData& cu = mode.cu;
-    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
-    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
+    uint32_t fullDepth  = cuGeom.depth + tuDepth;
+    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
     const uint32_t log2TrSizeC = 2;
     uint32_t qtLayer = log2TrSize - 2;
     uint32_t outDist = 0;
@@ -872,7 +867,7 @@
     ALIGN_VAR_32(coeff_t, tskipCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
     ALIGN_VAR_32(pixel,   tskipReconC[MAX_TS_SIZE * MAX_TS_SIZE]);
 
-    uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
+    uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
     const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
 
     TURecurse tuIterator(splitType, curPartNum, absPartIdx);
@@ -1006,9 +1001,8 @@
 
 void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth)
 {
-    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
     uint32_t tuDepthL  = cu.m_tuDepth[absPartIdx];
-    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
+    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
 
     if (tuDepthL == tuDepth || log2TrSizeC == 2)
@@ -1075,7 +1069,7 @@
     uint32_t stride = mode.fencYuv->m_csize;
     const uint32_t sizeIdxC = log2TrSizeC - 2;
 
-    uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
+    uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
     const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
 
     TURecurse tuIterator(splitType, curPartNum, absPartIdx);
@@ -1184,13 +1178,13 @@
 void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
 {
     CUData& cu = intraMode.cu;
-    uint32_t depth = cu.m_cuDepth[0];
+    uint32_t depth = cuGeom.depth;
 
     cu.setPartSizeSubParts(SIZE_2Nx2N);
     cu.setPredModeSubParts(MODE_INTRA);
 
     const uint32_t initTuDepth = 0;
-    uint32_t log2TrSize = cu.m_log2CUSize[0] - initTuDepth;
+    uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth;
     uint32_t tuSize = 1 << log2TrSize;
     const uint32_t absPartIdx = 0;
 
@@ -1403,10 +1397,10 @@
     Yuv* predYuv = &intraMode.predYuv;
     const Yuv* fencYuv = intraMode.fencYuv;
 
-    uint32_t depth        = cu.m_cuDepth[0];
+    uint32_t depth        = cuGeom.depth;
     uint32_t initTuDepth  = cu.m_partSize[0] != SIZE_2Nx2N;
     uint32_t numPU        = 1 << (2 * initTuDepth);
-    uint32_t log2TrSize   = cu.m_log2CUSize[0] - initTuDepth;
+    uint32_t log2TrSize   = cuGeom.log2CUSize - initTuDepth;
     uint32_t tuSize       = 1 << log2TrSize;
     uint32_t qNumParts    = cuGeom.numPartitions >> 2;
     uint32_t sizeIdx      = log2TrSize - 2;
@@ -1657,7 +1651,7 @@
         }
     }
 
-    cu.setChromIntraDirSubParts(bestMode, 0, cu.m_cuDepth[0]);
+    cu.setChromIntraDirSubParts(bestMode, 0, cuGeom.depth);
 }
 
 uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
@@ -1665,10 +1659,10 @@
     CUData& cu = intraMode.cu;
     Yuv& reconYuv = intraMode.reconYuv;
 
-    uint32_t depth       = cu.m_cuDepth[0];
+    uint32_t depth       = cuGeom.depth;
     uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N && m_csp == X265_CSP_I444;
-    uint32_t log2TrSize  = cu.m_log2CUSize[0] - initTuDepth;
-    uint32_t absPartStep = (NUM_CU_PARTITIONS >> (depth << 1));
+    uint32_t log2TrSize  = cuGeom.log2CUSize - initTuDepth;
+    uint32_t absPartStep = cuGeom.numPartitions;
     uint32_t totalDistortion = 0;
 
     int size = partitionFromLog2Size(log2TrSize);
@@ -2490,13 +2484,13 @@
     CUData& cu = interMode.cu;
     Yuv* reconYuv = &interMode.reconYuv;
     Yuv* predYuv = &interMode.predYuv;
-    ShortYuv* resiYuv = &m_rqt[cuGeom.depth].tmpResiYuv;
+    uint32_t depth = cuGeom.depth;
+    ShortYuv* resiYuv = &m_rqt[depth].tmpResiYuv;
     const Yuv* fencYuv = interMode.fencYuv;
 
     X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
 
-    uint32_t log2CUSize = cu.m_log2CUSize[0];
-    uint32_t depth = cu.m_cuDepth[0];
+    uint32_t log2CUSize = cuGeom.log2CUSize;
     int sizeIdx = log2CUSize - 2;
 
     m_quant.setQPforQuant(interMode.cu);
@@ -2509,7 +2503,7 @@
     m_entropyCoder.load(m_rqt[depth].cur);
 
     Cost costs;
-    estimateResidualQT(interMode, cuGeom, 0, depth, *resiYuv, costs, tuDepthRange);
+    estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
 
     if (!cu.m_tqBypass[0])
     {
@@ -2541,7 +2535,7 @@
     }
 
     if (cu.getQtRootCbf(0))
-        saveResidualQTData(cu, *resiYuv, 0, depth);
+        saveResidualQTData(cu, *resiYuv, 0, 0);
 
     /* calculate signal bits for inter/merge/skip coded CU */
     m_entropyCoder.load(m_rqt[depth].cur);
@@ -2567,7 +2561,7 @@
             m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
         m_entropyCoder.codeSkipFlag(cu, 0);
         m_entropyCoder.codePredMode(cu.m_predMode[0]);
-        m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]);
+        m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
         m_entropyCoder.codePredInfo(cu, 0);
         uint32_t mvBits = m_entropyCoder.getNumberOfWrittenBits();
 
@@ -2603,9 +2597,7 @@
 {
     uint32_t depth = cuGeom.depth + tuDepth;
     CUData& cu = mode.cu;
-    X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "invalid depth\n");
-
-    uint32_t log2TrSize = g_maxLog2CUSize - depth;
+    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
 
     bool bCheckFull = log2TrSize <= depthRange[1];
     if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && log2TrSize > depthRange[0])
@@ -2625,7 +2617,7 @@
             bCodeChroma = !(absPartIdx & 3);
         }
 
-        uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
+        uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
         uint32_t setCbf = 1 << tuDepth;
 
         uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
@@ -2633,7 +2625,7 @@
 
         uint32_t sizeIdx  = log2TrSize  - 2;
 
-        cu.setTUDepthSubParts(depth - cu.m_cuDepth[0], absPartIdx, depth);
+        cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
         cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
 
         ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
@@ -2744,22 +2736,21 @@
         return m_rdCost.calcRdCost(dist, nullBits);
 }
 
-void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2])
+void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2])
 {
     CUData& cu = mode.cu;
-    uint32_t log2TrSize = g_maxLog2CUSize - depth;
+    uint32_t depth = cuGeom.depth + tuDepth;
+    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
 
     bool bCheckSplit = log2TrSize > depthRange[0];
     bool bCheckFull = log2TrSize <= depthRange[1];
     bool bSplitPresentFlag = bCheckSplit && bCheckFull;
 
-    if (cu.m_partSize[0] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && bCheckSplit)
+    if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && bCheckSplit)
         bCheckFull = false;
 
     X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
-    X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
-
-    uint32_t tuDepth = depth - cu.m_cuDepth[0];
+
     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
     bool bCodeChroma = true;
     uint32_t tuDepthC = tuDepth;
@@ -2787,7 +2778,7 @@
 
     uint32_t trSize = 1 << log2TrSize;
     const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
-    uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
+    uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
     const Yuv* fencYuv = mode.fencYuv;
 
     // code full block
@@ -2804,7 +2795,7 @@
         bool checkTransformSkipY  = checkTransformSkip && log2TrSize  <= MAX_LOG2_TS_SIZE;
         bool checkTransformSkipC = checkTransformSkip && log2TrSizeC <= MAX_LOG2_TS_SIZE;
 
-        cu.setTUDepthSubParts(depth - cu.m_cuDepth[0], absPartIdx, depth);
+        cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
         cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
 
         if (m_bEnableRDOQ)
@@ -3215,7 +3206,7 @@
         uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
         for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
         {
-            estimateResidualQT(mode, cuGeom, qPartIdx, depth + 1, resiYuv, splitCost, depthRange);
+            estimateResidualQT(mode, cuGeom, qPartIdx, tuDepth + 1, resiYuv, splitCost, depthRange);
             ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
             ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
             vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
@@ -3234,7 +3225,7 @@
         m_entropyCoder.load(m_rqt[depth].rqtRoot);
         m_entropyCoder.resetBits();
 
-        codeInterSubdivCbfQT(cu, absPartIdx, depth, depthRange);
+        codeInterSubdivCbfQT(cu, absPartIdx, tuDepth, depthRange);
         uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
         splitCost.bits += splitCbfBits;
 
@@ -3307,14 +3298,12 @@
     outCosts.energy     += fullCost.energy;
 }
 
-void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, const uint32_t depthRange[2])
+void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2])
 {
-    X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
     X265_CHECK(cu.isInter(absPartIdx), "codeInterSubdivCbfQT() with intra block\n");
 
-    const uint32_t tuDepth     = depth - cu.m_cuDepth[0];
-    const bool     bSubdiv     = tuDepth != cu.m_tuDepth[absPartIdx];
-    const uint32_t log2TrSize  = g_maxLog2CUSize - depth;
+    const bool bSubdiv  = tuDepth < cu.m_tuDepth[absPartIdx];
+    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
 
     if (!(log2TrSize - m_hChromaShift < 2))
     {
@@ -3337,102 +3326,19 @@
     {
         uint32_t qNumParts = 1 << (log2TrSize -1 - LOG2_UNIT_SIZE) * 2;
         for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
-            codeInterSubdivCbfQT(cu, absPartIdx, depth + 1, depthRange);
+            codeInterSubdivCbfQT(cu, absPartIdx, tuDepth + 1, depthRange);
     }
 }
 
-void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, TextType ttype, const uint32_t depthRange[2])
+void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth)
 {
-    X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
-    X265_CHECK(cu.isInter(absPartIdx), "encodeResidualQT() with intra block\n");
-
-    const uint32_t curTuDepth  = depth - cu.m_cuDepth[0];
-    const uint32_t tuDepth     = cu.m_tuDepth[absPartIdx];
-    const bool     bSubdiv     = curTuDepth != tuDepth;
-    const uint32_t log2TrSize  = g_maxLog2CUSize - depth;
-
-    if (bSubdiv)
-    {
-        if (cu.getCbf(absPartIdx, ttype, curTuDepth))
-        {
-            uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
-            for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
-                encodeResidualQT(cu, absPartIdx, depth + 1, ttype, depthRange);
-        }
-        return;
-    }
-    else
-    {
-        const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
-        uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-
-        // Luma
-        const uint32_t qtLayer = log2TrSize - 2;
-        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
-        coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
-
-        // Chroma
-        bool bCodeChroma = true;
-        uint32_t tuDepthC = tuDepth;
-        if (log2TrSize == 2 && m_csp != X265_CSP_I444)
-        {
-            X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
-            log2TrSizeC++;
-            tuDepthC--;
-            bCodeChroma = !(absPartIdx & 3);
-        }
-
-        if (ttype == TEXT_LUMA && cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
-            m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
-
-        if (bCodeChroma)
-        {
-            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
-            coeff_t* coeffCurU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
-            coeff_t* coeffCurV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
-
-            if (!splitIntoSubTUs)
-            {
-                if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
-                    m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
-                if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
-                    m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
-            }
-            else
-            {
-                uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2);
-                uint32_t subTUSize = 1 << (log2TrSizeC * 2);
-                if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
-                {
-                    if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
-                        m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
-                    if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
-                        m_entropyCoder.codeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, TEXT_CHROMA_U);
-                }
-                if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
-                {
-                    if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
-                        m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
-                    if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
-                        m_entropyCoder.codeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, TEXT_CHROMA_V);
-                }
-            }
-        }
-    }
-}
-
-void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t depth)
-{
-    X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
-    const uint32_t curTrMode = depth - cu.m_cuDepth[0];
-    const uint32_t tuDepth   = cu.m_tuDepth[absPartIdx];
-    const uint32_t log2TrSize = g_maxLog2CUSize - depth;
-
-    if (curTrMode < tuDepth)
+    const uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
+
+    if (tuDepth < cu.m_tuDepth[absPartIdx])
     {
         uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
         for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
-            saveResidualQTData(cu, resiYuv, absPartIdx, depth + 1);
+            saveResidualQTData(cu, resiYuv, absPartIdx, tuDepth + 1);
         return;
     }
 
diff -r 65e71f08c55a -r 270c97866810 source/encoder/search.h
--- a/source/encoder/search.h	Sat Jan 17 10:12:34 2015 +0530
+++ b/source/encoder/search.h	Sat Jan 17 18:32:52 2015 +0900
@@ -201,7 +201,7 @@
     bool          m_bJobsQueued;
     void     singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, int part, int list, int ref);
 
-    void     saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t depth);
+    void     saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth);
 
     // RDO search of luma intra modes; result is fully encoded luma. luma distortion is returned
     uint32_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes);
@@ -210,7 +210,7 @@
     uint32_t estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom);
 
     void     codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx);
-    void     codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, const uint32_t depthRange[2]);
+    void     codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2]);
     void     codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype);
 
     struct Cost
@@ -225,9 +225,6 @@
     uint64_t estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId);
     void     estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2]);
 
-    // estimate bit cost of residual QT
-    void     encodeResidualQT(CUData& cu, uint32_t absPartIdx, uint32_t depth, TextType ttype, const uint32_t depthRange[2]);
-
     // generate prediction, generate residual and recon. if bAllowSplit, find optimal RQT splits
     void     codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& costs, const uint32_t depthRange[2]);
     void     codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& costs);