[x265] more use CUGeom

Sat Jan 17 12:48:02 CET 2015

On 01/17, Satoshi Nakagawa wrote:
> # HG changeset patch
> # User Satoshi Nakagawa <nakagawa424 at oki.com>
> # Date 1421487172 -32400
> #      Sat Jan 17 18:32:52 2015 +0900
> # Node ID 270c9786681069d34c8eb709b74412843e37373a
> # Parent  65e71f08c55a0e9303d51691b3435cb5fdf6c6a1
> more use CUGeom

looks good, queued

> diff -r 65e71f08c55a -r 270c97866810 source/common/cudata.cpp
> --- a/source/common/cudata.cpp	Sat Jan 17 10:12:34 2015 +0530
> +++ b/source/common/cudata.cpp	Sat Jan 17 18:32:52 2015 +0900
> @@ -57,51 +57,51 @@
>  void bcast256(uint8_t* dst, uint8_t val) { memset(dst, val, 256); }
>  
>  /* Check whether 2 addresses point to the same column */
> -inline bool isEqualCol(int addrA, int addrB, int numUnitsPerRow)
> +inline bool isEqualCol(int addrA, int addrB, int numUnits)
>  {
> -    // addrA % numUnitsPerRow == addrB % numUnitsPerRow
> -    return ((addrA ^ addrB) &  (numUnitsPerRow - 1)) == 0;
> +    // addrA % numUnits == addrB % numUnits
> +    return ((addrA ^ addrB) &  (numUnits - 1)) == 0;
>  }
>  
>  /* Check whether 2 addresses point to the same row */
> -inline bool isEqualRow(int addrA, int addrB, int numUnitsPerRow)
> +inline bool isEqualRow(int addrA, int addrB, int numUnits)
>  {
> -    // addrA / numUnitsPerRow == addrB / numUnitsPerRow
> -    return ((addrA ^ addrB) & ~(numUnitsPerRow - 1)) == 0;
> +    // addrA / numUnits == addrB / numUnits
> +    return ((addrA ^ addrB) & ~(numUnits - 1)) == 0;
>  }
>  
>  /* Check whether 2 addresses point to the same row or column */
> -inline bool isEqualRowOrCol(int addrA, int addrB, int numUnitsPerRow)
> +inline bool isEqualRowOrCol(int addrA, int addrB, int numUnits)
>  {
> -    return isEqualCol(addrA, addrB, numUnitsPerRow) | isEqualRow(addrA, addrB, numUnitsPerRow);
> +    return isEqualCol(addrA, addrB, numUnits) | isEqualRow(addrA, addrB, numUnits);
>  }
>  
>  /* Check whether one address points to the first column */
> -inline bool isZeroCol(int addr, int numUnitsPerRow)
> +inline bool isZeroCol(int addr, int numUnits)
>  {
> -    // addr % numUnitsPerRow == 0
> -    return (addr & (numUnitsPerRow - 1)) == 0;
> +    // addr % numUnits == 0
> +    return (addr & (numUnits - 1)) == 0;
>  }
>  
>  /* Check whether one address points to the first row */
> -inline bool isZeroRow(int addr, int numUnitsPerRow)
> +inline bool isZeroRow(int addr, int numUnits)
>  {
> -    // addr / numUnitsPerRow == 0
> -    return (addr & ~(numUnitsPerRow - 1)) == 0;
> +    // addr / numUnits == 0
> +    return (addr & ~(numUnits - 1)) == 0;
>  }
>  
>  /* Check whether one address points to a column whose index is smaller than a given value */
> -inline bool lessThanCol(int addr, int val, int numUnitsPerRow)
> +inline bool lessThanCol(int addr, int val, int numUnits)
>  {
> -    // addr % numUnitsPerRow < val
> -    return (addr & (numUnitsPerRow - 1)) < val;
> +    // addr % numUnits < val
> +    return (addr & (numUnits - 1)) < val;
>  }
>  
>  /* Check whether one address points to a row whose index is smaller than a given value */
> -inline bool lessThanRow(int addr, int val, int numUnitsPerRow)
> +inline bool lessThanRow(int addr, int val, int numUnits)
>  {
> -    // addr / numUnitsPerRow < val
> -    return addr < val * numUnitsPerRow;
> +    // addr / numUnits < val
> +    return addr < val * numUnits;
>  }
>  
>  inline MV scaleMv(MV mv, int scale)
> @@ -1533,17 +1533,17 @@
>              m_encData->getPicCTU(m_cuAddr)->m_cuPelY + g_zscanToPelY[partIdxRB] + UNIT_SIZE < m_slice->m_sps->picHeightInLumaSamples)
>          {
>              uint32_t absPartIdxRB = g_zscanToRaster[partIdxRB];
> -            uint32_t numPartInCUSize = s_numPartInCUSize;
> -            bool bNotLastCol = lessThanCol(absPartIdxRB, numPartInCUSize - 1, numPartInCUSize); // is not at the last column of CTU
> -            bool bNotLastRow = lessThanRow(absPartIdxRB, numPartInCUSize - 1, numPartInCUSize); // is not at the last row    of CTU
> +            uint32_t numUnits = s_numPartInCUSize;
> +            bool bNotLastCol = lessThanCol(absPartIdxRB, numUnits - 1, numUnits); // is not at the last column of CTU
> +            bool bNotLastRow = lessThanRow(absPartIdxRB, numUnits - 1, numUnits); // is not at the last row    of CTU
>  
>              if (bNotLastCol && bNotLastRow)
>              {
> -                absPartAddr = g_rasterToZscan[absPartIdxRB + numPartInCUSize + 1];
> +                absPartAddr = g_rasterToZscan[absPartIdxRB + numUnits + 1];
>                  ctuIdx = m_cuAddr;
>              }
>              else if (bNotLastCol)
> -                absPartAddr = g_rasterToZscan[(absPartIdxRB + numPartInCUSize + 1) & (numPartInCUSize - 1)];
> +                absPartAddr = g_rasterToZscan[(absPartIdxRB + numUnits + 1) & (numUnits - 1)];
>              else if (bNotLastRow)
>              {
>                  absPartAddr = g_rasterToZscan[absPartIdxRB + 1];
> @@ -1760,17 +1760,17 @@
>              m_encData->getPicCTU(m_cuAddr)->m_cuPelY + g_zscanToPelY[partIdxRB] + UNIT_SIZE < m_slice->m_sps->picHeightInLumaSamples)
>          {
>              uint32_t absPartIdxRB = g_zscanToRaster[partIdxRB];
> -            uint32_t numPartInCUSize = s_numPartInCUSize;
> -            bool bNotLastCol = lessThanCol(absPartIdxRB, numPartInCUSize - 1, numPartInCUSize); // is not at the last column of CTU
> -            bool bNotLastRow = lessThanRow(absPartIdxRB, numPartInCUSize - 1, numPartInCUSize); // is not at the last row    of CTU
> +            uint32_t numUnits = s_numPartInCUSize;
> +            bool bNotLastCol = lessThanCol(absPartIdxRB, numUnits - 1, numUnits); // is not at the last column of CTU
> +            bool bNotLastRow = lessThanRow(absPartIdxRB, numUnits - 1, numUnits); // is not at the last row    of CTU
>  
>              if (bNotLastCol && bNotLastRow)
>              {
> -                absPartAddr = g_rasterToZscan[absPartIdxRB + numPartInCUSize + 1];
> +                absPartAddr = g_rasterToZscan[absPartIdxRB + numUnits + 1];
>                  ctuIdx = m_cuAddr;
>              }
>              else if (bNotLastCol)
> -                absPartAddr = g_rasterToZscan[(absPartIdxRB + numPartInCUSize + 1) & (numPartInCUSize - 1)];
> +                absPartAddr = g_rasterToZscan[(absPartIdxRB + numUnits + 1) & (numUnits - 1)];
>              else if (bNotLastRow)
>              {
>                  absPartAddr = g_rasterToZscan[absPartIdxRB + 1];
> diff -r 65e71f08c55a -r 270c97866810 source/common/deblock.cpp
> --- a/source/common/deblock.cpp	Sat Jan 17 10:12:34 2015 +0530
> +++ b/source/common/deblock.cpp	Sat Jan 17 18:32:52 2015 +0900
> @@ -33,13 +33,13 @@
>  #define DEBLOCK_SMALLEST_BLOCK  8
>  #define DEFAULT_INTRA_TC_OFFSET 2
>  
> -void Deblock::deblockCTU(const CUData* ctu, int32_t dir)
> +void Deblock::deblockCTU(const CUData* ctu, const CUGeom& cuGeom, int32_t dir)
>  {
>      uint8_t blockStrength[MAX_NUM_PARTITIONS];
>  
> -    memset(blockStrength, 0, sizeof(uint8_t) * m_numPartitions);
> +    memset(blockStrength, 0, sizeof(uint8_t) * cuGeom.numPartitions);
>  
> -    deblockCU(ctu, 0, 0, dir, blockStrength);
> +    deblockCU(ctu, cuGeom, dir, blockStrength);
>  }
>  
>  static inline uint8_t bsCuEdge(const CUData* cu, uint32_t absPartIdx, int32_t dir)
> @@ -68,32 +68,31 @@
>  
>  /* Deblocking filter process in CU-based (the same function as conventional's)
>   * param Edge the direction of the edge in block boundary (horizonta/vertical), which is added newly */
> -void Deblock::deblockCU(const CUData* cu, uint32_t absPartIdx, uint32_t depth, const int32_t dir, uint8_t blockStrength[])
> +void Deblock::deblockCU(const CUData* cu, const CUGeom& cuGeom, const int32_t dir, uint8_t blockStrength[])
>  {
> +    uint32_t absPartIdx = cuGeom.encodeIdx;
> +    uint32_t depth = cuGeom.depth;
>      if (cu->m_predMode[absPartIdx] == MODE_NONE)
>          return;
>  
> -    uint32_t curNumParts = NUM_CU_PARTITIONS >> (depth << 1);
> -
> -    const SPS& sps = *cu->m_slice->m_sps;
> -
>      if (cu->m_cuDepth[absPartIdx] > depth)
>      {
> -        uint32_t qNumParts   = curNumParts >> 2;
> -        uint32_t xmax = sps.picWidthInLumaSamples  - cu->m_cuPelX;
> -        uint32_t ymax = sps.picHeightInLumaSamples - cu->m_cuPelY;
> -        for (uint32_t partIdx = 0; partIdx < 4; partIdx++, absPartIdx += qNumParts)
> -            if (g_zscanToPelX[absPartIdx] < xmax && g_zscanToPelY[absPartIdx] < ymax)
> -                deblockCU(cu, absPartIdx, depth + 1, dir, blockStrength);
> +        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
> +        {
> +            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
> +            if (childGeom.flags & CUGeom::PRESENT)
> +                deblockCU(cu, childGeom, dir, blockStrength);
> +        }
>          return;
>      }
>  
> -    const uint32_t numUnits  = sps.numPartInCUSize >> depth;
> +    uint32_t numUnits = 1 << (cuGeom.log2CUSize - LOG2_UNIT_SIZE);
>      setEdgefilterPU(cu, absPartIdx, dir, blockStrength, numUnits);
> -    setEdgefilterTU(cu, absPartIdx, depth, dir, blockStrength);
> +    setEdgefilterTU(cu, absPartIdx, 0, dir, blockStrength);
>      setEdgefilterMultiple(cu, absPartIdx, dir, 0, bsCuEdge(cu, absPartIdx, dir), blockStrength, numUnits);
>  
> -    for (uint32_t partIdx = absPartIdx; partIdx < absPartIdx + curNumParts; partIdx++)
> +    uint32_t numParts = cuGeom.numPartitions;
> +    for (uint32_t partIdx = absPartIdx; partIdx < absPartIdx + numParts; partIdx++)
>      {
>          uint32_t bsCheck = !(partIdx & (1 << dir));
>  
> @@ -102,12 +101,11 @@
>      }
>  
>      const uint32_t partIdxIncr = DEBLOCK_SMALLEST_BLOCK >> LOG2_UNIT_SIZE;
> -    uint32_t sizeInPU = sps.numPartInCUSize >> depth;
>      uint32_t shiftFactor = (dir == EDGE_VER) ? cu->m_hChromaShift : cu->m_vChromaShift;
>      uint32_t chromaMask = ((DEBLOCK_SMALLEST_BLOCK << shiftFactor) >> LOG2_UNIT_SIZE) - 1;
>      uint32_t e0 = (dir == EDGE_VER ? g_zscanToPelX[absPartIdx] : g_zscanToPelY[absPartIdx]) >> LOG2_UNIT_SIZE;
>          
> -    for (uint32_t e = 0; e < sizeInPU; e += partIdxIncr)
> +    for (uint32_t e = 0; e < numUnits; e += partIdxIncr)
>      {
>          edgeFilterLuma(cu, absPartIdx, depth, dir, e, blockStrength);
>          if (!((e0 + e) & chromaMask))
> @@ -117,12 +115,12 @@
>  
>  static inline uint32_t calcBsIdx(const CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, int32_t baseUnitIdx)
>  {
> -    uint32_t numPartInCUSize = cu->m_slice->m_sps->numPartInCUSize;
> +    uint32_t numUnits = cu->m_slice->m_sps->numPartInCUSize;
>  
>      if (dir)
> -        return g_rasterToZscan[g_zscanToRaster[absPartIdx] + edgeIdx * numPartInCUSize + baseUnitIdx];
> +        return g_rasterToZscan[g_zscanToRaster[absPartIdx] + edgeIdx * numUnits + baseUnitIdx];
>      else
> -        return g_rasterToZscan[g_zscanToRaster[absPartIdx] + baseUnitIdx * numPartInCUSize + edgeIdx];
> +        return g_rasterToZscan[g_zscanToRaster[absPartIdx] + baseUnitIdx * numUnits + edgeIdx];
>  }
>  
>  void Deblock::setEdgefilterMultiple(const CUData* cu, uint32_t scanIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits)
> @@ -135,19 +133,18 @@
>      }
>  }
>  
> -void Deblock::setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, uint8_t blockStrength[])
> +void Deblock::setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t tuDepth, int32_t dir, uint8_t blockStrength[])
>  {
> -    if ((uint32_t)cu->m_tuDepth[absPartIdx] + cu->m_cuDepth[absPartIdx] > depth)
> +    uint32_t log2TrSize = cu->m_log2CUSize[absPartIdx] - tuDepth;
> +    if (cu->m_tuDepth[absPartIdx] > tuDepth)
>      {
> -        const uint32_t curNumParts = NUM_CU_PARTITIONS >> (depth << 1);
> -        const uint32_t qNumParts   = curNumParts >> 2;
> -
> -        for (uint32_t partIdx = 0; partIdx < 4; partIdx++, absPartIdx += qNumParts)
> -            setEdgefilterTU(cu, absPartIdx, depth + 1, dir, blockStrength);
> +        uint32_t qNumParts = 1 << (log2TrSize - LOG2_UNIT_SIZE - 1) * 2;
> +        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
> +            setEdgefilterTU(cu, absPartIdx, tuDepth + 1, dir, blockStrength);
>          return;
>      }
>  
> -    uint32_t numUnits  = 1 << (cu->m_log2CUSize[absPartIdx] - cu->m_tuDepth[absPartIdx] - LOG2_UNIT_SIZE);
> +    uint32_t numUnits  = 1 << (log2TrSize - LOG2_UNIT_SIZE);
>      setEdgefilterMultiple(cu, absPartIdx, dir, 0, 2, blockStrength, numUnits);
>  }
>  
> @@ -501,7 +498,6 @@
>      srcChroma[1] = reconPic->m_picOrg[2] + srcOffset;
>  
>      uint32_t numUnits = cuQ->m_slice->m_sps->numPartInCUSize >> (depth + chromaShift);
> -
>      for (uint32_t idx = 0; idx < numUnits; idx++)
>      {
>          uint32_t partQ = calcBsIdx(cuQ, absPartIdx, dir, edge, idx << chromaShift);
> diff -r 65e71f08c55a -r 270c97866810 source/common/deblock.h
> --- a/source/common/deblock.h	Sat Jan 17 10:12:34 2015 +0530
> +++ b/source/common/deblock.h	Sat Jan 17 18:32:52 2015 +0900
> @@ -30,27 +30,22 @@
>  // private namespace
>  
>  class CUData;
> +struct CUGeom;
>  
>  class Deblock
>  {
>  public:
>      enum { EDGE_VER, EDGE_HOR };
>  
> -    uint32_t m_numPartitions;
> -
> -    Deblock() : m_numPartitions(0) {}
> -
> -    void init() { m_numPartitions = 1 << (g_maxFullDepth * 2); }
> -
> -    void deblockCTU(const CUData* ctu, int32_t dir);
> +    void deblockCTU(const CUData* ctu, const CUGeom& cuGeom, int32_t dir);
>  
>  protected:
>  
>      // CU-level deblocking function
> -    void deblockCU(const CUData* cu, uint32_t absPartIdx, uint32_t depth, const int32_t dir, uint8_t blockStrength[]);
> +    void deblockCU(const CUData* cu, const CUGeom& cuGeom, const int32_t dir, uint8_t blockStrength[]);
>  
>      // set filtering functions
> -    void setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, uint8_t blockStrength[]);
> +    void setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t tuDepth, int32_t dir, uint8_t blockStrength[]);
>      void setEdgefilterPU(const CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockStrength[], uint32_t numUnits);
>      void setEdgefilterMultiple(const CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits);
>  
> diff -r 65e71f08c55a -r 270c97866810 source/encoder/framefilter.cpp
> --- a/source/encoder/framefilter.cpp	Sat Jan 17 10:12:34 2015 +0530
> +++ b/source/encoder/framefilter.cpp	Sat Jan 17 18:32:52 2015 +0900
> @@ -63,8 +63,6 @@
>      m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0;
>      m_lastHeight = m_param->sourceHeight % g_maxCUSize ? m_param->sourceHeight % g_maxCUSize : g_maxCUSize;
>  
> -    m_deblock.init();
> -
>      if (m_param->bEnableSAO)
>          if (!m_sao.create(m_param))
>              m_param->bEnableSAO = 0;
> @@ -96,22 +94,24 @@
>  
>      if (m_param->bEnableLoopFilter)
>      {
> +        const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
> +        const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
> +
>          for (uint32_t col = 0; col < numCols; col++)
>          {
>              uint32_t cuAddr = lineStartCUAddr + col;
>              const CUData* ctu = encData.getPicCTU(cuAddr);
> -
> -            m_deblock.deblockCTU(ctu, Deblock::EDGE_VER);
> +            deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER);
>  
>              if (col > 0)
>              {
>                  const CUData* ctuPrev = encData.getPicCTU(cuAddr - 1);
> -                m_deblock.deblockCTU(ctuPrev, Deblock::EDGE_HOR);
> +                deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR);
>              }
>          }
>  
>          const CUData* ctuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1);
> -        m_deblock.deblockCTU(ctuPrev, Deblock::EDGE_HOR);
> +        deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[lineStartCUAddr + numCols - 1]], Deblock::EDGE_HOR);
>      }
>  
>      // SAO
> @@ -394,23 +394,24 @@
>  }
>  
>  /* restore original YUV samples to recon after SAO (if lossless) */
> -static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absPartIdx, uint32_t depth)
> +static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absPartIdx)
>  {
> -    int size = g_maxLog2CUSize - depth - 2;
> +    int size = cu->m_log2CUSize[absPartIdx] - 2;
> +    uint32_t cuAddr = cu->m_cuAddr;
>  
>      PicYuv* reconPic = frame.m_reconPic;
>      PicYuv* fencPic  = frame.m_fencPic;
>  
> -    pixel* dst = reconPic->getLumaAddr(cu->m_cuAddr, absPartIdx);
> -    pixel* src = fencPic->getLumaAddr(cu->m_cuAddr, absPartIdx);
> +    pixel* dst = reconPic->getLumaAddr(cuAddr, absPartIdx);
> +    pixel* src = fencPic->getLumaAddr(cuAddr, absPartIdx);
>  
>      primitives.cu[size].copy_pp(dst, reconPic->m_stride, src, fencPic->m_stride);
>     
> -    pixel* dstCb = reconPic->getCbAddr(cu->m_cuAddr, absPartIdx);
> -    pixel* srcCb = fencPic->getCbAddr(cu->m_cuAddr, absPartIdx);
> +    pixel* dstCb = reconPic->getCbAddr(cuAddr, absPartIdx);
> +    pixel* srcCb = fencPic->getCbAddr(cuAddr, absPartIdx);
>  
> -    pixel* dstCr = reconPic->getCrAddr(cu->m_cuAddr, absPartIdx);
> -    pixel* srcCr = fencPic->getCrAddr(cu->m_cuAddr, absPartIdx);
> +    pixel* dstCr = reconPic->getCrAddr(cuAddr, absPartIdx);
> +    pixel* srcCr = fencPic->getCrAddr(cuAddr, absPartIdx);
>  
>      int csp = fencPic->m_picCsp;
>      primitives.chroma[csp].cu[size].copy_pp(dstCb, reconPic->m_strideC, srcCb, fencPic->m_strideC);
> @@ -418,34 +419,29 @@
>  }
>  
>  /* Original YUV restoration for CU in lossless coding */
> -static void origCUSampleRestoration(const CUData* cu, Frame& frame, uint32_t absPartIdx, uint32_t depth)
> +static void origCUSampleRestoration(const CUData* cu, const CUGeom& cuGeom, Frame& frame)
>  {
> -    if (cu->m_cuDepth[absPartIdx] > depth)
> +    uint32_t absPartIdx = cuGeom.encodeIdx;
> +    if (cu->m_cuDepth[absPartIdx] > cuGeom.depth)
>      {
> -        /* TODO: this could use cuGeom.numPartition and flags */
> -        uint32_t curNumParts = NUM_CU_PARTITIONS >> (depth << 1);
> -        uint32_t qNumParts   = curNumParts >> 2;
> -        uint32_t xmax = cu->m_slice->m_sps->picWidthInLumaSamples  - cu->m_cuPelX;
> -        uint32_t ymax = cu->m_slice->m_sps->picHeightInLumaSamples - cu->m_cuPelY;
> -
> -        /* process four split sub-cu at next depth */
> -        for (int subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdx += qNumParts)
> +        for (int subPartIdx = 0; subPartIdx < 4; subPartIdx++)
>          {
> -            if (g_zscanToPelX[absPartIdx] < xmax && g_zscanToPelY[absPartIdx] < ymax)
> -                origCUSampleRestoration(cu, frame, absPartIdx, depth + 1);
> +            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
> +            if (childGeom.flags & CUGeom::PRESENT)
> +                origCUSampleRestoration(cu, childGeom, frame);
>          }
> -
>          return;
>      }
>  
>      // restore original YUV samples
>      if (cu->m_tqBypass[absPartIdx])
> -        restoreOrigLosslessYuv(cu, frame, absPartIdx, depth);
> +        restoreOrigLosslessYuv(cu, frame, absPartIdx);
>  }
>  
>  void FrameFilter::processSao(int row)
>  {
> -    SAOParam* saoParam = m_frame->m_encData->m_saoParam;
> +    FrameData& encData = *m_frame->m_encData;
> +    SAOParam* saoParam = encData.m_saoParam;
>  
>      if (saoParam->bSaoFlag[0])
>          m_sao.processSaoUnitRow(saoParam->ctuParam[0], row, 0);
> @@ -456,12 +452,19 @@
>          m_sao.processSaoUnitRow(saoParam->ctuParam[2], row, 2);
>      }
>  
> -    if (m_frame->m_encData->m_slice->m_pps->bTransquantBypassEnabled)
> +    if (encData.m_slice->m_pps->bTransquantBypassEnabled)
>      {
> -        uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth;
> +        uint32_t numCols = encData.m_slice->m_sps->numCuInWidth;
>          uint32_t lineStartCUAddr = row * numCols;
>  
> +        const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
> +        const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
> +
>          for (uint32_t col = 0; col < numCols; col++)
> -            origCUSampleRestoration(m_frame->m_encData->getPicCTU(lineStartCUAddr + col), *m_frame, 0, 0);
> +        {
> +            uint32_t cuAddr = lineStartCUAddr + col;
> +            const CUData* ctu = encData.getPicCTU(cuAddr);
> +            origCUSampleRestoration(ctu, cuGeoms[ctuGeomMap[cuAddr]], *m_frame);
> +        }
>      }
>  }
> diff -r 65e71f08c55a -r 270c97866810 source/encoder/framefilter.h
> --- a/source/encoder/framefilter.h	Sat Jan 17 10:12:34 2015 +0530
> +++ b/source/encoder/framefilter.h	Sat Jan 17 18:32:52 2015 +0900
> @@ -39,7 +39,7 @@
>  struct ThreadLocalData;
>  
>  // Manages the processing of a single frame loopfilter
> -class FrameFilter
> +class FrameFilter : public Deblock
>  {
>  public:
>  
> @@ -50,7 +50,6 @@
>      int           m_vChromaShift;
>      int           m_pad[2];
>  
> -    Deblock       m_deblock;
>      SAO           m_sao;
>      int           m_numRows;
>      int           m_saoRowDelay;
> diff -r 65e71f08c55a -r 270c97866810 source/encoder/search.cpp
> --- a/source/encoder/search.cpp	Sat Jan 17 10:12:34 2015 +0530
> +++ b/source/encoder/search.cpp	Sat Jan 17 18:32:52 2015 +0900
> @@ -63,6 +63,7 @@
>  
>  bool Search::initSearch(const x265_param& param, ScalingList& scalingList)
>  {
> +    uint32_t maxLog2CUSize = g_log2Size[param.maxCUSize];
>      m_param = ¶m;
>      m_bEnableRDOQ = param.rdLevel >= 4;
>      m_bFrameParallel = param.frameNumThreads > 1;
> @@ -81,9 +82,9 @@
>       * available for motion reference.  See refLagRows in FrameEncoder::compressCTURows() */
>      m_refLagPixels = m_bFrameParallel ? param.searchRange : param.sourceHeight;
>  
> -    uint32_t sizeL = 1 << (g_maxLog2CUSize * 2);
> +    uint32_t sizeL = 1 << (maxLog2CUSize * 2);
>      uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
> -    uint32_t numPartitions = NUM_CU_PARTITIONS;
> +    uint32_t numPartitions = 1 << (maxLog2CUSize - LOG2_UNIT_SIZE) * 2;
>  
>      /* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32
>       * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
> @@ -167,9 +168,8 @@
>  
>  void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx)
>  {
> -    uint32_t fullDepth  = cu.m_cuDepth[0] + tuDepth;
>      uint32_t subdiv     = tuDepth < cu.m_tuDepth[absPartIdx];
> -    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
> +    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
>  
>      if (!(log2TrSize - m_hChromaShift < 2))
>      {
> @@ -192,8 +192,7 @@
>      if (!cu.getCbf(absPartIdx, ttype, tuDepth))
>          return;
>  
> -    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
> -    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
> +    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
>  
>      if (tuDepth < cu.m_tuDepth[absPartIdx])
>      {
> @@ -241,8 +240,8 @@
>  void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2])
>  {
>      CUData& cu = mode.cu;
> -    uint32_t fullDepth  = cu.m_cuDepth[0] + tuDepth;
> -    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
> +    uint32_t fullDepth  = cuGeom.depth + tuDepth;
> +    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
>      uint32_t qtLayer    = log2TrSize - 2;
>      uint32_t sizeIdx    = log2TrSize - 2;
>      bool mightNotSplit  = log2TrSize <= depthRange[1];
> @@ -317,7 +316,7 @@
>                  m_entropyCoder.codePredMode(cu.m_predMode[0]);
>              }
>  
> -            m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]);
> +            m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
>          }
>          if (cu.m_partSize[0] == SIZE_2Nx2N)
>          {
> @@ -434,8 +433,8 @@
>  
>  void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
>  {
> -    uint32_t fullDepth = mode.cu.m_cuDepth[0] + tuDepth;
> -    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
> +    uint32_t fullDepth = cuGeom.depth + tuDepth;
> +    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
>      uint32_t tuSize = 1 << log2TrSize;
>  
>      X265_CHECK(tuSize == MAX_TS_SIZE, "transform skip is only possible at 4x4 TUs\n");
> @@ -528,7 +527,7 @@
>                  m_entropyCoder.codePredMode(cu.m_predMode[0]);
>              }
>  
> -            m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]);
> +            m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
>          }
>          if (cu.m_partSize[0] == SIZE_2Nx2N)
>          {
> @@ -604,8 +603,8 @@
>  void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
>  {
>      CUData& cu = mode.cu;
> -    uint32_t fullDepth  = cu.m_cuDepth[0] + tuDepth;
> -    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
> +    uint32_t fullDepth  = cuGeom.depth + tuDepth;
> +    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
>      bool     bCheckFull = log2TrSize <= depthRange[1];
>  
>      X265_CHECK(m_slice->m_sliceType != I_SLICE, "residualTransformQuantIntra not intended for I slices\n");
> @@ -675,8 +674,7 @@
>  
>  void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx)
>  {
> -    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
> -    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
> +    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
>  
>      if (tuDepth == cu.m_tuDepth[absPartIdx])
>      {
> @@ -709,9 +707,7 @@
>  /* 4:2:2 post-TU split processing */
>  void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx)
>  {
> -    uint32_t depth = cu.m_cuDepth[0];
> -    uint32_t fullDepth = depth + tuDepth;
> -    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
> +    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
>  
>      if (log2TrSize == 2)
>      {
> @@ -735,8 +731,7 @@
>  uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy)
>  {
>      CUData& cu = mode.cu;
> -    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
> -    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
> +    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
>  
>      if (tuDepth < cu.m_tuDepth[absPartIdx])
>      {
> @@ -782,7 +777,7 @@
>      const uint32_t sizeIdxC = log2TrSizeC - 2;
>      uint32_t outDist = 0;
>  
> -    uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
> +    uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
>      const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
>  
>      TURecurse tuIterator(splitType, curPartNum, absPartIdx);
> @@ -858,8 +853,8 @@
>  uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy)
>  {
>      CUData& cu = mode.cu;
> -    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
> -    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
> +    uint32_t fullDepth  = cuGeom.depth + tuDepth;
> +    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
>      const uint32_t log2TrSizeC = 2;
>      uint32_t qtLayer = log2TrSize - 2;
>      uint32_t outDist = 0;
> @@ -872,7 +867,7 @@
>      ALIGN_VAR_32(coeff_t, tskipCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
>      ALIGN_VAR_32(pixel,   tskipReconC[MAX_TS_SIZE * MAX_TS_SIZE]);
>  
> -    uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
> +    uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
>      const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
>  
>      TURecurse tuIterator(splitType, curPartNum, absPartIdx);
> @@ -1006,9 +1001,8 @@
>  
>  void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth)
>  {
> -    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
>      uint32_t tuDepthL  = cu.m_tuDepth[absPartIdx];
> -    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
> +    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
>      uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
>  
>      if (tuDepthL == tuDepth || log2TrSizeC == 2)
> @@ -1075,7 +1069,7 @@
>      uint32_t stride = mode.fencYuv->m_csize;
>      const uint32_t sizeIdxC = log2TrSizeC - 2;
>  
> -    uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
> +    uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
>      const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
>  
>      TURecurse tuIterator(splitType, curPartNum, absPartIdx);
> @@ -1184,13 +1178,13 @@
>  void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
>  {
>      CUData& cu = intraMode.cu;
> -    uint32_t depth = cu.m_cuDepth[0];
> +    uint32_t depth = cuGeom.depth;
>  
>      cu.setPartSizeSubParts(SIZE_2Nx2N);
>      cu.setPredModeSubParts(MODE_INTRA);
>  
>      const uint32_t initTuDepth = 0;
> -    uint32_t log2TrSize = cu.m_log2CUSize[0] - initTuDepth;
> +    uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth;
>      uint32_t tuSize = 1 << log2TrSize;
>      const uint32_t absPartIdx = 0;
>  
> @@ -1403,10 +1397,10 @@
>      Yuv* predYuv = &intraMode.predYuv;
>      const Yuv* fencYuv = intraMode.fencYuv;
>  
> -    uint32_t depth        = cu.m_cuDepth[0];
> +    uint32_t depth        = cuGeom.depth;
>      uint32_t initTuDepth  = cu.m_partSize[0] != SIZE_2Nx2N;
>      uint32_t numPU        = 1 << (2 * initTuDepth);
> -    uint32_t log2TrSize   = cu.m_log2CUSize[0] - initTuDepth;
> +    uint32_t log2TrSize   = cuGeom.log2CUSize - initTuDepth;
>      uint32_t tuSize       = 1 << log2TrSize;
>      uint32_t qNumParts    = cuGeom.numPartitions >> 2;
>      uint32_t sizeIdx      = log2TrSize - 2;
> @@ -1657,7 +1651,7 @@
>          }
>      }
>  
> -    cu.setChromIntraDirSubParts(bestMode, 0, cu.m_cuDepth[0]);
> +    cu.setChromIntraDirSubParts(bestMode, 0, cuGeom.depth);
>  }
>  
>  uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
> @@ -1665,10 +1659,10 @@
>      CUData& cu = intraMode.cu;
>      Yuv& reconYuv = intraMode.reconYuv;
>  
> -    uint32_t depth       = cu.m_cuDepth[0];
> +    uint32_t depth       = cuGeom.depth;
>      uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N && m_csp == X265_CSP_I444;
> -    uint32_t log2TrSize  = cu.m_log2CUSize[0] - initTuDepth;
> -    uint32_t absPartStep = (NUM_CU_PARTITIONS >> (depth << 1));
> +    uint32_t log2TrSize  = cuGeom.log2CUSize - initTuDepth;
> +    uint32_t absPartStep = cuGeom.numPartitions;
>      uint32_t totalDistortion = 0;
>  
>      int size = partitionFromLog2Size(log2TrSize);
> @@ -2490,13 +2484,13 @@
>      CUData& cu = interMode.cu;
>      Yuv* reconYuv = &interMode.reconYuv;
>      Yuv* predYuv = &interMode.predYuv;
> -    ShortYuv* resiYuv = &m_rqt[cuGeom.depth].tmpResiYuv;
> +    uint32_t depth = cuGeom.depth;
> +    ShortYuv* resiYuv = &m_rqt[depth].tmpResiYuv;
>      const Yuv* fencYuv = interMode.fencYuv;
>  
>      X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
>  
> -    uint32_t log2CUSize = cu.m_log2CUSize[0];
> -    uint32_t depth = cu.m_cuDepth[0];
> +    uint32_t log2CUSize = cuGeom.log2CUSize;
>      int sizeIdx = log2CUSize - 2;
>  
>      m_quant.setQPforQuant(interMode.cu);
> @@ -2509,7 +2503,7 @@
>      m_entropyCoder.load(m_rqt[depth].cur);
>  
>      Cost costs;
> -    estimateResidualQT(interMode, cuGeom, 0, depth, *resiYuv, costs, tuDepthRange);
> +    estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
>  
>      if (!cu.m_tqBypass[0])
>      {
> @@ -2541,7 +2535,7 @@
>      }
>  
>      if (cu.getQtRootCbf(0))
> -        saveResidualQTData(cu, *resiYuv, 0, depth);
> +        saveResidualQTData(cu, *resiYuv, 0, 0);
>  
>      /* calculate signal bits for inter/merge/skip coded CU */
>      m_entropyCoder.load(m_rqt[depth].cur);
> @@ -2567,7 +2561,7 @@
>              m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
>          m_entropyCoder.codeSkipFlag(cu, 0);
>          m_entropyCoder.codePredMode(cu.m_predMode[0]);
> -        m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]);
> +        m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
>          m_entropyCoder.codePredInfo(cu, 0);
>          uint32_t mvBits = m_entropyCoder.getNumberOfWrittenBits();
>  
> @@ -2603,9 +2597,7 @@
>  {
>      uint32_t depth = cuGeom.depth + tuDepth;
>      CUData& cu = mode.cu;
> -    X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "invalid depth\n");
> -
> -    uint32_t log2TrSize = g_maxLog2CUSize - depth;
> +    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
>  
>      bool bCheckFull = log2TrSize <= depthRange[1];
>      if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && log2TrSize > depthRange[0])
> @@ -2625,7 +2617,7 @@
>              bCodeChroma = !(absPartIdx & 3);
>          }
>  
> -        uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
> +        uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
>          uint32_t setCbf = 1 << tuDepth;
>  
>          uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
> @@ -2633,7 +2625,7 @@
>  
>          uint32_t sizeIdx  = log2TrSize  - 2;
>  
> -        cu.setTUDepthSubParts(depth - cu.m_cuDepth[0], absPartIdx, depth);
> +        cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
>          cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
>  
>          ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
> @@ -2744,22 +2736,21 @@
>          return m_rdCost.calcRdCost(dist, nullBits);
>  }
>  
> -void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2])
> +void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2])
>  {
>      CUData& cu = mode.cu;
> -    uint32_t log2TrSize = g_maxLog2CUSize - depth;
> +    uint32_t depth = cuGeom.depth + tuDepth;
> +    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
>  
>      bool bCheckSplit = log2TrSize > depthRange[0];
>      bool bCheckFull = log2TrSize <= depthRange[1];
>      bool bSplitPresentFlag = bCheckSplit && bCheckFull;
>  
> -    if (cu.m_partSize[0] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && bCheckSplit)
> +    if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && bCheckSplit)
>          bCheckFull = false;
>  
>      X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
> -    X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
> -
> -    uint32_t tuDepth = depth - cu.m_cuDepth[0];
> +
>      uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
>      bool bCodeChroma = true;
>      uint32_t tuDepthC = tuDepth;
> @@ -2787,7 +2778,7 @@
>  
>      uint32_t trSize = 1 << log2TrSize;
>      const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
> -    uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
> +    uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
>      const Yuv* fencYuv = mode.fencYuv;
>  
>      // code full block
> @@ -2804,7 +2795,7 @@
>          bool checkTransformSkipY  = checkTransformSkip && log2TrSize  <= MAX_LOG2_TS_SIZE;
>          bool checkTransformSkipC = checkTransformSkip && log2TrSizeC <= MAX_LOG2_TS_SIZE;
>  
> -        cu.setTUDepthSubParts(depth - cu.m_cuDepth[0], absPartIdx, depth);
> +        cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
>          cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
>  
>          if (m_bEnableRDOQ)
> @@ -3215,7 +3206,7 @@
>          uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
>          for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
>          {
> -            estimateResidualQT(mode, cuGeom, qPartIdx, depth + 1, resiYuv, splitCost, depthRange);
> +            estimateResidualQT(mode, cuGeom, qPartIdx, tuDepth + 1, resiYuv, splitCost, depthRange);
>              ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
>              ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
>              vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
> @@ -3234,7 +3225,7 @@
>          m_entropyCoder.load(m_rqt[depth].rqtRoot);
>          m_entropyCoder.resetBits();
>  
> -        codeInterSubdivCbfQT(cu, absPartIdx, depth, depthRange);
> +        codeInterSubdivCbfQT(cu, absPartIdx, tuDepth, depthRange);
>          uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
>          splitCost.bits += splitCbfBits;
>  
> @@ -3307,14 +3298,12 @@
>      outCosts.energy     += fullCost.energy;
>  }
>  
> -void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, const uint32_t depthRange[2])
> +void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2])
>  {
> -    X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
>      X265_CHECK(cu.isInter(absPartIdx), "codeInterSubdivCbfQT() with intra block\n");
>  
> -    const uint32_t tuDepth     = depth - cu.m_cuDepth[0];
> -    const bool     bSubdiv     = tuDepth != cu.m_tuDepth[absPartIdx];
> -    const uint32_t log2TrSize  = g_maxLog2CUSize - depth;
> +    const bool bSubdiv  = tuDepth < cu.m_tuDepth[absPartIdx];
> +    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
>  
>      if (!(log2TrSize - m_hChromaShift < 2))
>      {
> @@ -3337,102 +3326,19 @@
>      {
>          uint32_t qNumParts = 1 << (log2TrSize -1 - LOG2_UNIT_SIZE) * 2;
>          for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
> -            codeInterSubdivCbfQT(cu, absPartIdx, depth + 1, depthRange);
> +            codeInterSubdivCbfQT(cu, absPartIdx, tuDepth + 1, depthRange);
>      }
>  }
>  
> -void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, TextType ttype, const uint32_t depthRange[2])
> +void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth)
>  {
> -    X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
> -    X265_CHECK(cu.isInter(absPartIdx), "encodeResidualQT() with intra block\n");
> -
> -    const uint32_t curTuDepth  = depth - cu.m_cuDepth[0];
> -    const uint32_t tuDepth     = cu.m_tuDepth[absPartIdx];
> -    const bool     bSubdiv     = curTuDepth != tuDepth;
> -    const uint32_t log2TrSize  = g_maxLog2CUSize - depth;
> -
> -    if (bSubdiv)
> -    {
> -        if (cu.getCbf(absPartIdx, ttype, curTuDepth))
> -        {
> -            uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
> -            for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
> -                encodeResidualQT(cu, absPartIdx, depth + 1, ttype, depthRange);
> -        }
> -        return;
> -    }
> -    else
> -    {
> -        const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
> -        uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
> -
> -        // Luma
> -        const uint32_t qtLayer = log2TrSize - 2;
> -        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
> -        coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
> -
> -        // Chroma
> -        bool bCodeChroma = true;
> -        uint32_t tuDepthC = tuDepth;
> -        if (log2TrSize == 2 && m_csp != X265_CSP_I444)
> -        {
> -            X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
> -            log2TrSizeC++;
> -            tuDepthC--;
> -            bCodeChroma = !(absPartIdx & 3);
> -        }
> -
> -        if (ttype == TEXT_LUMA && cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
> -            m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
> -
> -        if (bCodeChroma)
> -        {
> -            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
> -            coeff_t* coeffCurU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
> -            coeff_t* coeffCurV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
> -
> -            if (!splitIntoSubTUs)
> -            {
> -                if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
> -                    m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
> -                if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
> -                    m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
> -            }
> -            else
> -            {
> -                uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2);
> -                uint32_t subTUSize = 1 << (log2TrSizeC * 2);
> -                if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
> -                {
> -                    if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
> -                        m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
> -                    if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
> -                        m_entropyCoder.codeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, TEXT_CHROMA_U);
> -                }
> -                if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
> -                {
> -                    if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
> -                        m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
> -                    if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
> -                        m_entropyCoder.codeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, TEXT_CHROMA_V);
> -                }
> -            }
> -        }
> -    }
> -}
> -
> -void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t depth)
> -{
> -    X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
> -    const uint32_t curTrMode = depth - cu.m_cuDepth[0];
> -    const uint32_t tuDepth   = cu.m_tuDepth[absPartIdx];
> -    const uint32_t log2TrSize = g_maxLog2CUSize - depth;
> -
> -    if (curTrMode < tuDepth)
> +    const uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
> +
> +    if (tuDepth < cu.m_tuDepth[absPartIdx])
>      {
>          uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
>          for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
> -            saveResidualQTData(cu, resiYuv, absPartIdx, depth + 1);
> +            saveResidualQTData(cu, resiYuv, absPartIdx, tuDepth + 1);
>          return;
>      }
>  
> diff -r 65e71f08c55a -r 270c97866810 source/encoder/search.h
> --- a/source/encoder/search.h	Sat Jan 17 10:12:34 2015 +0530
> +++ b/source/encoder/search.h	Sat Jan 17 18:32:52 2015 +0900
> @@ -201,7 +201,7 @@
>      bool          m_bJobsQueued;
>      void     singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, int part, int list, int ref);
>  
> -    void     saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t depth);
> +    void     saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth);
>  
>      // RDO search of luma intra modes; result is fully encoded luma. luma distortion is returned
>      uint32_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes);
> @@ -210,7 +210,7 @@
>      uint32_t estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom);
>  
>      void     codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx);
> -    void     codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, const uint32_t depthRange[2]);
> +    void     codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2]);
>      void     codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype);
>  
>      struct Cost
> @@ -225,9 +225,6 @@
>      uint64_t estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId);
>      void     estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2]);
>  
> -    // estimate bit cost of residual QT
> -    void     encodeResidualQT(CUData& cu, uint32_t absPartIdx, uint32_t depth, TextType ttype, const uint32_t depthRange[2]);
> -
>      // generate prediction, generate residual and recon. if bAllowSplit, find optimal RQT splits
>      void     codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& costs, const uint32_t depthRange[2]);
>      void     codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& costs);
> 
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
> 

-- 
Steve Borho