[x265] sao: refine, fix sao-non-deblock [CHANGES OUTPUT (RExt, sao-non-deblock)]

Steve Borho steve at borho.org
Sun Oct 5 23:11:35 CEST 2014


On 10/05, Satoshi Nakagawa wrote:
> # HG changeset patch
> # User Satoshi Nakagawa <nakagawa424 at oki.com>
> # Date 1412500756 -32400
> #      Sun Oct 05 18:19:16 2014 +0900
> # Node ID 64ea900398eb29ddd1c12df8126fa9866a280c81
> # Parent  b6d49505b179cb509aa76f3a065192f0b4926579
> sao: refine, fix sao-non-deblock [CHANGES OUTPUT (RExt, sao-non-deblock)]

Looks really good, but unfortunately it has collided with a number of
refactors I was also working on. Can you resend based on the current
tip?

> diff -r b6d49505b179 -r 64ea900398eb source/common/common.h
> --- a/source/common/common.h	Thu Oct 02 16:47:55 2014 -0500
> +++ b/source/common/common.h	Sun Oct 05 18:19:16 2014 +0900
> @@ -132,6 +132,12 @@
>      return std::min<T>(std::max<T>(minVal, a), maxVal);
>  }
>  
> +template<typename T>
> +inline T x265_min(T a, T b) { return a < b ? a : b; }
> +
> +template<typename T>
> +inline T x265_max(T a, T b) { return a > b ? a : b; }
> +
>  typedef int16_t  coeff_t;      // transform coefficient
>  
>  #define X265_MIN(a, b) ((a) < (b) ? (a) : (b))
> @@ -224,17 +230,15 @@
>      bool mergeUpFlag;
>      bool mergeLeftFlag;
>      int  typeIdx;
> -    int  subTypeIdx;    // indicates EO class or BO band position
> +    uint32_t bandPos;    // BO band position
>      int  offset[SAO_NUM_OFFSET];
> -    int  partIdx;
> -    int  partIdxTmp;
>  
>      void reset()
>      {
>          mergeUpFlag = false;
>          mergeLeftFlag = false;
>          typeIdx = -1;
> -        subTypeIdx = 0;
> +        bandPos = 0;
>          offset[0] = 0;
>          offset[1] = 0;
>          offset[2] = 0;
> @@ -246,7 +250,6 @@
>  {
>      SaoCtuParam* ctuParam[3];
>      bool         bSaoFlag[2];
> -    int          numCuInHeight;
>      int          numCuInWidth;
>  
>      SAOParam()
> @@ -254,6 +257,7 @@
>          for (int i = 0; i < 3; i++)
>              ctuParam[i] = NULL;
>      }
> +
>      ~SAOParam()
>      {
>          delete[] ctuParam[0];
> diff -r b6d49505b179 -r 64ea900398eb source/encoder/entropy.cpp
> --- a/source/encoder/entropy.cpp	Thu Oct 02 16:47:55 2014 -0500
> +++ b/source/encoder/entropy.cpp	Sun Oct 05 18:19:16 2014 +0900
> @@ -511,7 +511,7 @@
>      }
>  
>      // We need to split, so don't try these modes.
> -    if (cuSplitFlag) 
> +    if (cuSplitFlag)
>          codeSplitFlag(ctu, absPartIdx, depth);
>  
>      if (depth < ctu->getDepth(absPartIdx) && depth < g_maxCUDepth)
> @@ -863,74 +863,40 @@
>      encodeTransform(cu, state, lumaOffset, chromaOffset, absPartIdx, absPartIdxStep, depth, log2CUSize, 0, bCodeDQP, depthRange);
>  }
>  
> -void Entropy::codeSaoOffset(SaoCtuParam* saoLcuParam, uint32_t compIdx)
> +void Entropy::codeSaoOffset(const SaoCtuParam* saoLcuParam, int plane)
>  {
> -    uint32_t symbol;
> -    int i;
> +    int typeIdx = saoLcuParam->typeIdx;
>  
> -    symbol = saoLcuParam->typeIdx + 1;
> -    if (compIdx != 2)
> -        codeSaoTypeIdx(symbol);
> +    if (plane != 2)
> +    {
> +        encodeBin(typeIdx >= 0, m_contextState[OFF_SAO_TYPE_IDX_CTX]);
> +        if (typeIdx >= 0)
> +            encodeBinEP(typeIdx < SAO_BO ? 1 : 0);
> +    }
>  
> -    if (symbol)
> +    if (typeIdx >= 0)
>      {
> -        if (saoLcuParam->typeIdx < SAO_BO && compIdx != 2)
> -            saoLcuParam->subTypeIdx = saoLcuParam->typeIdx;
> +        enum { OFFSET_THRESH = 1 << X265_MIN(X265_DEPTH - 5, 5) };
>  
> -        int offsetTh = 1 << X265_MIN(X265_DEPTH - 5, 5);
> -        if (saoLcuParam->typeIdx == SAO_BO)
> +        if (typeIdx == SAO_BO)
>          {
> -            for (i = 0; i < SAO_BO_LEN; i++)
> -            {
> -                uint32_t absOffset = ((saoLcuParam->offset[i] < 0) ? -saoLcuParam->offset[i] : saoLcuParam->offset[i]);
> -                codeSaoMaxUvlc(absOffset, offsetTh - 1);
> -            }
> +            for (int i = 0; i < SAO_BO_LEN; i++)
> +                codeSaoMaxUvlc(abs(saoLcuParam->offset[i]), OFFSET_THRESH - 1);
>  
> -            for (i = 0; i < SAO_BO_LEN; i++)
> -            {
> +            for (int i = 0; i < SAO_BO_LEN; i++)
>                  if (saoLcuParam->offset[i] != 0)
> -                {
> -                    uint32_t sign = (saoLcuParam->offset[i] < 0) ? 1 : 0;
> -                    codeSAOSign(sign);
> -                }
> -            }
> +                    encodeBinEP(saoLcuParam->offset[i] < 0);
>  
> -            symbol = (uint32_t)(saoLcuParam->subTypeIdx);
> -            codeSaoUflc(5, symbol);
> +            encodeBinsEP(saoLcuParam->bandPos, 5);
>          }
> -        else // if (saoLcuParam->typeIdx < SAO_BO)
> +        else // if (typeIdx < SAO_BO)
>          {
> -            codeSaoMaxUvlc(saoLcuParam->offset[0], offsetTh - 1);
> -            codeSaoMaxUvlc(saoLcuParam->offset[1], offsetTh - 1);
> -            codeSaoMaxUvlc(-saoLcuParam->offset[2], offsetTh - 1);
> -            codeSaoMaxUvlc(-saoLcuParam->offset[3], offsetTh - 1);
> -            if (compIdx != 2)
> -            {
> -                symbol = (uint32_t)(saoLcuParam->subTypeIdx);
> -                codeSaoUflc(2, symbol);
> -            }
> -        }
> -    }
> -}
> -
> -void Entropy::codeSaoUnitInterleaving(int compIdx, bool saoFlag, int rx, int ry, SaoCtuParam* saoLcuParam, int cuAddrInSlice, int cuAddrUpInSlice, int allowMergeLeft, int allowMergeUp)
> -{
> -    if (saoFlag)
> -    {
> -        if (rx > 0 && cuAddrInSlice != 0 && allowMergeLeft)
> -            codeSaoMerge(saoLcuParam->mergeLeftFlag);
> -        else
> -            saoLcuParam->mergeLeftFlag = 0;
> -
> -        if (!saoLcuParam->mergeLeftFlag)
> -        {
> -            if ((ry > 0) && (cuAddrUpInSlice >= 0) && allowMergeUp)
> -                codeSaoMerge(saoLcuParam->mergeUpFlag);
> -            else
> -                saoLcuParam->mergeUpFlag = 0;
> -
> -            if (!saoLcuParam->mergeUpFlag)
> -                codeSaoOffset(saoLcuParam, compIdx);
> +            codeSaoMaxUvlc(saoLcuParam->offset[0], OFFSET_THRESH - 1);
> +            codeSaoMaxUvlc(saoLcuParam->offset[1], OFFSET_THRESH - 1);
> +            codeSaoMaxUvlc(-saoLcuParam->offset[2], OFFSET_THRESH - 1);
> +            codeSaoMaxUvlc(-saoLcuParam->offset[3], OFFSET_THRESH - 1);
> +            if (plane != 2)
> +                encodeBinsEP((uint32_t)(typeIdx), 2);
>          }
>      }
>  }
> @@ -1584,7 +1550,7 @@
>  
>      if (cu->m_slice->m_pps->bTransformSkipEnabled)
>          codeTransformSkipFlags(cu, absPartIdx, trSize, ttype);
> -    
> +
>      bool bIsLuma = ttype == TEXT_LUMA;
>  
>      // select scans
> @@ -1758,12 +1724,12 @@
>  {
>      X265_CHECK(maxSymbol > 0, "maxSymbol too small\n");
>  
> -    uint32_t isCodeLast = (maxSymbol > code) ? 1 : 0;
> -    uint32_t isCodeNonZero = (code != 0) ? 1 : 0;
> +    uint32_t isCodeNonZero = !!code;
>  
>      encodeBinEP(isCodeNonZero);
>      if (isCodeNonZero)
>      {
> +        uint32_t isCodeLast = (maxSymbol > code);
>          uint32_t mask = (1 << (code - 1)) - 1;
>          uint32_t len = code - 1 + isCodeLast;
>          mask <<= isCodeLast;
> @@ -1772,14 +1738,6 @@
>      }
>  }
>  
> -/** Code SAO type index */
> -void Entropy::codeSaoTypeIdx(uint32_t code)
> -{
> -    encodeBin((code == 0) ? 0 : 1, m_contextState[OFF_SAO_TYPE_IDX_CTX]);
> -    if (code)
> -        encodeBinEP(code <= 4 ? 1 : 0);
> -}
> -
>  /* estimate bit cost for CBP, significant map and significant coefficients */
>  void Entropy::estBit(EstBitsSbac& estBitsSbac, uint32_t log2TrSize, bool bIsLuma)
>  {
> diff -r b6d49505b179 -r 64ea900398eb source/encoder/entropy.h
> --- a/source/encoder/entropy.h	Thu Oct 02 16:47:55 2014 -0500
> +++ b/source/encoder/entropy.h	Sun Oct 05 18:19:16 2014 +0900
> @@ -39,7 +39,6 @@
>  class TComDataCU;
>  class ScalingList;
>  
> -
>  enum SplitType
>  {
>      DONT_SPLIT            = 0,
> @@ -149,8 +148,7 @@
>      void finishSlice()                 { encodeBinTrm(1); finish(); dynamic_cast<Bitstream*>(m_bitIf)->writeByteAlignment(); }
>  
>      void encodeCTU(TComDataCU* cu);
> -    void codeSaoOffset(SaoCtuParam* saoLcuParam, uint32_t compIdx);
> -    void codeSaoUnitInterleaving(int compIdx, bool saoFlag, int rx, int ry, SaoCtuParam* saoLcuParam, int cuAddrInSlice, int cuAddrUpInSlice, int allowMergeLeft, int allowMergeUp);
> +    void codeSaoOffset(const SaoCtuParam* saoLcuParam, int plane);
>      void codeSaoMerge(uint32_t code)   { encodeBin(code, m_contextState[OFF_SAO_MERGE_FLAG_CTX]); }
>  
>      void codeCUTransquantBypassFlag(uint32_t symbol);
> @@ -215,9 +213,6 @@
>      void codeRefFrmIdx(TComDataCU* cu, uint32_t absPartIdx, int list);
>  
>      void codeSaoMaxUvlc(uint32_t code, uint32_t maxSymbol);
> -    void codeSaoTypeIdx(uint32_t code);
> -    void codeSaoUflc(uint32_t length, uint32_t code) { encodeBinsEP(code, length); }
> -    void codeSAOSign(uint32_t code)                  { encodeBinEP(code); }
>  
>      void codeDeltaQP(TComDataCU* cu, uint32_t absPartIdx);
>      void codeLastSignificantXY(uint32_t posx, uint32_t posy, uint32_t log2TrSize, bool bIsLuma, uint32_t scanIdx);
> @@ -230,7 +225,7 @@
>          uint32_t bakAbsPartIdxCU;
>      };
>  
> -    void encodeTransform(TComDataCU* cu, CoeffCodeState& state, uint32_t offsetLumaOffset, uint32_t offsetChroma, 
> +    void encodeTransform(TComDataCU* cu, CoeffCodeState& state, uint32_t offsetLumaOffset, uint32_t offsetChroma,
>                           uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t depth, uint32_t log2TrSize, uint32_t uiTrIdx, bool& bCodeDQP, uint32_t* depthRange);
>  
>      void copyFrom(Entropy& src);
> diff -r b6d49505b179 -r 64ea900398eb source/encoder/sao.cpp
> --- a/source/encoder/sao.cpp	Thu Oct 02 16:47:55 2014 -0500
> +++ b/source/encoder/sao.cpp	Sun Oct 05 18:19:16 2014 +0900
> @@ -27,22 +27,9 @@
>  
>  namespace {
>  
> -#if HIGH_BIT_DEPTH
> -inline double roundIDBI2(double x)
> +inline int32_t roundIBDI(int32_t num, int32_t den)
>  {
> -    return ((x) > 0) ? (int)(((int)(x) + (1 << (X265_DEPTH - 8 - 1))) / (1 << (X265_DEPTH - 8))) :
> -                       ((int)(((int)(x) - (1 << (X265_DEPTH - 8 - 1))) / (1 << (X265_DEPTH - 8))));
> -}
> -#endif
> -
> -/* rounding with IBDI */
> -inline double roundIDBI(double x)
> -{
> -#if HIGH_BIT_DEPTH
> -    return X265_DEPTH > 8 ? roundIDBI2(x) : ((x) >= 0 ? ((int)((x) + 0.5)) : ((int)((x) - 0.5)));
> -#else
> -    return (x) >= 0 ? ((int)((x) + 0.5)) : ((int)((x) - 0.5));
> -#endif
> +    return num >= 0 ? ((num * 2 + den) / (den * 2)) : -((-num * 2 + den) / (den * 2));
>  }
>  
>  /* get the sign of input variable (TODO: this is a dup, make common) */
> @@ -51,6 +38,11 @@
>      return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
>  }
>  
> +inline int64_t estSaoDist(int32_t count, int offset, int32_t offsetOrg)
> +{
> +    return (count * offset - offsetOrg * 2) * offset;
> +}
> +
>  } // end anonymous namespace
>  
>  
> @@ -172,7 +164,6 @@
>  void SAO::allocSaoParam(SAOParam *saoParam) const
>  {
>      saoParam->numCuInWidth  = m_numCuInWidth;
> -    saoParam->numCuInHeight = m_numCuInHeight;
>  
>      saoParam->ctuParam[0] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth];
>      saoParam->ctuParam[1] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth];
> @@ -184,9 +175,11 @@
>  {
>      saoParam->bSaoFlag[0] = false;
>      saoParam->bSaoFlag[1] = false;
> +#if 0
>      resetCtuPart(saoParam->ctuParam[0]);
>      resetCtuPart(saoParam->ctuParam[1]);
>      resetCtuPart(saoParam->ctuParam[2]);
> +#endif
>  }
>  
>  void SAO::startSlice(Frame *pic, Entropy& initState, int qp)
> @@ -238,64 +231,45 @@
>  }
>  
>  // CTU-based SAO process without slice granularity
> -void SAO::processSaoCu(int addr, int saoType, int plane)
> +void SAO::processSaoCu(int addr, int typeIdx, int plane)
>  {
>      int x, y;
> -    TComDataCU *tmpCu = m_pic->getCU(addr);
> -    pixel* rec;
> -    int stride;
> -    int ctuWidth;
> -    int ctuHeight;
> -    int rpelx;
> -    int bpely;
> -    int picWidthTmp;
> -    int picHeightTmp;
> +    TComDataCU *cu = m_pic->getCU(addr);
> +    pixel* rec = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
> +    int stride = plane ? m_pic->getCStride() : m_pic->getStride();
> +    uint32_t picWidth  = m_param->sourceWidth;
> +    uint32_t picHeight = m_param->sourceHeight;
> +    int ctuWidth  = g_maxCUSize;
> +    int ctuHeight = g_maxCUSize;
> +    uint32_t lpelx = cu->getCUPelX();
> +    uint32_t tpely = cu->getCUPelY();
> +    if (plane)
> +    {
> +        picWidth  >>= m_hChromaShift;
> +        picHeight >>= m_vChromaShift;
> +        ctuWidth  >>= m_hChromaShift;
> +        ctuHeight >>= m_vChromaShift;
> +        lpelx     >>= m_hChromaShift;
> +        tpely     >>= m_vChromaShift;
> +    }
> +    uint32_t rpelx = x265_min(lpelx + ctuWidth,  picWidth);
> +    uint32_t bpely = x265_min(tpely + ctuHeight, picHeight);
> +    ctuWidth  = rpelx - lpelx;
> +    ctuHeight = bpely - tpely;
> +
>      int startX;
>      int startY;
>      int endX;
>      int endY;
>      pixel* tmpL;
>      pixel* tmpU;
> -    uint32_t lpelx = tmpCu->getCUPelX();
> -    uint32_t tpely = tmpCu->getCUPelY();
> -    bool isLuma = !plane;
> -
> -    picWidthTmp  = isLuma ? m_param->sourceWidth  : m_param->sourceWidth  >> m_hChromaShift;
> -    picHeightTmp = isLuma ? m_param->sourceHeight : m_param->sourceHeight >> m_vChromaShift;
> -    ctuWidth     = isLuma ? g_maxCUSize : g_maxCUSize >> m_hChromaShift;
> -    ctuHeight    = isLuma ? g_maxCUSize : g_maxCUSize >> m_vChromaShift;
> -    lpelx        = isLuma ? lpelx       : lpelx       >> m_hChromaShift;
> -    tpely        = isLuma ? tpely       : tpely       >> m_vChromaShift;
> -
> -    rpelx        = lpelx + ctuWidth;
> -    bpely        = tpely + ctuHeight;
> -    rpelx        = rpelx > picWidthTmp  ? picWidthTmp  : rpelx;
> -    bpely        = bpely > picHeightTmp ? picHeightTmp : bpely;
> -    ctuWidth     = rpelx - lpelx;
> -    ctuHeight    = bpely - tpely;
> -
> -    if (!tmpCu->m_pic)
> -        return;
> -
> -    if (plane)
> -    {
> -        rec    = m_pic->getPicYuvRec()->getChromaAddr(plane, addr);
> -        stride = m_pic->getCStride();
> -    }
> -    else
> -    {
> -        rec    = m_pic->getPicYuvRec()->getLumaAddr(addr);
> -        stride = m_pic->getStride();
> -    }
>  
>      int32_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
>      int32_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
>  
> -//   if (iSaoType!=SAO_BO_0 || iSaoType!=SAO_BO_1)
>      {
> -        int cuHeightTmp = isLuma ? g_maxCUSize : (g_maxCUSize  >> m_vChromaShift);
> -        pixel* recR = &rec[isLuma ? (g_maxCUSize - 1) : ((g_maxCUSize >> m_hChromaShift) - 1)];
> -        for (int i = 0; i < cuHeightTmp + 1; i++)
> +        const pixel* recR = &rec[ctuWidth - 1];
> +        for (int i = 0; i < ctuHeight + 1; i++)
>          {
>              m_tmpL2[i] = *recR;
>              recR += stride;
> @@ -305,13 +279,13 @@
>          tmpU = &(m_tmpU1[plane][lpelx]);
>      }
>  
> -    switch (saoType)
> +    switch (typeIdx)
>      {
>      case SAO_EO_0: // dir: -
>      {
>          pixel firstPxl = 0, lastPxl = 0;
>          startX = !lpelx;
> -        endX   = (rpelx == picWidthTmp) ? ctuWidth - 1 : ctuWidth;
> +        endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
>          if (ctuWidth & 15)
>          {
>              for (y = 0; y < ctuHeight; y++)
> @@ -338,7 +312,7 @@
>                  if (!lpelx)
>                      firstPxl = rec[0];
>  
> -                if (rpelx == picWidthTmp)
> +                if (rpelx == picWidth)
>                      lastPxl = rec[ctuWidth - 1];
>  
>                  primitives.saoCuOrgE0(rec, m_offsetEo, ctuWidth, (int8_t)signLeft);
> @@ -346,7 +320,7 @@
>                  if (!lpelx)
>                      rec[0] = firstPxl;
>  
> -                if (rpelx == picWidthTmp)
> +                if (rpelx == picWidth)
>                      rec[ctuWidth - 1] = lastPxl;
>  
>                  rec += stride;
> @@ -357,7 +331,7 @@
>      case SAO_EO_1: // dir: |
>      {
>          startY = !tpely;
> -        endY   = (bpely == picHeightTmp) ? ctuHeight - 1 : ctuHeight;
> +        endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
>          if (!tpely)
>              rec += stride;
>  
> @@ -383,10 +357,10 @@
>      case SAO_EO_2: // dir: 135
>      {
>          startX = !lpelx;
> -        endX   = (rpelx == picWidthTmp) ? ctuWidth - 1 : ctuWidth;
> +        endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
>  
>          startY = !tpely;
> -        endY   = (bpely == picHeightTmp) ? ctuHeight - 1 : ctuHeight;
> +        endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
>  
>          if (!tpely)
>              rec += stride;
> @@ -396,17 +370,15 @@
>  
>          for (y = startY; y < endY; y++)
>          {
> -            int signDown2 = signOf(rec[stride + startX] - tmpL[y]);
> +            upBufft[startX] = signOf(rec[stride + startX] - tmpL[y]);
>              for (x = startX; x < endX; x++)
>              {
> -                int signDown1 = signOf(rec[x] - rec[x + stride + 1]);
> -                int edgeType  = signDown1 + upBuff1[x] + 2;
> -                upBufft[x + 1] = -signDown1;
> +                int signDown = signOf(rec[x] - rec[x + stride + 1]);
> +                int edgeType = signDown + upBuff1[x] + 2;
> +                upBufft[x + 1] = -signDown;
>                  rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
>              }
>  
> -            upBufft[startX] = signDown2;
> -
>              std::swap(upBuff1, upBufft);
>  
>              rec += stride;
> @@ -416,13 +388,13 @@
>      }
>      case SAO_EO_3: // dir: 45
>      {
> -        startX = (lpelx == 0) ? 1 : 0;
> -        endX   = (rpelx == picWidthTmp) ? ctuWidth - 1 : ctuWidth;
> +        startX = !lpelx;
> +        endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
>  
> -        startY = (tpely == 0) ? 1 : 0;
> -        endY   = (bpely == picHeightTmp) ? ctuHeight - 1 : ctuHeight;
> +        startY = !tpely;
> +        endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
>  
> -        if (startY == 1)
> +        if (!tpely)
>              rec += stride;
>  
>          for (x = startX - 1; x < endX; x++)
> @@ -431,15 +403,15 @@
>          for (y = startY; y < endY; y++)
>          {
>              x = startX;
> -            int signDown1 = signOf(rec[x] - tmpL[y + 1]);
> -            int edgeType  = signDown1 + upBuff1[x] + 2;
> -            upBuff1[x - 1] = -signDown1;
> +            int signDown = signOf(rec[x] - tmpL[y + 1]);
> +            int edgeType = signDown + upBuff1[x] + 2;
> +            upBuff1[x - 1] = -signDown;
>              rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
>              for (x = startX + 1; x < endX; x++)
>              {
> -                signDown1 = signOf(rec[x] - rec[x + stride - 1]);
> -                edgeType  = signDown1 + upBuff1[x] + 2;
> -                upBuff1[x - 1] = -signDown1;
> +                signDown = signOf(rec[x] - rec[x + stride - 1]);
> +                edgeType = signDown + upBuff1[x] + 2;
> +                upBuff1[x - 1] = -signDown;
>                  rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
>              }
>  
> @@ -474,44 +446,27 @@
>  /* Process SAO all units */
>  void SAO::processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane)
>  {
> -    pixel *rec;
> -    int picWidthTmp;
> -
> +    int stride = plane ? m_pic->getCStride() : m_pic->getStride();
> +    uint32_t picWidth  = m_param->sourceWidth;
> +    int ctuWidth  = g_maxCUSize;
> +    int ctuHeight = g_maxCUSize;
>      if (plane)
>      {
> -        rec         = m_pic->getPicYuvRec()->getChromaAddr(plane);
> -        picWidthTmp = m_param->sourceWidth >> m_hChromaShift;
> -    }
> -    else
> -    {
> -        rec         = m_pic->getPicYuvRec()->getLumaAddr();
> -        picWidthTmp = m_param->sourceWidth;
> +        picWidth  >>= m_hChromaShift;
> +        ctuWidth  >>= m_hChromaShift;
> +        ctuHeight >>= m_vChromaShift;
>      }
>  
>      if (!idxY)
> -        memcpy(m_tmpU1[plane], rec, sizeof(pixel) * picWidthTmp);
> +    {
> +        pixel *rec = plane ? m_pic->getPicYuvRec()->getChromaAddr(plane) : m_pic->getPicYuvRec()->getLumaAddr();
> +        memcpy(m_tmpU1[plane], rec, sizeof(pixel) * picWidth);
> +    }
>  
> -    int frameWidthInCU = m_pic->getFrameWidthInCU();
> -    int stride;
> -    bool isChroma = !!plane;
> +    int addr = idxY * m_numCuInWidth;
> +    pixel *rec = plane ? m_pic->getPicYuvRec()->getChromaAddr(plane, addr) : m_pic->getPicYuvRec()->getLumaAddr(addr);
>  
> -    const int boShift = X265_DEPTH - SAO_BO_BITS;
> -
> -    int addr = idxY * frameWidthInCU;
> -    if (isChroma)
> -    {
> -        rec = m_pic->getPicYuvRec()->getChromaAddr(plane, addr);
> -        stride = m_pic->getCStride();
> -        picWidthTmp = m_param->sourceWidth >> m_hChromaShift;
> -    }
> -    else
> -    {
> -        rec = m_pic->getPicYuvRec()->getLumaAddr(addr);
> -        stride = m_pic->getStride();
> -        picWidthTmp = m_param->sourceWidth;
> -    }
> -    int maxCUHeight = isChroma ? (g_maxCUSize >> m_vChromaShift) : g_maxCUSize;
> -    for (int i = 0; i < maxCUHeight + 1; i++)
> +    for (int i = 0; i < ctuHeight + 1; i++)
>      {
>          m_tmpL1[i] = rec[0];
>          rec += stride;
> @@ -519,11 +474,13 @@
>  
>      rec -= (stride << 1);
>  
> -    memcpy(m_tmpU2[plane], rec, sizeof(pixel) * picWidthTmp);
> +    memcpy(m_tmpU2[plane], rec, sizeof(pixel) * picWidth);
>  
> -    for (int idxX = 0; idxX < frameWidthInCU; idxX++)
> +    const int boShift = X265_DEPTH - SAO_BO_BITS;
> +
> +    for (int idxX = 0; idxX < m_numCuInWidth; idxX++)
>      {
> -        addr = idxY * frameWidthInCU + idxX;
> +        addr = idxY * m_numCuInWidth + idxX;
>  
>          int typeIdx = ctuParam[addr].typeIdx;
>          bool mergeLeftFlag = ctuParam[addr].mergeLeftFlag;
> @@ -539,7 +496,7 @@
>                      memset(offset, 0, sizeof(offset));
>  
>                      for (int i = 0; i < SAO_NUM_OFFSET; i++)
> -                        offset[((ctuParam[addr].subTypeIdx + i) & (SAO_NUM_BO_CLASSES - 1))] = ctuParam[addr].offset[i] << SAO_BIT_INC;
> +                        offset[((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = ctuParam[addr].offset[i] << SAO_BIT_INC;
>  
>                      for (int i = 0; i < (1 << X265_DEPTH); i++)
>                          offsetBo[i] = m_clipTable[i + offset[i >> boShift]];
> @@ -557,27 +514,14 @@
>              }
>              processSaoCu(addr, typeIdx, plane);
>          }
> -        else
> +        else if (idxX != (m_numCuInWidth - 1))
>          {
> -            if (idxX != (frameWidthInCU - 1))
> +            rec = plane ? m_pic->getPicYuvRec()->getChromaAddr(plane, addr) : m_pic->getPicYuvRec()->getLumaAddr(addr);
> +
> +            for (int i = 0; i < ctuHeight + 1; i++)
>              {
> -                if (isChroma)
> -                {
> -                    rec = m_pic->getPicYuvRec()->getChromaAddr(plane, addr);
> -                    stride = m_pic->getCStride();
> -                }
> -                else
> -                {
> -                    rec = m_pic->getPicYuvRec()->getLumaAddr(addr);
> -                    stride = m_pic->getStride();
> -                }
> -
> -                int widthShift = isChroma ? (g_maxCUSize >> m_hChromaShift) : g_maxCUSize;
> -                for (int i = 0; i < maxCUHeight + 1; i++)
> -                {
> -                    m_tmpL1[i] = rec[widthShift - 1];
> -                    rec += stride;
> -                }
> +                m_tmpL1[i] = rec[ctuWidth - 1];
> +                rec += stride;
>              }
>          }
>      }
> @@ -591,9 +535,8 @@
>      {
>          ctuParam[i].mergeUpFlag   =  1;
>          ctuParam[i].mergeLeftFlag =  0;
> -        ctuParam[i].partIdx       =  0;
>          ctuParam[i].typeIdx       = -1;
> -        ctuParam[i].subTypeIdx    =  0;
> +        ctuParam[i].bandPos       =  0;
>          for (int j = 0; j < SAO_NUM_OFFSET; j++)
>              ctuParam[i].offset[j] = 0;
>      }
> @@ -603,10 +546,8 @@
>  {
>      saoUnit->mergeUpFlag   = 0;
>      saoUnit->mergeLeftFlag = 0;
> -    saoUnit->partIdx       = 0;
> -    saoUnit->partIdxTmp    = 0;
>      saoUnit->typeIdx       = -1;
> -    saoUnit->subTypeIdx    = 0;
> +    saoUnit->bandPos       = 0;
>  
>      for (int i = 0; i < SAO_NUM_OFFSET; i++)
>          saoUnit->offset[i] = 0;
> @@ -617,8 +558,8 @@
>      saoUnitDst->mergeLeftFlag = saoUnitSrc->mergeLeftFlag;
>      saoUnitDst->mergeUpFlag   = saoUnitSrc->mergeUpFlag;
>      saoUnitDst->typeIdx       = saoUnitSrc->typeIdx;
> +    saoUnitDst->bandPos       = saoUnitSrc->bandPos;
>  
> -    saoUnitDst->subTypeIdx  = saoUnitSrc->subTypeIdx;
>      for (int i = 0; i < SAO_NUM_OFFSET; i++)
>          saoUnitDst->offset[i] = saoUnitSrc->offset[i];
>  }
> @@ -628,242 +569,240 @@
>  {
>      int x, y;
>      TComDataCU *cu = m_pic->getCU(addr);
> +    const pixel* fenc0 = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
> +    const pixel* rec0  = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
> +    const pixel* fenc;
> +    const pixel* rec;
> +    int stride = plane ? m_pic->getCStride() : m_pic->getStride();
> +    uint32_t picWidth  = m_param->sourceWidth;
> +    uint32_t picHeight = m_param->sourceHeight;
> +    int ctuWidth  = g_maxCUSize;
> +    int ctuHeight = g_maxCUSize;
> +    uint32_t lpelx = cu->getCUPelX();
> +    uint32_t tpely = cu->getCUPelY();
> +    if (plane)
> +    {
> +        picWidth  >>= m_hChromaShift;
> +        picHeight >>= m_vChromaShift;
> +        ctuWidth  >>= m_hChromaShift;
> +        ctuHeight >>= m_vChromaShift;
> +        lpelx     >>= m_hChromaShift;
> +        tpely     >>= m_vChromaShift;
> +    }
> +    uint32_t rpelx = x265_min(lpelx + ctuWidth,  picWidth);
> +    uint32_t bpely = x265_min(tpely + ctuHeight, picHeight);
> +    ctuWidth  = rpelx - lpelx;
> +    ctuHeight = bpely - tpely;
>  
> -    pixel* fenc;
> -    pixel* recon;
> -    int stride;
> -    int ctuHeight;
> -    int ctuWidth;
> -    uint32_t rpelx;
> -    uint32_t bpely;
> -    uint32_t picWidthTmp;
> -    uint32_t picHeightTmp;
> -    int64_t* stats;
> -    int64_t* counts;
>      int startX;
>      int startY;
>      int endX;
>      int endY;
> -    uint32_t lpelx = cu->getCUPelX();
> -    uint32_t tpely = cu->getCUPelY();
> +    int32_t* stats;
> +    int32_t* count;
>  
> -    int isLuma = !plane;
> -    int isChroma = !!plane;
> -    int numSkipLine = isChroma ? 4 - (2 * m_vChromaShift) : 4;
> -    int numSkipLineRight = isChroma ? 5 - (2 * m_hChromaShift) : 5;
> +    int skipB = plane ? 2 : 4;
> +    int skipR = plane ? 3 : 5;
>  
> -    picWidthTmp  = isLuma ? m_param->sourceWidth  : m_param->sourceWidth  >> m_hChromaShift;
> -    picHeightTmp = isLuma ? m_param->sourceHeight : m_param->sourceHeight >> m_vChromaShift;
> -    ctuWidth     = isLuma ? g_maxCUSize : g_maxCUSize >> m_hChromaShift;
> -    ctuHeight    = isLuma ? g_maxCUSize : g_maxCUSize >> m_vChromaShift;
> -    lpelx        = isLuma ? lpelx       : lpelx       >> m_hChromaShift;
> -    tpely        = isLuma ? tpely       : tpely       >> m_vChromaShift;
> +    int32_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
> +    int32_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
>  
> -    rpelx     = lpelx + ctuWidth;
> -    bpely     = tpely + ctuHeight;
> -    rpelx     = rpelx > picWidthTmp  ? picWidthTmp  : rpelx;
> -    bpely     = bpely > picHeightTmp ? picHeightTmp : bpely;
> -    ctuWidth  = rpelx - lpelx;
> -    ctuHeight = bpely - tpely;
> -    stride    =  (plane == 0) ? m_pic->getStride() : m_pic->getCStride();
> -
> -    //if(iSaoType == BO_0 || iSaoType == BO_1)
> +    // SAO_BO:
>      {
>          const int boShift = X265_DEPTH - SAO_BO_BITS;
>  
>          if (m_param->bSaoNonDeblocked)
>          {
> -            numSkipLine      = isChroma ? 3 - (2 * m_vChromaShift) : 3;
> -            numSkipLineRight = isChroma ? 4 - (2 * m_hChromaShift) : 4;
> +            skipB = plane ? 1 : 3;
> +            skipR = plane ? 2 : 4;
>          }
>          stats = m_offsetOrg[plane][SAO_BO];
> -        counts = m_count[plane][SAO_BO];
> +        count = m_count[plane][SAO_BO];
>  
> -        fenc = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
> -        recon = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
> +        fenc = fenc0;
> +        rec  = rec0;
>  
> -        endX = (rpelx == picWidthTmp) ? ctuWidth : ctuWidth - numSkipLineRight;
> -        endY = (bpely == picHeightTmp) ? ctuHeight : ctuHeight - numSkipLine;
> +        endX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
> +        endY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;
> +
>          for (y = 0; y < endY; y++)
>          {
>              for (x = 0; x < endX; x++)
>              {
> -                int classIdx = 1 + (recon[x] >> boShift);
> -                stats[classIdx] += (fenc[x] - recon[x]);
> -                counts[classIdx]++;
> +                int classIdx = 1 + (rec[x] >> boShift);
> +                stats[classIdx] += (fenc[x] - rec[x]);
> +                count[classIdx]++;
>              }
>  
>              fenc += stride;
> -            recon += stride;
> +            rec += stride;
>          }
>      }
>  
> -    int32_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
> -    int32_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
> -
> -    //if (iSaoType == EO_0  || iSaoType == EO_1 || iSaoType == EO_2 || iSaoType == EO_3)
>      {
> -        //if (iSaoType == EO_0)
> +        // SAO_EO_0: // dir: -
>          {
>              if (m_param->bSaoNonDeblocked)
>              {
> -                numSkipLine      = isChroma ? 3 - (2 * m_vChromaShift) : 3;
> -                numSkipLineRight = isChroma ? 5 - (2 * m_hChromaShift) : 5;
> +                skipB = plane ? 1 : 3;
> +                skipR = plane ? 3 : 5;
>              }
>              stats = m_offsetOrg[plane][SAO_EO_0];
> -            counts = m_count[plane][SAO_EO_0];
> +            count = m_count[plane][SAO_EO_0];
>  
> -            fenc = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
> -            recon = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
> +            fenc = fenc0;
> +            rec  = rec0;
>  
> -            startX = (lpelx == 0) ? 1 : 0;
> -            endX   = (rpelx == picWidthTmp) ? ctuWidth - 1 : ctuWidth - numSkipLineRight;
> -            for (y = 0; y < ctuHeight - numSkipLine; y++)
> +            startX = !lpelx;
> +            endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
> +            for (y = 0; y < ctuHeight - skipB; y++)
>              {
> -                int signLeft = signOf(recon[startX] - recon[startX - 1]);
> +                int signLeft = signOf(rec[startX] - rec[startX - 1]);
>                  for (x = startX; x < endX; x++)
>                  {
> -                    int signRight = signOf(recon[x] - recon[x + 1]);
> +                    int signRight = signOf(rec[x] - rec[x + 1]);
>                      int edgeType = signRight + signLeft + 2;
>                      signLeft = -signRight;
>  
> -                    stats[s_eoTable[edgeType]] += (fenc[x] - recon[x]);
> -                    counts[s_eoTable[edgeType]]++;
> +                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
> +                    count[s_eoTable[edgeType]]++;
>                  }
>  
>                  fenc += stride;
> -                recon += stride;
> +                rec += stride;
>              }
>          }
>  
> -        //if (iSaoType == EO_1)
> +        // SAO_EO_1: // dir: |
>          {
>              if (m_param->bSaoNonDeblocked)
>              {
> -                numSkipLine      = isChroma ? 4 - (2 * m_vChromaShift) : 4;
> -                numSkipLineRight = isChroma ? 4 - (2 * m_hChromaShift) : 4;
> +                skipB = plane ? 2 : 4;
> +                skipR = plane ? 2 : 4;
>              }
>              stats = m_offsetOrg[plane][SAO_EO_1];
> -            counts = m_count[plane][SAO_EO_1];
> +            count = m_count[plane][SAO_EO_1];
>  
> -            fenc = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
> -            recon = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
> +            fenc = fenc0;
> +            rec  = rec0;
>  
> -            startY = (tpely == 0) ? 1 : 0;
> -            endX   = (rpelx == picWidthTmp) ? ctuWidth : ctuWidth - numSkipLineRight;
> -            endY   = (bpely == picHeightTmp) ? ctuHeight - 1 : ctuHeight - numSkipLine;
> -            if (tpely == 0)
> +            startY = !tpely;
> +            endX   = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
> +            endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
> +            if (!tpely)
>              {
>                  fenc += stride;
> -                recon += stride;
> +                rec += stride;
>              }
>  
>              for (x = 0; x < ctuWidth; x++)
> -                upBuff1[x] = signOf(recon[x] - recon[x - stride]);
> +                upBuff1[x] = signOf(rec[x] - rec[x - stride]);
>  
>              for (y = startY; y < endY; y++)
>              {
>                  for (x = 0; x < endX; x++)
>                  {
> -                    int signDown = signOf(recon[x] - recon[x + stride]);
> +                    int signDown = signOf(rec[x] - rec[x + stride]);
>                      int edgeType = signDown + upBuff1[x] + 2;
>                      upBuff1[x] = -signDown;
>  
> -                    stats[s_eoTable[edgeType]] += (fenc[x] - recon[x]);
> -                    counts[s_eoTable[edgeType]]++;
> +                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
> +                    count[s_eoTable[edgeType]]++;
>                  }
>  
>                  fenc += stride;
> -                recon += stride;
> +                rec += stride;
>              }
>          }
> -        //if (iSaoType == EO_2)
> +
> +        // SAO_EO_2: // dir: 135
>          {
>              if (m_param->bSaoNonDeblocked)
>              {
> -                numSkipLine      = isChroma ? 4 - (2 * m_vChromaShift) : 4;
> -                numSkipLineRight = isChroma ? 5 - (2 * m_hChromaShift) : 5;
> +                skipB = plane ? 2 : 4;
> +                skipR = plane ? 3 : 5;
>              }
>              stats = m_offsetOrg[plane][SAO_EO_2];
> -            counts = m_count[plane][SAO_EO_2];
> +            count = m_count[plane][SAO_EO_2];
>  
> -            fenc = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
> -            recon = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
> +            fenc = fenc0;
> +            rec  = rec0;
>  
> -            startX = (lpelx == 0) ? 1 : 0;
> -            endX   = (rpelx == picWidthTmp) ? ctuWidth - 1 : ctuWidth - numSkipLineRight;
> +            startX = !lpelx;
> +            endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
>  
> -            startY = (tpely == 0) ? 1 : 0;
> -            endY   = (bpely == picHeightTmp) ? ctuHeight - 1 : ctuHeight - numSkipLine;
> -            if (tpely == 0)
> +            startY = !tpely;
> +            endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
> +            if (!tpely)
>              {
>                  fenc += stride;
> -                recon += stride;
> +                rec += stride;
>              }
>  
>              for (x = startX; x < endX; x++)
> -                upBuff1[x] = signOf(recon[x] - recon[x - stride - 1]);
> +                upBuff1[x] = signOf(rec[x] - rec[x - stride - 1]);
>  
>              for (y = startY; y < endY; y++)
>              {
> -                int signDown2 = signOf(recon[stride + startX] - recon[startX - 1]);
> +                upBufft[startX] = signOf(rec[startX + stride] - rec[startX - 1]);
>                  for (x = startX; x < endX; x++)
>                  {
> -                    int signDown1 = signOf(recon[x] - recon[x + stride + 1]);
> -                    int edgeType  = signDown1 + upBuff1[x] + 2;
> -                    upBufft[x + 1] = -signDown1;
> -                    stats[s_eoTable[edgeType]] += (fenc[x] - recon[x]);
> -                    counts[s_eoTable[edgeType]]++;
> +                    int signDown = signOf(rec[x] - rec[x + stride + 1]);
> +                    int edgeType = signDown + upBuff1[x] + 2;
> +                    upBufft[x + 1] = -signDown;
> +                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
> +                    count[s_eoTable[edgeType]]++;
>                  }
>  
> -                upBufft[startX] = signDown2;
>                  std::swap(upBuff1, upBufft);
>  
> -                recon += stride;
> +                rec += stride;
>                  fenc += stride;
>              }
>          }
> -        //if (iSaoType == EO_3)
> +
> +        // SAO_EO_3: // dir: 45
>          {
>              if (m_param->bSaoNonDeblocked)
>              {
> -                numSkipLine      = isChroma ? 4 - (2 * m_vChromaShift) : 4;
> -                numSkipLineRight = isChroma ? 5 - (2 * m_hChromaShift) : 5;
> +                skipB = plane ? 2 : 4;
> +                skipR = plane ? 3 : 5;
>              }
>              stats = m_offsetOrg[plane][SAO_EO_3];
> -            counts = m_count[plane][SAO_EO_3];
> +            count = m_count[plane][SAO_EO_3];
>  
> -            fenc = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
> -            recon = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
> +            fenc = fenc0;
> +            rec  = rec0;
>  
> -            startX = (lpelx == 0) ? 1 : 0;
> -            endX   = (rpelx == picWidthTmp) ? ctuWidth - 1 : ctuWidth - numSkipLineRight;
> +            startX = !lpelx;
> +            endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
>  
> -            startY = (tpely == 0) ? 1 : 0;
> -            endY   = (bpely == picHeightTmp) ? ctuHeight - 1 : ctuHeight - numSkipLine;
> -            if (startY == 1)
> +            startY = !tpely;
> +            endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
> +
> +            if (!tpely)
>              {
>                  fenc += stride;
> -                recon += stride;
> +                rec += stride;
>              }
>  
>              for (x = startX - 1; x < endX; x++)
> -                upBuff1[x] = signOf(recon[x] - recon[x - stride + 1]);
> +                upBuff1[x] = signOf(rec[x] - rec[x - stride + 1]);
>  
>              for (y = startY; y < endY; y++)
>              {
>                  for (x = startX; x < endX; x++)
>                  {
> -                    int signDown1 = signOf(recon[x] - recon[x + stride - 1]);
> -                    int edgeType  = signDown1 + upBuff1[x] + 2;
> -                    upBuff1[x - 1] = -signDown1;
> -                    stats[s_eoTable[edgeType]] += (fenc[x] - recon[x]);
> -                    counts[s_eoTable[edgeType]]++;
> +                    int signDown = signOf(rec[x] - rec[x + stride - 1]);
> +                    int edgeType = signDown + upBuff1[x] + 2;
> +                    upBuff1[x - 1] = -signDown;
> +                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
> +                    count[s_eoTable[edgeType]]++;
>                  }
>  
> -                upBuff1[endX - 1] = signOf(recon[endX - 1 + stride] - recon[endX]);
> +                upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
>  
> -                recon += stride;
> +                rec += stride;
>                  fenc += stride;
>              }
>          }
> @@ -872,277 +811,266 @@
>  
>  void SAO::calcSaoStatsCu_BeforeDblk(Frame* pic, int idxX, int idxY)
>  {
> +    int addr    = idxX + m_numCuInWidth * idxY;
> +
>      int x, y;
> +    TComDataCU *cu = pic->getCU(addr);
> +    const pixel* fenc;
> +    const pixel* rec;
> +    int stride = m_pic->getStride();
> +    uint32_t picWidth  = m_param->sourceWidth;
> +    uint32_t picHeight = m_param->sourceHeight;
> +    int ctuWidth  = g_maxCUSize;
> +    int ctuHeight = g_maxCUSize;
> +    uint32_t lpelx = cu->getCUPelX();
> +    uint32_t tpely = cu->getCUPelY();
> +    uint32_t rpelx = x265_min(lpelx + ctuWidth,  picWidth);
> +    uint32_t bpely = x265_min(tpely + ctuHeight, picHeight);
> +    ctuWidth  = rpelx - lpelx;
> +    ctuHeight = bpely - tpely;
>  
> -    pixel* fenc;
> -    pixel* recon;
> -    int stride;
> -    uint32_t rPelX;
> -    uint32_t bPelY;
> -    int64_t* stats;
> -    int64_t* count;
>      int startX;
>      int startY;
>      int endX;
>      int endY;
>      int firstX, firstY;
> +    int32_t* stats;
> +    int32_t* count;
>  
> -    int frameWidthInCU  = m_numCuInWidth;
> +    int skipB, skipR;
>  
> -    int isChroma;
> -    int numSkipLine, numSkipLineRight;
> -
> -    uint32_t lPelX, tPelY;
> -    TComDataCU *cu;
>      int32_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
>      int32_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
>  
>      const int boShift = X265_DEPTH - SAO_BO_BITS;
>  
> -    // NOTE: Row
> +    memset(m_countPreDblk[addr], 0, sizeof(PerPlane));
> +    memset(m_offsetOrgPreDblk[addr], 0, sizeof(PerPlane));
> +
> +    for (int plane = 0; plane < NUM_PLANE; plane++)
>      {
> -        // NOTE: Col
> +        if (plane == 1)
>          {
> -            int addr    = idxX + frameWidthInCU * idxY;
> -            cu      = pic->getCU(addr);
> +            stride = pic->getCStride();
> +            picWidth  >>= m_hChromaShift;
> +            picHeight >>= m_vChromaShift;
> +            ctuWidth  >>= m_hChromaShift;
> +            ctuHeight >>= m_vChromaShift;
> +            lpelx     >>= m_hChromaShift;
> +            tpely     >>= m_vChromaShift;
> +            rpelx     >>= m_hChromaShift;
> +            bpely     >>= m_vChromaShift;
> +        }
>  
> -            uint32_t picWidthTmp  = m_param->sourceWidth;
> -            uint32_t picHeightTmp = m_param->sourceHeight;
> -            int ctuWidth  = g_maxCUSize;
> -            int ctuHeight = g_maxCUSize;
> -            lPelX   = cu->getCUPelX();
> -            tPelY   = cu->getCUPelY();
> -            rPelX     = lPelX + ctuWidth;
> -            bPelY     = tPelY + ctuHeight;
> -            rPelX     = rPelX > picWidthTmp  ? picWidthTmp  : rPelX;
> -            bPelY     = bPelY > picHeightTmp ? picHeightTmp : bPelY;
> -            ctuWidth  = rPelX - lPelX;
> -            ctuHeight = bPelY - tPelY;
> +        // SAO_BO:
>  
> -            memset(m_countPreDblk[addr], 0, sizeof(PerPlane));
> -            memset(m_offsetOrgPreDblk[addr], 0, sizeof(PerPlane));
> +        skipB = plane ? 1 : 3;
> +        skipR = plane ? 2 : 4;
>  
> -            for (int plane = 0; plane < 3; plane++)
> +        stats = m_offsetOrgPreDblk[addr][plane][SAO_BO];
> +        count = m_countPreDblk[addr][plane][SAO_BO];
> +
> +        const pixel* fenc0 = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
> +        const pixel* rec0  = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
> +        fenc = fenc0;
> +        rec  = rec0;
> +
> +        startX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
> +        startY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;
> +
> +        for (y = 0; y < ctuHeight; y++)
> +        {
> +            for (x = (y < startY ? startX : 0); x < ctuWidth; x++)
>              {
> -                isChroma = !!plane;
> -                if (plane == 1)
> +                int classIdx = 1 + (rec[x] >> boShift);
> +                stats[classIdx] += (fenc[x] - rec[x]);
> +                count[classIdx]++;
> +            }
> +
> +            fenc += stride;
> +            rec += stride;
> +        }
> +
> +        // SAO_EO_0: // dir: -
> +        {
> +            skipB = plane ? 1 : 3;
> +            skipR = plane ? 3 : 5;
> +
> +            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_0];
> +            count = m_countPreDblk[addr][plane][SAO_EO_0];
> +
> +            fenc = fenc0;
> +            rec  = rec0;
> +
> +            startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
> +            startY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;
> +            firstX = !lpelx;
> +            // endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
> +            endX   = ctuWidth - 1;  // not refer right CTU
> +
> +            for (y = 0; y < ctuHeight; y++)
> +            {
> +                x = (y < startY ? startX : firstX);
> +                int signLeft = signOf(rec[x] - rec[x - 1]);
> +                for (; x < endX; x++)
>                  {
> -                    picWidthTmp  >>= m_hChromaShift;
> -                    picHeightTmp >>= m_vChromaShift;
> -                    ctuWidth     >>= m_hChromaShift;
> -                    ctuHeight    >>= m_vChromaShift;
> -                    lPelX        >>= m_hChromaShift;
> -                    tPelY        >>= m_vChromaShift;
> -                    rPelX     = lPelX + ctuWidth;
> -                    bPelY     = tPelY + ctuHeight;
> +                    int signRight = signOf(rec[x] - rec[x + 1]);
> +                    int edgeType = signRight + signLeft + 2;
> +                    signLeft = -signRight;
> +
> +                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
> +                    count[s_eoTable[edgeType]]++;
>                  }
>  
> -                stride   = (plane == 0) ? pic->getStride() : pic->getCStride();
> +                fenc += stride;
> +                rec += stride;
> +            }
> +        }
>  
> -                //if(iSaoType == BO)
> +        // SAO_EO_1: // dir: |
> +        {
> +            skipB = plane ? 2 : 4;
> +            skipR = plane ? 2 : 4;
>  
> -                numSkipLine = isChroma ? 1 : 3;
> -                numSkipLineRight = isChroma ? 2 : 4;
> +            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_1];
> +            count = m_countPreDblk[addr][plane][SAO_EO_1];
>  
> -                stats = m_offsetOrgPreDblk[addr][plane][SAO_BO];
> -                count = m_countPreDblk[addr][plane][SAO_BO];
> +            fenc = fenc0;
> +            rec  = rec0;
>  
> -                fenc = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
> -                recon = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
> +            startX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
> +            startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
> +            firstY = !tpely;
> +            // endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
> +            endY   = ctuHeight - 1; // not refer below CTU
> +            if (!tpely)
> +            {
> +                fenc += stride;
> +                rec += stride;
> +            }
>  
> -                startX = (rPelX == picWidthTmp) ? ctuWidth : ctuWidth - numSkipLineRight;
> -                startY = (bPelY == picHeightTmp) ? ctuHeight : ctuHeight - numSkipLine;
> +            for (x = startX; x < ctuWidth; x++)
> +                upBuff1[x] = signOf(rec[x] - rec[x - stride]);
>  
> -                for (y = 0; y < ctuHeight; y++)
> +            for (y = firstY; y < endY; y++)
> +            {
> +                for (x = (y < startY - 1 ? startX : 0); x < ctuWidth; x++)
>                  {
> -                    for (x = 0; x < ctuWidth; x++)
> -                    {
> -                        if (x < startX && y < startY)
> -                            continue;
> +                    int signDown = signOf(rec[x] - rec[x + stride]);
> +                    int edgeType = signDown + upBuff1[x] + 2;
> +                    upBuff1[x] = -signDown;
>  
> -                        int classIdx = 1 + (recon[x] >> boShift);
> -                        stats[classIdx] += (fenc[x] - recon[x]);
> -                        count[classIdx]++;
> -                    }
> +                    if (x < startX && y < startY)
> +                        continue;
>  
> -                    fenc += stride;
> -                    recon += stride;
> +                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
> +                    count[s_eoTable[edgeType]]++;
>                  }
>  
> -                //if (iSaoType == EO_0)
> +                fenc += stride;
> +                rec += stride;
> +            }
> +        }
>  
> -                numSkipLine = isChroma ? 1 : 3;
> -                numSkipLineRight = isChroma ? 3 : 5;
> +        // SAO_EO_2: // dir: 135
> +        {
> +            skipB = plane ? 2 : 4;
> +            skipR = plane ? 3 : 5;
>  
> -                stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_0];
> -                count = m_countPreDblk[addr][plane][SAO_EO_0];
> +            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_2];
> +            count = m_countPreDblk[addr][plane][SAO_EO_2];
>  
> -                fenc = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
> -                recon = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
> +            fenc = fenc0;
> +            rec  = rec0;
>  
> -                startX = (rPelX == picWidthTmp) ? ctuWidth - 1 : ctuWidth - numSkipLineRight;
> -                startY = (bPelY == picHeightTmp) ? ctuHeight : ctuHeight - numSkipLine;
> -                firstX = (lPelX == 0) ? 1 : 0;
> -                endX   = (rPelX == picWidthTmp) ? ctuWidth - 1 : ctuWidth;
> +            startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
> +            startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
> +            firstX = !lpelx;
> +            firstY = !tpely;
> +            // endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
> +            // endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
> +            endX   = ctuWidth - 1;  // not refer right CTU
> +            endY   = ctuHeight - 1; // not refer below CTU
> +            if (!tpely)
> +            {
> +                fenc += stride;
> +                rec += stride;
> +            }
>  
> -                for (y = 0; y < ctuHeight; y++)
> +            for (x = startX; x < endX; x++)
> +                upBuff1[x] = signOf(rec[x] - rec[x - stride - 1]);
> +
> +            for (y = firstY; y < endY; y++)
> +            {
> +                x = (y < startY - 1 ? startX : firstX);
> +                upBufft[x] = signOf(rec[x + stride] - rec[x - 1]);
> +                for (; x < endX; x++)
>                  {
> -                    int signLeft = signOf(recon[firstX] - recon[firstX - 1]);
> -                    for (x = firstX; x < endX; x++)
> -                    {
> -                        int signRight =  signOf(recon[x] - recon[x + 1]);
> -                        int edgeType =  signRight + signLeft + 2;
> -                        signLeft  = -signRight;
> +                    int signDown = signOf(rec[x] - rec[x + stride + 1]);
> +                    int edgeType = signDown + upBuff1[x] + 2;
> +                    upBufft[x + 1] = -signDown;
>  
> -                        if (x < startX && y < startY)
> -                            continue;
> +                    if (x < startX && y < startY)
> +                        continue;
>  
> -                        stats[s_eoTable[edgeType]] += (fenc[x] - recon[x]);
> -                        count[s_eoTable[edgeType]]++;
> -                    }
> -
> -                    fenc += stride;
> -                    recon += stride;
> +                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
> +                    count[s_eoTable[edgeType]]++;
>                  }
>  
> -                //if (iSaoType == EO_1)
> +                std::swap(upBuff1, upBufft);
>  
> -                numSkipLine = isChroma ? 2 : 4;
> -                numSkipLineRight = isChroma ? 2 : 4;
> +                rec += stride;
> +                fenc += stride;
> +            }
> +        }
>  
> -                stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_1];
> -                count = m_countPreDblk[addr][plane][SAO_EO_1];
> +        // SAO_EO_3: // dir: 45
> +        {
> +            skipB = plane ? 2 : 4;
> +            skipR = plane ? 3 : 5;
>  
> -                fenc = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
> -                recon = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
> +            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_3];
> +            count = m_countPreDblk[addr][plane][SAO_EO_3];
>  
> -                startX = (rPelX == picWidthTmp) ? ctuWidth : ctuWidth - numSkipLineRight;
> -                startY = (bPelY == picHeightTmp) ? ctuHeight - 1 : ctuHeight - numSkipLine;
> -                firstY = (tPelY == 0) ? 1 : 0;
> -                endY   = (bPelY == picHeightTmp) ? ctuHeight - 1 : ctuHeight;
> -                if (firstY == 1)
> +            fenc = fenc0;
> +            rec  = rec0;
> +
> +            startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
> +            startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
> +            firstX = !lpelx;
> +            firstY = !tpely;
> +            // endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
> +            // endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
> +            endX   = ctuWidth - 1;  // not refer right CTU
> +            endY   = ctuHeight - 1; // not refer below CTU
> +            if (!tpely)
> +            {
> +                fenc += stride;
> +                rec += stride;
> +            }
> +
> +            for (x = startX - 1; x < endX; x++)
> +                upBuff1[x] = signOf(rec[x] - rec[x - stride + 1]);
> +
> +            for (y = firstY; y < endY; y++)
> +            {
> +                for (x = (y < startY - 1 ? startX : firstX); x < endX; x++)
>                  {
> -                    fenc += stride;
> -                    recon += stride;
> +                    int signDown = signOf(rec[x] - rec[x + stride - 1]);
> +                    int edgeType = signDown + upBuff1[x] + 2;
> +                    upBuff1[x - 1] = -signDown;
> +
> +                    if (x < startX && y < startY)
> +                        continue;
> +
> +                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
> +                    count[s_eoTable[edgeType]]++;
>                  }
>  
> -                for (x = 0; x < ctuWidth; x++)
> -                    upBuff1[x] = signOf(recon[x] - recon[x - stride]);
> +                upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
>  
> -                for (y = firstY; y < endY; y++)
> -                {
> -                    for (x = 0; x < ctuWidth; x++)
> -                    {
> -                        int signDown = signOf(recon[x] - recon[x + stride]);
> -                        int edgeType = signDown + upBuff1[x] + 2;
> -                        upBuff1[x] = -signDown;
> -
> -                        if (x < startX && y < startY)
> -                            continue;
> -
> -                        stats[s_eoTable[edgeType]] += (fenc[x] - recon[x]);
> -                        count[s_eoTable[edgeType]]++;
> -                    }
> -
> -                    fenc += stride;
> -                    recon += stride;
> -                }
> -
> -                //if (iSaoType == EO_2)
> -
> -                numSkipLine = isChroma ? 2 : 4;
> -                numSkipLineRight = isChroma ? 3 : 5;
> -
> -                stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_2];
> -                count = m_countPreDblk[addr][plane][SAO_EO_2];
> -
> -                fenc = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
> -                recon = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
> -
> -                startX = (rPelX == picWidthTmp) ? ctuWidth - 1 : ctuWidth - numSkipLineRight;
> -                startY = (bPelY == picHeightTmp) ? ctuHeight - 1 : ctuHeight - numSkipLine;
> -                firstX = (lPelX == 0) ? 1 : 0;
> -                firstY = (tPelY == 0) ? 1 : 0;
> -                endX   = (rPelX == picWidthTmp) ? ctuWidth - 1 : ctuWidth;
> -                endY   = (bPelY == picHeightTmp) ? ctuHeight - 1 : ctuHeight;
> -                if (firstY == 1)
> -                {
> -                    fenc += stride;
> -                    recon += stride;
> -                }
> -
> -                for (x = firstX; x < endX; x++)
> -                    upBuff1[x] = signOf(recon[x] - recon[x - stride - 1]);
> -
> -                for (y = firstY; y < endY; y++)
> -                {
> -                    int signDown2 = signOf(recon[stride + startX] - recon[startX - 1]);
> -                    for (x = firstX; x < endX; x++)
> -                    {
> -                        int signDown1 = signOf(recon[x] - recon[x + stride + 1]);
> -                        int edgeType = signDown1 + upBuff1[x] + 2;
> -                        upBufft[x + 1] = -signDown1;
> -
> -                        if (x < startX && y < startY)
> -                            continue;
> -
> -                        stats[s_eoTable[edgeType]] += (fenc[x] - recon[x]);
> -                        count[s_eoTable[edgeType]]++;
> -                    }
> -
> -                    upBufft[firstX] = signDown2;
> -                    std::swap(upBuff1, upBufft);
> -
> -                    recon += stride;
> -                    fenc += stride;
> -                }
> -
> -                //if (iSaoType == EO_3)
> -
> -                numSkipLine = isChroma ? 2 : 4;
> -                numSkipLineRight = isChroma ? 3 : 5;
> -
> -                stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_3];
> -                count = m_countPreDblk[addr][plane][SAO_EO_3];
> -
> -                fenc = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
> -                recon = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
> -
> -                startX = (rPelX == picWidthTmp) ? ctuWidth - 1 : ctuWidth - numSkipLineRight;
> -                startY = (bPelY == picHeightTmp) ? ctuHeight - 1 : ctuHeight - numSkipLine;
> -                firstX = (lPelX == 0) ? 1 : 0;
> -                firstY = (tPelY == 0) ? 1 : 0;
> -                endX   = (rPelX == picWidthTmp) ? ctuWidth - 1 : ctuWidth;
> -                endY   = (bPelY == picHeightTmp) ? ctuHeight - 1 : ctuHeight;
> -                if (firstY == 1)
> -                {
> -                    fenc += stride;
> -                    recon += stride;
> -                }
> -
> -                for (x = firstX - 1; x < endX; x++)
> -                    upBuff1[x] = signOf(recon[x] - recon[x - stride + 1]);
> -
> -                for (y = firstY; y < endY; y++)
> -                {
> -                    for (x = firstX; x < endX; x++)
> -                    {
> -                        int signDown1 = signOf(recon[x] - recon[x + stride - 1]);
> -                        int edgeType  = signDown1 + upBuff1[x] + 2;
> -                        upBuff1[x - 1] = -signDown1;
> -
> -                        if (x < startX && y < startY)
> -                            continue;
> -
> -                        stats[s_eoTable[edgeType]] += (fenc[x] - recon[x]);
> -                        count[s_eoTable[edgeType]]++;
> -                    }
> -
> -                    upBuff1[endX - 1] = signOf(recon[endX - 1 + stride] - recon[endX]);
> -
> -                    recon += stride;
> -                    fenc += stride;
> -                }
> +                rec += stride;
> +                fenc += stride;
>              }
>          }
>      }
> @@ -1151,69 +1079,9 @@
>  /* reset offset statistics */
>  void SAO::resetStats()
>  {
> -    for (int i = 0; i < NUM_PLANE; i++)
> -    {
> -        for (int j = 0; j < MAX_NUM_SAO_TYPE; j++)
> -        {
> -            for (int k = 0; k < MAX_NUM_SAO_CLASS; k++)
> -            {
> -                m_count[i][j][k] = 0;
> -                m_offset[i][j][k] = 0;
> -                m_offsetOrg[i][j][k] = 0;
> -            }
> -        }
> -    }
> -}
> -
> -/* Check merge SAO unit */
> -void SAO::checkMerge(SaoCtuParam * saoUnitCurr, SaoCtuParam * saoUnitCheck, int dir)
> -{
> -    int countDiff = 0;
> -
> -    if (saoUnitCurr->partIdx != saoUnitCheck->partIdx)
> -    {
> -        if (saoUnitCurr->typeIdx >= 0)
> -        {
> -            if (saoUnitCurr->typeIdx == saoUnitCheck->typeIdx)
> -            {
> -                for (int i = 0; i < SAO_NUM_OFFSET; i++)
> -                    countDiff += (saoUnitCurr->offset[i] != saoUnitCheck->offset[i]);
> -
> -                countDiff += (saoUnitCurr->subTypeIdx != saoUnitCheck->subTypeIdx);
> -                if (countDiff == 0)
> -                {
> -                    saoUnitCurr->partIdx = saoUnitCheck->partIdx;
> -                    if (dir == 1)
> -                    {
> -                        saoUnitCurr->mergeUpFlag = 1;
> -                        saoUnitCurr->mergeLeftFlag = 0;
> -                    }
> -                    else
> -                    {
> -                        saoUnitCurr->mergeUpFlag = 0;
> -                        saoUnitCurr->mergeLeftFlag = 1;
> -                    }
> -                }
> -            }
> -        }
> -        else
> -        {
> -            if (saoUnitCurr->typeIdx == saoUnitCheck->typeIdx)
> -            {
> -                saoUnitCurr->partIdx = saoUnitCheck->partIdx;
> -                if (dir == 1)
> -                {
> -                    saoUnitCurr->mergeUpFlag = 1;
> -                    saoUnitCurr->mergeLeftFlag = 0;
> -                }
> -                else
> -                {
> -                    saoUnitCurr->mergeUpFlag = 0;
> -                    saoUnitCurr->mergeLeftFlag = 1;
> -                }
> -            }
> -        }
> -    }
> +    memset(m_count, 0, sizeof(PerClass) * NUM_PLANE);
> +    memset(m_offset, 0, sizeof(PerClass) * NUM_PLANE);
> +    memset(m_offsetOrg, 0, sizeof(PerClass) * NUM_PLANE);
>  }
>  
>  void SAO::rdoSaoUnitRowInit(SAOParam *saoParam)
> @@ -1244,25 +1112,17 @@
>  
>  void SAO::rdoSaoUnitRow(SAOParam *saoParam, int idxY)
>  {
> -    int frameWidthInCU  = saoParam->numCuInWidth;
>      int j, k;
> -    int compIdx = 0;
>      SaoCtuParam mergeSaoParam[3][2];
>      double compDistortion[3];
> +    int allowMergeUp   = (idxY > 0);
>  
> -    for (int idxX = 0; idxX < frameWidthInCU; idxX++)
> +    for (int idxX = 0; idxX < m_numCuInWidth; idxX++)
>      {
> -        int addr     = idxX + idxY * frameWidthInCU;
> -        int addrUp   = idxY == 0 ? -1 : addr - frameWidthInCU;
> +        int addr     = idxX + idxY * m_numCuInWidth;
> +        int addrUp   = idxY == 0 ? -1 : addr - m_numCuInWidth;
>          int addrLeft = idxX == 0 ? -1 : addr - 1;
> -        int allowMergeLeft = 1;
> -        int allowMergeUp   = 1;
> -        uint32_t rate;
> -        double bestCost, mergeCost;
> -        if (idxX == 0)
> -            allowMergeLeft = 0;
> -        if (idxY == 0)
> -            allowMergeUp = 0;
> +        int allowMergeLeft = (idxX > 0);
>  
>          compDistortion[0] = 0;
>          compDistortion[1] = 0;
> @@ -1274,32 +1134,32 @@
>              m_entropyCoder.codeSaoMerge(0);
>          m_entropyCoder.store(m_rdEntropyCoders[0][CI_TEMP_BEST]);
>          // reset stats Y, Cb, Cr
> -        for (compIdx = 0; compIdx < 3; compIdx++)
> +        for (int plane = 0; plane < 3; plane++)
>          {
>              for (j = 0; j < MAX_NUM_SAO_TYPE; j++)
>              {
>                  for (k = 0; k < MAX_NUM_SAO_CLASS; k++)
>                  {
> -                    m_offset[compIdx][j][k] = 0;
> +                    m_offset[plane][j][k] = 0;
>                      if (m_param->bSaoNonDeblocked)
>                      {
> -                        m_count[compIdx][j][k] = m_countPreDblk[addr][compIdx][j][k];
> -                        m_offsetOrg[compIdx][j][k] = m_offsetOrgPreDblk[addr][compIdx][j][k];
> +                        m_count[plane][j][k] = m_countPreDblk[addr][plane][j][k];
> +                        m_offsetOrg[plane][j][k] = m_offsetOrgPreDblk[addr][plane][j][k];
>                      }
>                      else
>                      {
> -                        m_count[compIdx][j][k] = 0;
> -                        m_offsetOrg[compIdx][j][k] = 0;
> +                        m_count[plane][j][k] = 0;
> +                        m_offsetOrg[plane][j][k] = 0;
>                      }
>                  }
>              }
>  
> -            saoParam->ctuParam[compIdx][addr].typeIdx       = -1;
> -            saoParam->ctuParam[compIdx][addr].mergeUpFlag   = 0;
> -            saoParam->ctuParam[compIdx][addr].mergeLeftFlag = 0;
> -            saoParam->ctuParam[compIdx][addr].subTypeIdx    = 0;
> -            if ((compIdx == 0 && saoParam->bSaoFlag[0]) || (compIdx > 0 && saoParam->bSaoFlag[1]))
> -                calcSaoStatsCu(addr, compIdx);
> +            saoParam->ctuParam[plane][addr].typeIdx       = -1;
> +            saoParam->ctuParam[plane][addr].mergeUpFlag   = 0;
> +            saoParam->ctuParam[plane][addr].mergeLeftFlag = 0;
> +            saoParam->ctuParam[plane][addr].bandPos    = 0;
> +            if ((plane == 0 && saoParam->bSaoFlag[0]) || (plane > 0 && saoParam->bSaoFlag[1]))
> +                calcSaoStatsCu(addr, plane);
>          }
>  
>          saoComponentParamDist(allowMergeLeft, allowMergeUp, saoParam, addr, addrUp, addrLeft,
> @@ -1317,14 +1177,14 @@
>                  m_entropyCoder.codeSaoMerge(0);
>              if (allowMergeUp)
>                  m_entropyCoder.codeSaoMerge(0);
> -            for (compIdx = 0; compIdx < 3; compIdx++)
> +            for (int plane = 0; plane < 3; plane++)
>              {
> -                if ((compIdx == 0 && saoParam->bSaoFlag[0]) || (compIdx > 0 && saoParam->bSaoFlag[1]))
> -                    m_entropyCoder.codeSaoOffset(&saoParam->ctuParam[compIdx][addr], compIdx);
> +                if ((plane == 0 && saoParam->bSaoFlag[0]) || (plane > 0 && saoParam->bSaoFlag[1]))
> +                    m_entropyCoder.codeSaoOffset(&saoParam->ctuParam[plane][addr], plane);
>              }
>  
> -            rate = m_entropyCoder.getNumberOfWrittenBits();
> -            bestCost = compDistortion[0] + (double)rate;
> +            uint32_t rate = m_entropyCoder.getNumberOfWrittenBits();
> +            double bestCost = compDistortion[0] + (double)rate;
>              m_entropyCoder.store(m_rdEntropyCoders[0][CI_TEMP_BEST]);
>  
>              // Cost of Merge
> @@ -1340,17 +1200,17 @@
>                          m_entropyCoder.codeSaoMerge(1);
>  
>                      rate = m_entropyCoder.getNumberOfWrittenBits();
> -                    mergeCost = compDistortion[mergeUp + 1] + (double)rate;
> +                    double mergeCost = compDistortion[mergeUp + 1] + (double)rate;
>                      if (mergeCost < bestCost)
>                      {
>                          bestCost = mergeCost;
>                          m_entropyCoder.store(m_rdEntropyCoders[0][CI_TEMP_BEST]);
> -                        for (compIdx = 0; compIdx < 3; compIdx++)
> +                        for (int plane = 0; plane < 3; plane++)
>                          {
> -                            mergeSaoParam[compIdx][mergeUp].mergeLeftFlag = !mergeUp;
> -                            mergeSaoParam[compIdx][mergeUp].mergeUpFlag = !!mergeUp;
> -                            if ((compIdx == 0 && saoParam->bSaoFlag[0]) || (compIdx > 0 && saoParam->bSaoFlag[1]))
> -                                copySaoUnit(&saoParam->ctuParam[compIdx][addr], &mergeSaoParam[compIdx][mergeUp]);
> +                            mergeSaoParam[plane][mergeUp].mergeLeftFlag = !mergeUp;
> +                            mergeSaoParam[plane][mergeUp].mergeUpFlag = !!mergeUp;
> +                            if ((plane == 0 && saoParam->bSaoFlag[0]) || (plane > 0 && saoParam->bSaoFlag[1]))
> +                                copySaoUnit(&saoParam->ctuParam[plane][addr], &mergeSaoParam[plane][mergeUp]);
>                          }
>                      }
>                  }
> @@ -1367,92 +1227,85 @@
>  }
>  
>  /** rate distortion optimization of SAO unit */
> -inline int64_t SAO::estSaoTypeDist(int compIdx, int typeIdx, int shift, double lambda, int32_t *currentDistortionTableBo, double *currentRdCostTableBo)
> +inline int64_t SAO::estSaoTypeDist(int plane, int typeIdx, double lambda, int32_t *currentDistortionTableBo, double *currentRdCostTableBo)
>  {
>      int64_t estDist = 0;
>  
>      for (int classIdx = 1; classIdx < ((typeIdx < SAO_BO) ?  SAO_EO_LEN + 1 : SAO_NUM_BO_CLASSES + 1); classIdx++)
>      {
> +        int32_t  count = m_count[plane][typeIdx][classIdx];
> +        int32_t& offsetOrg = m_offsetOrg[plane][typeIdx][classIdx];
> +        int32_t& offsetOut = m_offset[plane][typeIdx][classIdx];
> +
>          if (typeIdx == SAO_BO)
>          {
>              currentDistortionTableBo[classIdx - 1] = 0;
>              currentRdCostTableBo[classIdx - 1] = lambda;
>          }
> -        if (m_count[compIdx][typeIdx][classIdx])
> +        if (count)
>          {
> -            m_offset[compIdx][typeIdx][classIdx] = (int64_t)roundIDBI((double)(m_offsetOrg[compIdx][typeIdx][classIdx] << (X265_DEPTH - 8)) / (double)(m_count[compIdx][typeIdx][classIdx] << SAO_BIT_INC));
> -            m_offset[compIdx][typeIdx][classIdx] = Clip3(-OFFSET_THRESH + 1, OFFSET_THRESH - 1, (int)m_offset[compIdx][typeIdx][classIdx]);
> +            int offset = roundIBDI(offsetOrg, count << SAO_BIT_INC);
> +            offset = Clip3(-OFFSET_THRESH + 1, OFFSET_THRESH - 1, offset);
>              if (typeIdx < SAO_BO)
>              {
> -                if (m_offset[compIdx][typeIdx][classIdx] < 0 && classIdx < 3)
> -                    m_offset[compIdx][typeIdx][classIdx] = 0;
> -                if (m_offset[compIdx][typeIdx][classIdx] > 0 && classIdx >= 3)
> -                    m_offset[compIdx][typeIdx][classIdx] = 0;
> +                if (classIdx < 3)
> +                    offset = X265_MAX(offset, 0);
> +                else
> +                    offset = X265_MIN(offset, 0);
>              }
> -            m_offset[compIdx][typeIdx][classIdx] = estIterOffset(typeIdx, classIdx, lambda, m_offset[compIdx][typeIdx][classIdx], m_count[compIdx][typeIdx][classIdx], m_offsetOrg[compIdx][typeIdx][classIdx], shift, SAO_BIT_INC, currentDistortionTableBo, currentRdCostTableBo, OFFSET_THRESH);
> +            offsetOut = estIterOffset(typeIdx, classIdx, lambda, offset, count, offsetOrg, currentDistortionTableBo, currentRdCostTableBo);
>          }
>          else
>          {
> -            m_offsetOrg[compIdx][typeIdx][classIdx] = 0;
> -            m_offset[compIdx][typeIdx][classIdx] = 0;
> +            offsetOrg = 0;
> +            offsetOut = 0;
>          }
>          if (typeIdx != SAO_BO)
> -            estDist += estSaoDist(m_count[compIdx][typeIdx][classIdx], m_offset[compIdx][typeIdx][classIdx] << SAO_BIT_INC, m_offsetOrg[compIdx][typeIdx][classIdx], shift);
> +            estDist += estSaoDist(count, (int)offsetOut << SAO_BIT_INC, offsetOrg);
>      }
>  
>      return estDist;
>  }
>  
> -inline int64_t SAO::estSaoDist(int64_t count, int64_t offset, int64_t offsetOrg, int shift)
> +inline int SAO::estIterOffset(int typeIdx, int classIdx, double lambda, int offset, int32_t count, int32_t offsetOrg, int32_t *currentDistortionTableBo, double *currentRdCostTableBo)
>  {
> -    return (count * offset * offset - offsetOrg * offset * 2) >> shift;
> -}
> +    int offsetOut = 0;
>  
> -inline int64_t SAO::estIterOffset(int typeIdx, int classIdx, double lambda, int64_t offsetInput, int64_t count, int64_t offsetOrg, int shift, int bitIncrease, int32_t *currentDistortionTableBo, double *currentRdCostTableBo, int offsetTh)
> -{
> -    //Clean up, best_q_offset.
> -    int64_t iterOffset, tempOffset;
> -    int64_t tempDist, tempRate;
> -    int64_t offsetOutput = 0;
> -
> -    iterOffset = offsetInput;
>      // Assuming sending quantized value 0 results in zero offset and sending the value zero needs 1 bit. entropy coder can be used to measure the exact rate here.
>      double tempMinCost = lambda;
> -    while (iterOffset != 0)
> +    while (offset != 0)
>      {
>          // Calculate the bits required for signalling the offset
> -        tempRate = (typeIdx == SAO_BO) ? (abs((int)iterOffset) + 2) : (abs((int)iterOffset) + 1);
> -        if (abs((int)iterOffset) == offsetTh - 1)
> +        int tempRate = (typeIdx == SAO_BO) ? (abs(offset) + 2) : (abs(offset) + 1);
> +        if (abs(offset) == OFFSET_THRESH - 1)
>              tempRate--;
>  
>          // Do the dequntization before distorion calculation
> -        tempOffset = iterOffset << bitIncrease;
> -        tempDist   = estSaoDist(count, tempOffset, offsetOrg, shift);
> +        int tempOffset = offset << SAO_BIT_INC;
> +        int64_t tempDist  = estSaoDist(count, tempOffset, offsetOrg);
>          double tempCost   = ((double)tempDist + lambda * (double)tempRate);
>          if (tempCost < tempMinCost)
>          {
>              tempMinCost = tempCost;
> -            offsetOutput = iterOffset;
> +            offsetOut = offset;
>              if (typeIdx == SAO_BO)
>              {
>                  currentDistortionTableBo[classIdx - 1] = (int)tempDist;
>                  currentRdCostTableBo[classIdx - 1] = tempCost;
>              }
>          }
> -        iterOffset = (iterOffset > 0) ? (iterOffset - 1) : (iterOffset + 1);
> +        offset = (offset > 0) ? (offset - 1) : (offset + 1);
>      }
>  
> -    return offsetOutput;
> +    return offsetOut;
>  }
>  
>  void SAO::saoComponentParamDist(int allowMergeLeft, int allowMergeUp, SAOParam *saoParam, int addr, int addrUp, int addrLeft,
>                                  SaoCtuParam *compSaoParam, double *compDistortion)
>  {
> -    int64_t estDist;
> -    int64_t bestDist;
> +    int64_t bestDist = 0;
>  
>      SaoCtuParam* lclCtuParam = &saoParam->ctuParam[0][addr];
> -    SaoCtuParam* ctuParamNeighbor = NULL;
>      SaoCtuParam  ctuParamRdo;
>  
>      resetSaoUnit(&ctuParamRdo);
> @@ -1460,7 +1313,6 @@
>      resetSaoUnit(&compSaoParam[1]);
>      resetSaoUnit(lclCtuParam);
>  
> -    double dCostPartBest = MAX_DOUBLE;
>      double bestRDCostTableBo = MAX_DOUBLE;
>      int    bestClassTableBo  = 0;
>      int    currentDistortionTableBo[MAX_NUM_SAO_CLASS];
> @@ -1469,13 +1321,12 @@
>      m_entropyCoder.load(m_rdEntropyCoders[0][CI_TEMP_BEST]);
>      m_entropyCoder.resetBits();
>      m_entropyCoder.codeSaoOffset(&ctuParamRdo, 0);
> -    dCostPartBest = m_entropyCoder.getNumberOfWrittenBits() * m_lumaLambda;
> +    double dCostPartBest = m_entropyCoder.getNumberOfWrittenBits() * m_lumaLambda;
>      copySaoUnit(lclCtuParam, &ctuParamRdo);
> -    bestDist = 0;
>  
>      for (int typeIdx = 0; typeIdx < MAX_NUM_SAO_TYPE; typeIdx++)
>      {
> -        estDist = estSaoTypeDist(0, typeIdx, 0, m_lumaLambda, currentDistortionTableBo, currentRdCostTableBo);
> +        int64_t estDist = estSaoTypeDist(0, typeIdx, m_lumaLambda, currentDistortionTableBo, currentRdCostTableBo);
>  
>          if (typeIdx == SAO_BO)
>          {
> @@ -1503,16 +1354,16 @@
>          ctuParamRdo.typeIdx = typeIdx;
>          ctuParamRdo.mergeLeftFlag = 0;
>          ctuParamRdo.mergeUpFlag   = 0;
> -        ctuParamRdo.subTypeIdx = (typeIdx == SAO_BO) ? bestClassTableBo : 0;
> +        ctuParamRdo.bandPos = (typeIdx == SAO_BO) ? bestClassTableBo : 0;
>          for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
> -            ctuParamRdo.offset[classIdx] = (int)m_offset[0][typeIdx][classIdx + ctuParamRdo.subTypeIdx + 1];
> +            ctuParamRdo.offset[classIdx] = (int)m_offset[0][typeIdx][classIdx + ctuParamRdo.bandPos + 1];
>  
>          m_entropyCoder.load(m_rdEntropyCoders[0][CI_TEMP_BEST]);
>          m_entropyCoder.resetBits();
>          m_entropyCoder.codeSaoOffset(&ctuParamRdo, 0);
>  
>          uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
> -        double cost = (double)((double)estDist + m_lumaLambda * (double)estRate);
> +        double cost = (double)estDist + m_lumaLambda * (double)estRate;
>  
>          if (cost < dCostPartBest)
>          {
> @@ -1531,27 +1382,24 @@
>  
>      for (int idxNeighbor = 0; idxNeighbor < 2; idxNeighbor++)
>      {
> -        ctuParamNeighbor = NULL;
> +        SaoCtuParam* ctuParamNeighbor = NULL;
>          if (allowMergeLeft && addrLeft >= 0 && idxNeighbor == 0)
>              ctuParamNeighbor = &(saoParam->ctuParam[0][addrLeft]);
>          else if (allowMergeUp && addrUp >= 0 && idxNeighbor == 1)
>              ctuParamNeighbor = &(saoParam->ctuParam[0][addrUp]);
>          if (ctuParamNeighbor != NULL)
>          {
> -            estDist = 0;
> +            int64_t estDist = 0;
>              int typeIdx = ctuParamNeighbor->typeIdx;
>              if (typeIdx >= 0)
>              {
> -                int mergeBandPosition = (typeIdx == SAO_BO) ? ctuParamNeighbor->subTypeIdx : 0;
> -                int mergeOffset;
> +                int mergeBandPosition = (typeIdx == SAO_BO) ? ctuParamNeighbor->bandPos : 0;
>                  for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
>                  {
> -                    mergeOffset = ctuParamNeighbor->offset[classIdx];
> -                    estDist += estSaoDist(m_count[0][typeIdx][classIdx + mergeBandPosition + 1], mergeOffset, m_offsetOrg[0][typeIdx][classIdx + mergeBandPosition + 1],  0);
> +                    int mergeOffset = ctuParamNeighbor->offset[classIdx];
> +                    estDist += estSaoDist(m_count[0][typeIdx][classIdx + mergeBandPosition + 1], mergeOffset, m_offsetOrg[0][typeIdx][classIdx + mergeBandPosition + 1]);
>                  }
>              }
> -            else
> -                estDist = 0;
>  
>              copySaoUnit(&compSaoParam[idxNeighbor], ctuParamNeighbor);
>              compSaoParam[idxNeighbor].mergeUpFlag   = !!idxNeighbor;
> @@ -1565,11 +1413,9 @@
>  void SAO::sao2ChromaParamDist(int allowMergeLeft, int allowMergeUp, SAOParam *saoParam, int addr, int addrUp, int addrLeft,
>                                SaoCtuParam *crSaoParam, SaoCtuParam *cbSaoParam, double *distortion)
>  {
> -    int64_t estDist[2];
>      int64_t bestDist = 0;
>  
>      SaoCtuParam* lclCtuParam[2] = { &saoParam->ctuParam[1][addr], &saoParam->ctuParam[2][addr] };
> -    SaoCtuParam* ctuParamNeighbor[2] = { NULL, NULL };
>      SaoCtuParam* saoMergeParam[2][2];
>      SaoCtuParam  ctuParamRdo[2];
>  
> @@ -1587,8 +1433,6 @@
>      resetSaoUnit(&ctuParamRdo[0]);
>      resetSaoUnit(&ctuParamRdo[1]);
>  
> -    double costPartBest = MAX_DOUBLE;
> -    double bestRDCostTableBo;
>      double currentRdCostTableBo[MAX_NUM_SAO_CLASS];
>      int    bestClassTableBo[2] = { 0, 0 };
>      int    currentDistortionTableBo[MAX_NUM_SAO_CLASS];
> @@ -1598,19 +1442,20 @@
>      m_entropyCoder.codeSaoOffset(&ctuParamRdo[0], 1);
>      m_entropyCoder.codeSaoOffset(&ctuParamRdo[1], 2);
>  
> -    costPartBest = m_entropyCoder.getNumberOfWrittenBits() * m_chromaLambda;
> +    double costPartBest = m_entropyCoder.getNumberOfWrittenBits() * m_chromaLambda;
>      copySaoUnit(lclCtuParam[0], &ctuParamRdo[0]);
>      copySaoUnit(lclCtuParam[1], &ctuParamRdo[1]);
>  
>      for (int typeIdx = 0; typeIdx < MAX_NUM_SAO_TYPE; typeIdx++)
>      {
> +        int64_t estDist[2];
>          if (typeIdx == SAO_BO)
>          {
>              // Estimate Best Position
>              for (int compIdx = 0; compIdx < 2; compIdx++)
>              {
> -                bestRDCostTableBo = MAX_DOUBLE;
> -                estDist[compIdx] = estSaoTypeDist(compIdx + 1, typeIdx, 0, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo);
> +                double bestRDCostTableBo = MAX_DOUBLE;
> +                estDist[compIdx] = estSaoTypeDist(compIdx + 1, typeIdx, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo);
>                  for (int i = 0; i < SAO_NUM_BO_CLASSES - SAO_BO_LEN + 1; i++)
>                  {
>                      double currentRDCost = 0.0;
> @@ -1633,8 +1478,8 @@
>          }
>          else
>          {
> -            estDist[0] = estSaoTypeDist(1, typeIdx, 0, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo);
> -            estDist[1] = estSaoTypeDist(2, typeIdx, 0, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo);
> +            estDist[0] = estSaoTypeDist(1, typeIdx, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo);
> +            estDist[1] = estSaoTypeDist(2, typeIdx, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo);
>          }
>  
>          m_entropyCoder.load(m_rdEntropyCoders[0][CI_TEMP_BEST]);
> @@ -1646,15 +1491,15 @@
>              ctuParamRdo[compIdx].typeIdx = typeIdx;
>              ctuParamRdo[compIdx].mergeLeftFlag = 0;
>              ctuParamRdo[compIdx].mergeUpFlag   = 0;
> -            ctuParamRdo[compIdx].subTypeIdx = (typeIdx == SAO_BO) ? bestClassTableBo[compIdx] : 0;
> +            ctuParamRdo[compIdx].bandPos = (typeIdx == SAO_BO) ? bestClassTableBo[compIdx] : 0;
>              for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
> -                ctuParamRdo[compIdx].offset[classIdx] = (int)m_offset[compIdx + 1][typeIdx][classIdx + ctuParamRdo[compIdx].subTypeIdx + 1];
> +                ctuParamRdo[compIdx].offset[classIdx] = (int)m_offset[compIdx + 1][typeIdx][classIdx + ctuParamRdo[compIdx].bandPos + 1];
>  
>              m_entropyCoder.codeSaoOffset(&ctuParamRdo[compIdx], compIdx + 1);
>          }
>  
>          uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
> -        double cost = (double)((double)(estDist[0] + estDist[1]) + m_chromaLambda * (double)estRate);
> +        double cost = (double)(estDist[0] + estDist[1]) + m_chromaLambda * (double)estRate;
>  
>          if (cost < costPartBest)
>          {
> @@ -1677,31 +1522,30 @@
>      {
>          for (int compIdx = 0; compIdx < 2; compIdx++)
>          {
> -            ctuParamNeighbor[compIdx] = NULL;
> +            int plane = compIdx + 1;
> +            SaoCtuParam* ctuParamNeighbor = NULL;
>              if (allowMergeLeft && addrLeft >= 0 && idxNeighbor == 0)
> -                ctuParamNeighbor[compIdx] = &(saoParam->ctuParam[compIdx + 1][addrLeft]);
> +                ctuParamNeighbor = &(saoParam->ctuParam[plane][addrLeft]);
>              else if (allowMergeUp && addrUp >= 0 && idxNeighbor == 1)
> -                ctuParamNeighbor[compIdx] = &(saoParam->ctuParam[compIdx + 1][addrUp]);
> -            if (ctuParamNeighbor[compIdx] != NULL)
> +                ctuParamNeighbor = &(saoParam->ctuParam[plane][addrUp]);
> +            if (ctuParamNeighbor != NULL)
>              {
> -                estDist[compIdx] = 0;
> -                int typeIdx = ctuParamNeighbor[compIdx]->typeIdx;
> +                int64_t estDist = 0;
> +                int typeIdx = ctuParamNeighbor->typeIdx;
>                  if (typeIdx >= 0)
>                  {
> -                    int mergeBandPosition = (typeIdx == SAO_BO) ? ctuParamNeighbor[compIdx]->subTypeIdx : 0;
> +                    int mergeBandPosition = (typeIdx == SAO_BO) ? ctuParamNeighbor->bandPos : 0;
>                      for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
>                      {
> -                        int mergeOffset = ctuParamNeighbor[compIdx]->offset[classIdx];
> -                        estDist[compIdx] += estSaoDist(m_count[compIdx + 1][typeIdx][classIdx + mergeBandPosition + 1], mergeOffset, m_offsetOrg[compIdx + 1][typeIdx][classIdx + mergeBandPosition + 1],  0);
> +                        int mergeOffset = ctuParamNeighbor->offset[classIdx];
> +                        estDist += estSaoDist(m_count[plane][typeIdx][classIdx + mergeBandPosition + 1], mergeOffset, m_offsetOrg[plane][typeIdx][classIdx + mergeBandPosition + 1]);
>                      }
>                  }
> -                else
> -                    estDist[compIdx] = 0;
>  
> -                copySaoUnit(saoMergeParam[compIdx][idxNeighbor], ctuParamNeighbor[compIdx]);
> +                copySaoUnit(saoMergeParam[compIdx][idxNeighbor], ctuParamNeighbor);
>                  saoMergeParam[compIdx][idxNeighbor]->mergeUpFlag   = !!idxNeighbor;
>                  saoMergeParam[compIdx][idxNeighbor]->mergeLeftFlag = !idxNeighbor;
> -                distortion[idxNeighbor + 1] += ((double)estDist[compIdx] / m_chromaLambda);
> +                distortion[idxNeighbor + 1] += ((double)estDist / m_chromaLambda);
>              }
>          }
>      }
> diff -r b6d49505b179 -r 64ea900398eb source/encoder/sao.h
> --- a/source/encoder/sao.h	Thu Oct 02 16:47:55 2014 -0500
> +++ b/source/encoder/sao.h	Sun Oct 05 18:19:16 2014 +0900
> @@ -63,9 +63,8 @@
>  
>      static const uint32_t s_eoTable[NUM_EDGETYPE];
>  
> -    typedef int64_t (PerClass[MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]);
> -    typedef int64_t (PerType[MAX_NUM_SAO_TYPE]);
> -    typedef int64_t (PerPlane[3][MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]);
> +    typedef int32_t (PerClass[MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]);
> +    typedef int32_t (PerPlane[NUM_PLANE][MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]);
>  
>      /* allocated per part */
>      PerClass*   m_count;
> @@ -102,7 +101,7 @@
>      x265_param* m_param;
>      int         m_refDepth;
>      int         m_numNoSao[2];
> -    
> +
>      double      m_lumaLambda;
>      double      m_chromaLambda;
>      /* TODO: No doubles for distortion */
> @@ -120,7 +119,7 @@
>      void resetSaoUnit(SaoCtuParam* saoUnit);
>  
>      // CTU-based SAO process without slice granularity
> -    void processSaoCu(int addr, int partIdx, int plane);
> +    void processSaoCu(int addr, int typeIdx, int plane);
>  
>      void resetCtuPart(SaoCtuParam* ctuParam);
>      void processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane);
> @@ -129,17 +128,15 @@
>  
>      void calcSaoStatsCu(int addr, int plane);
>      void calcSaoStatsCu_BeforeDblk(Frame* pic, int idxX, int idxY);
> -    void checkMerge(SaoCtuParam* paramCurr, SaoCtuParam* paramCheck, int dir);
>  
>      void saoComponentParamDist(int allowMergeLeft, int allowMergeUp, SAOParam *saoParam, int addr, int addrUp, int addrLeft,
>                                 SaoCtuParam *compSaoParam, double *distortion);
>      void sao2ChromaParamDist(int allowMergeLeft, int allowMergeUp, SAOParam *saoParam, int addr, int addrUp, int addrLeft,
>                               SaoCtuParam *crSaoParam, SaoCtuParam *cbSaoParam, double *distortion);
>  
> -    inline int64_t estSaoDist(int64_t count, int64_t offset, int64_t offsetOrg, int shift);
> -    inline int64_t estIterOffset(int typeIdx, int classIdx, double lambda, int64_t offsetInput, int64_t count, int64_t offsetOrg, int shift,
> -                                 int bitIncrease, int32_t *currentDistortionTableBo, double *currentRdCostTableBo, int offsetTh);
> -    inline int64_t estSaoTypeDist(int compIdx, int typeIdx, int shift, double lambda, int32_t *currentDistortionTableBo, double *currentRdCostTableBo);
> +    inline int estIterOffset(int typeIdx, int classIdx, double lambda, int offset, int32_t count, int32_t offsetOrg,
> +                             int32_t *currentDistortionTableBo, double *currentRdCostTableBo);
> +    inline int64_t estSaoTypeDist(int plane, int typeIdx, double lambda, int32_t *currentDistortionTableBo, double *currentRdCostTableBo);
>  
>      void rdoSaoUnitRowInit(SAOParam *saoParam);
>      void rdoSaoUnitRowEnd(SAOParam *saoParam, int numctus);
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho


More information about the x265-devel mailing list