[x265] sao: refine, fix sao-non-deblock [CHANGES OUTPUT (RExt, sao-non-deblock)]

Satoshi Nakagawa nakagawa424 at oki.com
Sun Oct 5 11:23:46 CEST 2014


# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1412500756 -32400
#      Sun Oct 05 18:19:16 2014 +0900
# Node ID 64ea900398eb29ddd1c12df8126fa9866a280c81
# Parent  b6d49505b179cb509aa76f3a065192f0b4926579
sao: refine, fix sao-non-deblock [CHANGES OUTPUT (RExt, sao-non-deblock)]

diff -r b6d49505b179 -r 64ea900398eb source/common/common.h
--- a/source/common/common.h	Thu Oct 02 16:47:55 2014 -0500
+++ b/source/common/common.h	Sun Oct 05 18:19:16 2014 +0900
@@ -132,6 +132,12 @@
     return std::min<T>(std::max<T>(minVal, a), maxVal);
 }
 
+template<typename T>
+inline T x265_min(T a, T b) { return a < b ? a : b; }
+
+template<typename T>
+inline T x265_max(T a, T b) { return a > b ? a : b; }
+
 typedef int16_t  coeff_t;      // transform coefficient
 
 #define X265_MIN(a, b) ((a) < (b) ? (a) : (b))
@@ -224,17 +230,15 @@
     bool mergeUpFlag;
     bool mergeLeftFlag;
     int  typeIdx;
-    int  subTypeIdx;    // indicates EO class or BO band position
+    uint32_t bandPos;    // BO band position
     int  offset[SAO_NUM_OFFSET];
-    int  partIdx;
-    int  partIdxTmp;
 
     void reset()
     {
         mergeUpFlag = false;
         mergeLeftFlag = false;
         typeIdx = -1;
-        subTypeIdx = 0;
+        bandPos = 0;
         offset[0] = 0;
         offset[1] = 0;
         offset[2] = 0;
@@ -246,7 +250,6 @@
 {
     SaoCtuParam* ctuParam[3];
     bool         bSaoFlag[2];
-    int          numCuInHeight;
     int          numCuInWidth;
 
     SAOParam()
@@ -254,6 +257,7 @@
         for (int i = 0; i < 3; i++)
             ctuParam[i] = NULL;
     }
+
     ~SAOParam()
     {
         delete[] ctuParam[0];
diff -r b6d49505b179 -r 64ea900398eb source/encoder/entropy.cpp
--- a/source/encoder/entropy.cpp	Thu Oct 02 16:47:55 2014 -0500
+++ b/source/encoder/entropy.cpp	Sun Oct 05 18:19:16 2014 +0900
@@ -511,7 +511,7 @@
     }
 
     // We need to split, so don't try these modes.
-    if (cuSplitFlag) 
+    if (cuSplitFlag)
         codeSplitFlag(ctu, absPartIdx, depth);
 
     if (depth < ctu->getDepth(absPartIdx) && depth < g_maxCUDepth)
@@ -863,74 +863,40 @@
     encodeTransform(cu, state, lumaOffset, chromaOffset, absPartIdx, absPartIdxStep, depth, log2CUSize, 0, bCodeDQP, depthRange);
 }
 
-void Entropy::codeSaoOffset(SaoCtuParam* saoLcuParam, uint32_t compIdx)
+void Entropy::codeSaoOffset(const SaoCtuParam* saoLcuParam, int plane)
 {
-    uint32_t symbol;
-    int i;
+    int typeIdx = saoLcuParam->typeIdx;
 
-    symbol = saoLcuParam->typeIdx + 1;
-    if (compIdx != 2)
-        codeSaoTypeIdx(symbol);
+    if (plane != 2)
+    {
+        encodeBin(typeIdx >= 0, m_contextState[OFF_SAO_TYPE_IDX_CTX]);
+        if (typeIdx >= 0)
+            encodeBinEP(typeIdx < SAO_BO ? 1 : 0);
+    }
 
-    if (symbol)
+    if (typeIdx >= 0)
     {
-        if (saoLcuParam->typeIdx < SAO_BO && compIdx != 2)
-            saoLcuParam->subTypeIdx = saoLcuParam->typeIdx;
+        enum { OFFSET_THRESH = 1 << X265_MIN(X265_DEPTH - 5, 5) };
 
-        int offsetTh = 1 << X265_MIN(X265_DEPTH - 5, 5);
-        if (saoLcuParam->typeIdx == SAO_BO)
+        if (typeIdx == SAO_BO)
         {
-            for (i = 0; i < SAO_BO_LEN; i++)
-            {
-                uint32_t absOffset = ((saoLcuParam->offset[i] < 0) ? -saoLcuParam->offset[i] : saoLcuParam->offset[i]);
-                codeSaoMaxUvlc(absOffset, offsetTh - 1);
-            }
+            for (int i = 0; i < SAO_BO_LEN; i++)
+                codeSaoMaxUvlc(abs(saoLcuParam->offset[i]), OFFSET_THRESH - 1);
 
-            for (i = 0; i < SAO_BO_LEN; i++)
-            {
+            for (int i = 0; i < SAO_BO_LEN; i++)
                 if (saoLcuParam->offset[i] != 0)
-                {
-                    uint32_t sign = (saoLcuParam->offset[i] < 0) ? 1 : 0;
-                    codeSAOSign(sign);
-                }
-            }
+                    encodeBinEP(saoLcuParam->offset[i] < 0);
 
-            symbol = (uint32_t)(saoLcuParam->subTypeIdx);
-            codeSaoUflc(5, symbol);
+            encodeBinsEP(saoLcuParam->bandPos, 5);
         }
-        else // if (saoLcuParam->typeIdx < SAO_BO)
+        else // if (typeIdx < SAO_BO)
         {
-            codeSaoMaxUvlc(saoLcuParam->offset[0], offsetTh - 1);
-            codeSaoMaxUvlc(saoLcuParam->offset[1], offsetTh - 1);
-            codeSaoMaxUvlc(-saoLcuParam->offset[2], offsetTh - 1);
-            codeSaoMaxUvlc(-saoLcuParam->offset[3], offsetTh - 1);
-            if (compIdx != 2)
-            {
-                symbol = (uint32_t)(saoLcuParam->subTypeIdx);
-                codeSaoUflc(2, symbol);
-            }
-        }
-    }
-}
-
-void Entropy::codeSaoUnitInterleaving(int compIdx, bool saoFlag, int rx, int ry, SaoCtuParam* saoLcuParam, int cuAddrInSlice, int cuAddrUpInSlice, int allowMergeLeft, int allowMergeUp)
-{
-    if (saoFlag)
-    {
-        if (rx > 0 && cuAddrInSlice != 0 && allowMergeLeft)
-            codeSaoMerge(saoLcuParam->mergeLeftFlag);
-        else
-            saoLcuParam->mergeLeftFlag = 0;
-
-        if (!saoLcuParam->mergeLeftFlag)
-        {
-            if ((ry > 0) && (cuAddrUpInSlice >= 0) && allowMergeUp)
-                codeSaoMerge(saoLcuParam->mergeUpFlag);
-            else
-                saoLcuParam->mergeUpFlag = 0;
-
-            if (!saoLcuParam->mergeUpFlag)
-                codeSaoOffset(saoLcuParam, compIdx);
+            codeSaoMaxUvlc(saoLcuParam->offset[0], OFFSET_THRESH - 1);
+            codeSaoMaxUvlc(saoLcuParam->offset[1], OFFSET_THRESH - 1);
+            codeSaoMaxUvlc(-saoLcuParam->offset[2], OFFSET_THRESH - 1);
+            codeSaoMaxUvlc(-saoLcuParam->offset[3], OFFSET_THRESH - 1);
+            if (plane != 2)
+                encodeBinsEP((uint32_t)(typeIdx), 2);
         }
     }
 }
@@ -1584,7 +1550,7 @@
 
     if (cu->m_slice->m_pps->bTransformSkipEnabled)
         codeTransformSkipFlags(cu, absPartIdx, trSize, ttype);
-    
+
     bool bIsLuma = ttype == TEXT_LUMA;
 
     // select scans
@@ -1758,12 +1724,12 @@
 {
     X265_CHECK(maxSymbol > 0, "maxSymbol too small\n");
 
-    uint32_t isCodeLast = (maxSymbol > code) ? 1 : 0;
-    uint32_t isCodeNonZero = (code != 0) ? 1 : 0;
+    uint32_t isCodeNonZero = !!code;
 
     encodeBinEP(isCodeNonZero);
     if (isCodeNonZero)
     {
+        uint32_t isCodeLast = (maxSymbol > code);
         uint32_t mask = (1 << (code - 1)) - 1;
         uint32_t len = code - 1 + isCodeLast;
         mask <<= isCodeLast;
@@ -1772,14 +1738,6 @@
     }
 }
 
-/** Code SAO type index */
-void Entropy::codeSaoTypeIdx(uint32_t code)
-{
-    encodeBin((code == 0) ? 0 : 1, m_contextState[OFF_SAO_TYPE_IDX_CTX]);
-    if (code)
-        encodeBinEP(code <= 4 ? 1 : 0);
-}
-
 /* estimate bit cost for CBP, significant map and significant coefficients */
 void Entropy::estBit(EstBitsSbac& estBitsSbac, uint32_t log2TrSize, bool bIsLuma)
 {
diff -r b6d49505b179 -r 64ea900398eb source/encoder/entropy.h
--- a/source/encoder/entropy.h	Thu Oct 02 16:47:55 2014 -0500
+++ b/source/encoder/entropy.h	Sun Oct 05 18:19:16 2014 +0900
@@ -39,7 +39,6 @@
 class TComDataCU;
 class ScalingList;
 
-
 enum SplitType
 {
     DONT_SPLIT            = 0,
@@ -149,8 +148,7 @@
     void finishSlice()                 { encodeBinTrm(1); finish(); dynamic_cast<Bitstream*>(m_bitIf)->writeByteAlignment(); }
 
     void encodeCTU(TComDataCU* cu);
-    void codeSaoOffset(SaoCtuParam* saoLcuParam, uint32_t compIdx);
-    void codeSaoUnitInterleaving(int compIdx, bool saoFlag, int rx, int ry, SaoCtuParam* saoLcuParam, int cuAddrInSlice, int cuAddrUpInSlice, int allowMergeLeft, int allowMergeUp);
+    void codeSaoOffset(const SaoCtuParam* saoLcuParam, int plane);
     void codeSaoMerge(uint32_t code)   { encodeBin(code, m_contextState[OFF_SAO_MERGE_FLAG_CTX]); }
 
     void codeCUTransquantBypassFlag(uint32_t symbol);
@@ -215,9 +213,6 @@
     void codeRefFrmIdx(TComDataCU* cu, uint32_t absPartIdx, int list);
 
     void codeSaoMaxUvlc(uint32_t code, uint32_t maxSymbol);
-    void codeSaoTypeIdx(uint32_t code);
-    void codeSaoUflc(uint32_t length, uint32_t code) { encodeBinsEP(code, length); }
-    void codeSAOSign(uint32_t code)                  { encodeBinEP(code); }
 
     void codeDeltaQP(TComDataCU* cu, uint32_t absPartIdx);
     void codeLastSignificantXY(uint32_t posx, uint32_t posy, uint32_t log2TrSize, bool bIsLuma, uint32_t scanIdx);
@@ -230,7 +225,7 @@
         uint32_t bakAbsPartIdxCU;
     };
 
-    void encodeTransform(TComDataCU* cu, CoeffCodeState& state, uint32_t offsetLumaOffset, uint32_t offsetChroma, 
+    void encodeTransform(TComDataCU* cu, CoeffCodeState& state, uint32_t offsetLumaOffset, uint32_t offsetChroma,
                          uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t depth, uint32_t log2TrSize, uint32_t uiTrIdx, bool& bCodeDQP, uint32_t* depthRange);
 
     void copyFrom(Entropy& src);
diff -r b6d49505b179 -r 64ea900398eb source/encoder/sao.cpp
--- a/source/encoder/sao.cpp	Thu Oct 02 16:47:55 2014 -0500
+++ b/source/encoder/sao.cpp	Sun Oct 05 18:19:16 2014 +0900
@@ -27,22 +27,9 @@
 
 namespace {
 
-#if HIGH_BIT_DEPTH
-inline double roundIDBI2(double x)
+inline int32_t roundIBDI(int32_t num, int32_t den)
 {
-    return ((x) > 0) ? (int)(((int)(x) + (1 << (X265_DEPTH - 8 - 1))) / (1 << (X265_DEPTH - 8))) :
-                       ((int)(((int)(x) - (1 << (X265_DEPTH - 8 - 1))) / (1 << (X265_DEPTH - 8))));
-}
-#endif
-
-/* rounding with IBDI */
-inline double roundIDBI(double x)
-{
-#if HIGH_BIT_DEPTH
-    return X265_DEPTH > 8 ? roundIDBI2(x) : ((x) >= 0 ? ((int)((x) + 0.5)) : ((int)((x) - 0.5)));
-#else
-    return (x) >= 0 ? ((int)((x) + 0.5)) : ((int)((x) - 0.5));
-#endif
+    return num >= 0 ? ((num * 2 + den) / (den * 2)) : -((-num * 2 + den) / (den * 2));
 }
 
 /* get the sign of input variable (TODO: this is a dup, make common) */
@@ -51,6 +38,11 @@
     return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
 }
 
+inline int64_t estSaoDist(int32_t count, int offset, int32_t offsetOrg)
+{
+    return (count * offset - offsetOrg * 2) * offset;
+}
+
 } // end anonymous namespace
 
 
@@ -172,7 +164,6 @@
 void SAO::allocSaoParam(SAOParam *saoParam) const
 {
     saoParam->numCuInWidth  = m_numCuInWidth;
-    saoParam->numCuInHeight = m_numCuInHeight;
 
     saoParam->ctuParam[0] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth];
     saoParam->ctuParam[1] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth];
@@ -184,9 +175,11 @@
 {
     saoParam->bSaoFlag[0] = false;
     saoParam->bSaoFlag[1] = false;
+#if 0
     resetCtuPart(saoParam->ctuParam[0]);
     resetCtuPart(saoParam->ctuParam[1]);
     resetCtuPart(saoParam->ctuParam[2]);
+#endif
 }
 
 void SAO::startSlice(Frame *pic, Entropy& initState, int qp)
@@ -238,64 +231,45 @@
 }
 
 // CTU-based SAO process without slice granularity
-void SAO::processSaoCu(int addr, int saoType, int plane)
+void SAO::processSaoCu(int addr, int typeIdx, int plane)
 {
     int x, y;
-    TComDataCU *tmpCu = m_pic->getCU(addr);
-    pixel* rec;
-    int stride;
-    int ctuWidth;
-    int ctuHeight;
-    int rpelx;
-    int bpely;
-    int picWidthTmp;
-    int picHeightTmp;
+    TComDataCU *cu = m_pic->getCU(addr);
+    pixel* rec = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
+    int stride = plane ? m_pic->getCStride() : m_pic->getStride();
+    uint32_t picWidth  = m_param->sourceWidth;
+    uint32_t picHeight = m_param->sourceHeight;
+    int ctuWidth  = g_maxCUSize;
+    int ctuHeight = g_maxCUSize;
+    uint32_t lpelx = cu->getCUPelX();
+    uint32_t tpely = cu->getCUPelY();
+    if (plane)
+    {
+        picWidth  >>= m_hChromaShift;
+        picHeight >>= m_vChromaShift;
+        ctuWidth  >>= m_hChromaShift;
+        ctuHeight >>= m_vChromaShift;
+        lpelx     >>= m_hChromaShift;
+        tpely     >>= m_vChromaShift;
+    }
+    uint32_t rpelx = x265_min(lpelx + ctuWidth,  picWidth);
+    uint32_t bpely = x265_min(tpely + ctuHeight, picHeight);
+    ctuWidth  = rpelx - lpelx;
+    ctuHeight = bpely - tpely;
+
     int startX;
     int startY;
     int endX;
     int endY;
     pixel* tmpL;
     pixel* tmpU;
-    uint32_t lpelx = tmpCu->getCUPelX();
-    uint32_t tpely = tmpCu->getCUPelY();
-    bool isLuma = !plane;
-
-    picWidthTmp  = isLuma ? m_param->sourceWidth  : m_param->sourceWidth  >> m_hChromaShift;
-    picHeightTmp = isLuma ? m_param->sourceHeight : m_param->sourceHeight >> m_vChromaShift;
-    ctuWidth     = isLuma ? g_maxCUSize : g_maxCUSize >> m_hChromaShift;
-    ctuHeight    = isLuma ? g_maxCUSize : g_maxCUSize >> m_vChromaShift;
-    lpelx        = isLuma ? lpelx       : lpelx       >> m_hChromaShift;
-    tpely        = isLuma ? tpely       : tpely       >> m_vChromaShift;
-
-    rpelx        = lpelx + ctuWidth;
-    bpely        = tpely + ctuHeight;
-    rpelx        = rpelx > picWidthTmp  ? picWidthTmp  : rpelx;
-    bpely        = bpely > picHeightTmp ? picHeightTmp : bpely;
-    ctuWidth     = rpelx - lpelx;
-    ctuHeight    = bpely - tpely;
-
-    if (!tmpCu->m_pic)
-        return;
-
-    if (plane)
-    {
-        rec    = m_pic->getPicYuvRec()->getChromaAddr(plane, addr);
-        stride = m_pic->getCStride();
-    }
-    else
-    {
-        rec    = m_pic->getPicYuvRec()->getLumaAddr(addr);
-        stride = m_pic->getStride();
-    }
 
     int32_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
     int32_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
 
-//   if (iSaoType!=SAO_BO_0 || iSaoType!=SAO_BO_1)
     {
-        int cuHeightTmp = isLuma ? g_maxCUSize : (g_maxCUSize  >> m_vChromaShift);
-        pixel* recR = &rec[isLuma ? (g_maxCUSize - 1) : ((g_maxCUSize >> m_hChromaShift) - 1)];
-        for (int i = 0; i < cuHeightTmp + 1; i++)
+        const pixel* recR = &rec[ctuWidth - 1];
+        for (int i = 0; i < ctuHeight + 1; i++)
         {
             m_tmpL2[i] = *recR;
             recR += stride;
@@ -305,13 +279,13 @@
         tmpU = &(m_tmpU1[plane][lpelx]);
     }
 
-    switch (saoType)
+    switch (typeIdx)
     {
     case SAO_EO_0: // dir: -
     {
         pixel firstPxl = 0, lastPxl = 0;
         startX = !lpelx;
-        endX   = (rpelx == picWidthTmp) ? ctuWidth - 1 : ctuWidth;
+        endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
         if (ctuWidth & 15)
         {
             for (y = 0; y < ctuHeight; y++)
@@ -338,7 +312,7 @@
                 if (!lpelx)
                     firstPxl = rec[0];
 
-                if (rpelx == picWidthTmp)
+                if (rpelx == picWidth)
                     lastPxl = rec[ctuWidth - 1];
 
                 primitives.saoCuOrgE0(rec, m_offsetEo, ctuWidth, (int8_t)signLeft);
@@ -346,7 +320,7 @@
                 if (!lpelx)
                     rec[0] = firstPxl;
 
-                if (rpelx == picWidthTmp)
+                if (rpelx == picWidth)
                     rec[ctuWidth - 1] = lastPxl;
 
                 rec += stride;
@@ -357,7 +331,7 @@
     case SAO_EO_1: // dir: |
     {
         startY = !tpely;
-        endY   = (bpely == picHeightTmp) ? ctuHeight - 1 : ctuHeight;
+        endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
         if (!tpely)
             rec += stride;
 
@@ -383,10 +357,10 @@
     case SAO_EO_2: // dir: 135
     {
         startX = !lpelx;
-        endX   = (rpelx == picWidthTmp) ? ctuWidth - 1 : ctuWidth;
+        endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
 
         startY = !tpely;
-        endY   = (bpely == picHeightTmp) ? ctuHeight - 1 : ctuHeight;
+        endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
 
         if (!tpely)
             rec += stride;
@@ -396,17 +370,15 @@
 
         for (y = startY; y < endY; y++)
         {
-            int signDown2 = signOf(rec[stride + startX] - tmpL[y]);
+            upBufft[startX] = signOf(rec[stride + startX] - tmpL[y]);
             for (x = startX; x < endX; x++)
             {
-                int signDown1 = signOf(rec[x] - rec[x + stride + 1]);
-                int edgeType  = signDown1 + upBuff1[x] + 2;
-                upBufft[x + 1] = -signDown1;
+                int signDown = signOf(rec[x] - rec[x + stride + 1]);
+                int edgeType = signDown + upBuff1[x] + 2;
+                upBufft[x + 1] = -signDown;
                 rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
             }
 
-            upBufft[startX] = signDown2;
-
             std::swap(upBuff1, upBufft);
 
             rec += stride;
@@ -416,13 +388,13 @@
     }
     case SAO_EO_3: // dir: 45
     {
-        startX = (lpelx == 0) ? 1 : 0;
-        endX   = (rpelx == picWidthTmp) ? ctuWidth - 1 : ctuWidth;
+        startX = !lpelx;
+        endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
 
-        startY = (tpely == 0) ? 1 : 0;
-        endY   = (bpely == picHeightTmp) ? ctuHeight - 1 : ctuHeight;
+        startY = !tpely;
+        endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
 
-        if (startY == 1)
+        if (!tpely)
             rec += stride;
 
         for (x = startX - 1; x < endX; x++)
@@ -431,15 +403,15 @@
         for (y = startY; y < endY; y++)
         {
             x = startX;
-            int signDown1 = signOf(rec[x] - tmpL[y + 1]);
-            int edgeType  = signDown1 + upBuff1[x] + 2;
-            upBuff1[x - 1] = -signDown1;
+            int signDown = signOf(rec[x] - tmpL[y + 1]);
+            int edgeType = signDown + upBuff1[x] + 2;
+            upBuff1[x - 1] = -signDown;
             rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
             for (x = startX + 1; x < endX; x++)
             {
-                signDown1 = signOf(rec[x] - rec[x + stride - 1]);
-                edgeType  = signDown1 + upBuff1[x] + 2;
-                upBuff1[x - 1] = -signDown1;
+                signDown = signOf(rec[x] - rec[x + stride - 1]);
+                edgeType = signDown + upBuff1[x] + 2;
+                upBuff1[x - 1] = -signDown;
                 rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
             }
 
@@ -474,44 +446,27 @@
 /* Process SAO all units */
 void SAO::processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane)
 {
-    pixel *rec;
-    int picWidthTmp;
-
+    int stride = plane ? m_pic->getCStride() : m_pic->getStride();
+    uint32_t picWidth  = m_param->sourceWidth;
+    int ctuWidth  = g_maxCUSize;
+    int ctuHeight = g_maxCUSize;
     if (plane)
     {
-        rec         = m_pic->getPicYuvRec()->getChromaAddr(plane);
-        picWidthTmp = m_param->sourceWidth >> m_hChromaShift;
-    }
-    else
-    {
-        rec         = m_pic->getPicYuvRec()->getLumaAddr();
-        picWidthTmp = m_param->sourceWidth;
+        picWidth  >>= m_hChromaShift;
+        ctuWidth  >>= m_hChromaShift;
+        ctuHeight >>= m_vChromaShift;
     }
 
     if (!idxY)
-        memcpy(m_tmpU1[plane], rec, sizeof(pixel) * picWidthTmp);
+    {
+        pixel *rec = plane ? m_pic->getPicYuvRec()->getChromaAddr(plane) : m_pic->getPicYuvRec()->getLumaAddr();
+        memcpy(m_tmpU1[plane], rec, sizeof(pixel) * picWidth);
+    }
 
-    int frameWidthInCU = m_pic->getFrameWidthInCU();
-    int stride;
-    bool isChroma = !!plane;
+    int addr = idxY * m_numCuInWidth;
+    pixel *rec = plane ? m_pic->getPicYuvRec()->getChromaAddr(plane, addr) : m_pic->getPicYuvRec()->getLumaAddr(addr);
 
-    const int boShift = X265_DEPTH - SAO_BO_BITS;
-
-    int addr = idxY * frameWidthInCU;
-    if (isChroma)
-    {
-        rec = m_pic->getPicYuvRec()->getChromaAddr(plane, addr);
-        stride = m_pic->getCStride();
-        picWidthTmp = m_param->sourceWidth >> m_hChromaShift;
-    }
-    else
-    {
-        rec = m_pic->getPicYuvRec()->getLumaAddr(addr);
-        stride = m_pic->getStride();
-        picWidthTmp = m_param->sourceWidth;
-    }
-    int maxCUHeight = isChroma ? (g_maxCUSize >> m_vChromaShift) : g_maxCUSize;
-    for (int i = 0; i < maxCUHeight + 1; i++)
+    for (int i = 0; i < ctuHeight + 1; i++)
     {
         m_tmpL1[i] = rec[0];
         rec += stride;
@@ -519,11 +474,13 @@
 
     rec -= (stride << 1);
 
-    memcpy(m_tmpU2[plane], rec, sizeof(pixel) * picWidthTmp);
+    memcpy(m_tmpU2[plane], rec, sizeof(pixel) * picWidth);
 
-    for (int idxX = 0; idxX < frameWidthInCU; idxX++)
+    const int boShift = X265_DEPTH - SAO_BO_BITS;
+
+    for (int idxX = 0; idxX < m_numCuInWidth; idxX++)
     {
-        addr = idxY * frameWidthInCU + idxX;
+        addr = idxY * m_numCuInWidth + idxX;
 
         int typeIdx = ctuParam[addr].typeIdx;
         bool mergeLeftFlag = ctuParam[addr].mergeLeftFlag;
@@ -539,7 +496,7 @@
                     memset(offset, 0, sizeof(offset));
 
                     for (int i = 0; i < SAO_NUM_OFFSET; i++)
-                        offset[((ctuParam[addr].subTypeIdx + i) & (SAO_NUM_BO_CLASSES - 1))] = ctuParam[addr].offset[i] << SAO_BIT_INC;
+                        offset[((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = ctuParam[addr].offset[i] << SAO_BIT_INC;
 
                     for (int i = 0; i < (1 << X265_DEPTH); i++)
                         offsetBo[i] = m_clipTable[i + offset[i >> boShift]];
@@ -557,27 +514,14 @@
             }
             processSaoCu(addr, typeIdx, plane);
         }
-        else
+        else if (idxX != (m_numCuInWidth - 1))
         {
-            if (idxX != (frameWidthInCU - 1))
+            rec = plane ? m_pic->getPicYuvRec()->getChromaAddr(plane, addr) : m_pic->getPicYuvRec()->getLumaAddr(addr);
+
+            for (int i = 0; i < ctuHeight + 1; i++)
             {
-                if (isChroma)
-                {
-                    rec = m_pic->getPicYuvRec()->getChromaAddr(plane, addr);
-                    stride = m_pic->getCStride();
-                }
-                else
-                {
-                    rec = m_pic->getPicYuvRec()->getLumaAddr(addr);
-                    stride = m_pic->getStride();
-                }
-
-                int widthShift = isChroma ? (g_maxCUSize >> m_hChromaShift) : g_maxCUSize;
-                for (int i = 0; i < maxCUHeight + 1; i++)
-                {
-                    m_tmpL1[i] = rec[widthShift - 1];
-                    rec += stride;
-                }
+                m_tmpL1[i] = rec[ctuWidth - 1];
+                rec += stride;
             }
         }
     }
@@ -591,9 +535,8 @@
     {
         ctuParam[i].mergeUpFlag   =  1;
         ctuParam[i].mergeLeftFlag =  0;
-        ctuParam[i].partIdx       =  0;
         ctuParam[i].typeIdx       = -1;
-        ctuParam[i].subTypeIdx    =  0;
+        ctuParam[i].bandPos       =  0;
         for (int j = 0; j < SAO_NUM_OFFSET; j++)
             ctuParam[i].offset[j] = 0;
     }
@@ -603,10 +546,8 @@
 {
     saoUnit->mergeUpFlag   = 0;
     saoUnit->mergeLeftFlag = 0;
-    saoUnit->partIdx       = 0;
-    saoUnit->partIdxTmp    = 0;
     saoUnit->typeIdx       = -1;
-    saoUnit->subTypeIdx    = 0;
+    saoUnit->bandPos       = 0;
 
     for (int i = 0; i < SAO_NUM_OFFSET; i++)
         saoUnit->offset[i] = 0;
@@ -617,8 +558,8 @@
     saoUnitDst->mergeLeftFlag = saoUnitSrc->mergeLeftFlag;
     saoUnitDst->mergeUpFlag   = saoUnitSrc->mergeUpFlag;
     saoUnitDst->typeIdx       = saoUnitSrc->typeIdx;
+    saoUnitDst->bandPos       = saoUnitSrc->bandPos;
 
-    saoUnitDst->subTypeIdx  = saoUnitSrc->subTypeIdx;
     for (int i = 0; i < SAO_NUM_OFFSET; i++)
         saoUnitDst->offset[i] = saoUnitSrc->offset[i];
 }
@@ -628,242 +569,240 @@
 {
     int x, y;
     TComDataCU *cu = m_pic->getCU(addr);
+    const pixel* fenc0 = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
+    const pixel* rec0  = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
+    const pixel* fenc;
+    const pixel* rec;
+    int stride = plane ? m_pic->getCStride() : m_pic->getStride();
+    uint32_t picWidth  = m_param->sourceWidth;
+    uint32_t picHeight = m_param->sourceHeight;
+    int ctuWidth  = g_maxCUSize;
+    int ctuHeight = g_maxCUSize;
+    uint32_t lpelx = cu->getCUPelX();
+    uint32_t tpely = cu->getCUPelY();
+    if (plane)
+    {
+        picWidth  >>= m_hChromaShift;
+        picHeight >>= m_vChromaShift;
+        ctuWidth  >>= m_hChromaShift;
+        ctuHeight >>= m_vChromaShift;
+        lpelx     >>= m_hChromaShift;
+        tpely     >>= m_vChromaShift;
+    }
+    uint32_t rpelx = x265_min(lpelx + ctuWidth,  picWidth);
+    uint32_t bpely = x265_min(tpely + ctuHeight, picHeight);
+    ctuWidth  = rpelx - lpelx;
+    ctuHeight = bpely - tpely;
 
-    pixel* fenc;
-    pixel* recon;
-    int stride;
-    int ctuHeight;
-    int ctuWidth;
-    uint32_t rpelx;
-    uint32_t bpely;
-    uint32_t picWidthTmp;
-    uint32_t picHeightTmp;
-    int64_t* stats;
-    int64_t* counts;
     int startX;
     int startY;
     int endX;
     int endY;
-    uint32_t lpelx = cu->getCUPelX();
-    uint32_t tpely = cu->getCUPelY();
+    int32_t* stats;
+    int32_t* count;
 
-    int isLuma = !plane;
-    int isChroma = !!plane;
-    int numSkipLine = isChroma ? 4 - (2 * m_vChromaShift) : 4;
-    int numSkipLineRight = isChroma ? 5 - (2 * m_hChromaShift) : 5;
+    int skipB = plane ? 2 : 4;
+    int skipR = plane ? 3 : 5;
 
-    picWidthTmp  = isLuma ? m_param->sourceWidth  : m_param->sourceWidth  >> m_hChromaShift;
-    picHeightTmp = isLuma ? m_param->sourceHeight : m_param->sourceHeight >> m_vChromaShift;
-    ctuWidth     = isLuma ? g_maxCUSize : g_maxCUSize >> m_hChromaShift;
-    ctuHeight    = isLuma ? g_maxCUSize : g_maxCUSize >> m_vChromaShift;
-    lpelx        = isLuma ? lpelx       : lpelx       >> m_hChromaShift;
-    tpely        = isLuma ? tpely       : tpely       >> m_vChromaShift;
+    int32_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
+    int32_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
 
-    rpelx     = lpelx + ctuWidth;
-    bpely     = tpely + ctuHeight;
-    rpelx     = rpelx > picWidthTmp  ? picWidthTmp  : rpelx;
-    bpely     = bpely > picHeightTmp ? picHeightTmp : bpely;
-    ctuWidth  = rpelx - lpelx;
-    ctuHeight = bpely - tpely;
-    stride    =  (plane == 0) ? m_pic->getStride() : m_pic->getCStride();
-
-    //if(iSaoType == BO_0 || iSaoType == BO_1)
+    // SAO_BO:
     {
         const int boShift = X265_DEPTH - SAO_BO_BITS;
 
         if (m_param->bSaoNonDeblocked)
         {
-            numSkipLine      = isChroma ? 3 - (2 * m_vChromaShift) : 3;
-            numSkipLineRight = isChroma ? 4 - (2 * m_hChromaShift) : 4;
+            skipB = plane ? 1 : 3;
+            skipR = plane ? 2 : 4;
         }
         stats = m_offsetOrg[plane][SAO_BO];
-        counts = m_count[plane][SAO_BO];
+        count = m_count[plane][SAO_BO];
 
-        fenc = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
-        recon = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
+        fenc = fenc0;
+        rec  = rec0;
 
-        endX = (rpelx == picWidthTmp) ? ctuWidth : ctuWidth - numSkipLineRight;
-        endY = (bpely == picHeightTmp) ? ctuHeight : ctuHeight - numSkipLine;
+        endX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
+        endY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;
+
         for (y = 0; y < endY; y++)
         {
             for (x = 0; x < endX; x++)
             {
-                int classIdx = 1 + (recon[x] >> boShift);
-                stats[classIdx] += (fenc[x] - recon[x]);
-                counts[classIdx]++;
+                int classIdx = 1 + (rec[x] >> boShift);
+                stats[classIdx] += (fenc[x] - rec[x]);
+                count[classIdx]++;
             }
 
             fenc += stride;
-            recon += stride;
+            rec += stride;
         }
     }
 
-    int32_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
-    int32_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
-
-    //if (iSaoType == EO_0  || iSaoType == EO_1 || iSaoType == EO_2 || iSaoType == EO_3)
     {
-        //if (iSaoType == EO_0)
+        // SAO_EO_0: // dir: -
         {
             if (m_param->bSaoNonDeblocked)
             {
-                numSkipLine      = isChroma ? 3 - (2 * m_vChromaShift) : 3;
-                numSkipLineRight = isChroma ? 5 - (2 * m_hChromaShift) : 5;
+                skipB = plane ? 1 : 3;
+                skipR = plane ? 3 : 5;
             }
             stats = m_offsetOrg[plane][SAO_EO_0];
-            counts = m_count[plane][SAO_EO_0];
+            count = m_count[plane][SAO_EO_0];
 
-            fenc = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
-            recon = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
+            fenc = fenc0;
+            rec  = rec0;
 
-            startX = (lpelx == 0) ? 1 : 0;
-            endX   = (rpelx == picWidthTmp) ? ctuWidth - 1 : ctuWidth - numSkipLineRight;
-            for (y = 0; y < ctuHeight - numSkipLine; y++)
+            startX = !lpelx;
+            endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
+            for (y = 0; y < ctuHeight - skipB; y++)
             {
-                int signLeft = signOf(recon[startX] - recon[startX - 1]);
+                int signLeft = signOf(rec[startX] - rec[startX - 1]);
                 for (x = startX; x < endX; x++)
                 {
-                    int signRight = signOf(recon[x] - recon[x + 1]);
+                    int signRight = signOf(rec[x] - rec[x + 1]);
                     int edgeType = signRight + signLeft + 2;
                     signLeft = -signRight;
 
-                    stats[s_eoTable[edgeType]] += (fenc[x] - recon[x]);
-                    counts[s_eoTable[edgeType]]++;
+                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
+                    count[s_eoTable[edgeType]]++;
                 }
 
                 fenc += stride;
-                recon += stride;
+                rec += stride;
             }
         }
 
-        //if (iSaoType == EO_1)
+        // SAO_EO_1: // dir: |
         {
             if (m_param->bSaoNonDeblocked)
             {
-                numSkipLine      = isChroma ? 4 - (2 * m_vChromaShift) : 4;
-                numSkipLineRight = isChroma ? 4 - (2 * m_hChromaShift) : 4;
+                skipB = plane ? 2 : 4;
+                skipR = plane ? 2 : 4;
             }
             stats = m_offsetOrg[plane][SAO_EO_1];
-            counts = m_count[plane][SAO_EO_1];
+            count = m_count[plane][SAO_EO_1];
 
-            fenc = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
-            recon = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
+            fenc = fenc0;
+            rec  = rec0;
 
-            startY = (tpely == 0) ? 1 : 0;
-            endX   = (rpelx == picWidthTmp) ? ctuWidth : ctuWidth - numSkipLineRight;
-            endY   = (bpely == picHeightTmp) ? ctuHeight - 1 : ctuHeight - numSkipLine;
-            if (tpely == 0)
+            startY = !tpely;
+            endX   = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
+            endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
+            if (!tpely)
             {
                 fenc += stride;
-                recon += stride;
+                rec += stride;
             }
 
             for (x = 0; x < ctuWidth; x++)
-                upBuff1[x] = signOf(recon[x] - recon[x - stride]);
+                upBuff1[x] = signOf(rec[x] - rec[x - stride]);
 
             for (y = startY; y < endY; y++)
             {
                 for (x = 0; x < endX; x++)
                 {
-                    int signDown = signOf(recon[x] - recon[x + stride]);
+                    int signDown = signOf(rec[x] - rec[x + stride]);
                     int edgeType = signDown + upBuff1[x] + 2;
                     upBuff1[x] = -signDown;
 
-                    stats[s_eoTable[edgeType]] += (fenc[x] - recon[x]);
-                    counts[s_eoTable[edgeType]]++;
+                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
+                    count[s_eoTable[edgeType]]++;
                 }
 
                 fenc += stride;
-                recon += stride;
+                rec += stride;
             }
         }
-        //if (iSaoType == EO_2)
+
+        // SAO_EO_2: // dir: 135
         {
             if (m_param->bSaoNonDeblocked)
             {
-                numSkipLine      = isChroma ? 4 - (2 * m_vChromaShift) : 4;
-                numSkipLineRight = isChroma ? 5 - (2 * m_hChromaShift) : 5;
+                skipB = plane ? 2 : 4;
+                skipR = plane ? 3 : 5;
             }
             stats = m_offsetOrg[plane][SAO_EO_2];
-            counts = m_count[plane][SAO_EO_2];
+            count = m_count[plane][SAO_EO_2];
 
-            fenc = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
-            recon = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
+            fenc = fenc0;
+            rec  = rec0;
 
-            startX = (lpelx == 0) ? 1 : 0;
-            endX   = (rpelx == picWidthTmp) ? ctuWidth - 1 : ctuWidth - numSkipLineRight;
+            startX = !lpelx;
+            endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
 
-            startY = (tpely == 0) ? 1 : 0;
-            endY   = (bpely == picHeightTmp) ? ctuHeight - 1 : ctuHeight - numSkipLine;
-            if (tpely == 0)
+            startY = !tpely;
+            endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
+            if (!tpely)
             {
                 fenc += stride;
-                recon += stride;
+                rec += stride;
             }
 
             for (x = startX; x < endX; x++)
-                upBuff1[x] = signOf(recon[x] - recon[x - stride - 1]);
+                upBuff1[x] = signOf(rec[x] - rec[x - stride - 1]);
 
             for (y = startY; y < endY; y++)
             {
-                int signDown2 = signOf(recon[stride + startX] - recon[startX - 1]);
+                upBufft[startX] = signOf(rec[startX + stride] - rec[startX - 1]);
                 for (x = startX; x < endX; x++)
                 {
-                    int signDown1 = signOf(recon[x] - recon[x + stride + 1]);
-                    int edgeType  = signDown1 + upBuff1[x] + 2;
-                    upBufft[x + 1] = -signDown1;
-                    stats[s_eoTable[edgeType]] += (fenc[x] - recon[x]);
-                    counts[s_eoTable[edgeType]]++;
+                    int signDown = signOf(rec[x] - rec[x + stride + 1]);
+                    int edgeType = signDown + upBuff1[x] + 2;
+                    upBufft[x + 1] = -signDown;
+                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
+                    count[s_eoTable[edgeType]]++;
                 }
 
-                upBufft[startX] = signDown2;
                 std::swap(upBuff1, upBufft);
 
-                recon += stride;
+                rec += stride;
                 fenc += stride;
             }
         }
-        //if (iSaoType == EO_3)
+
+        // SAO_EO_3: // dir: 45
         {
             if (m_param->bSaoNonDeblocked)
             {
-                numSkipLine      = isChroma ? 4 - (2 * m_vChromaShift) : 4;
-                numSkipLineRight = isChroma ? 5 - (2 * m_hChromaShift) : 5;
+                skipB = plane ? 2 : 4;
+                skipR = plane ? 3 : 5;
             }
             stats = m_offsetOrg[plane][SAO_EO_3];
-            counts = m_count[plane][SAO_EO_3];
+            count = m_count[plane][SAO_EO_3];
 
-            fenc = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
-            recon = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
+            fenc = fenc0;
+            rec  = rec0;
 
-            startX = (lpelx == 0) ? 1 : 0;
-            endX   = (rpelx == picWidthTmp) ? ctuWidth - 1 : ctuWidth - numSkipLineRight;
+            startX = !lpelx;
+            endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
 
-            startY = (tpely == 0) ? 1 : 0;
-            endY   = (bpely == picHeightTmp) ? ctuHeight - 1 : ctuHeight - numSkipLine;
-            if (startY == 1)
+            startY = !tpely;
+            endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
+
+            if (!tpely)
             {
                 fenc += stride;
-                recon += stride;
+                rec += stride;
             }
 
             for (x = startX - 1; x < endX; x++)
-                upBuff1[x] = signOf(recon[x] - recon[x - stride + 1]);
+                upBuff1[x] = signOf(rec[x] - rec[x - stride + 1]);
 
             for (y = startY; y < endY; y++)
             {
                 for (x = startX; x < endX; x++)
                 {
-                    int signDown1 = signOf(recon[x] - recon[x + stride - 1]);
-                    int edgeType  = signDown1 + upBuff1[x] + 2;
-                    upBuff1[x - 1] = -signDown1;
-                    stats[s_eoTable[edgeType]] += (fenc[x] - recon[x]);
-                    counts[s_eoTable[edgeType]]++;
+                    int signDown = signOf(rec[x] - rec[x + stride - 1]);
+                    int edgeType = signDown + upBuff1[x] + 2;
+                    upBuff1[x - 1] = -signDown;
+                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
+                    count[s_eoTable[edgeType]]++;
                 }
 
-                upBuff1[endX - 1] = signOf(recon[endX - 1 + stride] - recon[endX]);
+                upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
 
-                recon += stride;
+                rec += stride;
                 fenc += stride;
             }
         }
@@ -872,277 +811,266 @@
 
 void SAO::calcSaoStatsCu_BeforeDblk(Frame* pic, int idxX, int idxY)
 {
+    int addr    = idxX + m_numCuInWidth * idxY;
+
     int x, y;
+    TComDataCU *cu = pic->getCU(addr);
+    const pixel* fenc;
+    const pixel* rec;
+    int stride = m_pic->getStride();
+    uint32_t picWidth  = m_param->sourceWidth;
+    uint32_t picHeight = m_param->sourceHeight;
+    int ctuWidth  = g_maxCUSize;
+    int ctuHeight = g_maxCUSize;
+    uint32_t lpelx = cu->getCUPelX();
+    uint32_t tpely = cu->getCUPelY();
+    uint32_t rpelx = x265_min(lpelx + ctuWidth,  picWidth);
+    uint32_t bpely = x265_min(tpely + ctuHeight, picHeight);
+    ctuWidth  = rpelx - lpelx;
+    ctuHeight = bpely - tpely;
 
-    pixel* fenc;
-    pixel* recon;
-    int stride;
-    uint32_t rPelX;
-    uint32_t bPelY;
-    int64_t* stats;
-    int64_t* count;
     int startX;
     int startY;
     int endX;
     int endY;
     int firstX, firstY;
+    int32_t* stats;
+    int32_t* count;
 
-    int frameWidthInCU  = m_numCuInWidth;
+    int skipB, skipR;
 
-    int isChroma;
-    int numSkipLine, numSkipLineRight;
-
-    uint32_t lPelX, tPelY;
-    TComDataCU *cu;
     int32_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
     int32_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
 
     const int boShift = X265_DEPTH - SAO_BO_BITS;
 
-    // NOTE: Row
+    memset(m_countPreDblk[addr], 0, sizeof(PerPlane));
+    memset(m_offsetOrgPreDblk[addr], 0, sizeof(PerPlane));
+
+    for (int plane = 0; plane < NUM_PLANE; plane++)
     {
-        // NOTE: Col
+        if (plane == 1)
         {
-            int addr    = idxX + frameWidthInCU * idxY;
-            cu      = pic->getCU(addr);
+            stride = pic->getCStride();
+            picWidth  >>= m_hChromaShift;
+            picHeight >>= m_vChromaShift;
+            ctuWidth  >>= m_hChromaShift;
+            ctuHeight >>= m_vChromaShift;
+            lpelx     >>= m_hChromaShift;
+            tpely     >>= m_vChromaShift;
+            rpelx     >>= m_hChromaShift;
+            bpely     >>= m_vChromaShift;
+        }
 
-            uint32_t picWidthTmp  = m_param->sourceWidth;
-            uint32_t picHeightTmp = m_param->sourceHeight;
-            int ctuWidth  = g_maxCUSize;
-            int ctuHeight = g_maxCUSize;
-            lPelX   = cu->getCUPelX();
-            tPelY   = cu->getCUPelY();
-            rPelX     = lPelX + ctuWidth;
-            bPelY     = tPelY + ctuHeight;
-            rPelX     = rPelX > picWidthTmp  ? picWidthTmp  : rPelX;
-            bPelY     = bPelY > picHeightTmp ? picHeightTmp : bPelY;
-            ctuWidth  = rPelX - lPelX;
-            ctuHeight = bPelY - tPelY;
+        // SAO_BO:
 
-            memset(m_countPreDblk[addr], 0, sizeof(PerPlane));
-            memset(m_offsetOrgPreDblk[addr], 0, sizeof(PerPlane));
+        skipB = plane ? 1 : 3;
+        skipR = plane ? 2 : 4;
 
-            for (int plane = 0; plane < 3; plane++)
+        stats = m_offsetOrgPreDblk[addr][plane][SAO_BO];
+        count = m_countPreDblk[addr][plane][SAO_BO];
+
+        const pixel* fenc0 = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
+        const pixel* rec0  = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
+        fenc = fenc0;
+        rec  = rec0;
+
+        startX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
+        startY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;
+
+        for (y = 0; y < ctuHeight; y++)
+        {
+            for (x = (y < startY ? startX : 0); x < ctuWidth; x++)
             {
-                isChroma = !!plane;
-                if (plane == 1)
+                int classIdx = 1 + (rec[x] >> boShift);
+                stats[classIdx] += (fenc[x] - rec[x]);
+                count[classIdx]++;
+            }
+
+            fenc += stride;
+            rec += stride;
+        }
+
+        // SAO_EO_0: // dir: -
+        {
+            skipB = plane ? 1 : 3;
+            skipR = plane ? 3 : 5;
+
+            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_0];
+            count = m_countPreDblk[addr][plane][SAO_EO_0];
+
+            fenc = fenc0;
+            rec  = rec0;
+
+            startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
+            startY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;
+            firstX = !lpelx;
+            // endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
+            endX   = ctuWidth - 1;  // not refer right CTU
+
+            for (y = 0; y < ctuHeight; y++)
+            {
+                x = (y < startY ? startX : firstX);
+                int signLeft = signOf(rec[x] - rec[x - 1]);
+                for (; x < endX; x++)
                 {
-                    picWidthTmp  >>= m_hChromaShift;
-                    picHeightTmp >>= m_vChromaShift;
-                    ctuWidth     >>= m_hChromaShift;
-                    ctuHeight    >>= m_vChromaShift;
-                    lPelX        >>= m_hChromaShift;
-                    tPelY        >>= m_vChromaShift;
-                    rPelX     = lPelX + ctuWidth;
-                    bPelY     = tPelY + ctuHeight;
+                    int signRight = signOf(rec[x] - rec[x + 1]);
+                    int edgeType = signRight + signLeft + 2;
+                    signLeft = -signRight;
+
+                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
+                    count[s_eoTable[edgeType]]++;
                 }
 
-                stride   = (plane == 0) ? pic->getStride() : pic->getCStride();
+                fenc += stride;
+                rec += stride;
+            }
+        }
 
-                //if(iSaoType == BO)
+        // SAO_EO_1: // dir: |
+        {
+            skipB = plane ? 2 : 4;
+            skipR = plane ? 2 : 4;
 
-                numSkipLine = isChroma ? 1 : 3;
-                numSkipLineRight = isChroma ? 2 : 4;
+            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_1];
+            count = m_countPreDblk[addr][plane][SAO_EO_1];
 
-                stats = m_offsetOrgPreDblk[addr][plane][SAO_BO];
-                count = m_countPreDblk[addr][plane][SAO_BO];
+            fenc = fenc0;
+            rec  = rec0;
 
-                fenc = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
-                recon = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
+            startX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
+            startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
+            firstY = !tpely;
+            // endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
+            endY   = ctuHeight - 1; // not refer below CTU
+            if (!tpely)
+            {
+                fenc += stride;
+                rec += stride;
+            }
 
-                startX = (rPelX == picWidthTmp) ? ctuWidth : ctuWidth - numSkipLineRight;
-                startY = (bPelY == picHeightTmp) ? ctuHeight : ctuHeight - numSkipLine;
+            for (x = startX; x < ctuWidth; x++)
+                upBuff1[x] = signOf(rec[x] - rec[x - stride]);
 
-                for (y = 0; y < ctuHeight; y++)
+            for (y = firstY; y < endY; y++)
+            {
+                for (x = (y < startY - 1 ? startX : 0); x < ctuWidth; x++)
                 {
-                    for (x = 0; x < ctuWidth; x++)
-                    {
-                        if (x < startX && y < startY)
-                            continue;
+                    int signDown = signOf(rec[x] - rec[x + stride]);
+                    int edgeType = signDown + upBuff1[x] + 2;
+                    upBuff1[x] = -signDown;
 
-                        int classIdx = 1 + (recon[x] >> boShift);
-                        stats[classIdx] += (fenc[x] - recon[x]);
-                        count[classIdx]++;
-                    }
+                    if (x < startX && y < startY)
+                        continue;
 
-                    fenc += stride;
-                    recon += stride;
+                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
+                    count[s_eoTable[edgeType]]++;
                 }
 
-                //if (iSaoType == EO_0)
+                fenc += stride;
+                rec += stride;
+            }
+        }
 
-                numSkipLine = isChroma ? 1 : 3;
-                numSkipLineRight = isChroma ? 3 : 5;
+        // SAO_EO_2: // dir: 135
+        {
+            skipB = plane ? 2 : 4;
+            skipR = plane ? 3 : 5;
 
-                stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_0];
-                count = m_countPreDblk[addr][plane][SAO_EO_0];
+            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_2];
+            count = m_countPreDblk[addr][plane][SAO_EO_2];
 
-                fenc = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
-                recon = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
+            fenc = fenc0;
+            rec  = rec0;
 
-                startX = (rPelX == picWidthTmp) ? ctuWidth - 1 : ctuWidth - numSkipLineRight;
-                startY = (bPelY == picHeightTmp) ? ctuHeight : ctuHeight - numSkipLine;
-                firstX = (lPelX == 0) ? 1 : 0;
-                endX   = (rPelX == picWidthTmp) ? ctuWidth - 1 : ctuWidth;
+            startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
+            startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
+            firstX = !lpelx;
+            firstY = !tpely;
+            // endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
+            // endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
+            endX   = ctuWidth - 1;  // not refer right CTU
+            endY   = ctuHeight - 1; // not refer below CTU
+            if (!tpely)
+            {
+                fenc += stride;
+                rec += stride;
+            }
 
-                for (y = 0; y < ctuHeight; y++)
+            for (x = startX; x < endX; x++)
+                upBuff1[x] = signOf(rec[x] - rec[x - stride - 1]);
+
+            for (y = firstY; y < endY; y++)
+            {
+                x = (y < startY - 1 ? startX : firstX);
+                upBufft[x] = signOf(rec[x + stride] - rec[x - 1]);
+                for (; x < endX; x++)
                 {
-                    int signLeft = signOf(recon[firstX] - recon[firstX - 1]);
-                    for (x = firstX; x < endX; x++)
-                    {
-                        int signRight =  signOf(recon[x] - recon[x + 1]);
-                        int edgeType =  signRight + signLeft + 2;
-                        signLeft  = -signRight;
+                    int signDown = signOf(rec[x] - rec[x + stride + 1]);
+                    int edgeType = signDown + upBuff1[x] + 2;
+                    upBufft[x + 1] = -signDown;
 
-                        if (x < startX && y < startY)
-                            continue;
+                    if (x < startX && y < startY)
+                        continue;
 
-                        stats[s_eoTable[edgeType]] += (fenc[x] - recon[x]);
-                        count[s_eoTable[edgeType]]++;
-                    }
-
-                    fenc += stride;
-                    recon += stride;
+                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
+                    count[s_eoTable[edgeType]]++;
                 }
 
-                //if (iSaoType == EO_1)
+                std::swap(upBuff1, upBufft);
 
-                numSkipLine = isChroma ? 2 : 4;
-                numSkipLineRight = isChroma ? 2 : 4;
+                rec += stride;
+                fenc += stride;
+            }
+        }
 
-                stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_1];
-                count = m_countPreDblk[addr][plane][SAO_EO_1];
+        // SAO_EO_3: // dir: 45
+        {
+            skipB = plane ? 2 : 4;
+            skipR = plane ? 3 : 5;
 
-                fenc = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
-                recon = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
+            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_3];
+            count = m_countPreDblk[addr][plane][SAO_EO_3];
 
-                startX = (rPelX == picWidthTmp) ? ctuWidth : ctuWidth - numSkipLineRight;
-                startY = (bPelY == picHeightTmp) ? ctuHeight - 1 : ctuHeight - numSkipLine;
-                firstY = (tPelY == 0) ? 1 : 0;
-                endY   = (bPelY == picHeightTmp) ? ctuHeight - 1 : ctuHeight;
-                if (firstY == 1)
+            fenc = fenc0;
+            rec  = rec0;
+
+            startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
+            startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
+            firstX = !lpelx;
+            firstY = !tpely;
+            // endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
+            // endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
+            endX   = ctuWidth - 1;  // not refer right CTU
+            endY   = ctuHeight - 1; // not refer below CTU
+            if (!tpely)
+            {
+                fenc += stride;
+                rec += stride;
+            }
+
+            for (x = startX - 1; x < endX; x++)
+                upBuff1[x] = signOf(rec[x] - rec[x - stride + 1]);
+
+            for (y = firstY; y < endY; y++)
+            {
+                for (x = (y < startY - 1 ? startX : firstX); x < endX; x++)
                 {
-                    fenc += stride;
-                    recon += stride;
+                    int signDown = signOf(rec[x] - rec[x + stride - 1]);
+                    int edgeType = signDown + upBuff1[x] + 2;
+                    upBuff1[x - 1] = -signDown;
+
+                    if (x < startX && y < startY)
+                        continue;
+
+                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
+                    count[s_eoTable[edgeType]]++;
                 }
 
-                for (x = 0; x < ctuWidth; x++)
-                    upBuff1[x] = signOf(recon[x] - recon[x - stride]);
+                upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
 
-                for (y = firstY; y < endY; y++)
-                {
-                    for (x = 0; x < ctuWidth; x++)
-                    {
-                        int signDown = signOf(recon[x] - recon[x + stride]);
-                        int edgeType = signDown + upBuff1[x] + 2;
-                        upBuff1[x] = -signDown;
-
-                        if (x < startX && y < startY)
-                            continue;
-
-                        stats[s_eoTable[edgeType]] += (fenc[x] - recon[x]);
-                        count[s_eoTable[edgeType]]++;
-                    }
-
-                    fenc += stride;
-                    recon += stride;
-                }
-
-                //if (iSaoType == EO_2)
-
-                numSkipLine = isChroma ? 2 : 4;
-                numSkipLineRight = isChroma ? 3 : 5;
-
-                stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_2];
-                count = m_countPreDblk[addr][plane][SAO_EO_2];
-
-                fenc = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
-                recon = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
-
-                startX = (rPelX == picWidthTmp) ? ctuWidth - 1 : ctuWidth - numSkipLineRight;
-                startY = (bPelY == picHeightTmp) ? ctuHeight - 1 : ctuHeight - numSkipLine;
-                firstX = (lPelX == 0) ? 1 : 0;
-                firstY = (tPelY == 0) ? 1 : 0;
-                endX   = (rPelX == picWidthTmp) ? ctuWidth - 1 : ctuWidth;
-                endY   = (bPelY == picHeightTmp) ? ctuHeight - 1 : ctuHeight;
-                if (firstY == 1)
-                {
-                    fenc += stride;
-                    recon += stride;
-                }
-
-                for (x = firstX; x < endX; x++)
-                    upBuff1[x] = signOf(recon[x] - recon[x - stride - 1]);
-
-                for (y = firstY; y < endY; y++)
-                {
-                    int signDown2 = signOf(recon[stride + startX] - recon[startX - 1]);
-                    for (x = firstX; x < endX; x++)
-                    {
-                        int signDown1 = signOf(recon[x] - recon[x + stride + 1]);
-                        int edgeType = signDown1 + upBuff1[x] + 2;
-                        upBufft[x + 1] = -signDown1;
-
-                        if (x < startX && y < startY)
-                            continue;
-
-                        stats[s_eoTable[edgeType]] += (fenc[x] - recon[x]);
-                        count[s_eoTable[edgeType]]++;
-                    }
-
-                    upBufft[firstX] = signDown2;
-                    std::swap(upBuff1, upBufft);
-
-                    recon += stride;
-                    fenc += stride;
-                }
-
-                //if (iSaoType == EO_3)
-
-                numSkipLine = isChroma ? 2 : 4;
-                numSkipLineRight = isChroma ? 3 : 5;
-
-                stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_3];
-                count = m_countPreDblk[addr][plane][SAO_EO_3];
-
-                fenc = m_pic->getPicYuvOrg()->getPlaneAddr(plane, addr);
-                recon = m_pic->getPicYuvRec()->getPlaneAddr(plane, addr);
-
-                startX = (rPelX == picWidthTmp) ? ctuWidth - 1 : ctuWidth - numSkipLineRight;
-                startY = (bPelY == picHeightTmp) ? ctuHeight - 1 : ctuHeight - numSkipLine;
-                firstX = (lPelX == 0) ? 1 : 0;
-                firstY = (tPelY == 0) ? 1 : 0;
-                endX   = (rPelX == picWidthTmp) ? ctuWidth - 1 : ctuWidth;
-                endY   = (bPelY == picHeightTmp) ? ctuHeight - 1 : ctuHeight;
-                if (firstY == 1)
-                {
-                    fenc += stride;
-                    recon += stride;
-                }
-
-                for (x = firstX - 1; x < endX; x++)
-                    upBuff1[x] = signOf(recon[x] - recon[x - stride + 1]);
-
-                for (y = firstY; y < endY; y++)
-                {
-                    for (x = firstX; x < endX; x++)
-                    {
-                        int signDown1 = signOf(recon[x] - recon[x + stride - 1]);
-                        int edgeType  = signDown1 + upBuff1[x] + 2;
-                        upBuff1[x - 1] = -signDown1;
-
-                        if (x < startX && y < startY)
-                            continue;
-
-                        stats[s_eoTable[edgeType]] += (fenc[x] - recon[x]);
-                        count[s_eoTable[edgeType]]++;
-                    }
-
-                    upBuff1[endX - 1] = signOf(recon[endX - 1 + stride] - recon[endX]);
-
-                    recon += stride;
-                    fenc += stride;
-                }
+                rec += stride;
+                fenc += stride;
             }
         }
     }
@@ -1151,69 +1079,9 @@
 /* reset offset statistics */
 void SAO::resetStats()
 {
-    for (int i = 0; i < NUM_PLANE; i++)
-    {
-        for (int j = 0; j < MAX_NUM_SAO_TYPE; j++)
-        {
-            for (int k = 0; k < MAX_NUM_SAO_CLASS; k++)
-            {
-                m_count[i][j][k] = 0;
-                m_offset[i][j][k] = 0;
-                m_offsetOrg[i][j][k] = 0;
-            }
-        }
-    }
-}
-
-/* Check merge SAO unit */
-void SAO::checkMerge(SaoCtuParam * saoUnitCurr, SaoCtuParam * saoUnitCheck, int dir)
-{
-    int countDiff = 0;
-
-    if (saoUnitCurr->partIdx != saoUnitCheck->partIdx)
-    {
-        if (saoUnitCurr->typeIdx >= 0)
-        {
-            if (saoUnitCurr->typeIdx == saoUnitCheck->typeIdx)
-            {
-                for (int i = 0; i < SAO_NUM_OFFSET; i++)
-                    countDiff += (saoUnitCurr->offset[i] != saoUnitCheck->offset[i]);
-
-                countDiff += (saoUnitCurr->subTypeIdx != saoUnitCheck->subTypeIdx);
-                if (countDiff == 0)
-                {
-                    saoUnitCurr->partIdx = saoUnitCheck->partIdx;
-                    if (dir == 1)
-                    {
-                        saoUnitCurr->mergeUpFlag = 1;
-                        saoUnitCurr->mergeLeftFlag = 0;
-                    }
-                    else
-                    {
-                        saoUnitCurr->mergeUpFlag = 0;
-                        saoUnitCurr->mergeLeftFlag = 1;
-                    }
-                }
-            }
-        }
-        else
-        {
-            if (saoUnitCurr->typeIdx == saoUnitCheck->typeIdx)
-            {
-                saoUnitCurr->partIdx = saoUnitCheck->partIdx;
-                if (dir == 1)
-                {
-                    saoUnitCurr->mergeUpFlag = 1;
-                    saoUnitCurr->mergeLeftFlag = 0;
-                }
-                else
-                {
-                    saoUnitCurr->mergeUpFlag = 0;
-                    saoUnitCurr->mergeLeftFlag = 1;
-                }
-            }
-        }
-    }
+    memset(m_count, 0, sizeof(PerClass) * NUM_PLANE);
+    memset(m_offset, 0, sizeof(PerClass) * NUM_PLANE);
+    memset(m_offsetOrg, 0, sizeof(PerClass) * NUM_PLANE);
 }
 
 void SAO::rdoSaoUnitRowInit(SAOParam *saoParam)
@@ -1244,25 +1112,17 @@
 
 void SAO::rdoSaoUnitRow(SAOParam *saoParam, int idxY)
 {
-    int frameWidthInCU  = saoParam->numCuInWidth;
     int j, k;
-    int compIdx = 0;
     SaoCtuParam mergeSaoParam[3][2];
     double compDistortion[3];
+    int allowMergeUp   = (idxY > 0);
 
-    for (int idxX = 0; idxX < frameWidthInCU; idxX++)
+    for (int idxX = 0; idxX < m_numCuInWidth; idxX++)
     {
-        int addr     = idxX + idxY * frameWidthInCU;
-        int addrUp   = idxY == 0 ? -1 : addr - frameWidthInCU;
+        int addr     = idxX + idxY * m_numCuInWidth;
+        int addrUp   = idxY == 0 ? -1 : addr - m_numCuInWidth;
         int addrLeft = idxX == 0 ? -1 : addr - 1;
-        int allowMergeLeft = 1;
-        int allowMergeUp   = 1;
-        uint32_t rate;
-        double bestCost, mergeCost;
-        if (idxX == 0)
-            allowMergeLeft = 0;
-        if (idxY == 0)
-            allowMergeUp = 0;
+        int allowMergeLeft = (idxX > 0);
 
         compDistortion[0] = 0;
         compDistortion[1] = 0;
@@ -1274,32 +1134,32 @@
             m_entropyCoder.codeSaoMerge(0);
         m_entropyCoder.store(m_rdEntropyCoders[0][CI_TEMP_BEST]);
         // reset stats Y, Cb, Cr
-        for (compIdx = 0; compIdx < 3; compIdx++)
+        for (int plane = 0; plane < 3; plane++)
         {
             for (j = 0; j < MAX_NUM_SAO_TYPE; j++)
             {
                 for (k = 0; k < MAX_NUM_SAO_CLASS; k++)
                 {
-                    m_offset[compIdx][j][k] = 0;
+                    m_offset[plane][j][k] = 0;
                     if (m_param->bSaoNonDeblocked)
                     {
-                        m_count[compIdx][j][k] = m_countPreDblk[addr][compIdx][j][k];
-                        m_offsetOrg[compIdx][j][k] = m_offsetOrgPreDblk[addr][compIdx][j][k];
+                        m_count[plane][j][k] = m_countPreDblk[addr][plane][j][k];
+                        m_offsetOrg[plane][j][k] = m_offsetOrgPreDblk[addr][plane][j][k];
                     }
                     else
                     {
-                        m_count[compIdx][j][k] = 0;
-                        m_offsetOrg[compIdx][j][k] = 0;
+                        m_count[plane][j][k] = 0;
+                        m_offsetOrg[plane][j][k] = 0;
                     }
                 }
             }
 
-            saoParam->ctuParam[compIdx][addr].typeIdx       = -1;
-            saoParam->ctuParam[compIdx][addr].mergeUpFlag   = 0;
-            saoParam->ctuParam[compIdx][addr].mergeLeftFlag = 0;
-            saoParam->ctuParam[compIdx][addr].subTypeIdx    = 0;
-            if ((compIdx == 0 && saoParam->bSaoFlag[0]) || (compIdx > 0 && saoParam->bSaoFlag[1]))
-                calcSaoStatsCu(addr, compIdx);
+            saoParam->ctuParam[plane][addr].typeIdx       = -1;
+            saoParam->ctuParam[plane][addr].mergeUpFlag   = 0;
+            saoParam->ctuParam[plane][addr].mergeLeftFlag = 0;
+            saoParam->ctuParam[plane][addr].bandPos    = 0;
+            if ((plane == 0 && saoParam->bSaoFlag[0]) || (plane > 0 && saoParam->bSaoFlag[1]))
+                calcSaoStatsCu(addr, plane);
         }
 
         saoComponentParamDist(allowMergeLeft, allowMergeUp, saoParam, addr, addrUp, addrLeft,
@@ -1317,14 +1177,14 @@
                 m_entropyCoder.codeSaoMerge(0);
             if (allowMergeUp)
                 m_entropyCoder.codeSaoMerge(0);
-            for (compIdx = 0; compIdx < 3; compIdx++)
+            for (int plane = 0; plane < 3; plane++)
             {
-                if ((compIdx == 0 && saoParam->bSaoFlag[0]) || (compIdx > 0 && saoParam->bSaoFlag[1]))
-                    m_entropyCoder.codeSaoOffset(&saoParam->ctuParam[compIdx][addr], compIdx);
+                if ((plane == 0 && saoParam->bSaoFlag[0]) || (plane > 0 && saoParam->bSaoFlag[1]))
+                    m_entropyCoder.codeSaoOffset(&saoParam->ctuParam[plane][addr], plane);
             }
 
-            rate = m_entropyCoder.getNumberOfWrittenBits();
-            bestCost = compDistortion[0] + (double)rate;
+            uint32_t rate = m_entropyCoder.getNumberOfWrittenBits();
+            double bestCost = compDistortion[0] + (double)rate;
             m_entropyCoder.store(m_rdEntropyCoders[0][CI_TEMP_BEST]);
 
             // Cost of Merge
@@ -1340,17 +1200,17 @@
                         m_entropyCoder.codeSaoMerge(1);
 
                     rate = m_entropyCoder.getNumberOfWrittenBits();
-                    mergeCost = compDistortion[mergeUp + 1] + (double)rate;
+                    double mergeCost = compDistortion[mergeUp + 1] + (double)rate;
                     if (mergeCost < bestCost)
                     {
                         bestCost = mergeCost;
                         m_entropyCoder.store(m_rdEntropyCoders[0][CI_TEMP_BEST]);
-                        for (compIdx = 0; compIdx < 3; compIdx++)
+                        for (int plane = 0; plane < 3; plane++)
                         {
-                            mergeSaoParam[compIdx][mergeUp].mergeLeftFlag = !mergeUp;
-                            mergeSaoParam[compIdx][mergeUp].mergeUpFlag = !!mergeUp;
-                            if ((compIdx == 0 && saoParam->bSaoFlag[0]) || (compIdx > 0 && saoParam->bSaoFlag[1]))
-                                copySaoUnit(&saoParam->ctuParam[compIdx][addr], &mergeSaoParam[compIdx][mergeUp]);
+                            mergeSaoParam[plane][mergeUp].mergeLeftFlag = !mergeUp;
+                            mergeSaoParam[plane][mergeUp].mergeUpFlag = !!mergeUp;
+                            if ((plane == 0 && saoParam->bSaoFlag[0]) || (plane > 0 && saoParam->bSaoFlag[1]))
+                                copySaoUnit(&saoParam->ctuParam[plane][addr], &mergeSaoParam[plane][mergeUp]);
                         }
                     }
                 }
@@ -1367,92 +1227,85 @@
 }
 
 /** rate distortion optimization of SAO unit */
-inline int64_t SAO::estSaoTypeDist(int compIdx, int typeIdx, int shift, double lambda, int32_t *currentDistortionTableBo, double *currentRdCostTableBo)
+inline int64_t SAO::estSaoTypeDist(int plane, int typeIdx, double lambda, int32_t *currentDistortionTableBo, double *currentRdCostTableBo)
 {
     int64_t estDist = 0;
 
     for (int classIdx = 1; classIdx < ((typeIdx < SAO_BO) ?  SAO_EO_LEN + 1 : SAO_NUM_BO_CLASSES + 1); classIdx++)
     {
+        int32_t  count = m_count[plane][typeIdx][classIdx];
+        int32_t& offsetOrg = m_offsetOrg[plane][typeIdx][classIdx];
+        int32_t& offsetOut = m_offset[plane][typeIdx][classIdx];
+
         if (typeIdx == SAO_BO)
         {
             currentDistortionTableBo[classIdx - 1] = 0;
             currentRdCostTableBo[classIdx - 1] = lambda;
         }
-        if (m_count[compIdx][typeIdx][classIdx])
+        if (count)
         {
-            m_offset[compIdx][typeIdx][classIdx] = (int64_t)roundIDBI((double)(m_offsetOrg[compIdx][typeIdx][classIdx] << (X265_DEPTH - 8)) / (double)(m_count[compIdx][typeIdx][classIdx] << SAO_BIT_INC));
-            m_offset[compIdx][typeIdx][classIdx] = Clip3(-OFFSET_THRESH + 1, OFFSET_THRESH - 1, (int)m_offset[compIdx][typeIdx][classIdx]);
+            int offset = roundIBDI(offsetOrg, count << SAO_BIT_INC);
+            offset = Clip3(-OFFSET_THRESH + 1, OFFSET_THRESH - 1, offset);
             if (typeIdx < SAO_BO)
             {
-                if (m_offset[compIdx][typeIdx][classIdx] < 0 && classIdx < 3)
-                    m_offset[compIdx][typeIdx][classIdx] = 0;
-                if (m_offset[compIdx][typeIdx][classIdx] > 0 && classIdx >= 3)
-                    m_offset[compIdx][typeIdx][classIdx] = 0;
+                if (classIdx < 3)
+                    offset = X265_MAX(offset, 0);
+                else
+                    offset = X265_MIN(offset, 0);
             }
-            m_offset[compIdx][typeIdx][classIdx] = estIterOffset(typeIdx, classIdx, lambda, m_offset[compIdx][typeIdx][classIdx], m_count[compIdx][typeIdx][classIdx], m_offsetOrg[compIdx][typeIdx][classIdx], shift, SAO_BIT_INC, currentDistortionTableBo, currentRdCostTableBo, OFFSET_THRESH);
+            offsetOut = estIterOffset(typeIdx, classIdx, lambda, offset, count, offsetOrg, currentDistortionTableBo, currentRdCostTableBo);
         }
         else
         {
-            m_offsetOrg[compIdx][typeIdx][classIdx] = 0;
-            m_offset[compIdx][typeIdx][classIdx] = 0;
+            offsetOrg = 0;
+            offsetOut = 0;
         }
         if (typeIdx != SAO_BO)
-            estDist += estSaoDist(m_count[compIdx][typeIdx][classIdx], m_offset[compIdx][typeIdx][classIdx] << SAO_BIT_INC, m_offsetOrg[compIdx][typeIdx][classIdx], shift);
+            estDist += estSaoDist(count, (int)offsetOut << SAO_BIT_INC, offsetOrg);
     }
 
     return estDist;
 }
 
-inline int64_t SAO::estSaoDist(int64_t count, int64_t offset, int64_t offsetOrg, int shift)
+inline int SAO::estIterOffset(int typeIdx, int classIdx, double lambda, int offset, int32_t count, int32_t offsetOrg, int32_t *currentDistortionTableBo, double *currentRdCostTableBo)
 {
-    return (count * offset * offset - offsetOrg * offset * 2) >> shift;
-}
+    int offsetOut = 0;
 
-inline int64_t SAO::estIterOffset(int typeIdx, int classIdx, double lambda, int64_t offsetInput, int64_t count, int64_t offsetOrg, int shift, int bitIncrease, int32_t *currentDistortionTableBo, double *currentRdCostTableBo, int offsetTh)
-{
-    //Clean up, best_q_offset.
-    int64_t iterOffset, tempOffset;
-    int64_t tempDist, tempRate;
-    int64_t offsetOutput = 0;
-
-    iterOffset = offsetInput;
     // Assuming sending quantized value 0 results in zero offset and sending the value zero needs 1 bit. entropy coder can be used to measure the exact rate here.
     double tempMinCost = lambda;
-    while (iterOffset != 0)
+    while (offset != 0)
     {
         // Calculate the bits required for signalling the offset
-        tempRate = (typeIdx == SAO_BO) ? (abs((int)iterOffset) + 2) : (abs((int)iterOffset) + 1);
-        if (abs((int)iterOffset) == offsetTh - 1)
+        int tempRate = (typeIdx == SAO_BO) ? (abs(offset) + 2) : (abs(offset) + 1);
+        if (abs(offset) == OFFSET_THRESH - 1)
             tempRate--;
 
         // Do the dequntization before distorion calculation
-        tempOffset = iterOffset << bitIncrease;
-        tempDist   = estSaoDist(count, tempOffset, offsetOrg, shift);
+        int tempOffset = offset << SAO_BIT_INC;
+        int64_t tempDist  = estSaoDist(count, tempOffset, offsetOrg);
         double tempCost   = ((double)tempDist + lambda * (double)tempRate);
         if (tempCost < tempMinCost)
         {
             tempMinCost = tempCost;
-            offsetOutput = iterOffset;
+            offsetOut = offset;
             if (typeIdx == SAO_BO)
             {
                 currentDistortionTableBo[classIdx - 1] = (int)tempDist;
                 currentRdCostTableBo[classIdx - 1] = tempCost;
             }
         }
-        iterOffset = (iterOffset > 0) ? (iterOffset - 1) : (iterOffset + 1);
+        offset = (offset > 0) ? (offset - 1) : (offset + 1);
     }
 
-    return offsetOutput;
+    return offsetOut;
 }
 
 void SAO::saoComponentParamDist(int allowMergeLeft, int allowMergeUp, SAOParam *saoParam, int addr, int addrUp, int addrLeft,
                                 SaoCtuParam *compSaoParam, double *compDistortion)
 {
-    int64_t estDist;
-    int64_t bestDist;
+    int64_t bestDist = 0;
 
     SaoCtuParam* lclCtuParam = &saoParam->ctuParam[0][addr];
-    SaoCtuParam* ctuParamNeighbor = NULL;
     SaoCtuParam  ctuParamRdo;
 
     resetSaoUnit(&ctuParamRdo);
@@ -1460,7 +1313,6 @@
     resetSaoUnit(&compSaoParam[1]);
     resetSaoUnit(lclCtuParam);
 
-    double dCostPartBest = MAX_DOUBLE;
     double bestRDCostTableBo = MAX_DOUBLE;
     int    bestClassTableBo  = 0;
     int    currentDistortionTableBo[MAX_NUM_SAO_CLASS];
@@ -1469,13 +1321,12 @@
     m_entropyCoder.load(m_rdEntropyCoders[0][CI_TEMP_BEST]);
     m_entropyCoder.resetBits();
     m_entropyCoder.codeSaoOffset(&ctuParamRdo, 0);
-    dCostPartBest = m_entropyCoder.getNumberOfWrittenBits() * m_lumaLambda;
+    double dCostPartBest = m_entropyCoder.getNumberOfWrittenBits() * m_lumaLambda;
     copySaoUnit(lclCtuParam, &ctuParamRdo);
-    bestDist = 0;
 
     for (int typeIdx = 0; typeIdx < MAX_NUM_SAO_TYPE; typeIdx++)
     {
-        estDist = estSaoTypeDist(0, typeIdx, 0, m_lumaLambda, currentDistortionTableBo, currentRdCostTableBo);
+        int64_t estDist = estSaoTypeDist(0, typeIdx, m_lumaLambda, currentDistortionTableBo, currentRdCostTableBo);
 
         if (typeIdx == SAO_BO)
         {
@@ -1503,16 +1354,16 @@
         ctuParamRdo.typeIdx = typeIdx;
         ctuParamRdo.mergeLeftFlag = 0;
         ctuParamRdo.mergeUpFlag   = 0;
-        ctuParamRdo.subTypeIdx = (typeIdx == SAO_BO) ? bestClassTableBo : 0;
+        ctuParamRdo.bandPos = (typeIdx == SAO_BO) ? bestClassTableBo : 0;
         for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
-            ctuParamRdo.offset[classIdx] = (int)m_offset[0][typeIdx][classIdx + ctuParamRdo.subTypeIdx + 1];
+            ctuParamRdo.offset[classIdx] = (int)m_offset[0][typeIdx][classIdx + ctuParamRdo.bandPos + 1];
 
         m_entropyCoder.load(m_rdEntropyCoders[0][CI_TEMP_BEST]);
         m_entropyCoder.resetBits();
         m_entropyCoder.codeSaoOffset(&ctuParamRdo, 0);
 
         uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
-        double cost = (double)((double)estDist + m_lumaLambda * (double)estRate);
+        double cost = (double)estDist + m_lumaLambda * (double)estRate;
 
         if (cost < dCostPartBest)
         {
@@ -1531,27 +1382,24 @@
 
     for (int idxNeighbor = 0; idxNeighbor < 2; idxNeighbor++)
     {
-        ctuParamNeighbor = NULL;
+        SaoCtuParam* ctuParamNeighbor = NULL;
         if (allowMergeLeft && addrLeft >= 0 && idxNeighbor == 0)
             ctuParamNeighbor = &(saoParam->ctuParam[0][addrLeft]);
         else if (allowMergeUp && addrUp >= 0 && idxNeighbor == 1)
             ctuParamNeighbor = &(saoParam->ctuParam[0][addrUp]);
         if (ctuParamNeighbor != NULL)
         {
-            estDist = 0;
+            int64_t estDist = 0;
             int typeIdx = ctuParamNeighbor->typeIdx;
             if (typeIdx >= 0)
             {
-                int mergeBandPosition = (typeIdx == SAO_BO) ? ctuParamNeighbor->subTypeIdx : 0;
-                int mergeOffset;
+                int mergeBandPosition = (typeIdx == SAO_BO) ? ctuParamNeighbor->bandPos : 0;
                 for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
                 {
-                    mergeOffset = ctuParamNeighbor->offset[classIdx];
-                    estDist += estSaoDist(m_count[0][typeIdx][classIdx + mergeBandPosition + 1], mergeOffset, m_offsetOrg[0][typeIdx][classIdx + mergeBandPosition + 1],  0);
+                    int mergeOffset = ctuParamNeighbor->offset[classIdx];
+                    estDist += estSaoDist(m_count[0][typeIdx][classIdx + mergeBandPosition + 1], mergeOffset, m_offsetOrg[0][typeIdx][classIdx + mergeBandPosition + 1]);
                 }
             }
-            else
-                estDist = 0;
 
             copySaoUnit(&compSaoParam[idxNeighbor], ctuParamNeighbor);
             compSaoParam[idxNeighbor].mergeUpFlag   = !!idxNeighbor;
@@ -1565,11 +1413,9 @@
 void SAO::sao2ChromaParamDist(int allowMergeLeft, int allowMergeUp, SAOParam *saoParam, int addr, int addrUp, int addrLeft,
                               SaoCtuParam *crSaoParam, SaoCtuParam *cbSaoParam, double *distortion)
 {
-    int64_t estDist[2];
     int64_t bestDist = 0;
 
     SaoCtuParam* lclCtuParam[2] = { &saoParam->ctuParam[1][addr], &saoParam->ctuParam[2][addr] };
-    SaoCtuParam* ctuParamNeighbor[2] = { NULL, NULL };
     SaoCtuParam* saoMergeParam[2][2];
     SaoCtuParam  ctuParamRdo[2];
 
@@ -1587,8 +1433,6 @@
     resetSaoUnit(&ctuParamRdo[0]);
     resetSaoUnit(&ctuParamRdo[1]);
 
-    double costPartBest = MAX_DOUBLE;
-    double bestRDCostTableBo;
     double currentRdCostTableBo[MAX_NUM_SAO_CLASS];
     int    bestClassTableBo[2] = { 0, 0 };
     int    currentDistortionTableBo[MAX_NUM_SAO_CLASS];
@@ -1598,19 +1442,20 @@
     m_entropyCoder.codeSaoOffset(&ctuParamRdo[0], 1);
     m_entropyCoder.codeSaoOffset(&ctuParamRdo[1], 2);
 
-    costPartBest = m_entropyCoder.getNumberOfWrittenBits() * m_chromaLambda;
+    double costPartBest = m_entropyCoder.getNumberOfWrittenBits() * m_chromaLambda;
     copySaoUnit(lclCtuParam[0], &ctuParamRdo[0]);
     copySaoUnit(lclCtuParam[1], &ctuParamRdo[1]);
 
     for (int typeIdx = 0; typeIdx < MAX_NUM_SAO_TYPE; typeIdx++)
     {
+        int64_t estDist[2];
         if (typeIdx == SAO_BO)
         {
             // Estimate Best Position
             for (int compIdx = 0; compIdx < 2; compIdx++)
             {
-                bestRDCostTableBo = MAX_DOUBLE;
-                estDist[compIdx] = estSaoTypeDist(compIdx + 1, typeIdx, 0, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo);
+                double bestRDCostTableBo = MAX_DOUBLE;
+                estDist[compIdx] = estSaoTypeDist(compIdx + 1, typeIdx, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo);
                 for (int i = 0; i < SAO_NUM_BO_CLASSES - SAO_BO_LEN + 1; i++)
                 {
                     double currentRDCost = 0.0;
@@ -1633,8 +1478,8 @@
         }
         else
         {
-            estDist[0] = estSaoTypeDist(1, typeIdx, 0, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo);
-            estDist[1] = estSaoTypeDist(2, typeIdx, 0, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo);
+            estDist[0] = estSaoTypeDist(1, typeIdx, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo);
+            estDist[1] = estSaoTypeDist(2, typeIdx, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo);
         }
 
         m_entropyCoder.load(m_rdEntropyCoders[0][CI_TEMP_BEST]);
@@ -1646,15 +1491,15 @@
             ctuParamRdo[compIdx].typeIdx = typeIdx;
             ctuParamRdo[compIdx].mergeLeftFlag = 0;
             ctuParamRdo[compIdx].mergeUpFlag   = 0;
-            ctuParamRdo[compIdx].subTypeIdx = (typeIdx == SAO_BO) ? bestClassTableBo[compIdx] : 0;
+            ctuParamRdo[compIdx].bandPos = (typeIdx == SAO_BO) ? bestClassTableBo[compIdx] : 0;
             for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
-                ctuParamRdo[compIdx].offset[classIdx] = (int)m_offset[compIdx + 1][typeIdx][classIdx + ctuParamRdo[compIdx].subTypeIdx + 1];
+                ctuParamRdo[compIdx].offset[classIdx] = (int)m_offset[compIdx + 1][typeIdx][classIdx + ctuParamRdo[compIdx].bandPos + 1];
 
             m_entropyCoder.codeSaoOffset(&ctuParamRdo[compIdx], compIdx + 1);
         }
 
         uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
-        double cost = (double)((double)(estDist[0] + estDist[1]) + m_chromaLambda * (double)estRate);
+        double cost = (double)(estDist[0] + estDist[1]) + m_chromaLambda * (double)estRate;
 
         if (cost < costPartBest)
         {
@@ -1677,31 +1522,30 @@
     {
         for (int compIdx = 0; compIdx < 2; compIdx++)
         {
-            ctuParamNeighbor[compIdx] = NULL;
+            int plane = compIdx + 1;
+            SaoCtuParam* ctuParamNeighbor = NULL;
             if (allowMergeLeft && addrLeft >= 0 && idxNeighbor == 0)
-                ctuParamNeighbor[compIdx] = &(saoParam->ctuParam[compIdx + 1][addrLeft]);
+                ctuParamNeighbor = &(saoParam->ctuParam[plane][addrLeft]);
             else if (allowMergeUp && addrUp >= 0 && idxNeighbor == 1)
-                ctuParamNeighbor[compIdx] = &(saoParam->ctuParam[compIdx + 1][addrUp]);
-            if (ctuParamNeighbor[compIdx] != NULL)
+                ctuParamNeighbor = &(saoParam->ctuParam[plane][addrUp]);
+            if (ctuParamNeighbor != NULL)
             {
-                estDist[compIdx] = 0;
-                int typeIdx = ctuParamNeighbor[compIdx]->typeIdx;
+                int64_t estDist = 0;
+                int typeIdx = ctuParamNeighbor->typeIdx;
                 if (typeIdx >= 0)
                 {
-                    int mergeBandPosition = (typeIdx == SAO_BO) ? ctuParamNeighbor[compIdx]->subTypeIdx : 0;
+                    int mergeBandPosition = (typeIdx == SAO_BO) ? ctuParamNeighbor->bandPos : 0;
                     for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
                     {
-                        int mergeOffset = ctuParamNeighbor[compIdx]->offset[classIdx];
-                        estDist[compIdx] += estSaoDist(m_count[compIdx + 1][typeIdx][classIdx + mergeBandPosition + 1], mergeOffset, m_offsetOrg[compIdx + 1][typeIdx][classIdx + mergeBandPosition + 1],  0);
+                        int mergeOffset = ctuParamNeighbor->offset[classIdx];
+                        estDist += estSaoDist(m_count[plane][typeIdx][classIdx + mergeBandPosition + 1], mergeOffset, m_offsetOrg[plane][typeIdx][classIdx + mergeBandPosition + 1]);
                     }
                 }
-                else
-                    estDist[compIdx] = 0;
 
-                copySaoUnit(saoMergeParam[compIdx][idxNeighbor], ctuParamNeighbor[compIdx]);
+                copySaoUnit(saoMergeParam[compIdx][idxNeighbor], ctuParamNeighbor);
                 saoMergeParam[compIdx][idxNeighbor]->mergeUpFlag   = !!idxNeighbor;
                 saoMergeParam[compIdx][idxNeighbor]->mergeLeftFlag = !idxNeighbor;
-                distortion[idxNeighbor + 1] += ((double)estDist[compIdx] / m_chromaLambda);
+                distortion[idxNeighbor + 1] += ((double)estDist / m_chromaLambda);
             }
         }
     }
diff -r b6d49505b179 -r 64ea900398eb source/encoder/sao.h
--- a/source/encoder/sao.h	Thu Oct 02 16:47:55 2014 -0500
+++ b/source/encoder/sao.h	Sun Oct 05 18:19:16 2014 +0900
@@ -63,9 +63,8 @@
 
     static const uint32_t s_eoTable[NUM_EDGETYPE];
 
-    typedef int64_t (PerClass[MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]);
-    typedef int64_t (PerType[MAX_NUM_SAO_TYPE]);
-    typedef int64_t (PerPlane[3][MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]);
+    typedef int32_t (PerClass[MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]);
+    typedef int32_t (PerPlane[NUM_PLANE][MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]);
 
     /* allocated per part */
     PerClass*   m_count;
@@ -102,7 +101,7 @@
     x265_param* m_param;
     int         m_refDepth;
     int         m_numNoSao[2];
-    
+
     double      m_lumaLambda;
     double      m_chromaLambda;
     /* TODO: No doubles for distortion */
@@ -120,7 +119,7 @@
     void resetSaoUnit(SaoCtuParam* saoUnit);
 
     // CTU-based SAO process without slice granularity
-    void processSaoCu(int addr, int partIdx, int plane);
+    void processSaoCu(int addr, int typeIdx, int plane);
 
     void resetCtuPart(SaoCtuParam* ctuParam);
     void processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane);
@@ -129,17 +128,15 @@
 
     void calcSaoStatsCu(int addr, int plane);
     void calcSaoStatsCu_BeforeDblk(Frame* pic, int idxX, int idxY);
-    void checkMerge(SaoCtuParam* paramCurr, SaoCtuParam* paramCheck, int dir);
 
     void saoComponentParamDist(int allowMergeLeft, int allowMergeUp, SAOParam *saoParam, int addr, int addrUp, int addrLeft,
                                SaoCtuParam *compSaoParam, double *distortion);
     void sao2ChromaParamDist(int allowMergeLeft, int allowMergeUp, SAOParam *saoParam, int addr, int addrUp, int addrLeft,
                              SaoCtuParam *crSaoParam, SaoCtuParam *cbSaoParam, double *distortion);
 
-    inline int64_t estSaoDist(int64_t count, int64_t offset, int64_t offsetOrg, int shift);
-    inline int64_t estIterOffset(int typeIdx, int classIdx, double lambda, int64_t offsetInput, int64_t count, int64_t offsetOrg, int shift,
-                                 int bitIncrease, int32_t *currentDistortionTableBo, double *currentRdCostTableBo, int offsetTh);
-    inline int64_t estSaoTypeDist(int compIdx, int typeIdx, int shift, double lambda, int32_t *currentDistortionTableBo, double *currentRdCostTableBo);
+    inline int estIterOffset(int typeIdx, int classIdx, double lambda, int offset, int32_t count, int32_t offsetOrg,
+                             int32_t *currentDistortionTableBo, double *currentRdCostTableBo);
+    inline int64_t estSaoTypeDist(int plane, int typeIdx, double lambda, int32_t *currentDistortionTableBo, double *currentRdCostTableBo);
 
     void rdoSaoUnitRowInit(SAOParam *saoParam);
     void rdoSaoUnitRowEnd(SAOParam *saoParam, int numctus);


More information about the x265-devel mailing list