[x265] [PATCH 3 of 4] framepp: Parallelism of SAO (saoLcuBasedOptimization mode only)

Min Chen chenm003 at 163.com
Thu Aug 22 09:18:32 CEST 2013


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1377155849 -28800
# Node ID 08631e01e5c0ee7f05e81d2dff89bc59534cebed
# Parent  78f36991d73034630b123c6b98d00e6fffde468f
framepp: Parallelism of SAO (saoLcuBasedOptimization mode only)

diff -r 78f36991d730 -r 08631e01e5c0 source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp
--- a/source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp	Thu Aug 22 15:16:40 2013 +0800
+++ b/source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp	Thu Aug 22 15:17:29 2013 +0800
@@ -73,8 +73,12 @@
     m_upBuff2 = NULL;
     m_upBufft = NULL;
 
-    m_tmpU1 = NULL;
-    m_tmpU2 = NULL;
+    m_tmpU1[0] = NULL;
+    m_tmpU1[1] = NULL;
+    m_tmpU1[2] = NULL;
+    m_tmpU2[0] = NULL;
+    m_tmpU2[0] = NULL;
+    m_tmpU2[0] = NULL;
     m_tmpL1 = NULL;
     m_tmpL2 = NULL;
 }
@@ -251,8 +255,14 @@
 
     m_tmpL1 = new Pel[m_maxCUHeight + 1];
     m_tmpL2 = new Pel[m_maxCUHeight + 1];
-    m_tmpU1 = new Pel[m_picWidth];
-    m_tmpU2 = new Pel[m_picWidth];
+
+    m_tmpU1[0] = new Pel[m_picWidth];
+    m_tmpU1[1] = new Pel[m_picWidth];
+    m_tmpU1[2] = new Pel[m_picWidth];
+
+    m_tmpU2[0] = new Pel[m_picWidth];
+    m_tmpU2[1] = new Pel[m_picWidth];
+    m_tmpU2[2] = new Pel[m_picWidth];
 }
 
 /** destroy SampleAdaptiveOffset memory.
@@ -320,15 +330,23 @@
         delete [] m_tmpL2;
         m_tmpL2 = NULL;
     }
-    if (m_tmpU1)
+    if (m_tmpU1[0])
     {
-        delete [] m_tmpU1;
-        m_tmpU1 = NULL;
+        delete [] m_tmpU1[0];
+        delete [] m_tmpU1[1];
+        delete [] m_tmpU1[2];
+        m_tmpU1[0] = NULL;
+        m_tmpU1[1] = NULL;
+        m_tmpU1[2] = NULL;
     }
-    if (m_tmpU2)
+    if (m_tmpU2[0])
     {
-        delete [] m_tmpU2;
-        m_tmpU2 = NULL;
+        delete [] m_tmpU2[0];
+        delete [] m_tmpU2[1];
+        delete [] m_tmpU2[2];
+        m_tmpU2[0] = NULL;
+        m_tmpU2[1] = NULL;
+        m_tmpU2[2] = NULL;
     }
 }
 
@@ -540,20 +558,11 @@
 Void TComSampleAdaptiveOffset::destroyPicSaoInfo()
 {
 }
-
-/** sample adaptive offset process for one LCU
+/** sample adaptive offset process for one LCU crossing LCU boundary
  * \param   addr, iSaoType, yCbCr
  */
 Void TComSampleAdaptiveOffset::processSaoCu(Int addr, Int saoType, Int yCbCr)
 {
-    processSaoCuOrg(addr, saoType, yCbCr);
-}
-
-/** sample adaptive offset process for one LCU crossing LCU boundary
- * \param   addr, iSaoType, yCbCr
- */
-Void TComSampleAdaptiveOffset::processSaoCuOrg(Int addr, Int saoType, Int yCbCr)
-{
     Int x, y;
     TComDataCU *tmpCu = m_pic->getCU(addr);
     Pel* rec;
@@ -632,7 +641,7 @@
         rec -= (stride * (cuHeightTmp + 1));
 
         tmpL = m_tmpL1;
-        tmpU = &(m_tmpU1[lpelx]);
+        tmpU = &(m_tmpU1[yCbCr][lpelx]);
     }
 
     clipTbl = (yCbCr == 0) ? m_clipTable : m_chromaClipTable;
@@ -867,7 +876,7 @@
         picWidthTmp = m_picWidth >> 1;
     }
 
-    memcpy(m_tmpU1, rec, sizeof(Pel) * picWidthTmp);
+    memcpy(m_tmpU1[yCbCr], rec, sizeof(Pel) * picWidthTmp);
 
     Int  i;
     UInt edgeType;
@@ -922,7 +931,7 @@
 
         rec -= (stride << 1);
 
-        memcpy(m_tmpU2, rec, sizeof(Pel) * picWidthTmp);
+        memcpy(m_tmpU2[yCbCr], rec, sizeof(Pel) * picWidthTmp);
 
         for (idxX = 0; idxX < frameWidthInCU; idxX++)
         {
@@ -1006,15 +1015,174 @@
             }
         }
 
-        tmpUSwap = m_tmpU1;
-        m_tmpU1 = m_tmpU2;
-        m_tmpU2 = tmpUSwap;
+        tmpUSwap       = m_tmpU1[yCbCr];
+        m_tmpU1[yCbCr] = m_tmpU2[yCbCr];
+        m_tmpU2[yCbCr] = tmpUSwap;
     }
 }
 
-/** Reset SAO LCU part
- * \param saoLcuParam
+/** Process SAO all units
+ * \param saoLcuParam SAO LCU parameters
+ * \param oneUnitFlag one unit flag
+ * \param yCbCr color componet index
  */
+Void TComSampleAdaptiveOffset::processSaoUnitRow(SaoLcuParam* saoLcuParam, int idxY, Int yCbCr)
+{
+    Pel *rec;
+    Int picWidthTmp;
+
+    if (yCbCr == 0)
+    {
+        rec        = m_pic->getPicYuvRec()->getLumaAddr();
+        picWidthTmp = m_picWidth;
+    }
+    else if (yCbCr == 1)
+    {
+        rec        = m_pic->getPicYuvRec()->getCbAddr();
+        picWidthTmp = m_picWidth >> 1;
+    }
+    else
+    {
+        rec        = m_pic->getPicYuvRec()->getCrAddr();
+        picWidthTmp = m_picWidth >> 1;
+    }
+
+    if (idxY == 0)
+        memcpy(m_tmpU1[yCbCr], rec, sizeof(Pel) * picWidthTmp);
+
+    Int  i;
+    UInt edgeType;
+    Pel* lumaTable = NULL;
+    Pel* clipTable = NULL;
+    Int* offsetBo = NULL;
+    Int  typeIdx;
+
+    Int offset[LUMA_GROUP_NUM + 1];
+    Int idxX;
+    Int addr;
+    Int frameWidthInCU = m_pic->getFrameWidthInCU();
+    Int stride;
+    Pel *tmpUSwap;
+    Int sChroma = (yCbCr == 0) ? 0 : 1;
+    Bool mergeLeftFlag;
+    Int saoBitIncrease = (yCbCr == 0) ? m_saoBitIncreaseY : m_saoBitIncreaseC;
+
+    offsetBo = (yCbCr == 0) ? m_offsetBo : m_chromaOffsetBo;
+
+    offset[0] = 0;
+    {
+        addr = idxY * frameWidthInCU;
+        if (yCbCr == 0)
+        {
+            rec  = m_pic->getPicYuvRec()->getLumaAddr(addr);
+            stride = m_pic->getStride();
+            picWidthTmp = m_picWidth;
+        }
+        else if (yCbCr == 1)
+        {
+            rec  = m_pic->getPicYuvRec()->getCbAddr(addr);
+            stride = m_pic->getCStride();
+            picWidthTmp = m_picWidth >> 1;
+        }
+        else
+        {
+            rec  = m_pic->getPicYuvRec()->getCrAddr(addr);
+            stride = m_pic->getCStride();
+            picWidthTmp = m_picWidth >> 1;
+        }
+
+        //     pRec += stride*(m_uiMaxCUHeight-1);
+        for (i = 0; i < (m_maxCUHeight >> sChroma) + 1; i++)
+        {
+            m_tmpL1[i] = rec[0];
+            rec += stride;
+        }
+
+        rec -= (stride << 1);
+
+        memcpy(m_tmpU2[yCbCr], rec, sizeof(Pel) * picWidthTmp);
+
+        for (idxX = 0; idxX < frameWidthInCU; idxX++)
+        {
+            addr = idxY * frameWidthInCU + idxX;
+
+            typeIdx = saoLcuParam[addr].typeIdx;
+            mergeLeftFlag = saoLcuParam[addr].mergeLeftFlag;
+
+            if (typeIdx >= 0)
+            {
+                if (!mergeLeftFlag)
+                {
+                    if (typeIdx == SAO_BO)
+                    {
+                        for (i = 0; i < SAO_MAX_BO_CLASSES + 1; i++)
+                        {
+                            offset[i] = 0;
+                        }
+
+                        for (i = 0; i < saoLcuParam[addr].length; i++)
+                        {
+                            offset[(saoLcuParam[addr].subTypeIdx + i) % SAO_MAX_BO_CLASSES  + 1] = saoLcuParam[addr].offset[i] << saoBitIncrease;
+                        }
+
+                        lumaTable = (yCbCr == 0) ? m_lumaTableBo : m_chromaTableBo;
+                        clipTable = (yCbCr == 0) ? m_clipTable : m_chromaClipTable;
+
+                        for (i = 0; i < (1 << X265_DEPTH); i++)
+                        {
+                            offsetBo[i] = clipTable[i + offset[lumaTable[i]]];
+                        }
+                    }
+                    if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
+                    {
+                        for (i = 0; i < saoLcuParam[addr].length; i++)
+                        {
+                            offset[i + 1] = saoLcuParam[addr].offset[i] << saoBitIncrease;
+                        }
+
+                        for (edgeType = 0; edgeType < 6; edgeType++)
+                        {
+                            m_offsetEo[edgeType] = offset[m_eoTable[edgeType]];
+                        }
+                    }
+                }
+                processSaoCu(addr, typeIdx, yCbCr);
+            }
+            else
+            {
+                if (idxX != (frameWidthInCU - 1))
+                {
+                    if (yCbCr == 0)
+                    {
+                        rec  = m_pic->getPicYuvRec()->getLumaAddr(addr);
+                        stride = m_pic->getStride();
+                    }
+                    else if (yCbCr == 1)
+                    {
+                        rec  = m_pic->getPicYuvRec()->getCbAddr(addr);
+                        stride = m_pic->getCStride();
+                    }
+                    else
+                    {
+                        rec  = m_pic->getPicYuvRec()->getCrAddr(addr);
+                        stride = m_pic->getCStride();
+                    }
+                    Int widthShift = m_maxCUWidth >> sChroma;
+                    for (i = 0; i < (m_maxCUHeight >> sChroma) + 1; i++)
+                    {
+                        m_tmpL1[i] = rec[widthShift - 1];
+                        rec += stride;
+                    }
+                }
+            }
+        }
+
+        tmpUSwap       = m_tmpU1[yCbCr];
+        m_tmpU1[yCbCr] = m_tmpU2[yCbCr];
+        m_tmpU2[yCbCr] = tmpUSwap;
+    }
+}
+
 Void TComSampleAdaptiveOffset::resetLcuPart(SaoLcuParam* saoLcuParam)
 {
     Int i, j;
@@ -1133,7 +1301,6 @@
 }
 
 static Void xPCMRestoration(TComPic* pic);
-static Void xPCMCURestoration(TComDataCU* cu, UInt absZOrderIdx, UInt depth);
 static Void xPCMSampleRestoration(TComDataCU* cu, UInt absZOrderIdx, UInt depth, TextType ttText);
 
 /** PCM LF disable process.
@@ -1172,7 +1339,7 @@
  * \param depth CU depth
  * \returns Void
  */
-static Void xPCMCURestoration(TComDataCU* cu, UInt absZOrderIdx, UInt depth)
+Void xPCMCURestoration(TComDataCU* cu, UInt absZOrderIdx, UInt depth)
 {
     TComPic* pic     = cu->getPic();
     UInt curNumParts = pic->getNumPartInCU() >> (depth << 1);
diff -r 78f36991d730 -r 08631e01e5c0 source/Lib/TLibCommon/TComSampleAdaptiveOffset.h
--- a/source/Lib/TLibCommon/TComSampleAdaptiveOffset.h	Thu Aug 22 15:16:40 2013 +0800
+++ b/source/Lib/TLibCommon/TComSampleAdaptiveOffset.h	Thu Aug 22 15:17:29 2013 +0800
@@ -169,8 +169,8 @@
     Int   *m_upBufft;
     TComPicYuv* m_tmpYuv;  //!< temporary picture buffer pointer when non-across slice/tile boundary SAO is enabled
 
-    Pel* m_tmpU1;
-    Pel* m_tmpU2;
+    Pel* m_tmpU1[3];
+    Pel* m_tmpU2[3];
     Pel* m_tmpL1;
     Pel* m_tmpL2;
     Int     m_maxNumOffsetsPerPic;
@@ -193,10 +193,9 @@
     static Void freeSaoParam(SAOParam* saoParam);
 
     Void SAOProcess(SAOParam* saoParam);
-    Void processSaoCu(Int addr, Int saoType, Int yCbCr);
     Pel* getPicYuvAddr(TComPicYuv* picYuv, Int yCbCr, Int addr = 0);
 
-    Void processSaoCuOrg(Int addr, Int partIdx, Int yCbCr); //!< LCU-basd SAO process without slice granularity
+    Void processSaoCu(Int addr, Int partIdx, Int yCbCr); //!< LCU-basd SAO process without slice granularity
     Void createPicSaoInfo(TComPic* pic);
     Void destroyPicSaoInfo();
 
@@ -204,6 +203,7 @@
     Void convertQT2SaoUnit(SAOParam* saoParam, UInt partIdx, Int yCbCr);
     Void convertOnePart2SaoUnit(SAOParam *saoParam, UInt partIdx, Int yCbCr);
     Void processSaoUnitAll(SaoLcuParam* saoLcuParam, Bool oneUnitFlag, Int yCbCr);
+    Void processSaoUnitRow(SaoLcuParam* saoLcuParam, int idxY, Int yCbCr);
     Void setSaoLcuBoundary(int bVal)  { m_saoLcuBoundary = bVal != 0; }
 
     Bool getSaoLcuBoundary()           { return m_saoLcuBoundary; }
@@ -216,6 +216,8 @@
     Void copySaoUnit(SaoLcuParam* saoUnitDst, SaoLcuParam* saoUnitSrc);
 };
 Void PCMLFDisableProcess(TComPic* pic);
+Void xPCMCURestoration(TComDataCU* cu, UInt absZOrderIdx, UInt depth);
+
 
 //! \}
 #endif // ifndef __TCOMSAMPLEADAPTIVEOFFSET__
diff -r 78f36991d730 -r 08631e01e5c0 source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp
--- a/source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp	Thu Aug 22 15:16:40 2013 +0800
+++ b/source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp	Thu Aug 22 15:17:29 2013 +0800
@@ -45,22 +45,25 @@
 //! \{
 
 TEncSampleAdaptiveOffset::TEncSampleAdaptiveOffset()
+    : m_entropyCoder(NULL)
+    , m_rdSbacCoders(NULL)
+    , m_rdGoOnSbacCoder(NULL)
+    , m_binCoderCABAC(NULL)
+    , m_count(NULL)
+    , m_offset(NULL)
+    , m_offsetOrg(NULL)
+    , m_countPreDblk(NULL)
+    , m_offsetOrgPreDblk(NULL)
+    , m_rate(NULL)
+    , m_dist(NULL)
+    , m_cost(NULL)
+    , m_costPartBest(NULL)
+    , m_distOrg(NULL)
+    , m_typePartBest(NULL)
+    , lumaLambda(0.)
+    , chromaLambd(0.)
+    , depth(0)
 {
-    m_entropyCoder = NULL;
-    m_rdSbacCoders = NULL;
-    m_rdGoOnSbacCoder = NULL;
-    m_binCoderCABAC = NULL;
-    m_count = NULL;
-    m_offset = NULL;
-    m_offsetOrg = NULL;
-    m_countPreDblk = NULL;
-    m_offsetOrgPreDblk = NULL;
-    m_rate = NULL;
-    m_dist = NULL;
-    m_cost = NULL;
-    m_costPartBest = NULL;
-    m_distOrg = NULL;
-    m_typePartBest = NULL;
     m_depthSaoRate[0][0] = 0;
     m_depthSaoRate[0][1] = 0;
     m_depthSaoRate[0][2] = 0;
@@ -69,6 +72,11 @@
     m_depthSaoRate[1][1] = 0;
     m_depthSaoRate[1][2] = 0;
     m_depthSaoRate[1][3] = 0;
+
+    m_saoBitIncreaseY = max(X265_DEPTH - 10, 0);
+    m_saoBitIncreaseC = max(X265_DEPTH - 10, 0);
+    m_offsetThY = 1 << min(X265_DEPTH - 5, 5);
+    m_offsetThC = 1 << min(X265_DEPTH - 5, 5);
 }
 
 TEncSampleAdaptiveOffset::~TEncSampleAdaptiveOffset()
@@ -850,19 +858,11 @@
     }
 }
 
-/** Calculate SAO statistics for current LCU
+/** Calculate SAO statistics for current LCU without non-crossing slice
  * \param  addr,  partIdx,  yCbCr
  */
 Void TEncSampleAdaptiveOffset::calcSaoStatsCu(Int addr, Int partIdx, Int yCbCr)
 {
-    calcSaoStatsCuOrg(addr, partIdx, yCbCr);
-}
-
-/** Calculate SAO statistics for current LCU without non-crossing slice
- * \param  addr,  partIdx,  yCbCr
- */
-Void TEncSampleAdaptiveOffset::calcSaoStatsCuOrg(Int addr, Int partIdx, Int yCbCr)
-{
     Int x, y;
     TComDataCU *pTmpCu = m_pic->getCU(addr);
     TComSPS *pTmpSPS =  m_pic->getSlice()->getSPS();
@@ -1536,31 +1536,23 @@
  * \param dLambdaLuma
  * \param lambdaChroma
  */
-Void TEncSampleAdaptiveOffset::SAOProcess(SAOParam *saoParam, Double lambdaLuma, Double lambdaChroma, Int depth)
+Void TEncSampleAdaptiveOffset::SAOProcess(SAOParam *saoParam)
 {
-    m_saoBitIncreaseY = max(X265_DEPTH - 10, 0);
-    m_saoBitIncreaseC = max(X265_DEPTH - 10, 0);
-    m_offsetThY = 1 << min(X265_DEPTH - 5, 5);
-    m_offsetThC = 1 << min(X265_DEPTH - 5, 5);
-    resetSAOParam(saoParam);
-    if (!m_saoLcuBasedOptimization || !m_saoLcuBoundary)
-    {
-        resetStats();
-    }
-    Double costFinal = 0;
     if (m_saoLcuBasedOptimization)
     {
-        rdoSaoUnitAll(saoParam, lambdaLuma, lambdaChroma, depth);
+        // Why be here?
+        assert(0);
+
+        rdoSaoUnitAll(saoParam, lumaLambda, chromaLambd, depth);
     }
     else
     {
+        Double costFinal = 0;
         saoParam->bSaoFlag[0] = 1;
         saoParam->bSaoFlag[1] = 0;
         costFinal = 0;
-        Double lambdaRdo = lambdaLuma;
-        resetStats();
         getSaoStats(saoParam->saoPart[0], 0);
-        runQuadTreeDecision(saoParam->saoPart[0], 0, costFinal, m_maxSplitLevel, lambdaRdo, 0);
+        runQuadTreeDecision(saoParam->saoPart[0], 0, costFinal, m_maxSplitLevel, lumaLambda, 0);
         saoParam->bSaoFlag[0] = costFinal < 0 ? 1 : 0;
         if (saoParam->bSaoFlag[0])
         {
@@ -1897,6 +1889,199 @@
     }
 }
 
+void TEncSampleAdaptiveOffset::rdoSaoUnitRowInit(SAOParam *saoParam)
+{
+    saoParam->bSaoFlag[0] = true;
+    saoParam->bSaoFlag[1] = true;
+    saoParam->oneUnitFlag[0] = false;
+    saoParam->oneUnitFlag[1] = false;
+    saoParam->oneUnitFlag[2] = false;
+
+    numNoSao[0] = 0; // Luma
+    numNoSao[1] = 0; // Chroma
+    if (depth > 0 && m_depthSaoRate[0][depth - 1] > SAO_ENCODING_RATE)
+    {
+        saoParam->bSaoFlag[0] = false;
+    }
+    if (depth > 0 && m_depthSaoRate[1][depth - 1] > SAO_ENCODING_RATE_CHROMA)
+    {
+        saoParam->bSaoFlag[1] = false;
+    }
+}
+
+Void TEncSampleAdaptiveOffset::rdoSaoUnitRowEnd(SAOParam *saoParam, int numlcus)
+{
+
+    if (!saoParam->bSaoFlag[0])
+    {
+        m_depthSaoRate[0][depth] = 1.0;
+    }
+    else
+    {
+        m_depthSaoRate[0][depth] = numNoSao[0] / ((Double)numlcus);
+    }
+    if (!saoParam->bSaoFlag[1])
+    {
+        m_depthSaoRate[1][depth] = 1.0;
+    }
+    else
+    {
+        m_depthSaoRate[1][depth] = numNoSao[1] / ((Double)numlcus * 2);
+    }
+}
+
+Void TEncSampleAdaptiveOffset::rdoSaoUnitRow(SAOParam *saoParam, Int idxY)
+{
+    Int idxX;
+    Int frameWidthInCU  = saoParam->numCuInWidth;
+    Int j, k;
+    Int addr = 0;
+    Int addrUp = -1;
+    Int addrLeft = -1;
+    Int compIdx = 0;
+    SaoLcuParam mergeSaoParam[3][2];
+    Double compDistortion[3];
+
+    {
+        for (idxX = 0; idxX < frameWidthInCU; idxX++)
+        {
+            addr     = idxX  + frameWidthInCU * idxY;
+            addrUp   = addr < frameWidthInCU ? -1 : idxX   + frameWidthInCU * (idxY - 1);
+            addrLeft = idxX == 0               ? -1 : idxX - 1 + frameWidthInCU * idxY;
+            Int allowMergeLeft = 1;
+            Int allowMergeUp   = 1;
+            UInt rate;
+            Double bestCost, mergeCost;
+            if (idxX == 0)
+            {
+                allowMergeLeft = 0;
+            }
+            if (idxY == 0)
+            {
+                allowMergeUp = 0;
+            }
+
+            compDistortion[0] = 0;
+            compDistortion[1] = 0;
+            compDistortion[2] = 0;
+            m_rdGoOnSbacCoder->load(m_rdSbacCoders[0][CI_CURR_BEST]);
+            if (allowMergeLeft)
+            {
+                m_entropyCoder->m_pcEntropyCoderIf->codeSaoMerge(0);
+            }
+            if (allowMergeUp)
+            {
+                m_entropyCoder->m_pcEntropyCoderIf->codeSaoMerge(0);
+            }
+            m_rdGoOnSbacCoder->store(m_rdSbacCoders[0][CI_TEMP_BEST]);
+            // reset stats Y, Cb, Cr
+            for (compIdx = 0; compIdx < 3; compIdx++)
+            {
+                for (j = 0; j < MAX_NUM_SAO_TYPE; j++)
+                {
+                    for (k = 0; k < MAX_NUM_SAO_CLASS; k++)
+                    {
+                        m_offset[compIdx][j][k] = 0;
+                        if (m_saoLcuBasedOptimization && m_saoLcuBoundary)
+                        {
+                            m_count[compIdx][j][k] = m_countPreDblk[addr][compIdx][j][k];
+                            m_offsetOrg[compIdx][j][k] = m_offsetOrgPreDblk[addr][compIdx][j][k];
+                        }
+                        else
+                        {
+                            m_count[compIdx][j][k] = 0;
+                            m_offsetOrg[compIdx][j][k] = 0;
+                        }
+                    }
+                }
+
+                saoParam->saoLcuParam[compIdx][addr].typeIdx       =  -1;
+                saoParam->saoLcuParam[compIdx][addr].mergeUpFlag   = 0;
+                saoParam->saoLcuParam[compIdx][addr].mergeLeftFlag = 0;
+                saoParam->saoLcuParam[compIdx][addr].subTypeIdx    = 0;
+                if ((compIdx == 0 && saoParam->bSaoFlag[0]) || (compIdx > 0 && saoParam->bSaoFlag[1]))
+                {
+                    calcSaoStatsCu(addr, compIdx,  compIdx);
+                }
+            }
+
+            saoComponentParamDist(allowMergeLeft, allowMergeUp, saoParam, addr, addrUp, addrLeft, 0,  lumaLambda, &mergeSaoParam[0][0], &compDistortion[0]);
+            sao2ChromaParamDist(allowMergeLeft, allowMergeUp, saoParam, addr, addrUp, addrLeft, chromaLambd, &mergeSaoParam[1][0], &mergeSaoParam[2][0], &compDistortion[0]);
+            if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
+            {
+                // Cost of new SAO_params
+                m_rdGoOnSbacCoder->load(m_rdSbacCoders[0][CI_CURR_BEST]);
+                m_rdGoOnSbacCoder->resetBits();
+                if (allowMergeLeft)
+                {
+                    m_entropyCoder->m_pcEntropyCoderIf->codeSaoMerge(0);
+                }
+                if (allowMergeUp)
+                {
+                    m_entropyCoder->m_pcEntropyCoderIf->codeSaoMerge(0);
+                }
+                for (compIdx = 0; compIdx < 3; compIdx++)
+                {
+                    if ((compIdx == 0 && saoParam->bSaoFlag[0]) || (compIdx > 0 && saoParam->bSaoFlag[1]))
+                    {
+                        m_entropyCoder->encodeSaoOffset(&saoParam->saoLcuParam[compIdx][addr], compIdx);
+                    }
+                }
+
+                rate = m_entropyCoder->getNumberOfWrittenBits();
+                bestCost = compDistortion[0] + (Double)rate;
+                m_rdGoOnSbacCoder->store(m_rdSbacCoders[0][CI_TEMP_BEST]);
+
+                // Cost of Merge
+                for (Int mergeUp = 0; mergeUp < 2; ++mergeUp)
+                {
+                    if ((allowMergeLeft && (mergeUp == 0)) || (allowMergeUp && (mergeUp == 1)))
+                    {
+                        m_rdGoOnSbacCoder->load(m_rdSbacCoders[0][CI_CURR_BEST]);
+                        m_rdGoOnSbacCoder->resetBits();
+                        if (allowMergeLeft)
+                        {
+                            m_entropyCoder->m_pcEntropyCoderIf->codeSaoMerge(1 - mergeUp);
+                        }
+                        if (allowMergeUp && (mergeUp == 1))
+                        {
+                            m_entropyCoder->m_pcEntropyCoderIf->codeSaoMerge(1);
+                        }
+
+                        rate = m_entropyCoder->getNumberOfWrittenBits();
+                        mergeCost = compDistortion[mergeUp + 1] + (Double)rate;
+                        if (mergeCost < bestCost)
+                        {
+                            bestCost = mergeCost;
+                            m_rdGoOnSbacCoder->store(m_rdSbacCoders[0][CI_TEMP_BEST]);
+                            for (compIdx = 0; compIdx < 3; compIdx++)
+                            {
+                                mergeSaoParam[compIdx][mergeUp].mergeLeftFlag = 1 - mergeUp;
+                                mergeSaoParam[compIdx][mergeUp].mergeUpFlag = mergeUp;
+                                if ((compIdx == 0 && saoParam->bSaoFlag[0]) || (compIdx > 0 && saoParam->bSaoFlag[1]))
+                                {
+                                    copySaoUnit(&saoParam->saoLcuParam[compIdx][addr], &mergeSaoParam[compIdx][mergeUp]);
+                                }
+                            }
+                        }
+                    }
+                }
+
+                if (saoParam->saoLcuParam[0][addr].typeIdx == -1)
+                {
+                    numNoSao[0]++;
+                }
+                if (saoParam->saoLcuParam[1][addr].typeIdx == -1)
+                {
+                    numNoSao[1] += 2;
+                }
+                m_rdGoOnSbacCoder->load(m_rdSbacCoders[0][CI_TEMP_BEST]);
+                m_rdGoOnSbacCoder->store(m_rdSbacCoders[0][CI_CURR_BEST]);
+            }
+        }
+    }
+}
+
 /** rate distortion optimization of SAO unit
  * \param saoParam SAO parameters
  * \param addr address
diff -r 78f36991d730 -r 08631e01e5c0 source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h
--- a/source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h	Thu Aug 22 15:16:40 2013 +0800
+++ b/source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h	Thu Aug 22 15:17:29 2013 +0800
@@ -78,6 +78,10 @@
     Double  m_depthSaoRate[2][4];
 
 public:
+    double  lumaLambda;
+    double  chromaLambd;
+    int     depth;
+    Int     numNoSao[2];
 
     TEncSampleAdaptiveOffset();
     virtual ~TEncSampleAdaptiveOffset();
@@ -85,7 +89,7 @@
     Void startSaoEnc(TComPic* pic, TEncEntropy* entropyCoder, TEncSbac* rdGoOnSbacCoder);
     Void endSaoEnc();
     Void resetStats();
-    Void SAOProcess(SAOParam *saoParam, Double lambda, Double lambdaChroma, Int depth);
+    Void SAOProcess(SAOParam *saoParam);
 
     Void runQuadTreeDecision(SAOQTPart *psQTPart, Int partIdx, Double &costFinal, Int maxLevel, Double lambda, Int yCbCr);
     Void rdoSaoOnePart(SAOQTPart *psQTPart, Int partIdx, Double lambda, Int yCbCr);
@@ -94,7 +98,6 @@
     Void getSaoStats(SAOQTPart *psQTPart, Int yCbCr);
     Void calcSaoStatsCu(Int addr, Int partIdx, Int yCbCr);
     Void calcSaoStatsBlock(Pel* recStart, Pel* orgStart, Int stride, Int64** stats, Int64** counts, UInt width, UInt height, Bool* bBorderAvail, Int yCbCr);
-    Void calcSaoStatsCuOrg(Int addr, Int partIdx, Int yCbCr);
     Void calcSaoStatsRowCus_BeforeDblk(TComPic* pic, Int idxY);
     Void destroyEncBuffer();
     Void createEncBuffer();
@@ -109,6 +112,10 @@
     Void setMaxNumOffsetsPerPic(Int val) { m_maxNumOffsetsPerPic = val; }
 
     Int  getMaxNumOffsetsPerPic() { return m_maxNumOffsetsPerPic; }
+
+    Void rdoSaoUnitRowInit(SAOParam *saoParam);
+    Void rdoSaoUnitRowEnd(SAOParam *saoParam, int numlcus);
+    Void rdoSaoUnitRow(SAOParam *saoParam, Int idxY);
 };
 
 //! \}
diff -r 78f36991d730 -r 08631e01e5c0 source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp	Thu Aug 22 15:16:40 2013 +0800
+++ b/source/encoder/frameencoder.cpp	Thu Aug 22 15:17:29 2013 +0800
@@ -88,8 +88,6 @@
     m_numRows = numRows;
     row_delay = (m_cfg->param.saoLcuBasedOptimization && m_cfg->param.saoLcuBoundary) ? 2 : 1;;
 
-    m_frameFilter.init(top, numRows);
-
     m_rows = new CTURow[m_numRows];
     for (int i = 0; i < m_numRows; ++i)
     {
@@ -102,6 +100,8 @@
         m_pool = NULL;
     }
 
+    m_frameFilter.init(top, numRows, getEntropyCoder(0), getRDGoOnSbacCoder(0));
+
     // initialize SPS
     top->xInitSPS(&m_sps);
 
@@ -352,7 +352,7 @@
     setCrDistortionWeight(weight);
 
     // for RDOQ
-    setQPLambda(qp, lambda, lambda / weight);
+    setQPLambda(qp, lambda, lambda / weight, slice->getDepth());
 
     // For SAO
     slice->setLambda(lambda, lambda / weight);
@@ -529,26 +529,22 @@
     }
 
     /* use the main bitstream buffer for storing the marshaled picture */
-    entropyCoder->setBitstream(NULL);
-
     if (m_sps.getUseSAO())
     {
-        // set entropy coder for RD
-        entropyCoder->setEntropyCoder(&m_sbacCoder, slice);
-        entropyCoder->resetEntropy();
-        entropyCoder->setBitstream(&m_bitCounter);
+        SAOParam* saoParam = pic->getPicSym()->getSaoParam();
 
-        // CHECK_ME: I think the SAO uses a temp Sbac only, so I always use [0], am I right?
-        getSAO()->startSaoEnc(pic, entropyCoder, getRDGoOnSbacCoder(0));
-
-        SAOParam* saoParam = pic->getPicSym()->getSaoParam();
-        getSAO()->SAOProcess(saoParam, slice->getLambdaLuma(), slice->getLambdaChroma(), slice->getDepth());
-        getSAO()->endSaoEnc();
-        PCMLFDisableProcess(pic);
+        if (!getSAO()->getSaoLcuBasedOptimization())
+        {
+            getSAO()->SAOProcess(saoParam);
+            getSAO()->endSaoEnc();
+            PCMLFDisableProcess(pic);
+        }
 
         slice->setSaoEnabledFlag((saoParam->bSaoFlag[0] == 1) ? true : false);
     }
 
+    entropyCoder->setBitstream(NULL);
+
     // Reconstruction slice
     slice->setNextSlice(true);
     determineSliceBounds(pic);
diff -r 78f36991d730 -r 08631e01e5c0 source/encoder/frameencoder.h
--- a/source/encoder/frameencoder.h	Thu Aug 22 15:16:40 2013 +0800
+++ b/source/encoder/frameencoder.h	Thu Aug 22 15:17:29 2013 +0800
@@ -75,12 +75,15 @@
         }
     }
 
-    void setQPLambda(Int QP, double lumaLambda, double chromaLambda)
+    void setQPLambda(Int QP, double lumaLambda, double chromaLambda, int depth)
     {
         for (int i = 0; i < m_numRows; i++)
         {
             m_rows[i].m_search.setQPLambda(QP, lumaLambda, chromaLambda);
         }
+        m_frameFilter.m_sao.lumaLambda = lumaLambda;
+        m_frameFilter.m_sao.chromaLambd = chromaLambda;
+        m_frameFilter.m_sao.depth = depth;
     }
 
     void setCbDistortionWeight(double weight)
diff -r 78f36991d730 -r 08631e01e5c0 source/encoder/framefilter.cpp
--- a/source/encoder/framefilter.cpp	Thu Aug 22 15:16:40 2013 +0800
+++ b/source/encoder/framefilter.cpp	Thu Aug 22 15:17:29 2013 +0800
@@ -37,6 +37,8 @@
     , m_cfg(NULL)
     , m_pic(NULL)
     , active_lft(FALSE)
+    , m_entropyCoder(NULL)
+    , m_rdGoOnSbacCoder(NULL)
 {}
 
 void FrameFilter::destroy()
@@ -73,11 +75,15 @@
     return false;
 }
 
-void FrameFilter::init(TEncTop *top, int numRows)
+void FrameFilter::init(TEncTop *top, int numRows, TEncEntropy* entropyCoder, TEncSbac* rdGoOnSbacCoder)
 {
     m_cfg = top;
     m_numRows = numRows;
 
+    // NOTE: for sao only, DON'T use before first row finished
+    m_entropyCoder = entropyCoder;
+    m_rdGoOnSbacCoder = rdGoOnSbacCoder;
+
     if (top->param.bEnableLoopFilter)
     {
         m_loopFilter.create(g_maxCUDepth);
@@ -99,9 +105,12 @@
     active_lft = FALSE;
     if (m_cfg->param.bEnableLoopFilter)
     {
-        if (m_cfg->param.saoLcuBasedOptimization && m_cfg->param.saoLcuBoundary)
-            m_sao.resetStats();
+        m_sao.resetStats();
         m_sao.createPicSaoInfo(pic);
+
+        SAOParam* saoParam = pic->getPicSym()->getSaoParam();
+        m_sao.resetSAOParam(saoParam);
+        m_sao.rdoSaoUnitRowInit(saoParam);
     }
 
     if (m_cfg->param.bEnableLoopFilter && m_pool && m_cfg->param.bEnableWavefront)
@@ -139,6 +148,13 @@
 
     // Called by worker threads
 
+    // NOTE: We are here only active both of loopfilter and sao, and row 0 always finished, so we can safe to reuse row[0]'s data 
+    if (row == 0)
+    {
+        // CHECK_ME: I think the SAO uses a temp Sbac only, so I always use [0], am I right?
+        m_sao.startSaoEnc(m_pic, m_entropyCoder, m_rdGoOnSbacCoder);
+    }
+
     const uint32_t numCols = m_pic->getPicSym()->getFrameWidthInCU();
     const uint32_t lineStartCUAddr = row * numCols;
 
@@ -167,9 +183,61 @@
         m_loopFilter.loopFilterCU(cu_prev, EDGE_HOR);
     }
 
+    // SAO
+    SAOParam* saoParam = m_pic->getPicSym()->getSaoParam();
+    if (m_sao.getSaoLcuBasedOptimization())
+    {
+        m_sao.rdoSaoUnitRow(saoParam, row);
+
+        // NOTE: Delay a row because SAO decide need top row pixels at next row, is it HM's bug?
+        if (row > 0)
+        {
+            // NOTE: these flag is not use in this mode
+            assert(saoParam->oneUnitFlag[0] == false);
+            assert(saoParam->oneUnitFlag[1] == false);
+            assert(saoParam->oneUnitFlag[2] == false);
+
+            if (saoParam->bSaoFlag[0])
+            {
+                m_sao.processSaoUnitRow(saoParam->saoLcuParam[0], row - 1, 0);
+            }
+            if (saoParam->bSaoFlag[1])
+            {
+                m_sao.processSaoUnitRow(saoParam->saoLcuParam[1], row - 1, 1);
+                m_sao.processSaoUnitRow(saoParam->saoLcuParam[2], row - 1, 2);
+            }
+
+            // TODO: this code is NOT VERIFY because TransformSkip and PCM mode have some bugs, they always not active!
+            Bool  bPCMFilter = (m_pic->getSlice()->getSPS()->getUsePCM() && m_pic->getSlice()->getSPS()->getPCMFilterDisableFlag()) ? true : false;
+            if (bPCMFilter || m_pic->getSlice()->getPPS()->getTransquantBypassEnableFlag())
+            {
+                for (UInt col = 0; col < numCols; col++)
+                {
+                    const uint32_t cuAddr = lineStartCUAddr + col;
+                    TComDataCU* cu = m_pic->getCU(cuAddr);
+
+                    xPCMCURestoration(cu, 0, 0);
+                }
+            }
+        }
+    }
+
     // this row of CTUs has been encoded
     if (row == m_numRows - 1)
     {
+        m_sao.rdoSaoUnitRowEnd(saoParam, m_pic->getNumCUsInFrame());
+
+        // Process Last row of SAO
+        if (saoParam->bSaoFlag[0])
+        {
+            m_sao.processSaoUnitRow(saoParam->saoLcuParam[0], row, 0);
+        }
+        if (saoParam->bSaoFlag[1])
+        {
+            m_sao.processSaoUnitRow(saoParam->saoLcuParam[1], row, 1);
+            m_sao.processSaoUnitRow(saoParam->saoLcuParam[2], row, 2);
+        }
+
         m_completionEvent.trigger();
     }
 }
diff -r 78f36991d730 -r 08631e01e5c0 source/encoder/framefilter.h
--- a/source/encoder/framefilter.h	Thu Aug 22 15:16:40 2013 +0800
+++ b/source/encoder/framefilter.h	Thu Aug 22 15:17:29 2013 +0800
@@ -48,7 +48,7 @@
 
     virtual ~FrameFilter() {}
 
-    void init(TEncTop *top, int numRows);
+    void init(TEncTop *top, int numRows, TEncEntropy* entropyCoder, TEncSbac* rdGoOnSbacCoder);
 
     void destroy();
 
@@ -72,6 +72,8 @@
 
     TComLoopFilter              m_loopFilter;
     TEncSampleAdaptiveOffset    m_sao;
+    TEncEntropy*                m_entropyCoder;
+    TEncSbac*                   m_rdGoOnSbacCoder;
     int                         m_numRows;
 
     // TODO: if you want thread priority logic, add col here



More information about the x265-devel mailing list