[x265] [PATCH 1 of 2] improve performance by full row process

Min Chen chenm003 at 163.com
Thu Feb 4 06:29:50 CET 2016


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1454563778 -28800
# Node ID ad8ebeffdda44378dd93b787215a937a26be980e
# Parent  dc62b47dd0d98f732165345883edac55320baec1
improve performance by full row process
---
 source/encoder/framefilter.cpp |  197 +++++++++++++++++++++++++++++++++--
 source/encoder/framefilter.h   |    4 +
 source/encoder/sao.cpp         |  224 ++++++++++++++++++++++++++++++++++++++++
 source/encoder/sao.h           |    4 +-
 4 files changed, 418 insertions(+), 11 deletions(-)

diff -r dc62b47dd0d9 -r ad8ebeffdda4 source/encoder/framefilter.cpp
--- a/source/encoder/framefilter.cpp	Mon Jan 25 14:59:50 2016 +0530
+++ b/source/encoder/framefilter.cpp	Thu Feb 04 13:29:38 2016 +0800
@@ -174,6 +174,22 @@
         restoreOrigLosslessYuv(cu, frame, absPartIdx);
 }
 
+void FrameFilter::ParallelFilter::processSaoPcmRow(int startCol)
+{
+    if (m_encData->m_slice->m_pps->bTransquantBypassEnabled)
+    {
+        const CUGeom* cuGeoms = m_frameFilter->m_frameEncoder->m_cuGeoms;
+        const uint32_t* ctuGeomMap = m_frameFilter->m_frameEncoder->m_ctuGeomMap;
+
+        for (int col = startCol; col < m_frameFilter->m_numCols; col++)
+        {
+            uint32_t cuAddr = m_rowAddr + col;
+            const CUData* ctu = m_encData->getPicCTU(cuAddr);
+            origCUSampleRestoration(ctu, cuGeoms[ctuGeomMap[cuAddr]], *m_frameFilter->m_frame);
+        }
+    }
+}
+
 void FrameFilter::ParallelFilter::copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col)
 {
     // Copy SAO Top Reference Pixels
@@ -182,7 +198,7 @@
 
     // Luma
     memcpy(&m_sao.m_tmpU[0][col * ctuWidth], recY, ctuWidth * sizeof(pixel));
-    X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected");
+    X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer write beyond bound detected");
 
     // Chroma
     if (m_frameFilter->m_param->internalCsp != X265_CSP_I400)
@@ -194,7 +210,32 @@
         memcpy(&m_sao.m_tmpU[1][col * ctuWidth], recU, ctuWidth * sizeof(pixel));
         memcpy(&m_sao.m_tmpU[2][col * ctuWidth], recV, ctuWidth * sizeof(pixel));
 
-        X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected");
+        X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer write beyond bound detected");
+    }
+}
+
+void FrameFilter::ParallelFilter::copySaoAboveRefRow(PicYuv* reconPic, uint32_t cuAddr, int col)
+{
+    // Copy SAO Top Reference Pixels
+    int ctuWidth  = g_maxCUSize;
+    const pixel* recY = reconPic->getPlaneAddr(0, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_stride);
+    const int cntCols = (m_frameFilter->m_numCols - col);
+
+    // Luma
+    memcpy(&m_sao.m_tmpU[0][col * ctuWidth], recY, cntCols * ctuWidth * sizeof(pixel));
+    X265_CHECK(col * ctuWidth + cntCols * ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer write beyond bound detected");
+
+    // Chroma
+    if (m_frameFilter->m_param->internalCsp != X265_CSP_I400)
+    {
+        ctuWidth  >>= m_sao.m_hChromaShift;
+
+        const pixel* recU = reconPic->getPlaneAddr(1, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC);
+        const pixel* recV = reconPic->getPlaneAddr(2, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC);
+        memcpy(&m_sao.m_tmpU[1][col * ctuWidth], recU, cntCols * ctuWidth * sizeof(pixel));
+        memcpy(&m_sao.m_tmpU[2][col * ctuWidth], recV, cntCols * ctuWidth * sizeof(pixel));
+
+        X265_CHECK(col * ctuWidth + cntCols * ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound detected");
     }
 }
 
@@ -243,7 +284,7 @@
     const intptr_t stride = reconPic->m_stride;
     const intptr_t strideC = reconPic->m_strideC;
     pixel *pixY = reconPic->getLumaAddr(lineStartCUAddr);
-    // // MUST BE check I400 since m_picOrg uninitialize in that case
+    // MUST BE check I400 since m_picOrg uninitialize in that case
     pixel *pixU = (m_frameFilter->m_param->internalCsp != X265_CSP_I400) ? reconPic->getCbAddr(lineStartCUAddr) : NULL;
     pixel *pixV = (m_frameFilter->m_param->internalCsp != X265_CSP_I400) ? reconPic->getCrAddr(lineStartCUAddr) : NULL;
     int copySizeY = realW;
@@ -312,6 +353,79 @@
     }
 }
 
+void FrameFilter::ParallelFilter::processPostRow() const
+{
+
+    PicYuv *reconPic = m_frameFilter->m_frame->m_reconPic;
+
+    const uint32_t lumaMarginX = reconPic->m_lumaMarginX;
+    const uint32_t lumaMarginY = reconPic->m_lumaMarginY;
+    const uint32_t chromaMarginX = reconPic->m_chromaMarginX;
+    const uint32_t chromaMarginY = reconPic->m_chromaMarginY;
+    const int hChromaShift = reconPic->m_hChromaShift;
+    const int vChromaShift = reconPic->m_vChromaShift;
+    const intptr_t stride = reconPic->m_stride;
+    const intptr_t strideC = reconPic->m_strideC;
+    pixel *pixY0 = reconPic->getLumaAddr(m_rowAddr);
+    // MUST BE check I400 since m_picOrg uninitialize in that case
+    pixel *pixU0 = (m_frameFilter->m_param->internalCsp != X265_CSP_I400) ? reconPic->getCbAddr(m_rowAddr) : NULL;
+    pixel *pixV0 = (m_frameFilter->m_param->internalCsp != X265_CSP_I400) ? reconPic->getCrAddr(m_rowAddr) : NULL;
+    const int realH = getCUHeight();
+
+    // Border extend Left and Right
+    primitives.extendRowBorder(pixY0, reconPic->m_stride, reconPic->m_picWidth, realH, reconPic->m_lumaMarginX);
+    if (m_frameFilter->m_param->internalCsp != X265_CSP_I400)
+    {
+        primitives.extendRowBorder(pixU0, strideC, reconPic->m_picWidth >> hChromaShift, realH >> vChromaShift, chromaMarginX);
+        primitives.extendRowBorder(pixV0, strideC, reconPic->m_picWidth >> hChromaShift, realH >> vChromaShift, chromaMarginX);
+    }
+
+    // Border extend Top
+    if (!m_row)
+    {
+        pixel *pixY = pixY0 - lumaMarginX;
+
+        for (uint32_t y = 0; y < lumaMarginY; y++)
+            memcpy(pixY - (y + 1) * stride, pixY, stride * sizeof(pixel));
+
+        if (m_frameFilter->m_param->internalCsp != X265_CSP_I400)
+        {
+            pixel *pixU = pixU0 - chromaMarginX;
+            pixel *pixV = pixV0 - chromaMarginX;
+
+            for (uint32_t y = 0; y < reconPic->m_chromaMarginY; y++)
+            {
+                memcpy(pixU - (y + 1) * strideC, pixU, strideC * sizeof(pixel));
+                memcpy(pixV - (y + 1) * strideC, pixV, strideC * sizeof(pixel));
+            }
+        }
+    }
+
+    // Border extend Bottom
+    if (m_row == m_frameFilter->m_numRows - 1)
+    {
+        pixel *pixY = pixY0 - lumaMarginX + (realH - 1) * stride;
+
+        for (uint32_t y = 0; y < lumaMarginY; y++)
+            memcpy(pixY + (y + 1) * stride, pixY, stride * sizeof(pixel));
+
+        if (m_frameFilter->m_param->internalCsp != X265_CSP_I400)
+        {
+            pixel *pixU = pixU0 - chromaMarginX + ((realH >> vChromaShift) - 1) * strideC;
+            pixel *pixV = pixV0 - chromaMarginX + ((realH >> vChromaShift) - 1) * strideC;
+
+            for (uint32_t y = 0; y < chromaMarginY; y++)
+            {
+                memcpy(pixU + (y + 1) * strideC, pixU, strideC * sizeof(pixel));
+                memcpy(pixV + (y + 1) * strideC, pixV, strideC * sizeof(pixel));
+            }
+        }
+    }
+
+    // Update finished CU cursor
+    m_frameFilter->m_frame->m_reconColCount[m_row].set(m_frameFilter->m_numCols - 1);
+}
+
 // NOTE: Single Threading only
 void FrameFilter::ParallelFilter::processTasks(int /*workerThreadId*/)
 {
@@ -433,6 +547,75 @@
     }
 }
 
+void FrameFilter::ParallelFilter::processTasksRow(int /*workerThreadId*/)
+{
+    SAOParam* saoParam = m_encData->m_saoParam;
+    const CUGeom* cuGeoms = m_frameFilter->m_frameEncoder->m_cuGeoms;
+    const uint32_t* ctuGeomMap = m_frameFilter->m_frameEncoder->m_ctuGeomMap;
+    PicYuv* reconPic = m_encData->m_reconPic;
+    const int colStart = m_lastCol.get();
+    const int numCols = m_frameFilter->m_numCols;
+
+    // Avoid threading conflict
+    if (colStart >= numCols)
+        return;
+
+    // Previous row MUST BE finish
+    if (m_frameFilter->m_param->bEnableLoopFilter)
+    {
+        for (uint32_t col = (uint32_t)colStart; col < (uint32_t)numCols; col++)
+        {
+            const uint32_t cuAddr = m_rowAddr + col;
+
+            const CUData* ctu = m_encData->getPicCTU(cuAddr);
+            deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER);
+
+            if (col >= 1)
+            {
+                const CUData* ctuPrev = m_encData->getPicCTU(cuAddr - 1);
+                deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR);
+            }
+        }
+        // Process last column
+        {
+            const uint32_t cuAddr = m_rowAddr + numCols - 1;
+            const CUData* ctuPrev = m_encData->getPicCTU(cuAddr);
+            deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_HOR);
+        }
+    }
+
+    if (m_frameFilter->m_param->bEnableSAO)
+    {
+        // Save SAO bottom row reference pixels
+        copySaoAboveRefRow(reconPic, m_rowAddr + X265_MAX(0, colStart - 1), X265_MAX(0, colStart - 1));
+
+        m_sao.rdoSaoUnitRow(saoParam, m_rowAddr, X265_MAX(0, colStart - 2));
+
+        // Process Previous Row SAO CU
+        if (m_row >= 1)
+        {
+            const int saoProcessStartCol = X265_MAX(0, colStart - 3);
+
+            // Must delay 1 row to avoid thread data race conflict
+            m_prevRow->m_sao.processSaoUnitRow(saoParam->ctuParam[0], m_prevRow->m_row, saoProcessStartCol, 0);
+            m_prevRow->m_sao.processSaoUnitRow(saoParam->ctuParam[1], m_prevRow->m_row, saoProcessStartCol, 1);
+            m_prevRow->m_sao.processSaoUnitRow(saoParam->ctuParam[2], m_prevRow->m_row, saoProcessStartCol, 2);
+            m_prevRow->processSaoPcmRow(saoProcessStartCol);
+        }
+    }
+
+    if (m_row >= 1)
+    {
+        // TODO: process current row when SAO disabled
+        m_prevRow->processPostRow();
+    }
+
+    // Setting column sync counter
+    if (m_row >= 1)
+        m_frameFilter->m_frame->m_reconColCount[m_row - 1].set(numCols - 1);    // REMOVE soon
+    m_lastDeblocked.set(numCols);
+}
+
 void FrameFilter::processRow(int row)
 {
     ProfileScopeEvent(filterCTURow);
@@ -461,7 +644,7 @@
         X265_CHECK((row < 1) || m_parallelFilter[row - 1].m_lastDeblocked.get() == m_numCols, "previous row not finish");
 
         m_parallelFilter[row].m_allowedCol.set(m_numCols);
-        m_parallelFilter[row].processTasks(-1);
+        m_parallelFilter[row].processTasksRow(-1);
 
         if (row == m_numRows - 1)
         {
@@ -480,11 +663,7 @@
             }
 
             // Process border extension on last row
-            for(int col = 0; col < m_numCols; col++)
-            {
-                // m_reconColCount will be set in processPostCu()
-                m_parallelFilter[row].processPostCu(col);
-            }
+            m_parallelFilter[row].processPostRow();
         }
     }
 
diff -r dc62b47dd0d9 -r ad8ebeffdda4 source/encoder/framefilter.h
--- a/source/encoder/framefilter.h	Mon Jan 25 14:59:50 2016 +0530
+++ b/source/encoder/framefilter.h	Thu Feb 04 13:29:38 2016 +0800
@@ -88,15 +88,19 @@
         { }
 
         void processTasks(int workerThreadId);
+        void processTasksRow(int workerThreadId);
 
         // Apply SAO on a CU in current row
+        void processSaoPcmRow(int startCol);
         void processSaoUnitCu(SAOParam *saoParam, int col);
 
         // Copy and Save SAO reference pixels for SAO Rdo decide
         void copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col);
+        void copySaoAboveRefRow(PicYuv* reconPic, uint32_t cuAddr, int col);
 
         // Post-Process (Border extension)
         void processPostCu(int col) const;
+        void processPostRow() const;
 
         uint32_t getCUHeight() const
         {
diff -r dc62b47dd0d9 -r ad8ebeffdda4 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp	Mon Jan 25 14:59:50 2016 +0530
+++ b/source/encoder/sao.cpp	Thu Feb 04 13:29:38 2016 +0800
@@ -595,6 +595,79 @@
     }
 }
 
+/* Process SAO all units */
+void SAO::processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int startX, int plane)
+{
+    PicYuv* reconPic = m_frame->m_reconPic;
+    intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
+    uint32_t picWidth  = m_param->sourceWidth;
+    int ctuWidth  = g_maxCUSize;
+    int ctuHeight = g_maxCUSize;
+
+    if (plane)
+    {
+        picWidth  >>= m_hChromaShift;
+        ctuWidth  >>= m_hChromaShift;
+        ctuHeight >>= m_vChromaShift;
+    }
+
+    int addr = idxY * m_numCuInWidth;
+    pixel* rec = reconPic->getPlaneAddr(plane, addr);
+
+    if (startX == 0)
+    {
+        for (int i = 0; i < ctuHeight + 1; i++)
+        {
+            m_tmpL1[plane][i] = rec[0];
+            rec += stride;
+        }
+    }
+
+    for (int idxX = startX; idxX < m_numCuInWidth; idxX++)
+    {
+        addr = idxY * m_numCuInWidth + idxX;
+
+        bool mergeLeftFlag = ctuParam[addr].mergeMode == SAO_MERGE_LEFT;
+        int typeIdx = ctuParam[addr].typeIdx;
+
+        if (idxX != (m_numCuInWidth - 1))
+        {
+            rec = reconPic->getPlaneAddr(plane, addr);
+            for (int i = 0; i < ctuHeight + 1; i++)
+            {
+                m_tmpL2[plane][i] = rec[ctuWidth - 1];
+                rec += stride;
+            }
+        }
+
+        if (typeIdx >= 0)
+        {
+            if (!mergeLeftFlag)
+            {
+                if (typeIdx == SAO_BO)
+                {
+                    memset(m_offsetBo[plane], 0, sizeof(m_offsetBo[0]));
+
+                    for (int i = 0; i < SAO_NUM_OFFSET; i++)
+                        m_offsetBo[plane][((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = (int8_t)(ctuParam[addr].offset[i] << SAO_BIT_INC);
+                }
+                else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
+                {
+                    int offset[NUM_EDGETYPE];
+                    offset[0] = 0;
+                    for (int i = 0; i < SAO_NUM_OFFSET; i++)
+                        offset[i + 1] = ctuParam[addr].offset[i] << SAO_BIT_INC;
+
+                    for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++)
+                        m_offsetEo[plane][edgeType] = (int8_t)offset[s_eoTable[edgeType]];
+                }
+            }
+            processSaoCu(addr, typeIdx, plane);
+        }
+        std::swap(m_tmpL1[plane], m_tmpL2[plane]);
+    }
+}
+
 /* Process SAO unit */
 void SAO::processSaoUnitCuLuma(SaoCtuParam* ctuParam, int idxY, int idxX)
 {
@@ -1361,6 +1434,157 @@
     }
 }
 
+void SAO::rdoSaoUnitRow(SAOParam* saoParam, int rowBaseAddr, int startCol)
+{
+    double lambda[3] = {m_lumaLambda, m_chromaLambda, m_chromaLambda};
+
+    bool chroma = m_param->internalCsp != X265_CSP_I400;
+    int planes = chroma ? 3 : 1;
+    bool allowMerge[2] = {(startCol != 0), (rowBaseAddr != 0)}; // left, up
+//     int addrMerge[2] = {(startCol - 1), (rowBaseAddr ? startCol - m_numCuInWidth : -1)};// left, up
+
+    for(int idxX = startCol; idxX < m_numCuInWidth; idxX++)
+    {
+//         X265_CHECK((idxX ? idxX - 1 : -1) == addrMerge[0], "addrMerge[0] check failure");
+        const int addr = rowBaseAddr + idxX;
+
+        int addrMerge[2] = {(idxX ? addr - 1 : -1), (rowBaseAddr ? addr - m_numCuInWidth : -1)};// left, up
+
+        m_entropyCoder.load(m_rdContexts.cur);
+        if (allowMerge[0])
+            m_entropyCoder.codeSaoMerge(0);
+        if (allowMerge[1])
+            m_entropyCoder.codeSaoMerge(0);
+        m_entropyCoder.store(m_rdContexts.temp);
+
+        // reset stats Y, Cb, Cr
+        X265_CHECK(sizeof(PerPlane) == (sizeof(int32_t) * (NUM_PLANE * MAX_NUM_SAO_TYPE * MAX_NUM_SAO_CLASS)), "Found Padding space in struct PerPlane");
+
+        // TODO: Confirm the address space is continuous
+        if (m_param->bSaoNonDeblocked)
+        {
+            memcpy(m_count, m_countPreDblk[addr], sizeof(m_count));
+            memcpy(m_offsetOrg, m_offsetOrgPreDblk[addr], sizeof(m_offsetOrg));
+        }
+        else
+        {
+            memset(m_count, 0, sizeof(m_count));
+            memset(m_offsetOrg, 0, sizeof(m_offsetOrg));
+        }
+
+        for (int i = 0; i < planes; i++)
+            saoParam->ctuParam[i][addr].reset();
+
+        if (saoParam->bSaoFlag[0])
+        {
+            calcSaoStatsCu(addr, 0);
+            saoStatsInitialOffset(0);
+        }
+
+        if (saoParam->bSaoFlag[1])
+        {
+            calcSaoStatsCu(addr, 1);
+            calcSaoStatsCu(addr, 2);
+            saoStatsInitialOffset(1);
+            //        saoStatsInitialOffset(2);
+        }
+
+        double mergeDist[NUM_MERGE_MODE] = { 0.0 };
+        saoLumaComponentParamDist(saoParam, addr, mergeDist);
+        if (chroma)
+            saoChromaComponentParamDist(saoParam, addr, mergeDist);
+
+        if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
+        {
+            // Cost of new SAO_params
+            m_entropyCoder.load(m_rdContexts.cur);
+            m_entropyCoder.resetBits();
+            if (allowMerge[0])
+                m_entropyCoder.codeSaoMerge(0);
+            if (allowMerge[1])
+                m_entropyCoder.codeSaoMerge(0);
+            for (int plane = 0; plane < planes; plane++)
+            {
+                if (saoParam->bSaoFlag[plane > 0])
+                    m_entropyCoder.codeSaoOffset(saoParam->ctuParam[plane][addr], plane);
+            }
+
+            uint32_t rate = m_entropyCoder.getNumberOfWrittenBits();
+            double bestCost = mergeDist[0] + (double)rate;
+            m_entropyCoder.store(m_rdContexts.temp);
+
+            // Cost of merge left or Up
+            for (int mergeIdx = 0; mergeIdx < 2; ++mergeIdx)
+            {
+                if (!allowMerge[mergeIdx])
+                    continue;
+
+                for (int plane = 0; plane < 3; plane++)
+                {
+                    int64_t estDist = 0;
+                    SaoCtuParam* mergeSrcParam = &(saoParam->ctuParam[plane][addrMerge[mergeIdx]]);
+                    int typeIdx = mergeSrcParam->typeIdx;
+                    if (typeIdx >= 0)
+                    {
+                        int bandPos = (typeIdx == SAO_BO) ? mergeSrcParam->bandPos : 0;
+                        for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
+                        {
+                            int mergeOffset = mergeSrcParam->offset[classIdx];
+                            estDist += estSaoDist(m_count[plane][typeIdx][classIdx + bandPos + 1], mergeOffset, m_offsetOrg[plane][typeIdx][classIdx + bandPos + 1]);
+                        }
+                    }
+
+                    mergeDist[mergeIdx + 1] += ((double)estDist / lambda[plane]);
+                }
+
+
+                m_entropyCoder.load(m_rdContexts.cur);
+                m_entropyCoder.resetBits();
+                if (allowMerge[0])
+                    m_entropyCoder.codeSaoMerge(1 - mergeIdx);
+                if (allowMerge[1] && (mergeIdx == 1))
+                    m_entropyCoder.codeSaoMerge(1);
+
+                rate = m_entropyCoder.getNumberOfWrittenBits();
+                double mergeCost = mergeDist[mergeIdx + 1] + (double)rate;
+                if (mergeCost < bestCost)
+                {
+                    SaoMergeMode mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT;
+                    bestCost = mergeCost;
+                    m_entropyCoder.store(m_rdContexts.temp);
+                    for (int plane = 0; plane < planes; plane++)
+                    {
+                        if (saoParam->bSaoFlag[plane > 0])
+                        {
+                            SaoCtuParam* dstCtuParam   = &saoParam->ctuParam[plane][addr];
+                            SaoCtuParam* mergeSrcParam = &(saoParam->ctuParam[plane][addrMerge[mergeIdx]]);
+                            dstCtuParam->mergeMode = mergeMode;
+                            dstCtuParam->typeIdx   = mergeSrcParam->typeIdx;
+                            dstCtuParam->bandPos   = mergeSrcParam->bandPos;
+
+                            for (int i = 0; i < SAO_NUM_OFFSET; i++)
+                                dstCtuParam->offset[i] = mergeSrcParam->offset[i];
+                        }
+                    }
+                }
+            }
+
+            if (saoParam->ctuParam[0][addr].typeIdx < 0)
+                m_numNoSao[0]++;
+            if (chroma && saoParam->ctuParam[1][addr].typeIdx < 0)
+                m_numNoSao[1]++;
+            m_entropyCoder.load(m_rdContexts.temp);
+            m_entropyCoder.store(m_rdContexts.cur);
+        }
+
+        // Left merge still available after first CU
+        allowMerge[0] = true;
+
+        // next CU address
+        //addrMerge[0]++;
+        //addrMerge[1] += (rowBaseAddr ? 1 : 0);
+    }
+}
 
 // Rounds the division of initial offsets by the number of samples in
 // each of the statistics table entries.
diff -r dc62b47dd0d9 -r ad8ebeffdda4 source/encoder/sao.h
--- a/source/encoder/sao.h	Mon Jan 25 14:59:50 2016 +0530
+++ b/source/encoder/sao.h	Thu Feb 04 13:29:38 2016 +0800
@@ -132,7 +132,7 @@
 
     // CTU-based SAO process without slice granularity
     void processSaoCu(int addr, int typeIdx, int plane);
-    void processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane);
+    void processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int startX, int plane);
     void processSaoUnitCuLuma(SaoCtuParam* ctuParam, int idxY, int idxX);
     void processSaoUnitCuChroma(SaoCtuParam* ctuParam[3], int idxY, int idxX);
 
@@ -147,7 +147,7 @@
     inline int estIterOffset(int typeIdx, double lambda, int offset, int32_t count, int32_t offsetOrg,
                              int& currentDistortionTableBo, double& currentRdCostTableBo);
     void rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus);
-//    void rdoSaoUnitRow(SAOParam* saoParam, int idxY);
+    void rdoSaoUnitRow(SAOParam* saoParam, int rowBaseAddr, int startCol);
     void rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr);
 
     void saoStatsInitialOffset(int plane);



More information about the x265-devel mailing list