[x265] [PATCH 1 of 2] improve performance by full row process

Ashok Kumar Mishra ashok at multicorewareinc.com
Fri Feb 5 14:24:30 CET 2016


I think there is no point of using two fuctions for the same purpose.
processTasks() and processTasksRow(). Either we can use processTasks() or
processTasksRow().

I dont know how much performance we are achieving by doing this, but from
my perspective we should
look into the complexity of the code at the same time.

On Thu, Feb 4, 2016 at 10:59 AM, Min Chen <chenm003 at 163.com> wrote:

> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1454563778 -28800
> # Node ID ad8ebeffdda44378dd93b787215a937a26be980e
> # Parent  dc62b47dd0d98f732165345883edac55320baec1
> improve performance by full row process
> ---
>  source/encoder/framefilter.cpp |  197 +++++++++++++++++++++++++++++++++--
>  source/encoder/framefilter.h   |    4 +
>  source/encoder/sao.cpp         |  224
> ++++++++++++++++++++++++++++++++++++++++
>  source/encoder/sao.h           |    4 +-
>  4 files changed, 418 insertions(+), 11 deletions(-)
>
> diff -r dc62b47dd0d9 -r ad8ebeffdda4 source/encoder/framefilter.cpp
> --- a/source/encoder/framefilter.cpp    Mon Jan 25 14:59:50 2016 +0530
> +++ b/source/encoder/framefilter.cpp    Thu Feb 04 13:29:38 2016 +0800
> @@ -174,6 +174,22 @@
>          restoreOrigLosslessYuv(cu, frame, absPartIdx);
>  }
>
> +void FrameFilter::ParallelFilter::processSaoPcmRow(int startCol)
> +{
> +    if (m_encData->m_slice->m_pps->bTransquantBypassEnabled)
> +    {
> +        const CUGeom* cuGeoms = m_frameFilter->m_frameEncoder->m_cuGeoms;
> +        const uint32_t* ctuGeomMap =
> m_frameFilter->m_frameEncoder->m_ctuGeomMap;
> +
> +        for (int col = startCol; col < m_frameFilter->m_numCols; col++)
> +        {
> +            uint32_t cuAddr = m_rowAddr + col;
> +            const CUData* ctu = m_encData->getPicCTU(cuAddr);
> +            origCUSampleRestoration(ctu, cuGeoms[ctuGeomMap[cuAddr]],
> *m_frameFilter->m_frame);
> +        }
> +    }
> +}
> +
>  void FrameFilter::ParallelFilter::copySaoAboveRef(PicYuv* reconPic,
> uint32_t cuAddr, int col)
>  {
>      // Copy SAO Top Reference Pixels
> @@ -182,7 +198,7 @@
>
>      // Luma
>      memcpy(&m_sao.m_tmpU[0][col * ctuWidth], recY, ctuWidth *
> sizeof(pixel));
> -    X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth *
> ctuWidth, "m_tmpU buffer beyond bound write detected");
> +    X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth *
> ctuWidth, "m_tmpU buffer write beyond bound detected");
>
>      // Chroma
>      if (m_frameFilter->m_param->internalCsp != X265_CSP_I400)
> @@ -194,7 +210,32 @@
>          memcpy(&m_sao.m_tmpU[1][col * ctuWidth], recU, ctuWidth *
> sizeof(pixel));
>          memcpy(&m_sao.m_tmpU[2][col * ctuWidth], recV, ctuWidth *
> sizeof(pixel));
>
> -        X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth *
> ctuWidth, "m_tmpU buffer beyond bound write detected");
> +        X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth *
> ctuWidth, "m_tmpU buffer write beyond bound detected");
> +    }
> +}
> +
> +void FrameFilter::ParallelFilter::copySaoAboveRefRow(PicYuv* reconPic,
> uint32_t cuAddr, int col)
> +{
> +    // Copy SAO Top Reference Pixels
> +    int ctuWidth  = g_maxCUSize;
> +    const pixel* recY = reconPic->getPlaneAddr(0, cuAddr) - (m_rowAddr ==
> 0 ? 0 : reconPic->m_stride);
> +    const int cntCols = (m_frameFilter->m_numCols - col);
> +
> +    // Luma
> +    memcpy(&m_sao.m_tmpU[0][col * ctuWidth], recY, cntCols * ctuWidth *
> sizeof(pixel));
> +    X265_CHECK(col * ctuWidth + cntCols * ctuWidth <=
> m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer write beyond bound
> detected");
> +
> +    // Chroma
> +    if (m_frameFilter->m_param->internalCsp != X265_CSP_I400)
> +    {
> +        ctuWidth  >>= m_sao.m_hChromaShift;
> +
> +        const pixel* recU = reconPic->getPlaneAddr(1, cuAddr) -
> (m_rowAddr == 0 ? 0 : reconPic->m_strideC);
> +        const pixel* recV = reconPic->getPlaneAddr(2, cuAddr) -
> (m_rowAddr == 0 ? 0 : reconPic->m_strideC);
> +        memcpy(&m_sao.m_tmpU[1][col * ctuWidth], recU, cntCols * ctuWidth
> * sizeof(pixel));
> +        memcpy(&m_sao.m_tmpU[2][col * ctuWidth], recV, cntCols * ctuWidth
> * sizeof(pixel));
> +
> +        X265_CHECK(col * ctuWidth + cntCols * ctuWidth <=
> m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound detected");
>      }
>  }
>
> @@ -243,7 +284,7 @@
>      const intptr_t stride = reconPic->m_stride;
>      const intptr_t strideC = reconPic->m_strideC;
>      pixel *pixY = reconPic->getLumaAddr(lineStartCUAddr);
> -    // // MUST BE check I400 since m_picOrg uninitialize in that case
> +    // MUST BE check I400 since m_picOrg uninitialize in that case
>      pixel *pixU = (m_frameFilter->m_param->internalCsp != X265_CSP_I400)
> ? reconPic->getCbAddr(lineStartCUAddr) : NULL;
>      pixel *pixV = (m_frameFilter->m_param->internalCsp != X265_CSP_I400)
> ? reconPic->getCrAddr(lineStartCUAddr) : NULL;
>      int copySizeY = realW;
> @@ -312,6 +353,79 @@
>      }
>  }
>
> +void FrameFilter::ParallelFilter::processPostRow() const
> +{
> +
> +    PicYuv *reconPic = m_frameFilter->m_frame->m_reconPic;
> +
> +    const uint32_t lumaMarginX = reconPic->m_lumaMarginX;
> +    const uint32_t lumaMarginY = reconPic->m_lumaMarginY;
> +    const uint32_t chromaMarginX = reconPic->m_chromaMarginX;
> +    const uint32_t chromaMarginY = reconPic->m_chromaMarginY;
> +    const int hChromaShift = reconPic->m_hChromaShift;
> +    const int vChromaShift = reconPic->m_vChromaShift;
> +    const intptr_t stride = reconPic->m_stride;
> +    const intptr_t strideC = reconPic->m_strideC;
> +    pixel *pixY0 = reconPic->getLumaAddr(m_rowAddr);
> +    // MUST BE check I400 since m_picOrg uninitialize in that case
> +    pixel *pixU0 = (m_frameFilter->m_param->internalCsp != X265_CSP_I400)
> ? reconPic->getCbAddr(m_rowAddr) : NULL;
> +    pixel *pixV0 = (m_frameFilter->m_param->internalCsp != X265_CSP_I400)
> ? reconPic->getCrAddr(m_rowAddr) : NULL;
> +    const int realH = getCUHeight();
> +
> +    // Border extend Left and Right
> +    primitives.extendRowBorder(pixY0, reconPic->m_stride,
> reconPic->m_picWidth, realH, reconPic->m_lumaMarginX);
> +    if (m_frameFilter->m_param->internalCsp != X265_CSP_I400)
> +    {
> +        primitives.extendRowBorder(pixU0, strideC, reconPic->m_picWidth
> >> hChromaShift, realH >> vChromaShift, chromaMarginX);
> +        primitives.extendRowBorder(pixV0, strideC, reconPic->m_picWidth
> >> hChromaShift, realH >> vChromaShift, chromaMarginX);
> +    }
> +
> +    // Border extend Top
> +    if (!m_row)
> +    {
> +        pixel *pixY = pixY0 - lumaMarginX;
> +
> +        for (uint32_t y = 0; y < lumaMarginY; y++)
> +            memcpy(pixY - (y + 1) * stride, pixY, stride * sizeof(pixel));
> +
> +        if (m_frameFilter->m_param->internalCsp != X265_CSP_I400)
> +        {
> +            pixel *pixU = pixU0 - chromaMarginX;
> +            pixel *pixV = pixV0 - chromaMarginX;
> +
> +            for (uint32_t y = 0; y < reconPic->m_chromaMarginY; y++)
> +            {
> +                memcpy(pixU - (y + 1) * strideC, pixU, strideC *
> sizeof(pixel));
> +                memcpy(pixV - (y + 1) * strideC, pixV, strideC *
> sizeof(pixel));
> +            }
> +        }
> +    }
> +
> +    // Border extend Bottom
> +    if (m_row == m_frameFilter->m_numRows - 1)
> +    {
> +        pixel *pixY = pixY0 - lumaMarginX + (realH - 1) * stride;
> +
> +        for (uint32_t y = 0; y < lumaMarginY; y++)
> +            memcpy(pixY + (y + 1) * stride, pixY, stride * sizeof(pixel));
> +
> +        if (m_frameFilter->m_param->internalCsp != X265_CSP_I400)
> +        {
> +            pixel *pixU = pixU0 - chromaMarginX + ((realH >>
> vChromaShift) - 1) * strideC;
> +            pixel *pixV = pixV0 - chromaMarginX + ((realH >>
> vChromaShift) - 1) * strideC;
> +
> +            for (uint32_t y = 0; y < chromaMarginY; y++)
> +            {
> +                memcpy(pixU + (y + 1) * strideC, pixU, strideC *
> sizeof(pixel));
> +                memcpy(pixV + (y + 1) * strideC, pixV, strideC *
> sizeof(pixel));
> +            }
> +        }
> +    }
> +
> +    // Update finished CU cursor
> +
> m_frameFilter->m_frame->m_reconColCount[m_row].set(m_frameFilter->m_numCols
> - 1);
> +}
> +
>  // NOTE: Single Threading only
>  void FrameFilter::ParallelFilter::processTasks(int /*workerThreadId*/)
>  {
> @@ -433,6 +547,75 @@
>      }
>  }
>
> +void FrameFilter::ParallelFilter::processTasksRow(int /*workerThreadId*/)
> +{
> +    SAOParam* saoParam = m_encData->m_saoParam;
> +    const CUGeom* cuGeoms = m_frameFilter->m_frameEncoder->m_cuGeoms;
> +    const uint32_t* ctuGeomMap =
> m_frameFilter->m_frameEncoder->m_ctuGeomMap;
> +    PicYuv* reconPic = m_encData->m_reconPic;
> +    const int colStart = m_lastCol.get();
> +    const int numCols = m_frameFilter->m_numCols;
> +
> +    // Avoid threading conflict
> +    if (colStart >= numCols)
> +        return;
> +
> +    // Previous row MUST BE finish
> +    if (m_frameFilter->m_param->bEnableLoopFilter)
> +    {
> +        for (uint32_t col = (uint32_t)colStart; col < (uint32_t)numCols;
> col++)
> +        {
> +            const uint32_t cuAddr = m_rowAddr + col;
> +
> +            const CUData* ctu = m_encData->getPicCTU(cuAddr);
> +            deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]],
> Deblock::EDGE_VER);
> +
> +            if (col >= 1)
> +            {
> +                const CUData* ctuPrev = m_encData->getPicCTU(cuAddr - 1);
> +                deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]],
> Deblock::EDGE_HOR);
> +            }
> +        }
> +        // Process last column
> +        {
> +            const uint32_t cuAddr = m_rowAddr + numCols - 1;
> +            const CUData* ctuPrev = m_encData->getPicCTU(cuAddr);
> +            deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr]],
> Deblock::EDGE_HOR);
> +        }
> +    }
> +
> +    if (m_frameFilter->m_param->bEnableSAO)
> +    {
> +        // Save SAO bottom row reference pixels
> +        copySaoAboveRefRow(reconPic, m_rowAddr + X265_MAX(0, colStart -
> 1), X265_MAX(0, colStart - 1));
> +
> +        m_sao.rdoSaoUnitRow(saoParam, m_rowAddr, X265_MAX(0, colStart -
> 2));
> +
> +        // Process Previous Row SAO CU
> +        if (m_row >= 1)
> +        {
> +            const int saoProcessStartCol = X265_MAX(0, colStart - 3);
> +
> +            // Must delay 1 row to avoid thread data race conflict
> +            m_prevRow->m_sao.processSaoUnitRow(saoParam->ctuParam[0],
> m_prevRow->m_row, saoProcessStartCol, 0);
> +            m_prevRow->m_sao.processSaoUnitRow(saoParam->ctuParam[1],
> m_prevRow->m_row, saoProcessStartCol, 1);
> +            m_prevRow->m_sao.processSaoUnitRow(saoParam->ctuParam[2],
> m_prevRow->m_row, saoProcessStartCol, 2);
> +            m_prevRow->processSaoPcmRow(saoProcessStartCol);
> +        }
> +    }
> +
> +    if (m_row >= 1)
> +    {
> +        // TODO: process current row when SAO disabled
> +        m_prevRow->processPostRow();
> +    }
> +
> +    // Setting column sync counter
> +    if (m_row >= 1)
> +        m_frameFilter->m_frame->m_reconColCount[m_row - 1].set(numCols -
> 1);    // REMOVE soon
> +    m_lastDeblocked.set(numCols);
> +}
> +
>  void FrameFilter::processRow(int row)
>  {
>      ProfileScopeEvent(filterCTURow);
> @@ -461,7 +644,7 @@
>          X265_CHECK((row < 1) || m_parallelFilter[row -
> 1].m_lastDeblocked.get() == m_numCols, "previous row not finish");
>
>          m_parallelFilter[row].m_allowedCol.set(m_numCols);
> -        m_parallelFilter[row].processTasks(-1);
> +        m_parallelFilter[row].processTasksRow(-1);
>
>          if (row == m_numRows - 1)
>          {
> @@ -480,11 +663,7 @@
>              }
>
>              // Process border extension on last row
> -            for(int col = 0; col < m_numCols; col++)
> -            {
> -                // m_reconColCount will be set in processPostCu()
> -                m_parallelFilter[row].processPostCu(col);
> -            }
> +            m_parallelFilter[row].processPostRow();
>          }
>      }
>
> diff -r dc62b47dd0d9 -r ad8ebeffdda4 source/encoder/framefilter.h
> --- a/source/encoder/framefilter.h      Mon Jan 25 14:59:50 2016 +0530
> +++ b/source/encoder/framefilter.h      Thu Feb 04 13:29:38 2016 +0800
> @@ -88,15 +88,19 @@
>          { }
>
>          void processTasks(int workerThreadId);
> +        void processTasksRow(int workerThreadId);
>
>          // Apply SAO on a CU in current row
> +        void processSaoPcmRow(int startCol);
>          void processSaoUnitCu(SAOParam *saoParam, int col);
>
>          // Copy and Save SAO reference pixels for SAO Rdo decide
>          void copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col);
> +        void copySaoAboveRefRow(PicYuv* reconPic, uint32_t cuAddr, int
> col);
>
>          // Post-Process (Border extension)
>          void processPostCu(int col) const;
> +        void processPostRow() const;
>
>          uint32_t getCUHeight() const
>          {
> diff -r dc62b47dd0d9 -r ad8ebeffdda4 source/encoder/sao.cpp
> --- a/source/encoder/sao.cpp    Mon Jan 25 14:59:50 2016 +0530
> +++ b/source/encoder/sao.cpp    Thu Feb 04 13:29:38 2016 +0800
> @@ -595,6 +595,79 @@
>      }
>  }
>
> +/* Process SAO all units */
> +void SAO::processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int startX,
> int plane)
> +{
> +    PicYuv* reconPic = m_frame->m_reconPic;
> +    intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
> +    uint32_t picWidth  = m_param->sourceWidth;
> +    int ctuWidth  = g_maxCUSize;
> +    int ctuHeight = g_maxCUSize;
> +
> +    if (plane)
> +    {
> +        picWidth  >>= m_hChromaShift;
> +        ctuWidth  >>= m_hChromaShift;
> +        ctuHeight >>= m_vChromaShift;
> +    }
> +
> +    int addr = idxY * m_numCuInWidth;
> +    pixel* rec = reconPic->getPlaneAddr(plane, addr);
> +
> +    if (startX == 0)
> +    {
> +        for (int i = 0; i < ctuHeight + 1; i++)
> +        {
> +            m_tmpL1[plane][i] = rec[0];
> +            rec += stride;
> +        }
> +    }
> +
> +    for (int idxX = startX; idxX < m_numCuInWidth; idxX++)
> +    {
> +        addr = idxY * m_numCuInWidth + idxX;
> +
> +        bool mergeLeftFlag = ctuParam[addr].mergeMode == SAO_MERGE_LEFT;
> +        int typeIdx = ctuParam[addr].typeIdx;
> +
> +        if (idxX != (m_numCuInWidth - 1))
> +        {
> +            rec = reconPic->getPlaneAddr(plane, addr);
> +            for (int i = 0; i < ctuHeight + 1; i++)
> +            {
> +                m_tmpL2[plane][i] = rec[ctuWidth - 1];
> +                rec += stride;
> +            }
> +        }
> +
> +        if (typeIdx >= 0)
> +        {
> +            if (!mergeLeftFlag)
> +            {
> +                if (typeIdx == SAO_BO)
> +                {
> +                    memset(m_offsetBo[plane], 0, sizeof(m_offsetBo[0]));
> +
> +                    for (int i = 0; i < SAO_NUM_OFFSET; i++)
> +                        m_offsetBo[plane][((ctuParam[addr].bandPos + i) &
> (SAO_NUM_BO_CLASSES - 1))] = (int8_t)(ctuParam[addr].offset[i] <<
> SAO_BIT_INC);
> +                }
> +                else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 ||
> typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
> +                {
> +                    int offset[NUM_EDGETYPE];
> +                    offset[0] = 0;
> +                    for (int i = 0; i < SAO_NUM_OFFSET; i++)
> +                        offset[i + 1] = ctuParam[addr].offset[i] <<
> SAO_BIT_INC;
> +
> +                    for (int edgeType = 0; edgeType < NUM_EDGETYPE;
> edgeType++)
> +                        m_offsetEo[plane][edgeType] =
> (int8_t)offset[s_eoTable[edgeType]];
> +                }
> +            }
> +            processSaoCu(addr, typeIdx, plane);
> +        }
> +        std::swap(m_tmpL1[plane], m_tmpL2[plane]);
> +    }
> +}
> +
>  /* Process SAO unit */
>  void SAO::processSaoUnitCuLuma(SaoCtuParam* ctuParam, int idxY, int idxX)
>  {
> @@ -1361,6 +1434,157 @@
>      }
>  }
>
> +void SAO::rdoSaoUnitRow(SAOParam* saoParam, int rowBaseAddr, int startCol)
> +{
> +    double lambda[3] = {m_lumaLambda, m_chromaLambda, m_chromaLambda};
> +
> +    bool chroma = m_param->internalCsp != X265_CSP_I400;
> +    int planes = chroma ? 3 : 1;
> +    bool allowMerge[2] = {(startCol != 0), (rowBaseAddr != 0)}; // left,
> up
> +//     int addrMerge[2] = {(startCol - 1), (rowBaseAddr ? startCol -
> m_numCuInWidth : -1)};// left, up
> +
> +    for(int idxX = startCol; idxX < m_numCuInWidth; idxX++)
> +    {
> +//         X265_CHECK((idxX ? idxX - 1 : -1) == addrMerge[0],
> "addrMerge[0] check failure");
> +        const int addr = rowBaseAddr + idxX;
> +
> +        int addrMerge[2] = {(idxX ? addr - 1 : -1), (rowBaseAddr ? addr -
> m_numCuInWidth : -1)};// left, up
> +
> +        m_entropyCoder.load(m_rdContexts.cur);
> +        if (allowMerge[0])
> +            m_entropyCoder.codeSaoMerge(0);
> +        if (allowMerge[1])
> +            m_entropyCoder.codeSaoMerge(0);
> +        m_entropyCoder.store(m_rdContexts.temp);
> +
> +        // reset stats Y, Cb, Cr
> +        X265_CHECK(sizeof(PerPlane) == (sizeof(int32_t) * (NUM_PLANE *
> MAX_NUM_SAO_TYPE * MAX_NUM_SAO_CLASS)), "Found Padding space in struct
> PerPlane");
> +
> +        // TODO: Confirm the address space is continuous
> +        if (m_param->bSaoNonDeblocked)
> +        {
> +            memcpy(m_count, m_countPreDblk[addr], sizeof(m_count));
> +            memcpy(m_offsetOrg, m_offsetOrgPreDblk[addr],
> sizeof(m_offsetOrg));
> +        }
> +        else
> +        {
> +            memset(m_count, 0, sizeof(m_count));
> +            memset(m_offsetOrg, 0, sizeof(m_offsetOrg));
> +        }
> +
> +        for (int i = 0; i < planes; i++)
> +            saoParam->ctuParam[i][addr].reset();
> +
> +        if (saoParam->bSaoFlag[0])
> +        {
> +            calcSaoStatsCu(addr, 0);
> +            saoStatsInitialOffset(0);
> +        }
> +
> +        if (saoParam->bSaoFlag[1])
> +        {
> +            calcSaoStatsCu(addr, 1);
> +            calcSaoStatsCu(addr, 2);
> +            saoStatsInitialOffset(1);
> +            //        saoStatsInitialOffset(2);
> +        }
> +
> +        double mergeDist[NUM_MERGE_MODE] = { 0.0 };
> +        saoLumaComponentParamDist(saoParam, addr, mergeDist);
> +        if (chroma)
> +            saoChromaComponentParamDist(saoParam, addr, mergeDist);
> +
> +        if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
> +        {
> +            // Cost of new SAO_params
> +            m_entropyCoder.load(m_rdContexts.cur);
> +            m_entropyCoder.resetBits();
> +            if (allowMerge[0])
> +                m_entropyCoder.codeSaoMerge(0);
> +            if (allowMerge[1])
> +                m_entropyCoder.codeSaoMerge(0);
> +            for (int plane = 0; plane < planes; plane++)
> +            {
> +                if (saoParam->bSaoFlag[plane > 0])
> +
> m_entropyCoder.codeSaoOffset(saoParam->ctuParam[plane][addr], plane);
> +            }
> +
> +            uint32_t rate = m_entropyCoder.getNumberOfWrittenBits();
> +            double bestCost = mergeDist[0] + (double)rate;
> +            m_entropyCoder.store(m_rdContexts.temp);
> +
> +            // Cost of merge left or Up
> +            for (int mergeIdx = 0; mergeIdx < 2; ++mergeIdx)
> +            {
> +                if (!allowMerge[mergeIdx])
> +                    continue;
> +
> +                for (int plane = 0; plane < 3; plane++)
> +                {
> +                    int64_t estDist = 0;
> +                    SaoCtuParam* mergeSrcParam =
> &(saoParam->ctuParam[plane][addrMerge[mergeIdx]]);
> +                    int typeIdx = mergeSrcParam->typeIdx;
> +                    if (typeIdx >= 0)
> +                    {
> +                        int bandPos = (typeIdx == SAO_BO) ?
> mergeSrcParam->bandPos : 0;
> +                        for (int classIdx = 0; classIdx < SAO_NUM_OFFSET;
> classIdx++)
> +                        {
> +                            int mergeOffset =
> mergeSrcParam->offset[classIdx];
> +                            estDist +=
> estSaoDist(m_count[plane][typeIdx][classIdx + bandPos + 1], mergeOffset,
> m_offsetOrg[plane][typeIdx][classIdx + bandPos + 1]);
> +                        }
> +                    }
> +
> +                    mergeDist[mergeIdx + 1] += ((double)estDist /
> lambda[plane]);
> +                }
> +
> +
> +                m_entropyCoder.load(m_rdContexts.cur);
> +                m_entropyCoder.resetBits();
> +                if (allowMerge[0])
> +                    m_entropyCoder.codeSaoMerge(1 - mergeIdx);
> +                if (allowMerge[1] && (mergeIdx == 1))
> +                    m_entropyCoder.codeSaoMerge(1);
> +
> +                rate = m_entropyCoder.getNumberOfWrittenBits();
> +                double mergeCost = mergeDist[mergeIdx + 1] + (double)rate;
> +                if (mergeCost < bestCost)
> +                {
> +                    SaoMergeMode mergeMode = mergeIdx ? SAO_MERGE_UP :
> SAO_MERGE_LEFT;
> +                    bestCost = mergeCost;
> +                    m_entropyCoder.store(m_rdContexts.temp);
> +                    for (int plane = 0; plane < planes; plane++)
> +                    {
> +                        if (saoParam->bSaoFlag[plane > 0])
> +                        {
> +                            SaoCtuParam* dstCtuParam   =
> &saoParam->ctuParam[plane][addr];
> +                            SaoCtuParam* mergeSrcParam =
> &(saoParam->ctuParam[plane][addrMerge[mergeIdx]]);
> +                            dstCtuParam->mergeMode = mergeMode;
> +                            dstCtuParam->typeIdx   =
> mergeSrcParam->typeIdx;
> +                            dstCtuParam->bandPos   =
> mergeSrcParam->bandPos;
> +
> +                            for (int i = 0; i < SAO_NUM_OFFSET; i++)
> +                                dstCtuParam->offset[i] =
> mergeSrcParam->offset[i];
> +                        }
> +                    }
> +                }
> +            }
> +
> +            if (saoParam->ctuParam[0][addr].typeIdx < 0)
> +                m_numNoSao[0]++;
> +            if (chroma && saoParam->ctuParam[1][addr].typeIdx < 0)
> +                m_numNoSao[1]++;
> +            m_entropyCoder.load(m_rdContexts.temp);
> +            m_entropyCoder.store(m_rdContexts.cur);
> +        }
> +
> +        // Left merge still available after first CU
> +        allowMerge[0] = true;
> +
> +        // next CU address
> +        //addrMerge[0]++;
> +        //addrMerge[1] += (rowBaseAddr ? 1 : 0);
> +    }
> +}
>
>  // Rounds the division of initial offsets by the number of samples in
>  // each of the statistics table entries.
> diff -r dc62b47dd0d9 -r ad8ebeffdda4 source/encoder/sao.h
> --- a/source/encoder/sao.h      Mon Jan 25 14:59:50 2016 +0530
> +++ b/source/encoder/sao.h      Thu Feb 04 13:29:38 2016 +0800
> @@ -132,7 +132,7 @@
>
>      // CTU-based SAO process without slice granularity
>      void processSaoCu(int addr, int typeIdx, int plane);
> -    void processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane);
> +    void processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int startX,
> int plane);
>      void processSaoUnitCuLuma(SaoCtuParam* ctuParam, int idxY, int idxX);
>      void processSaoUnitCuChroma(SaoCtuParam* ctuParam[3], int idxY, int
> idxX);
>
> @@ -147,7 +147,7 @@
>      inline int estIterOffset(int typeIdx, double lambda, int offset,
> int32_t count, int32_t offsetOrg,
>                               int& currentDistortionTableBo, double&
> currentRdCostTableBo);
>      void rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus);
> -//    void rdoSaoUnitRow(SAOParam* saoParam, int idxY);
> +    void rdoSaoUnitRow(SAOParam* saoParam, int rowBaseAddr, int startCol);
>      void rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int
> addr);
>
>      void saoStatsInitialOffset(int plane);
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20160205/5e5abdcf/attachment-0001.html>


More information about the x265-devel mailing list