<div dir="ltr"><div>I think there is no point of using two fuctions for the same purpose.</div><div>processTasks() and processTasksRow(). Either we can use processTasks() or processTasksRow().</div><div><br></div><div>I dont know how much performance we are achieving by doing this, but from my perspective we should </div><div>look into the complexity of the code at the same time. </div></div><div class="gmail_extra"><br><div class="gmail_quote">On Thu, Feb 4, 2016 at 10:59 AM, Min Chen <span dir="ltr"><<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Min Chen <<a href="mailto:chenm003@163.com">chenm003@163.com</a>><br>
# Date 1454563778 -28800<br>
# Node ID ad8ebeffdda44378dd93b787215a937a26be980e<br>
# Parent dc62b47dd0d98f732165345883edac55320baec1<br>
improve performance by full row process<br>
---<br>
source/encoder/framefilter.cpp | 197 +++++++++++++++++++++++++++++++++--<br>
source/encoder/framefilter.h | 4 +<br>
source/encoder/sao.cpp | 224 ++++++++++++++++++++++++++++++++++++++++<br>
source/encoder/sao.h | 4 +-<br>
4 files changed, 418 insertions(+), 11 deletions(-)<br>
<br>
diff -r dc62b47dd0d9 -r ad8ebeffdda4 source/encoder/framefilter.cpp<br>
--- a/source/encoder/framefilter.cpp Mon Jan 25 14:59:50 2016 +0530<br>
+++ b/source/encoder/framefilter.cpp Thu Feb 04 13:29:38 2016 +0800<br>
@@ -174,6 +174,22 @@<br>
restoreOrigLosslessYuv(cu, frame, absPartIdx);<br>
}<br>
<br>
+void FrameFilter::ParallelFilter::processSaoPcmRow(int startCol)<br>
+{<br>
+ if (m_encData->m_slice->m_pps->bTransquantBypassEnabled)<br>
+ {<br>
+ const CUGeom* cuGeoms = m_frameFilter->m_frameEncoder->m_cuGeoms;<br>
+ const uint32_t* ctuGeomMap = m_frameFilter->m_frameEncoder->m_ctuGeomMap;<br>
+<br>
+ for (int col = startCol; col < m_frameFilter->m_numCols; col++)<br>
+ {<br>
+ uint32_t cuAddr = m_rowAddr + col;<br>
+ const CUData* ctu = m_encData->getPicCTU(cuAddr);<br>
+ origCUSampleRestoration(ctu, cuGeoms[ctuGeomMap[cuAddr]], *m_frameFilter->m_frame);<br>
+ }<br>
+ }<br>
+}<br>
+<br>
void FrameFilter::ParallelFilter::copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col)<br>
{<br>
// Copy SAO Top Reference Pixels<br>
@@ -182,7 +198,7 @@<br>
<br>
// Luma<br>
memcpy(&m_sao.m_tmpU[0][col * ctuWidth], recY, ctuWidth * sizeof(pixel));<br>
- X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected");<br>
+ X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer write beyond bound detected");<br>
<br>
// Chroma<br>
if (m_frameFilter->m_param->internalCsp != X265_CSP_I400)<br>
@@ -194,7 +210,32 @@<br>
memcpy(&m_sao.m_tmpU[1][col * ctuWidth], recU, ctuWidth * sizeof(pixel));<br>
memcpy(&m_sao.m_tmpU[2][col * ctuWidth], recV, ctuWidth * sizeof(pixel));<br>
<br>
- X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected");<br>
+ X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer write beyond bound detected");<br>
+ }<br>
+}<br>
+<br>
+void FrameFilter::ParallelFilter::copySaoAboveRefRow(PicYuv* reconPic, uint32_t cuAddr, int col)<br>
+{<br>
+ // Copy SAO Top Reference Pixels<br>
+ int ctuWidth = g_maxCUSize;<br>
+ const pixel* recY = reconPic->getPlaneAddr(0, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_stride);<br>
+ const int cntCols = (m_frameFilter->m_numCols - col);<br>
+<br>
+ // Luma<br>
+ memcpy(&m_sao.m_tmpU[0][col * ctuWidth], recY, cntCols * ctuWidth * sizeof(pixel));<br>
+ X265_CHECK(col * ctuWidth + cntCols * ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer write beyond bound detected");<br>
+<br>
+ // Chroma<br>
+ if (m_frameFilter->m_param->internalCsp != X265_CSP_I400)<br>
+ {<br>
+ ctuWidth >>= m_sao.m_hChromaShift;<br>
+<br>
+ const pixel* recU = reconPic->getPlaneAddr(1, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC);<br>
+ const pixel* recV = reconPic->getPlaneAddr(2, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC);<br>
+ memcpy(&m_sao.m_tmpU[1][col * ctuWidth], recU, cntCols * ctuWidth * sizeof(pixel));<br>
+ memcpy(&m_sao.m_tmpU[2][col * ctuWidth], recV, cntCols * ctuWidth * sizeof(pixel));<br>
+<br>
+ X265_CHECK(col * ctuWidth + cntCols * ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound detected");<br>
}<br>
}<br>
<br>
@@ -243,7 +284,7 @@<br>
const intptr_t stride = reconPic->m_stride;<br>
const intptr_t strideC = reconPic->m_strideC;<br>
pixel *pixY = reconPic->getLumaAddr(lineStartCUAddr);<br>
- // // MUST BE check I400 since m_picOrg uninitialize in that case<br>
+ // MUST BE check I400 since m_picOrg uninitialize in that case<br>
pixel *pixU = (m_frameFilter->m_param->internalCsp != X265_CSP_I400) ? reconPic->getCbAddr(lineStartCUAddr) : NULL;<br>
pixel *pixV = (m_frameFilter->m_param->internalCsp != X265_CSP_I400) ? reconPic->getCrAddr(lineStartCUAddr) : NULL;<br>
int copySizeY = realW;<br>
@@ -312,6 +353,79 @@<br>
}<br>
}<br>
<br>
+void FrameFilter::ParallelFilter::processPostRow() const<br>
+{<br>
+<br>
+ PicYuv *reconPic = m_frameFilter->m_frame->m_reconPic;<br>
+<br>
+ const uint32_t lumaMarginX = reconPic->m_lumaMarginX;<br>
+ const uint32_t lumaMarginY = reconPic->m_lumaMarginY;<br>
+ const uint32_t chromaMarginX = reconPic->m_chromaMarginX;<br>
+ const uint32_t chromaMarginY = reconPic->m_chromaMarginY;<br>
+ const int hChromaShift = reconPic->m_hChromaShift;<br>
+ const int vChromaShift = reconPic->m_vChromaShift;<br>
+ const intptr_t stride = reconPic->m_stride;<br>
+ const intptr_t strideC = reconPic->m_strideC;<br>
+ pixel *pixY0 = reconPic->getLumaAddr(m_rowAddr);<br>
+ // MUST BE check I400 since m_picOrg uninitialize in that case<br>
+ pixel *pixU0 = (m_frameFilter->m_param->internalCsp != X265_CSP_I400) ? reconPic->getCbAddr(m_rowAddr) : NULL;<br>
+ pixel *pixV0 = (m_frameFilter->m_param->internalCsp != X265_CSP_I400) ? reconPic->getCrAddr(m_rowAddr) : NULL;<br>
+ const int realH = getCUHeight();<br>
+<br>
+ // Border extend Left and Right<br>
+ primitives.extendRowBorder(pixY0, reconPic->m_stride, reconPic->m_picWidth, realH, reconPic->m_lumaMarginX);<br>
+ if (m_frameFilter->m_param->internalCsp != X265_CSP_I400)<br>
+ {<br>
+ primitives.extendRowBorder(pixU0, strideC, reconPic->m_picWidth >> hChromaShift, realH >> vChromaShift, chromaMarginX);<br>
+ primitives.extendRowBorder(pixV0, strideC, reconPic->m_picWidth >> hChromaShift, realH >> vChromaShift, chromaMarginX);<br>
+ }<br>
+<br>
+ // Border extend Top<br>
+ if (!m_row)<br>
+ {<br>
+ pixel *pixY = pixY0 - lumaMarginX;<br>
+<br>
+ for (uint32_t y = 0; y < lumaMarginY; y++)<br>
+ memcpy(pixY - (y + 1) * stride, pixY, stride * sizeof(pixel));<br>
+<br>
+ if (m_frameFilter->m_param->internalCsp != X265_CSP_I400)<br>
+ {<br>
+ pixel *pixU = pixU0 - chromaMarginX;<br>
+ pixel *pixV = pixV0 - chromaMarginX;<br>
+<br>
+ for (uint32_t y = 0; y < reconPic->m_chromaMarginY; y++)<br>
+ {<br>
+ memcpy(pixU - (y + 1) * strideC, pixU, strideC * sizeof(pixel));<br>
+ memcpy(pixV - (y + 1) * strideC, pixV, strideC * sizeof(pixel));<br>
+ }<br>
+ }<br>
+ }<br>
+<br>
+ // Border extend Bottom<br>
+ if (m_row == m_frameFilter->m_numRows - 1)<br>
+ {<br>
+ pixel *pixY = pixY0 - lumaMarginX + (realH - 1) * stride;<br>
+<br>
+ for (uint32_t y = 0; y < lumaMarginY; y++)<br>
+ memcpy(pixY + (y + 1) * stride, pixY, stride * sizeof(pixel));<br>
+<br>
+ if (m_frameFilter->m_param->internalCsp != X265_CSP_I400)<br>
+ {<br>
+ pixel *pixU = pixU0 - chromaMarginX + ((realH >> vChromaShift) - 1) * strideC;<br>
+ pixel *pixV = pixV0 - chromaMarginX + ((realH >> vChromaShift) - 1) * strideC;<br>
+<br>
+ for (uint32_t y = 0; y < chromaMarginY; y++)<br>
+ {<br>
+ memcpy(pixU + (y + 1) * strideC, pixU, strideC * sizeof(pixel));<br>
+ memcpy(pixV + (y + 1) * strideC, pixV, strideC * sizeof(pixel));<br>
+ }<br>
+ }<br>
+ }<br>
+<br>
+ // Update finished CU cursor<br>
+ m_frameFilter->m_frame->m_reconColCount[m_row].set(m_frameFilter->m_numCols - 1);<br>
+}<br>
+<br>
// NOTE: Single Threading only<br>
void FrameFilter::ParallelFilter::processTasks(int /*workerThreadId*/)<br>
{<br>
@@ -433,6 +547,75 @@<br>
}<br>
}<br>
<br>
+void FrameFilter::ParallelFilter::processTasksRow(int /*workerThreadId*/)<br>
+{<br>
+ SAOParam* saoParam = m_encData->m_saoParam;<br>
+ const CUGeom* cuGeoms = m_frameFilter->m_frameEncoder->m_cuGeoms;<br>
+ const uint32_t* ctuGeomMap = m_frameFilter->m_frameEncoder->m_ctuGeomMap;<br>
+ PicYuv* reconPic = m_encData->m_reconPic;<br>
+ const int colStart = m_lastCol.get();<br>
+ const int numCols = m_frameFilter->m_numCols;<br>
+<br>
+ // Avoid threading conflict<br>
+ if (colStart >= numCols)<br>
+ return;<br>
+<br>
+ // Previous row MUST BE finish<br>
+ if (m_frameFilter->m_param->bEnableLoopFilter)<br>
+ {<br>
+ for (uint32_t col = (uint32_t)colStart; col < (uint32_t)numCols; col++)<br>
+ {<br>
+ const uint32_t cuAddr = m_rowAddr + col;<br>
+<br>
+ const CUData* ctu = m_encData->getPicCTU(cuAddr);<br>
+ deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER);<br>
+<br>
+ if (col >= 1)<br>
+ {<br>
+ const CUData* ctuPrev = m_encData->getPicCTU(cuAddr - 1);<br>
+ deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR);<br>
+ }<br>
+ }<br>
+ // Process last column<br>
+ {<br>
+ const uint32_t cuAddr = m_rowAddr + numCols - 1;<br>
+ const CUData* ctuPrev = m_encData->getPicCTU(cuAddr);<br>
+ deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_HOR);<br>
+ }<br>
+ }<br>
+<br>
+ if (m_frameFilter->m_param->bEnableSAO)<br>
+ {<br>
+ // Save SAO bottom row reference pixels<br>
+ copySaoAboveRefRow(reconPic, m_rowAddr + X265_MAX(0, colStart - 1), X265_MAX(0, colStart - 1));<br>
+<br>
+ m_sao.rdoSaoUnitRow(saoParam, m_rowAddr, X265_MAX(0, colStart - 2));<br>
+<br>
+ // Process Previous Row SAO CU<br>
+ if (m_row >= 1)<br>
+ {<br>
+ const int saoProcessStartCol = X265_MAX(0, colStart - 3);<br>
+<br>
+ // Must delay 1 row to avoid thread data race conflict<br>
+ m_prevRow->m_sao.processSaoUnitRow(saoParam->ctuParam[0], m_prevRow->m_row, saoProcessStartCol, 0);<br>
+ m_prevRow->m_sao.processSaoUnitRow(saoParam->ctuParam[1], m_prevRow->m_row, saoProcessStartCol, 1);<br>
+ m_prevRow->m_sao.processSaoUnitRow(saoParam->ctuParam[2], m_prevRow->m_row, saoProcessStartCol, 2);<br>
+ m_prevRow->processSaoPcmRow(saoProcessStartCol);<br>
+ }<br>
+ }<br>
+<br>
+ if (m_row >= 1)<br>
+ {<br>
+ // TODO: process current row when SAO disabled<br>
+ m_prevRow->processPostRow();<br>
+ }<br>
+<br>
+ // Setting column sync counter<br>
+ if (m_row >= 1)<br>
+ m_frameFilter->m_frame->m_reconColCount[m_row - 1].set(numCols - 1); // REMOVE soon<br>
+ m_lastDeblocked.set(numCols);<br>
+}<br>
+<br>
void FrameFilter::processRow(int row)<br>
{<br>
ProfileScopeEvent(filterCTURow);<br>
@@ -461,7 +644,7 @@<br>
X265_CHECK((row < 1) || m_parallelFilter[row - 1].m_lastDeblocked.get() == m_numCols, "previous row not finish");<br>
<br>
m_parallelFilter[row].m_allowedCol.set(m_numCols);<br>
- m_parallelFilter[row].processTasks(-1);<br>
+ m_parallelFilter[row].processTasksRow(-1);<br>
<br>
if (row == m_numRows - 1)<br>
{<br>
@@ -480,11 +663,7 @@<br>
}<br>
<br>
// Process border extension on last row<br>
- for(int col = 0; col < m_numCols; col++)<br>
- {<br>
- // m_reconColCount will be set in processPostCu()<br>
- m_parallelFilter[row].processPostCu(col);<br>
- }<br>
+ m_parallelFilter[row].processPostRow();<br>
}<br>
}<br>
<br>
diff -r dc62b47dd0d9 -r ad8ebeffdda4 source/encoder/framefilter.h<br>
--- a/source/encoder/framefilter.h Mon Jan 25 14:59:50 2016 +0530<br>
+++ b/source/encoder/framefilter.h Thu Feb 04 13:29:38 2016 +0800<br>
@@ -88,15 +88,19 @@<br>
{ }<br>
<br>
void processTasks(int workerThreadId);<br>
+ void processTasksRow(int workerThreadId);<br>
<br>
// Apply SAO on a CU in current row<br>
+ void processSaoPcmRow(int startCol);<br>
void processSaoUnitCu(SAOParam *saoParam, int col);<br>
<br>
// Copy and Save SAO reference pixels for SAO Rdo decide<br>
void copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col);<br>
+ void copySaoAboveRefRow(PicYuv* reconPic, uint32_t cuAddr, int col);<br>
<br>
// Post-Process (Border extension)<br>
void processPostCu(int col) const;<br>
+ void processPostRow() const;<br>
<br>
uint32_t getCUHeight() const<br>
{<br>
diff -r dc62b47dd0d9 -r ad8ebeffdda4 source/encoder/sao.cpp<br>
--- a/source/encoder/sao.cpp Mon Jan 25 14:59:50 2016 +0530<br>
+++ b/source/encoder/sao.cpp Thu Feb 04 13:29:38 2016 +0800<br>
@@ -595,6 +595,79 @@<br>
}<br>
}<br>
<br>
+/* Process SAO all units */<br>
+void SAO::processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int startX, int plane)<br>
+{<br>
+ PicYuv* reconPic = m_frame->m_reconPic;<br>
+ intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;<br>
+ uint32_t picWidth = m_param->sourceWidth;<br>
+ int ctuWidth = g_maxCUSize;<br>
+ int ctuHeight = g_maxCUSize;<br>
+<br>
+ if (plane)<br>
+ {<br>
+ picWidth >>= m_hChromaShift;<br>
+ ctuWidth >>= m_hChromaShift;<br>
+ ctuHeight >>= m_vChromaShift;<br>
+ }<br>
+<br>
+ int addr = idxY * m_numCuInWidth;<br>
+ pixel* rec = reconPic->getPlaneAddr(plane, addr);<br>
+<br>
+ if (startX == 0)<br>
+ {<br>
+ for (int i = 0; i < ctuHeight + 1; i++)<br>
+ {<br>
+ m_tmpL1[plane][i] = rec[0];<br>
+ rec += stride;<br>
+ }<br>
+ }<br>
+<br>
+ for (int idxX = startX; idxX < m_numCuInWidth; idxX++)<br>
+ {<br>
+ addr = idxY * m_numCuInWidth + idxX;<br>
+<br>
+ bool mergeLeftFlag = ctuParam[addr].mergeMode == SAO_MERGE_LEFT;<br>
+ int typeIdx = ctuParam[addr].typeIdx;<br>
+<br>
+ if (idxX != (m_numCuInWidth - 1))<br>
+ {<br>
+ rec = reconPic->getPlaneAddr(plane, addr);<br>
+ for (int i = 0; i < ctuHeight + 1; i++)<br>
+ {<br>
+ m_tmpL2[plane][i] = rec[ctuWidth - 1];<br>
+ rec += stride;<br>
+ }<br>
+ }<br>
+<br>
+ if (typeIdx >= 0)<br>
+ {<br>
+ if (!mergeLeftFlag)<br>
+ {<br>
+ if (typeIdx == SAO_BO)<br>
+ {<br>
+ memset(m_offsetBo[plane], 0, sizeof(m_offsetBo[0]));<br>
+<br>
+ for (int i = 0; i < SAO_NUM_OFFSET; i++)<br>
+ m_offsetBo[plane][((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = (int8_t)(ctuParam[addr].offset[i] << SAO_BIT_INC);<br>
+ }<br>
+ else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)<br>
+ {<br>
+ int offset[NUM_EDGETYPE];<br>
+ offset[0] = 0;<br>
+ for (int i = 0; i < SAO_NUM_OFFSET; i++)<br>
+ offset[i + 1] = ctuParam[addr].offset[i] << SAO_BIT_INC;<br>
+<br>
+ for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++)<br>
+ m_offsetEo[plane][edgeType] = (int8_t)offset[s_eoTable[edgeType]];<br>
+ }<br>
+ }<br>
+ processSaoCu(addr, typeIdx, plane);<br>
+ }<br>
+ std::swap(m_tmpL1[plane], m_tmpL2[plane]);<br>
+ }<br>
+}<br>
+<br>
/* Process SAO unit */<br>
void SAO::processSaoUnitCuLuma(SaoCtuParam* ctuParam, int idxY, int idxX)<br>
{<br>
@@ -1361,6 +1434,157 @@<br>
}<br>
}<br>
<br>
+void SAO::rdoSaoUnitRow(SAOParam* saoParam, int rowBaseAddr, int startCol)<br>
+{<br>
+ double lambda[3] = {m_lumaLambda, m_chromaLambda, m_chromaLambda};<br>
+<br>
+ bool chroma = m_param->internalCsp != X265_CSP_I400;<br>
+ int planes = chroma ? 3 : 1;<br>
+ bool allowMerge[2] = {(startCol != 0), (rowBaseAddr != 0)}; // left, up<br>
+// int addrMerge[2] = {(startCol - 1), (rowBaseAddr ? startCol - m_numCuInWidth : -1)};// left, up<br>
+<br>
+ for(int idxX = startCol; idxX < m_numCuInWidth; idxX++)<br>
+ {<br>
+// X265_CHECK((idxX ? idxX - 1 : -1) == addrMerge[0], "addrMerge[0] check failure");<br>
+ const int addr = rowBaseAddr + idxX;<br>
+<br>
+ int addrMerge[2] = {(idxX ? addr - 1 : -1), (rowBaseAddr ? addr - m_numCuInWidth : -1)};// left, up<br>
+<br>
+ m_entropyCoder.load(m_rdContexts.cur);<br>
+ if (allowMerge[0])<br>
+ m_entropyCoder.codeSaoMerge(0);<br>
+ if (allowMerge[1])<br>
+ m_entropyCoder.codeSaoMerge(0);<br>
+ m_entropyCoder.store(m_rdContexts.temp);<br>
+<br>
+ // reset stats Y, Cb, Cr<br>
+ X265_CHECK(sizeof(PerPlane) == (sizeof(int32_t) * (NUM_PLANE * MAX_NUM_SAO_TYPE * MAX_NUM_SAO_CLASS)), "Found Padding space in struct PerPlane");<br>
+<br>
+ // TODO: Confirm the address space is continuous<br>
+ if (m_param->bSaoNonDeblocked)<br>
+ {<br>
+ memcpy(m_count, m_countPreDblk[addr], sizeof(m_count));<br>
+ memcpy(m_offsetOrg, m_offsetOrgPreDblk[addr], sizeof(m_offsetOrg));<br>
+ }<br>
+ else<br>
+ {<br>
+ memset(m_count, 0, sizeof(m_count));<br>
+ memset(m_offsetOrg, 0, sizeof(m_offsetOrg));<br>
+ }<br>
+<br>
+ for (int i = 0; i < planes; i++)<br>
+ saoParam->ctuParam[i][addr].reset();<br>
+<br>
+ if (saoParam->bSaoFlag[0])<br>
+ {<br>
+ calcSaoStatsCu(addr, 0);<br>
+ saoStatsInitialOffset(0);<br>
+ }<br>
+<br>
+ if (saoParam->bSaoFlag[1])<br>
+ {<br>
+ calcSaoStatsCu(addr, 1);<br>
+ calcSaoStatsCu(addr, 2);<br>
+ saoStatsInitialOffset(1);<br>
+ // saoStatsInitialOffset(2);<br>
+ }<br>
+<br>
+ double mergeDist[NUM_MERGE_MODE] = { 0.0 };<br>
+ saoLumaComponentParamDist(saoParam, addr, mergeDist);<br>
+ if (chroma)<br>
+ saoChromaComponentParamDist(saoParam, addr, mergeDist);<br>
+<br>
+ if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])<br>
+ {<br>
+ // Cost of new SAO_params<br>
+ m_entropyCoder.load(m_rdContexts.cur);<br>
+ m_entropyCoder.resetBits();<br>
+ if (allowMerge[0])<br>
+ m_entropyCoder.codeSaoMerge(0);<br>
+ if (allowMerge[1])<br>
+ m_entropyCoder.codeSaoMerge(0);<br>
+ for (int plane = 0; plane < planes; plane++)<br>
+ {<br>
+ if (saoParam->bSaoFlag[plane > 0])<br>
+ m_entropyCoder.codeSaoOffset(saoParam->ctuParam[plane][addr], plane);<br>
+ }<br>
+<br>
+ uint32_t rate = m_entropyCoder.getNumberOfWrittenBits();<br>
+ double bestCost = mergeDist[0] + (double)rate;<br>
+ m_entropyCoder.store(m_rdContexts.temp);<br>
+<br>
+ // Cost of merge left or Up<br>
+ for (int mergeIdx = 0; mergeIdx < 2; ++mergeIdx)<br>
+ {<br>
+ if (!allowMerge[mergeIdx])<br>
+ continue;<br>
+<br>
+ for (int plane = 0; plane < 3; plane++)<br>
+ {<br>
+ int64_t estDist = 0;<br>
+ SaoCtuParam* mergeSrcParam = &(saoParam->ctuParam[plane][addrMerge[mergeIdx]]);<br>
+ int typeIdx = mergeSrcParam->typeIdx;<br>
+ if (typeIdx >= 0)<br>
+ {<br>
+ int bandPos = (typeIdx == SAO_BO) ? mergeSrcParam->bandPos : 0;<br>
+ for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)<br>
+ {<br>
+ int mergeOffset = mergeSrcParam->offset[classIdx];<br>
+ estDist += estSaoDist(m_count[plane][typeIdx][classIdx + bandPos + 1], mergeOffset, m_offsetOrg[plane][typeIdx][classIdx + bandPos + 1]);<br>
+ }<br>
+ }<br>
+<br>
+ mergeDist[mergeIdx + 1] += ((double)estDist / lambda[plane]);<br>
+ }<br>
+<br>
+<br>
+ m_entropyCoder.load(m_rdContexts.cur);<br>
+ m_entropyCoder.resetBits();<br>
+ if (allowMerge[0])<br>
+ m_entropyCoder.codeSaoMerge(1 - mergeIdx);<br>
+ if (allowMerge[1] && (mergeIdx == 1))<br>
+ m_entropyCoder.codeSaoMerge(1);<br>
+<br>
+ rate = m_entropyCoder.getNumberOfWrittenBits();<br>
+ double mergeCost = mergeDist[mergeIdx + 1] + (double)rate;<br>
+ if (mergeCost < bestCost)<br>
+ {<br>
+ SaoMergeMode mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT;<br>
+ bestCost = mergeCost;<br>
+ m_entropyCoder.store(m_rdContexts.temp);<br>
+ for (int plane = 0; plane < planes; plane++)<br>
+ {<br>
+ if (saoParam->bSaoFlag[plane > 0])<br>
+ {<br>
+ SaoCtuParam* dstCtuParam = &saoParam->ctuParam[plane][addr];<br>
+ SaoCtuParam* mergeSrcParam = &(saoParam->ctuParam[plane][addrMerge[mergeIdx]]);<br>
+ dstCtuParam->mergeMode = mergeMode;<br>
+ dstCtuParam->typeIdx = mergeSrcParam->typeIdx;<br>
+ dstCtuParam->bandPos = mergeSrcParam->bandPos;<br>
+<br>
+ for (int i = 0; i < SAO_NUM_OFFSET; i++)<br>
+ dstCtuParam->offset[i] = mergeSrcParam->offset[i];<br>
+ }<br>
+ }<br>
+ }<br>
+ }<br>
+<br>
+ if (saoParam->ctuParam[0][addr].typeIdx < 0)<br>
+ m_numNoSao[0]++;<br>
+ if (chroma && saoParam->ctuParam[1][addr].typeIdx < 0)<br>
+ m_numNoSao[1]++;<br>
+ m_entropyCoder.load(m_rdContexts.temp);<br>
+ m_entropyCoder.store(m_rdContexts.cur);<br>
+ }<br>
+<br>
+ // Left merge still available after first CU<br>
+ allowMerge[0] = true;<br>
+<br>
+ // next CU address<br>
+ //addrMerge[0]++;<br>
+ //addrMerge[1] += (rowBaseAddr ? 1 : 0);<br>
+ }<br>
+}<br>
<br>
// Rounds the division of initial offsets by the number of samples in<br>
// each of the statistics table entries.<br>
diff -r dc62b47dd0d9 -r ad8ebeffdda4 source/encoder/sao.h<br>
--- a/source/encoder/sao.h Mon Jan 25 14:59:50 2016 +0530<br>
+++ b/source/encoder/sao.h Thu Feb 04 13:29:38 2016 +0800<br>
@@ -132,7 +132,7 @@<br>
<br>
// CTU-based SAO process without slice granularity<br>
void processSaoCu(int addr, int typeIdx, int plane);<br>
- void processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane);<br>
+ void processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int startX, int plane);<br>
void processSaoUnitCuLuma(SaoCtuParam* ctuParam, int idxY, int idxX);<br>
void processSaoUnitCuChroma(SaoCtuParam* ctuParam[3], int idxY, int idxX);<br>
<br>
@@ -147,7 +147,7 @@<br>
inline int estIterOffset(int typeIdx, double lambda, int offset, int32_t count, int32_t offsetOrg,<br>
int& currentDistortionTableBo, double& currentRdCostTableBo);<br>
void rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus);<br>
-// void rdoSaoUnitRow(SAOParam* saoParam, int idxY);<br>
+ void rdoSaoUnitRow(SAOParam* saoParam, int rowBaseAddr, int startCol);<br>
void rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr);<br>
<br>
void saoStatsInitialOffset(int plane);<br>
<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" rel="noreferrer" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br></div>