[x265] [PATCH 06 of 15] sao: merge tmpU1 and tmpU2 into tmpU, and copy these above reference pixels in every row based thread

Min Chen chenm003 at 163.com
Wed Dec 2 18:28:29 CET 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1449076356 21600
# Node ID 1c6f6e627722c767bb9484064a1cea6286c62103
# Parent  eb20b66eebe7e9de04cec0f98f1c3c43e678fcf5
sao: merge tmpU1 and tmpU2 into tmpU, and copy these above reference pixels in every row based thread
---
 source/encoder/frameencoder.cpp |    7 ++++++
 source/encoder/framefilter.cpp  |   44 +++++++++++++++++++++++++++-----------
 source/encoder/framefilter.h    |    3 ++
 source/encoder/sao.cpp          |   39 +++++++++------------------------
 source/encoder/sao.h            |    3 +-
 5 files changed, 53 insertions(+), 43 deletions(-)

diff -r eb20b66eebe7 -r 1c6f6e627722 source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp	Wed Dec 02 11:12:32 2015 -0600
+++ b/source/encoder/frameencoder.cpp	Wed Dec 02 11:12:36 2015 -0600
@@ -1124,6 +1124,13 @@
             }
         }
 
+        /* Case of DEBLOCK Disable and SAO Enable */
+        if (!m_param->bEnableLoopFilter && m_param->bEnableSAO)
+        {
+            PicYuv* reconPic = curEncData.m_reconPic;
+            m_frameFilter.m_parallelFilter[row].copySaoAboveRef(reconPic, cuAddr, col);
+        }
+
         if (m_param->bEnableWavefront && curRow.completed >= 2 && row < m_numRows - 1 &&
             (!m_bAllRowsStop || intRow + 1 < m_vbvResetTriggerRow))
         {
diff -r eb20b66eebe7 -r 1c6f6e627722 source/encoder/framefilter.cpp
--- a/source/encoder/framefilter.cpp	Wed Dec 02 11:12:32 2015 -0600
+++ b/source/encoder/framefilter.cpp	Wed Dec 02 11:12:36 2015 -0600
@@ -69,7 +69,7 @@
     if (m_param->bEnableSsim)
         m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3));
 
-    if (m_param->bEnableLoopFilter)
+    if (m_param->bEnableLoopFilter | m_param->bEnableSAO)
         m_parallelFilter = new ParallelFilter[numRows];
 
     if (m_parallelFilter)
@@ -91,6 +91,7 @@
 
         for(int row = 0; row < numRows; row++)
         {
+            m_parallelFilter[row].m_param = m_param;
             m_parallelFilter[row].m_rowAddr = row * numCols;
             m_parallelFilter[row].m_frameEncoder = m_frameEncoder;
         }
@@ -117,17 +118,39 @@
             m_parallelFilter[row].m_encData = frame->m_encData;
         }
 
-        // Reset SAO global/common statistics
+        // Reset SAO common statistics
         if (m_param->bEnableSAO)
             m_parallelFilter[0].m_sao.resetStats();
     }
 }
 
+void FrameFilter::ParallelFilter::copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col)
+{
+    // Copy SAO Top Reference Pixels
+    int ctuWidth  = g_maxCUSize;
+    const pixel* recY = reconPic->getPlaneAddr(0, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_stride);
+
+    // Luma
+    memcpy(&m_sao.m_tmpU[0][col * ctuWidth], recY, ctuWidth * sizeof(pixel));
+    X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected");
+
+    // Chroma
+    ctuWidth  >>= m_sao.m_hChromaShift;
+
+    const pixel* recU = reconPic->getPlaneAddr(1, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC);
+    const pixel* recV = reconPic->getPlaneAddr(2, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC);
+    memcpy(&m_sao.m_tmpU[1][col * ctuWidth], recU, ctuWidth * sizeof(pixel));
+    memcpy(&m_sao.m_tmpU[2][col * ctuWidth], recV, ctuWidth * sizeof(pixel));
+
+    X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected");
+}
+
 // NOTE: Single Threading only
 void FrameFilter::ParallelFilter::processTasks(int /*workerThreadId*/)
 {
     const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
     const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
+    PicYuv* reconPic = m_encData->m_reconPic;
     const int colStart = m_lastCol.get();
     // TODO: Waiting previous row finish or simple clip on it?
     const int colEnd = m_allowedCol.get();
@@ -146,6 +169,9 @@
         {
             const CUData* ctuPrev = m_encData->getPicCTU(cuAddr - 1);
             deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR);
+
+            if (m_param->bEnableSAO)
+                copySaoAboveRef(reconPic, cuAddr - 1, col - 1);
         }
         m_lastCol.incr();
     }
@@ -155,6 +181,9 @@
         const uint32_t cuAddr = m_rowAddr + numCols - 1;
         const CUData* ctuPrev = m_encData->getPicCTU(cuAddr);
         deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_HOR);
+
+        if (m_param->bEnableSAO)
+            copySaoAboveRef(reconPic, cuAddr, numCols - 1);
     }
 }
 
@@ -507,23 +536,12 @@
     SAOParam* saoParam = encData.m_saoParam;
 
     if (saoParam->bSaoFlag[0])
-    {
         m_parallelFilter[row].m_sao.processSaoUnitRow(saoParam->ctuParam[0], row, 0);
-        if (row != m_numRows - 1)
-        {
-            memcpy(m_parallelFilter[row + 1].m_sao.m_tmpU1[0], m_parallelFilter[row].m_sao.m_tmpU1[0], sizeof(pixel) * m_param->sourceWidth);
-        }
-    }
 
     if (saoParam->bSaoFlag[1])
     {
         m_parallelFilter[row].m_sao.processSaoUnitRow(saoParam->ctuParam[1], row, 1);
         m_parallelFilter[row].m_sao.processSaoUnitRow(saoParam->ctuParam[2], row, 2);
-        if (row != m_numRows - 1)
-        {
-            memcpy(m_parallelFilter[row + 1].m_sao.m_tmpU1[1], m_parallelFilter[row].m_sao.m_tmpU1[1], sizeof(pixel) * m_param->sourceWidth);
-            memcpy(m_parallelFilter[row + 1].m_sao.m_tmpU1[2], m_parallelFilter[row].m_sao.m_tmpU1[2], sizeof(pixel) * m_param->sourceWidth);
-        }
     }
 
     if (encData.m_slice->m_pps->bTransquantBypassEnabled)
diff -r eb20b66eebe7 -r 1c6f6e627722 source/encoder/framefilter.h
--- a/source/encoder/framefilter.h	Wed Dec 02 11:12:32 2015 -0600
+++ b/source/encoder/framefilter.h	Wed Dec 02 11:12:36 2015 -0600
@@ -63,6 +63,7 @@
     public:
         static uint32_t     numCols;
         uint32_t            m_rowAddr;
+        x265_param*         m_param;
         FrameEncoder*       m_frameEncoder;
         FrameData*          m_encData;
         SAO                 m_sao;
@@ -71,6 +72,7 @@
 
         ParallelFilter()
             : m_rowAddr(0)
+            , m_param(NULL)
             , m_frameEncoder(NULL)
             , m_encData(NULL)
         {
@@ -80,6 +82,7 @@
         { }
 
         void processTasks(int workerThreadId);
+        void copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col);
 
     protected:
 
diff -r eb20b66eebe7 -r 1c6f6e627722 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp	Wed Dec 02 11:12:32 2015 -0600
+++ b/source/encoder/sao.cpp	Wed Dec 02 11:12:36 2015 -0600
@@ -84,12 +84,9 @@
     m_param = NULL;
     m_clipTable = NULL;
     m_clipTableBase = NULL;
-    m_tmpU1[0] = NULL;
-    m_tmpU1[1] = NULL;
-    m_tmpU1[2] = NULL;
-    m_tmpU2[0] = NULL;
-    m_tmpU2[1] = NULL;
-    m_tmpU2[2] = NULL;
+    m_tmpU[0] = NULL;
+    m_tmpU[1] = NULL;
+    m_tmpU[2] = NULL;
     m_tmpL1 = NULL;
     m_tmpL2 = NULL;
 
@@ -125,10 +122,9 @@
     for (int i = 0; i < 3; i++)
     {
         // SAO asm code will read 1 pixel before and after, so pad by 2
-        CHECKED_MALLOC(m_tmpU1[i], pixel, m_param->sourceWidth + 2);
-        m_tmpU1[i] += 1;
-        CHECKED_MALLOC(m_tmpU2[i], pixel, m_param->sourceWidth + 2);
-        m_tmpU2[i] += 1;
+        // NOTE: m_param->sourceWidth+2 enough, to avoid condition check in copySaoAboveRef(), I alloc more up to 63 bytes in here
+        CHECKED_MALLOC(m_tmpU[i], pixel, m_numCuInWidth * g_maxCUSize + 2);
+        m_tmpU[i] += 1;
     }
 
     if (initCommon)
@@ -191,15 +187,10 @@
 
     for (int i = 0; i < 3; i++)
     {
-        if (m_tmpU1[i])
+        if (m_tmpU[i])
         {
-            X265_FREE(m_tmpU1[i] - 1);
-            m_tmpU1[i] = NULL;
-        }
-        if (m_tmpU2[i])
-        {
-            X265_FREE(m_tmpU2[i] - 1);
-            m_tmpU2[i] = NULL;
+            X265_FREE(m_tmpU[i] - 1);
+            m_tmpU[i] = NULL;
         }
     }
 
@@ -325,7 +316,7 @@
         }
 
         tmpL = m_tmpL1;
-        tmpU = &(m_tmpU1[plane][lpelx]);
+        tmpU = &(m_tmpU[plane][lpelx]);
     }
 
     switch (typeIdx)
@@ -615,6 +606,7 @@
     uint32_t picWidth  = m_param->sourceWidth;
     int ctuWidth  = g_maxCUSize;
     int ctuHeight = g_maxCUSize;
+
     if (plane)
     {
         picWidth  >>= m_hChromaShift;
@@ -625,11 +617,6 @@
     int addr = idxY * m_numCuInWidth;
     pixel* rec = reconPic->getPlaneAddr(plane, addr);
 
-    if (!idxY)
-    {
-        memcpy(m_tmpU1[plane], rec, sizeof(pixel) * picWidth);
-    }
-
     for (int i = 0; i < ctuHeight + 1; i++)
     {
         m_tmpL1[i] = rec[0];
@@ -638,8 +625,6 @@
 
     rec -= (stride << 1);
 
-    memcpy(m_tmpU2[plane], rec, sizeof(pixel) * picWidth);
-
     for (int idxX = 0; idxX < m_numCuInWidth; idxX++)
     {
         addr = idxY * m_numCuInWidth + idxX;
@@ -682,8 +667,6 @@
             }
         }
     }
-
-    std::swap(m_tmpU1[plane], m_tmpU2[plane]);
 }
 
 void SAO::copySaoUnit(SaoCtuParam* saoUnitDst, const SaoCtuParam* saoUnitSrc)
diff -r eb20b66eebe7 -r 1c6f6e627722 source/encoder/sao.h
--- a/source/encoder/sao.h	Wed Dec 02 11:12:32 2015 -0600
+++ b/source/encoder/sao.h	Wed Dec 02 11:12:36 2015 -0600
@@ -92,8 +92,7 @@
     pixel*      m_clipTable;
     pixel*      m_clipTableBase;
 
-    pixel*      m_tmpU1[3];
-    pixel*      m_tmpU2[3];
+    pixel*      m_tmpU[3];
     pixel*      m_tmpL1;
     pixel*      m_tmpL2;
 



More information about the x265-devel mailing list