[x265] [PATCH] Improved sao implementation by limiting sao types

ashok at multicorewareinc.com ashok at multicorewareinc.com
Fri Apr 7 12:34:43 CEST 2017


# HG changeset patch
# User Ashok Kumar Mishra <ashok at multicorewareinc.com>
# Date 1491215527 -19800
#      Mon Apr 03 16:02:07 2017 +0530
# Node ID 195ae8f499fc61bcdc6865cf7cffe7d0d7c486f0
# Parent  08a05ca9fd16c9f5efb1ce4d8389bda8a63f5f7d
Improved sao implementation by limiting sao types

diff -r 08a05ca9fd16 -r 195ae8f499fc doc/reST/cli.rst
--- a/doc/reST/cli.rst	Mon Mar 27 12:35:20 2017 +0530
+++ b/doc/reST/cli.rst	Mon Apr 03 16:02:07 2017 +0530
@@ -1690,6 +1690,12 @@
 	disabled, SAO analysis skips the right/bottom boundary areas.
 	Default disabled
 
+.. option:: --limit-sao, --no-limit-sao
+    Limit SAO filter computation by early terminating SAO process based
+    on inter prediction mode, CTU spatial-domain correlations, and relations
+    between luma and chroma.
+    Default disabled
+
 VUI (Video Usability Information) options
 =========================================
 
diff -r 08a05ca9fd16 -r 195ae8f499fc source/common/param.cpp
--- a/source/common/param.cpp	Mon Mar 27 12:35:20 2017 +0530
+++ b/source/common/param.cpp	Mon Apr 03 16:02:07 2017 +0530
@@ -187,6 +187,7 @@
     /* SAO Loop Filter */
     param->bEnableSAO = 1;
     param->bSaoNonDeblocked = 0;
+    param->bLimitSAO = 0;
 
     /* Coding Quality */
     param->cbQpOffset = 0;
@@ -272,7 +273,6 @@
     param->bAQMotion = 0;
     param->bHDROpt = 0;
     param->analysisRefineLevel = 5;
-
 }
 
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
@@ -949,6 +949,7 @@
         }
         OPT("hdr") p->bEmitHDRSEI = atobool(value);
         OPT("hdr-opt") p->bHDROpt = atobool(value);
+        OPT("limit-sao") p->bLimitSAO = atobool(value);
         else
             return X265_PARAM_BAD_NAME;
     }
@@ -1658,6 +1659,7 @@
     BOOL(p->bEmitHDRSEI, "hdr");
     BOOL(p->bHDROpt, "hdr-opt");
     s += sprintf(s, " refine-level=%d", p->analysisRefineLevel);
+    BOOL(p->bLimitSAO, "limit-sao");
 #undef BOOL
     return buf;
 }
diff -r 08a05ca9fd16 -r 195ae8f499fc source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp	Mon Mar 27 12:35:20 2017 +0530
+++ b/source/encoder/encoder.cpp	Mon Apr 03 16:02:07 2017 +0530
@@ -2109,6 +2109,7 @@
     /* some options make no sense if others are disabled */
     p->bSaoNonDeblocked &= p->bEnableSAO;
     p->bEnableTSkipFast &= p->bEnableTransformSkip;
+    p->bLimitSAO &= p->bEnableSAO;
 
     /* initialize the conformance window */
     m_conformanceWindow.bEnabled = false;
diff -r 08a05ca9fd16 -r 195ae8f499fc source/encoder/sao.cpp
--- a/source/encoder/sao.cpp	Mon Mar 27 12:35:20 2017 +0530
+++ b/source/encoder/sao.cpp	Mon Apr 03 16:02:07 2017 +0530
@@ -734,6 +734,7 @@
 /* Calculate SAO statistics for current CTU without non-crossing slice */
 void SAO::calcSaoStatsCTU(int addr, int plane)
 {
+    Slice* slice = m_frame->m_encData->m_slice;
     const PicYuv* reconPic = m_frame->m_reconPic;
     const CUData* cu = m_frame->m_encData->getPicCTU(addr);
     const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
@@ -858,59 +859,63 @@
             primitives.saoCuStatsE1(diff + startY * MAX_CU_SIZE, rec0 + startY * stride, stride, upBuff1, endX, endY - startY, m_offsetOrg[plane][SAO_EO_1], m_count[plane][SAO_EO_1]);
         }
 
-        // SAO_EO_2: // dir: 135
+        if (!m_param->bLimitSAO || ((slice->m_sliceType == P_SLICE && !cu->isSkipped(0)) || 
+            (slice->m_sliceType != B_SLICE)))
         {
-            if (m_param->bSaoNonDeblocked)
+            // SAO_EO_2: // dir: 135
             {
-                skipB = 4;
-                skipR = 5;
+                if (m_param->bSaoNonDeblocked)
+                {
+                    skipB = 4;
+                    skipR = 5;
+                }
+
+                fenc = fenc0;
+                rec  = rec0;
+
+                startX = !lpelx;
+                endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset;
+
+                startY = bAboveUnavail;
+                endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset;
+                if (startY)
+                {
+                    fenc += stride;
+                    rec += stride;
+                }
+
+                primitives.sign(upBuff1, &rec[startX], &rec[startX - stride - 1], (endX - startX));
+
+                primitives.saoCuStatsE2(diff + startX + startY * MAX_CU_SIZE, rec0  + startX + startY * stride, stride, upBuff1, upBufft, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_2], m_count[plane][SAO_EO_2]);
             }
 
-            fenc = fenc0;
-            rec  = rec0;
+            // SAO_EO_3: // dir: 45
+            {
+                if (m_param->bSaoNonDeblocked)
+                {
+                    skipB = 4;
+                    skipR = 5;
+                }
 
-            startX = !lpelx;
-            endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset;
+                fenc = fenc0;
+                rec  = rec0;
 
-            startY = bAboveUnavail;
-            endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset;
-            if (startY)
-            {
-                fenc += stride;
-                rec += stride;
+                startX = !lpelx;
+                endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset;
+
+                startY = bAboveUnavail;
+                endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset;
+
+                if (startY)
+                {
+                    fenc += stride;
+                    rec += stride;
+                }
+
+                primitives.sign(upBuff1, &rec[startX - 1], &rec[startX - 1 - stride + 1], (endX - startX + 1));
+
+                primitives.saoCuStatsE3(diff + startX + startY * MAX_CU_SIZE, rec0  + startX + startY * stride, stride, upBuff1 + 1, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_3], m_count[plane][SAO_EO_3]);
             }
-
-            primitives.sign(upBuff1, &rec[startX], &rec[startX - stride - 1], (endX - startX));
-
-            primitives.saoCuStatsE2(diff + startX + startY * MAX_CU_SIZE, rec0  + startX + startY * stride, stride, upBuff1, upBufft, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_2], m_count[plane][SAO_EO_2]);
-        }
-
-        // SAO_EO_3: // dir: 45
-        {
-            if (m_param->bSaoNonDeblocked)
-            {
-                skipB = 4;
-                skipR = 5;
-            }
-
-            fenc = fenc0;
-            rec  = rec0;
-
-            startX = !lpelx;
-            endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset;
-
-            startY = bAboveUnavail;
-            endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset;
-
-            if (startY)
-            {
-                fenc += stride;
-                rec += stride;
-            }
-
-            primitives.sign(upBuff1, &rec[startX - 1], &rec[startX - 1 - stride + 1], (endX - startX + 1));
-
-            primitives.saoCuStatsE3(diff + startX + startY * MAX_CU_SIZE, rec0  + startX + startY * stride, stride, upBuff1 + 1, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_3], m_count[plane][SAO_EO_3]);
         }
     }
 }
@@ -1224,7 +1229,6 @@
 void SAO::rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr)
 {
     Slice* slice = m_frame->m_encData->m_slice;
-//    int qp = slice->m_sliceQp;
     const CUData* cu = m_frame->m_encData->getPicCTU(addr);
     int qp = cu->m_qp[0];
 
@@ -1263,17 +1267,6 @@
     for (int i = 0; i < planes; i++)
         saoParam->ctuParam[i][addr].reset();
 
-    if (saoParam->bSaoFlag[0])
-        calcSaoStatsCTU(addr, 0);
-
-    if (saoParam->bSaoFlag[1])
-    {
-        calcSaoStatsCTU(addr, 1);
-        calcSaoStatsCTU(addr, 2);
-    }
-
-    saoStatsInitialOffset(planes);
-
     // SAO distortion calculation
     m_entropyCoder.load(m_rdContexts.cur);
     m_entropyCoder.resetBits();
@@ -1283,13 +1276,44 @@
         m_entropyCoder.codeSaoMerge(0);
     m_entropyCoder.store(m_rdContexts.temp);
 
-    // Estimate distortion and cost of new SAO params
+    memset(m_offset, 0, sizeof(m_offset));
     int64_t bestCost = 0;
     int64_t rateDist = 0;
+
+    bool bAboveLeftAvail = true;
+    for (int mergeIdx = 0; mergeIdx < 2; ++mergeIdx)
+    {
+        if (!allowMerge[mergeIdx])
+            continue;
+
+        SaoCtuParam* mergeSrcParam = &(saoParam->ctuParam[0][addrMerge[mergeIdx]]);
+        bAboveLeftAvail = bAboveLeftAvail && (mergeSrcParam->typeIdx == -1);
+    }
+    // Don't apply sao if ctu is skipped or ajacent ctus are sao off
+    bool bSaoOff = (slice->m_sliceType == B_SLICE) && (cu->isSkipped(0) || bAboveLeftAvail);
+
     // Estimate distortion and cost of new SAO params
-    saoLumaComponentParamDist(saoParam, addr, rateDist, lambda, bestCost);
-    if (chroma)
-        saoChromaComponentParamDist(saoParam, addr, rateDist, lambda, bestCost);
+    if (saoParam->bSaoFlag[0])
+    {
+        if (!m_param->bLimitSAO || !bSaoOff)
+        {
+            calcSaoStatsCTU(addr, 0);
+            saoStatsInitialOffset(addr, 0);
+            saoLumaComponentParamDist(saoParam, addr, rateDist, lambda, bestCost);
+        }
+    }
+
+    SaoCtuParam* lclCtuParam = &saoParam->ctuParam[0][addr];
+    if (saoParam->bSaoFlag[1])
+    {
+        if (!m_param->bLimitSAO || ((lclCtuParam->typeIdx != -1) && !bSaoOff))
+        {
+            calcSaoStatsCTU(addr, 1);
+            calcSaoStatsCTU(addr, 2);
+            saoStatsInitialOffset(addr, 1);
+            saoChromaComponentParamDist(saoParam, addr, rateDist, lambda, bestCost);
+        }
+    }
 
     if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
     {
@@ -1360,14 +1384,26 @@
 
 // Rounds the division of initial offsets by the number of samples in
 // each of the statistics table entries.
-void SAO::saoStatsInitialOffset(int planes)
+void SAO::saoStatsInitialOffset(int addr, int planes)
 {
-    memset(m_offset, 0, sizeof(m_offset));
+    Slice* slice = m_frame->m_encData->m_slice;
+    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
+
+    int maxSaoType;
+    if (m_param->bLimitSAO && ((slice->m_sliceType == P_SLICE && cu->isSkipped(0)) || 
+       (slice->m_sliceType == B_SLICE)))
+    {
+        maxSaoType = MAX_NUM_SAO_TYPE - 3;
+    }
+    else
+    {
+        maxSaoType = MAX_NUM_SAO_TYPE - 1;
+    }
 
     // EO
-    for (int plane = 0; plane < planes; plane++)
+    for (int plane = planes; plane <= planes * 2; plane++)
     {
-        for (int typeIdx = 0; typeIdx < MAX_NUM_SAO_TYPE - 1; typeIdx++)
+        for (int typeIdx = 0; typeIdx < maxSaoType; typeIdx++)
         {
             for (int classIdx = 1; classIdx < SAO_NUM_OFFSET + 1; classIdx++)
             {
@@ -1390,7 +1426,7 @@
     }
 
     // BO
-    for (int plane = 0; plane < planes; plane++)
+    for (int plane = planes; plane <= planes * 2; plane++)
     {
         for (int classIdx = 0; classIdx < MAX_NUM_SAO_CLASS; classIdx++)
         {
@@ -1454,6 +1490,8 @@
 
 void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& rateDist, int64_t* lambda, int64_t &bestCost)
 {
+    Slice* slice = m_frame->m_encData->m_slice;
+    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
     int64_t bestDist = 0;
     int bestTypeIdx = -1;
 
@@ -1469,13 +1507,24 @@
 
     int64_t costPartBest = calcSaoRdoCost(0, m_entropyCoder.getNumberOfWrittenBits(), lambda[0]);
 
+    int maxSaoType;
+    if (m_param->bLimitSAO && ((slice->m_sliceType == P_SLICE && cu->isSkipped(0)) || 
+        (slice->m_sliceType == B_SLICE)))
+    {
+        maxSaoType = MAX_NUM_SAO_TYPE - 3;
+    }
+    else
+    {
+        maxSaoType = MAX_NUM_SAO_TYPE - 1;
+    }
+
     //EO distortion calculation
-    for (int typeIdx = 0; typeIdx < MAX_NUM_SAO_TYPE - 1; typeIdx++)
+    for (int typeIdx = 0; typeIdx < maxSaoType; typeIdx++)
     {
         int64_t estDist = 0;
         for (int classIdx = 1; classIdx < SAO_NUM_OFFSET + 1; classIdx++)
         {
-            int32_t&  count     = m_count[0][typeIdx][classIdx];
+            int32_t&  count    = m_count[0][typeIdx][classIdx];
             int32_t& offsetOrg = m_offsetOrg[0][typeIdx][classIdx];
             int32_t& offsetOut = m_offset[0][typeIdx][classIdx];
 
@@ -1571,6 +1620,8 @@
 
 void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& rateDist, int64_t* lambda, int64_t &bestCost)
 {
+    Slice* slice = m_frame->m_encData->m_slice;
+    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
     int64_t bestDist = 0;
     int bestTypeIdx = -1;
 
@@ -1587,8 +1638,19 @@
     uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
     int64_t costPartBest = calcSaoRdoCost(0, bits, lambda[1]);
 
+    int maxSaoType;
+    if (m_param->bLimitSAO && ((slice->m_sliceType == P_SLICE && cu->isSkipped(0)) || 
+        (slice->m_sliceType == B_SLICE)))
+    {
+        maxSaoType = MAX_NUM_SAO_TYPE - 3;
+    }
+    else
+    {
+        maxSaoType = MAX_NUM_SAO_TYPE - 1;
+    }
+
     //EO RDO
-    for (int typeIdx = 0; typeIdx < MAX_NUM_SAO_TYPE - 1; typeIdx++)
+    for (int typeIdx = 0; typeIdx < maxSaoType; typeIdx++)
     {
         int64_t estDist[2] = {0, 0};
         for (int compIdx = 1; compIdx < 3; compIdx++)
diff -r 08a05ca9fd16 -r 195ae8f499fc source/encoder/sao.h
--- a/source/encoder/sao.h	Mon Mar 27 12:35:20 2017 +0530
+++ b/source/encoder/sao.h	Mon Apr 03 16:02:07 2017 +0530
@@ -134,7 +134,7 @@
     void rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr);
     int64_t calcSaoRdoCost(int64_t distortion, uint32_t bits, int64_t lambda);
 
-    void saoStatsInitialOffset(int planes);
+    void saoStatsInitialOffset(int addr, int planes);
 
     friend class FrameFilter;
 };
diff -r 08a05ca9fd16 -r 195ae8f499fc source/test/regression-tests.txt
--- a/source/test/regression-tests.txt	Mon Mar 27 12:35:20 2017 +0530
+++ b/source/test/regression-tests.txt	Mon Apr 03 16:02:07 2017 +0530
@@ -45,6 +45,7 @@
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip --tskip-fast --no-scenecut --limit-tu 1
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --aq-mode 3 --aq-strength 1.5 --aq-motion --bitrate 5000
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --aq-mode 3 --aq-strength 1.5 --no-psy-rd --ssim-rd
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --sao --limit-sao
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset superfast --weightp --qg-size 16
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16 --limit-modes
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
diff -r 08a05ca9fd16 -r 195ae8f499fc source/test/smoke-tests.txt
--- a/source/test/smoke-tests.txt	Mon Mar 27 12:35:20 2017 +0530
+++ b/source/test/smoke-tests.txt	Mon Apr 03 16:02:07 2017 +0530
@@ -19,6 +19,7 @@
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryfast --min-cu 16
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=fast --weightb --interlace bff
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryslow --limit-ref 1 --limit-mode --tskip --limit-tu 1
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset=superfast --bitrate 7000 --sao --limit-sao
 
 # Main12 intraCost overflow bug test
 720p50_parkrun_ter.y4m,--preset medium
diff -r 08a05ca9fd16 -r 195ae8f499fc source/x265.h
--- a/source/x265.h	Mon Mar 27 12:35:20 2017 +0530
+++ b/source/x265.h	Mon Apr 03 16:02:07 2017 +0530
@@ -963,7 +963,7 @@
     int       bEnableWeightedBiPred;
 
     /* Enable source pixels in motion estimation. Default is disabled */
-    int      bSourceReferenceEstimation;
+    int       bSourceReferenceEstimation;
 
     /*== Loop Filters ==*/
 
@@ -1347,7 +1347,7 @@
 
     /* This value represents the percentage difference between the inter cost and
     * intra cost of a frame used in scenecut detection. Default 5. */
-    double     scenecutBias;
+    double    scenecutBias;
 
     /* Use multiple worker threads dedicated to doing only lookahead instead of sharing
     * the worker threads with Frame Encoders. A dedicated lookahead threadpool is created with the
@@ -1357,16 +1357,16 @@
     int       lookaheadThreads;
 
     /* Optimize CU level QPs to signal consistent deltaQPs in frame for rd level > 4 */
-    int        bOptCUDeltaQP;
+    int       bOptCUDeltaQP;
 
     /* Refine analysis in multipass ratecontrol based on analysis information stored */
-    int         analysisMultiPassRefine;
+    int       analysisMultiPassRefine;
 
     /* Refine analysis in multipass ratecontrol based on distortion data stored */
-    int         analysisMultiPassDistortion;
+    int       analysisMultiPassDistortion;
 
     /* Adaptive Quantization based on relative motion */
-    int        bAQMotion;
+    int       bAQMotion;
 
     /* SSIM based RDO, based on residual divisive normalization scheme. Used for mode
     * selection during analysis of CTUs, can achieve significant gain in terms of 
@@ -1390,6 +1390,11 @@
     * level higher the informtion stored/reused. Default is 5 */
     int       analysisRefineLevel;
 
+     /* Limit Sample Adaptive Offset filter computation by early terminating SAO
+     * process based on inter prediction mode, CTU spatial-domain correlations,
+     * and relations between luma and chroma */
+    int       bLimitSAO;
+
 } x265_param;
 
 /* x265_param_alloc:
diff -r 08a05ca9fd16 -r 195ae8f499fc source/x265cli.h
--- a/source/x265cli.h	Mon Mar 27 12:35:20 2017 +0530
+++ b/source/x265cli.h	Mon Apr 03 16:02:07 2017 +0530
@@ -266,6 +266,8 @@
     { "no-hdr",               no_argument, NULL, 0 },
     { "hdr-opt",              no_argument, NULL, 0 },
     { "no-hdr-opt",           no_argument, NULL, 0 },
+    { "limit-sao",            no_argument, NULL, 0 },
+    { "no-limit-sao",         no_argument, NULL, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
@@ -454,6 +456,7 @@
     H0("   --[no-]deblock                Enable Deblocking Loop Filter, optionally specify tC:Beta offsets Default %s\n", OPT(param->bEnableLoopFilter));
     H0("   --[no-]sao                    Enable Sample Adaptive Offset. Default %s\n", OPT(param->bEnableSAO));
     H1("   --[no-]sao-non-deblock        Use non-deblocked pixels, else right/bottom boundary areas skipped. Default %s\n", OPT(param->bSaoNonDeblocked));
+    H0("   --[no-]limit-sao              Limit Sample Adaptive Offset types. Default %s\n", OPT(param->bLimitSAO));
     H0("\nVUI options:\n");
     H0("   --sar <width:height|int>      Sample Aspect Ratio, the ratio of width to height of an individual pixel.\n");
     H0("                                 Choose from 0=undef, 1=1:1(\"square\"), 2=12:11, 3=10:11, 4=16:11,\n");


More information about the x265-devel mailing list