[x265] [PATCH] Improved sao implementation by limiting sao types
Pradeep Ramachandran
pradeep at multicorewareinc.com
Mon Apr 10 08:10:15 CEST 2017
On Fri, Apr 7, 2017 at 4:04 PM, <ashok at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Ashok Kumar Mishra <ashok at multicorewareinc.com>
> # Date 1491215527 -19800
> # Mon Apr 03 16:02:07 2017 +0530
> # Node ID 195ae8f499fc61bcdc6865cf7cffe7d0d7c486f0
> # Parent 08a05ca9fd16c9f5efb1ce4d8389bda8a63f5f7d
> Improved sao implementation by limiting sao types
>
Looks good. The X265_BUILD number has to be incremented as x265_param has
been modified, but I can absorb this in my push.
Do you have any performance numbers to share to show the impact of
limit-sao?
>
> diff -r 08a05ca9fd16 -r 195ae8f499fc doc/reST/cli.rst
> --- a/doc/reST/cli.rst Mon Mar 27 12:35:20 2017 +0530
> +++ b/doc/reST/cli.rst Mon Apr 03 16:02:07 2017 +0530
> @@ -1690,6 +1690,12 @@
> disabled, SAO analysis skips the right/bottom boundary areas.
> Default disabled
>
> +.. option:: --limit-sao, --no-limit-sao
> + Limit SAO filter computation by early terminating SAO process based
> + on inter prediction mode, CTU spatial-domain correlations, and
> relations
> + between luma and chroma.
> + Default disabled
> +
> VUI (Video Usability Information) options
> =========================================
>
> diff -r 08a05ca9fd16 -r 195ae8f499fc source/common/param.cpp
> --- a/source/common/param.cpp Mon Mar 27 12:35:20 2017 +0530
> +++ b/source/common/param.cpp Mon Apr 03 16:02:07 2017 +0530
> @@ -187,6 +187,7 @@
> /* SAO Loop Filter */
> param->bEnableSAO = 1;
> param->bSaoNonDeblocked = 0;
> + param->bLimitSAO = 0;
>
> /* Coding Quality */
> param->cbQpOffset = 0;
> @@ -272,7 +273,6 @@
> param->bAQMotion = 0;
> param->bHDROpt = 0;
> param->analysisRefineLevel = 5;
> -
> }
>
> int x265_param_default_preset(x265_param* param, const char* preset,
> const char* tune)
> @@ -949,6 +949,7 @@
> }
> OPT("hdr") p->bEmitHDRSEI = atobool(value);
> OPT("hdr-opt") p->bHDROpt = atobool(value);
> + OPT("limit-sao") p->bLimitSAO = atobool(value);
> else
> return X265_PARAM_BAD_NAME;
> }
> @@ -1658,6 +1659,7 @@
> BOOL(p->bEmitHDRSEI, "hdr");
> BOOL(p->bHDROpt, "hdr-opt");
> s += sprintf(s, " refine-level=%d", p->analysisRefineLevel);
> + BOOL(p->bLimitSAO, "limit-sao");
> #undef BOOL
> return buf;
> }
> diff -r 08a05ca9fd16 -r 195ae8f499fc source/encoder/encoder.cpp
> --- a/source/encoder/encoder.cpp Mon Mar 27 12:35:20 2017 +0530
> +++ b/source/encoder/encoder.cpp Mon Apr 03 16:02:07 2017 +0530
> @@ -2109,6 +2109,7 @@
> /* some options make no sense if others are disabled */
> p->bSaoNonDeblocked &= p->bEnableSAO;
> p->bEnableTSkipFast &= p->bEnableTransformSkip;
> + p->bLimitSAO &= p->bEnableSAO;
>
> /* initialize the conformance window */
> m_conformanceWindow.bEnabled = false;
> diff -r 08a05ca9fd16 -r 195ae8f499fc source/encoder/sao.cpp
> --- a/source/encoder/sao.cpp Mon Mar 27 12:35:20 2017 +0530
> +++ b/source/encoder/sao.cpp Mon Apr 03 16:02:07 2017 +0530
> @@ -734,6 +734,7 @@
> /* Calculate SAO statistics for current CTU without non-crossing slice */
> void SAO::calcSaoStatsCTU(int addr, int plane)
> {
> + Slice* slice = m_frame->m_encData->m_slice;
> const PicYuv* reconPic = m_frame->m_reconPic;
> const CUData* cu = m_frame->m_encData->getPicCTU(addr);
> const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
> @@ -858,59 +859,63 @@
> primitives.saoCuStatsE1(diff + startY * MAX_CU_SIZE, rec0 +
> startY * stride, stride, upBuff1, endX, endY - startY,
> m_offsetOrg[plane][SAO_EO_1], m_count[plane][SAO_EO_1]);
> }
>
> - // SAO_EO_2: // dir: 135
> + if (!m_param->bLimitSAO || ((slice->m_sliceType == P_SLICE &&
> !cu->isSkipped(0)) ||
> + (slice->m_sliceType != B_SLICE)))
> {
> - if (m_param->bSaoNonDeblocked)
> + // SAO_EO_2: // dir: 135
> {
> - skipB = 4;
> - skipR = 5;
> + if (m_param->bSaoNonDeblocked)
> + {
> + skipB = 4;
> + skipR = 5;
> + }
> +
> + fenc = fenc0;
> + rec = rec0;
> +
> + startX = !lpelx;
> + endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth -
> skipR + plane_offset;
> +
> + startY = bAboveUnavail;
> + endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight
> - skipB + plane_offset;
> + if (startY)
> + {
> + fenc += stride;
> + rec += stride;
> + }
> +
> + primitives.sign(upBuff1, &rec[startX], &rec[startX -
> stride - 1], (endX - startX));
> +
> + primitives.saoCuStatsE2(diff + startX + startY *
> MAX_CU_SIZE, rec0 + startX + startY * stride, stride, upBuff1, upBufft,
> endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_2],
> m_count[plane][SAO_EO_2]);
> }
>
> - fenc = fenc0;
> - rec = rec0;
> + // SAO_EO_3: // dir: 45
> + {
> + if (m_param->bSaoNonDeblocked)
> + {
> + skipB = 4;
> + skipR = 5;
> + }
>
> - startX = !lpelx;
> - endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth -
> skipR + plane_offset;
> + fenc = fenc0;
> + rec = rec0;
>
> - startY = bAboveUnavail;
> - endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight -
> skipB + plane_offset;
> - if (startY)
> - {
> - fenc += stride;
> - rec += stride;
> + startX = !lpelx;
> + endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth -
> skipR + plane_offset;
> +
> + startY = bAboveUnavail;
> + endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight
> - skipB + plane_offset;
> +
> + if (startY)
> + {
> + fenc += stride;
> + rec += stride;
> + }
> +
> + primitives.sign(upBuff1, &rec[startX - 1], &rec[startX -
> 1 - stride + 1], (endX - startX + 1));
> +
> + primitives.saoCuStatsE3(diff + startX + startY *
> MAX_CU_SIZE, rec0 + startX + startY * stride, stride, upBuff1 + 1, endX -
> startX, endY - startY, m_offsetOrg[plane][SAO_EO_3],
> m_count[plane][SAO_EO_3]);
> }
> -
> - primitives.sign(upBuff1, &rec[startX], &rec[startX - stride -
> 1], (endX - startX));
> -
> - primitives.saoCuStatsE2(diff + startX + startY * MAX_CU_SIZE,
> rec0 + startX + startY * stride, stride, upBuff1, upBufft, endX - startX,
> endY - startY, m_offsetOrg[plane][SAO_EO_2], m_count[plane][SAO_EO_2]);
> - }
> -
> - // SAO_EO_3: // dir: 45
> - {
> - if (m_param->bSaoNonDeblocked)
> - {
> - skipB = 4;
> - skipR = 5;
> - }
> -
> - fenc = fenc0;
> - rec = rec0;
> -
> - startX = !lpelx;
> - endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth -
> skipR + plane_offset;
> -
> - startY = bAboveUnavail;
> - endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight -
> skipB + plane_offset;
> -
> - if (startY)
> - {
> - fenc += stride;
> - rec += stride;
> - }
> -
> - primitives.sign(upBuff1, &rec[startX - 1], &rec[startX - 1 -
> stride + 1], (endX - startX + 1));
> -
> - primitives.saoCuStatsE3(diff + startX + startY * MAX_CU_SIZE,
> rec0 + startX + startY * stride, stride, upBuff1 + 1, endX - startX, endY
> - startY, m_offsetOrg[plane][SAO_EO_3], m_count[plane][SAO_EO_3]);
> }
> }
> }
> @@ -1224,7 +1229,6 @@
> void SAO::rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int
> addr)
> {
> Slice* slice = m_frame->m_encData->m_slice;
> -// int qp = slice->m_sliceQp;
> const CUData* cu = m_frame->m_encData->getPicCTU(addr);
> int qp = cu->m_qp[0];
>
> @@ -1263,17 +1267,6 @@
> for (int i = 0; i < planes; i++)
> saoParam->ctuParam[i][addr].reset();
>
> - if (saoParam->bSaoFlag[0])
> - calcSaoStatsCTU(addr, 0);
> -
> - if (saoParam->bSaoFlag[1])
> - {
> - calcSaoStatsCTU(addr, 1);
> - calcSaoStatsCTU(addr, 2);
> - }
> -
> - saoStatsInitialOffset(planes);
> -
> // SAO distortion calculation
> m_entropyCoder.load(m_rdContexts.cur);
> m_entropyCoder.resetBits();
> @@ -1283,13 +1276,44 @@
> m_entropyCoder.codeSaoMerge(0);
> m_entropyCoder.store(m_rdContexts.temp);
>
> - // Estimate distortion and cost of new SAO params
> + memset(m_offset, 0, sizeof(m_offset));
> int64_t bestCost = 0;
> int64_t rateDist = 0;
> +
> + bool bAboveLeftAvail = true;
> + for (int mergeIdx = 0; mergeIdx < 2; ++mergeIdx)
> + {
> + if (!allowMerge[mergeIdx])
> + continue;
> +
> + SaoCtuParam* mergeSrcParam = &(saoParam->ctuParam[0][
> addrMerge[mergeIdx]]);
> + bAboveLeftAvail = bAboveLeftAvail && (mergeSrcParam->typeIdx ==
> -1);
> + }
> + // Don't apply sao if ctu is skipped or ajacent ctus are sao off
> + bool bSaoOff = (slice->m_sliceType == B_SLICE) && (cu->isSkipped(0)
> || bAboveLeftAvail);
> +
> // Estimate distortion and cost of new SAO params
> - saoLumaComponentParamDist(saoParam, addr, rateDist, lambda,
> bestCost);
> - if (chroma)
> - saoChromaComponentParamDist(saoParam, addr, rateDist, lambda,
> bestCost);
> + if (saoParam->bSaoFlag[0])
> + {
> + if (!m_param->bLimitSAO || !bSaoOff)
> + {
> + calcSaoStatsCTU(addr, 0);
> + saoStatsInitialOffset(addr, 0);
> + saoLumaComponentParamDist(saoParam, addr, rateDist, lambda,
> bestCost);
> + }
> + }
> +
> + SaoCtuParam* lclCtuParam = &saoParam->ctuParam[0][addr];
> + if (saoParam->bSaoFlag[1])
> + {
> + if (!m_param->bLimitSAO || ((lclCtuParam->typeIdx != -1) &&
> !bSaoOff))
> + {
> + calcSaoStatsCTU(addr, 1);
> + calcSaoStatsCTU(addr, 2);
> + saoStatsInitialOffset(addr, 1);
> + saoChromaComponentParamDist(saoParam, addr, rateDist,
> lambda, bestCost);
> + }
> + }
>
> if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
> {
> @@ -1360,14 +1384,26 @@
>
> // Rounds the division of initial offsets by the number of samples in
> // each of the statistics table entries.
> -void SAO::saoStatsInitialOffset(int planes)
> +void SAO::saoStatsInitialOffset(int addr, int planes)
> {
> - memset(m_offset, 0, sizeof(m_offset));
> + Slice* slice = m_frame->m_encData->m_slice;
> + const CUData* cu = m_frame->m_encData->getPicCTU(addr);
> +
> + int maxSaoType;
> + if (m_param->bLimitSAO && ((slice->m_sliceType == P_SLICE &&
> cu->isSkipped(0)) ||
> + (slice->m_sliceType == B_SLICE)))
> + {
> + maxSaoType = MAX_NUM_SAO_TYPE - 3;
> + }
> + else
> + {
> + maxSaoType = MAX_NUM_SAO_TYPE - 1;
> + }
>
> // EO
> - for (int plane = 0; plane < planes; plane++)
> + for (int plane = planes; plane <= planes * 2; plane++)
> {
> - for (int typeIdx = 0; typeIdx < MAX_NUM_SAO_TYPE - 1; typeIdx++)
> + for (int typeIdx = 0; typeIdx < maxSaoType; typeIdx++)
> {
> for (int classIdx = 1; classIdx < SAO_NUM_OFFSET + 1;
> classIdx++)
> {
> @@ -1390,7 +1426,7 @@
> }
>
> // BO
> - for (int plane = 0; plane < planes; plane++)
> + for (int plane = planes; plane <= planes * 2; plane++)
> {
> for (int classIdx = 0; classIdx < MAX_NUM_SAO_CLASS; classIdx++)
> {
> @@ -1454,6 +1490,8 @@
>
> void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr,
> int64_t& rateDist, int64_t* lambda, int64_t &bestCost)
> {
> + Slice* slice = m_frame->m_encData->m_slice;
> + const CUData* cu = m_frame->m_encData->getPicCTU(addr);
> int64_t bestDist = 0;
> int bestTypeIdx = -1;
>
> @@ -1469,13 +1507,24 @@
>
> int64_t costPartBest = calcSaoRdoCost(0, m_entropyCoder.getNumberOfWrittenBits(),
> lambda[0]);
>
> + int maxSaoType;
> + if (m_param->bLimitSAO && ((slice->m_sliceType == P_SLICE &&
> cu->isSkipped(0)) ||
> + (slice->m_sliceType == B_SLICE)))
> + {
> + maxSaoType = MAX_NUM_SAO_TYPE - 3;
> + }
> + else
> + {
> + maxSaoType = MAX_NUM_SAO_TYPE - 1;
> + }
> +
> //EO distortion calculation
> - for (int typeIdx = 0; typeIdx < MAX_NUM_SAO_TYPE - 1; typeIdx++)
> + for (int typeIdx = 0; typeIdx < maxSaoType; typeIdx++)
> {
> int64_t estDist = 0;
> for (int classIdx = 1; classIdx < SAO_NUM_OFFSET + 1; classIdx++)
> {
> - int32_t& count = m_count[0][typeIdx][classIdx];
> + int32_t& count = m_count[0][typeIdx][classIdx];
> int32_t& offsetOrg = m_offsetOrg[0][typeIdx][classIdx];
> int32_t& offsetOut = m_offset[0][typeIdx][classIdx];
>
> @@ -1571,6 +1620,8 @@
>
> void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr,
> int64_t& rateDist, int64_t* lambda, int64_t &bestCost)
> {
> + Slice* slice = m_frame->m_encData->m_slice;
> + const CUData* cu = m_frame->m_encData->getPicCTU(addr);
> int64_t bestDist = 0;
> int bestTypeIdx = -1;
>
> @@ -1587,8 +1638,19 @@
> uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
> int64_t costPartBest = calcSaoRdoCost(0, bits, lambda[1]);
>
> + int maxSaoType;
> + if (m_param->bLimitSAO && ((slice->m_sliceType == P_SLICE &&
> cu->isSkipped(0)) ||
> + (slice->m_sliceType == B_SLICE)))
> + {
> + maxSaoType = MAX_NUM_SAO_TYPE - 3;
> + }
> + else
> + {
> + maxSaoType = MAX_NUM_SAO_TYPE - 1;
> + }
> +
> //EO RDO
> - for (int typeIdx = 0; typeIdx < MAX_NUM_SAO_TYPE - 1; typeIdx++)
> + for (int typeIdx = 0; typeIdx < maxSaoType; typeIdx++)
> {
> int64_t estDist[2] = {0, 0};
> for (int compIdx = 1; compIdx < 3; compIdx++)
> diff -r 08a05ca9fd16 -r 195ae8f499fc source/encoder/sao.h
> --- a/source/encoder/sao.h Mon Mar 27 12:35:20 2017 +0530
> +++ b/source/encoder/sao.h Mon Apr 03 16:02:07 2017 +0530
> @@ -134,7 +134,7 @@
> void rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int
> addr);
> int64_t calcSaoRdoCost(int64_t distortion, uint32_t bits, int64_t
> lambda);
>
> - void saoStatsInitialOffset(int planes);
> + void saoStatsInitialOffset(int addr, int planes);
>
> friend class FrameFilter;
> };
> diff -r 08a05ca9fd16 -r 195ae8f499fc source/test/regression-tests.txt
> --- a/source/test/regression-tests.txt Mon Mar 27 12:35:20 2017 +0530
> +++ b/source/test/regression-tests.txt Mon Apr 03 16:02:07 2017 +0530
> @@ -45,6 +45,7 @@
> CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip
> --tskip-fast --no-scenecut --limit-tu 1
> CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --aq-mode 3
> --aq-strength 1.5 --aq-motion --bitrate 5000
> CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --aq-mode 3
> --aq-strength 1.5 --no-psy-rd --ssim-rd
> +CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --sao
> --limit-sao
> DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset superfast --weightp
> --qg-size 16
> DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr
> --bframes 16 --limit-modes
> DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers
> --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
> diff -r 08a05ca9fd16 -r 195ae8f499fc source/test/smoke-tests.txt
> --- a/source/test/smoke-tests.txt Mon Mar 27 12:35:20 2017 +0530
> +++ b/source/test/smoke-tests.txt Mon Apr 03 16:02:07 2017 +0530
> @@ -19,6 +19,7 @@
> DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryfast --min-cu 16
> DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=fast --weightb
> --interlace bff
> DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryslow --limit-ref 1
> --limit-mode --tskip --limit-tu 1
> +CrowdRun_1920x1080_50_10bit_444.yuv,--preset=superfast --bitrate 7000
> --sao --limit-sao
>
> # Main12 intraCost overflow bug test
> 720p50_parkrun_ter.y4m,--preset medium
> diff -r 08a05ca9fd16 -r 195ae8f499fc source/x265.h
> --- a/source/x265.h Mon Mar 27 12:35:20 2017 +0530
> +++ b/source/x265.h Mon Apr 03 16:02:07 2017 +0530
> @@ -963,7 +963,7 @@
> int bEnableWeightedBiPred;
>
> /* Enable source pixels in motion estimation. Default is disabled */
> - int bSourceReferenceEstimation;
> + int bSourceReferenceEstimation;
>
> /*== Loop Filters ==*/
>
> @@ -1347,7 +1347,7 @@
>
> /* This value represents the percentage difference between the inter
> cost and
> * intra cost of a frame used in scenecut detection. Default 5. */
> - double scenecutBias;
> + double scenecutBias;
>
> /* Use multiple worker threads dedicated to doing only lookahead
> instead of sharing
> * the worker threads with Frame Encoders. A dedicated lookahead
> threadpool is created with the
> @@ -1357,16 +1357,16 @@
> int lookaheadThreads;
>
> /* Optimize CU level QPs to signal consistent deltaQPs in frame for
> rd level > 4 */
> - int bOptCUDeltaQP;
> + int bOptCUDeltaQP;
>
> /* Refine analysis in multipass ratecontrol based on analysis
> information stored */
> - int analysisMultiPassRefine;
> + int analysisMultiPassRefine;
>
> /* Refine analysis in multipass ratecontrol based on distortion data
> stored */
> - int analysisMultiPassDistortion;
> + int analysisMultiPassDistortion;
>
> /* Adaptive Quantization based on relative motion */
> - int bAQMotion;
> + int bAQMotion;
>
> /* SSIM based RDO, based on residual divisive normalization scheme.
> Used for mode
> * selection during analysis of CTUs, can achieve significant gain in
> terms of
> @@ -1390,6 +1390,11 @@
> * level higher the informtion stored/reused. Default is 5 */
> int analysisRefineLevel;
>
> + /* Limit Sample Adaptive Offset filter computation by early
> terminating SAO
> + * process based on inter prediction mode, CTU spatial-domain
> correlations,
> + * and relations between luma and chroma */
> + int bLimitSAO;
> +
> } x265_param;
>
> /* x265_param_alloc:
> diff -r 08a05ca9fd16 -r 195ae8f499fc source/x265cli.h
> --- a/source/x265cli.h Mon Mar 27 12:35:20 2017 +0530
> +++ b/source/x265cli.h Mon Apr 03 16:02:07 2017 +0530
> @@ -266,6 +266,8 @@
> { "no-hdr", no_argument, NULL, 0 },
> { "hdr-opt", no_argument, NULL, 0 },
> { "no-hdr-opt", no_argument, NULL, 0 },
> + { "limit-sao", no_argument, NULL, 0 },
> + { "no-limit-sao", no_argument, NULL, 0 },
> { 0, 0, 0, 0 },
> { 0, 0, 0, 0 },
> { 0, 0, 0, 0 },
> @@ -454,6 +456,7 @@
> H0(" --[no-]deblock Enable Deblocking Loop Filter,
> optionally specify tC:Beta offsets Default %s\n",
> OPT(param->bEnableLoopFilter));
> H0(" --[no-]sao Enable Sample Adaptive Offset.
> Default %s\n", OPT(param->bEnableSAO));
> H1(" --[no-]sao-non-deblock Use non-deblocked pixels, else
> right/bottom boundary areas skipped. Default %s\n",
> OPT(param->bSaoNonDeblocked));
> + H0(" --[no-]limit-sao Limit Sample Adaptive Offset
> types. Default %s\n", OPT(param->bLimitSAO));
> H0("\nVUI options:\n");
> H0(" --sar <width:height|int> Sample Aspect Ratio, the ratio
> of width to height of an individual pixel.\n");
> H0(" Choose from 0=undef,
> 1=1:1(\"square\"), 2=12:11, 3=10:11, 4=16:11,\n");
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20170410/b4c6a46c/attachment-0001.html>
More information about the x265-devel
mailing list