[x265] [PATCH] analysis: avoid redundant rect/amp mode analysis based on split block rdCost and mvCost for rd-0/4

Ashok Kumar Mishra ashok at multicorewareinc.com
Thu Oct 15 17:06:07 CEST 2015


Below are the performance testing on Haswell with and without limiting
rect/amp analysis mode in slow preset.

*Before*
D:\ashok>x265_b.exe --input
\\HEVC-TEST-2\testsequences\ducks_take_off_1080p50.y4m --preset slow
--hash=1 --no-info --psnr --ssim -o test_b.hevc --bitrate 6000
encoded 500 frames in 157.90s (3.17 fps), 6060.98 kb/s, Avg QP:40.41,
Global PSNR: 29.485, SSIM Mean Y: 0.7780801 ( 6.538 dB)

*After*
D:\ashok>x265.exe --input
\\HEVC-TEST-2\testsequences\ducks_take_off_1080p50.y4m --preset slow
--hash=1 --no-info --psnr --ssim -o test.hevc --bitrate 6000
--limit-rect-amp 1
encoded 500 frames in 148.53s (3.37 fps), 6062.77 kb/s, Avg QP:40.43,
Global PSNR: 29.487, SSIM Mean Y: 0.7780540 ( 6.538 dB)

D:\ashok>x265.exe --input
\\HEVC-TEST-2\testsequences\ducks_take_off_1080p50.y4m --preset slow
--hash=1 --no-info --psnr --ssim -o test.hevc --bitrate 6000 --limit-refs 1
encoded 500 frames in 136.84s (3.65 fps), 6061.15 kb/s, Avg QP:40.42,
Global PSNR: 29.480, SSIM Mean Y: 0.7778692 ( 6.534 dB)

D:\ashok>x265.exe --input
\\HEVC-TEST-2\testsequences\ducks_take_off_1080p50.y4m --preset slow
--hash=1 --no-info --psnr --ssim -o test.hevc --bitrate 6000 --limit-refs 1
--limit-rect-amp 1
encoded 500 frames in 133.06s (3.76 fps), 6062.52 kb/s, Avg QP:40.43,
Global PSNR: 29.481, SSIM Mean Y: 0.7779036 ( 6.535 dB)

On Thu, Oct 15, 2015 at 8:31 PM, <ashok at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Ashok Kumar Mishra<ashok at multicorewareinc.com>
> # Date 1444897694 -19800
> #      Thu Oct 15 13:58:14 2015 +0530
> # Node ID 65d7c1f5baf5fa619d773fcc2e1361d46f6df7f1
> # Parent  f3963e7e75b8dcb599250c082357e08fd32191a5
> analysis: avoid redundant rect/amp mode analysis based on split block
> rdCost and mvCost for rd-0/4
>
> diff -r f3963e7e75b8 -r 65d7c1f5baf5 source/encoder/analysis.cpp
> --- a/source/encoder/analysis.cpp       Wed Oct 14 17:44:33 2015 +0530
> +++ b/source/encoder/analysis.cpp       Thu Oct 15 13:58:14 2015 +0530
> @@ -809,7 +809,7 @@
>      return refMask;
>  }
>
> -uint32_t Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const
> CUGeom& cuGeom, int32_t qp)
> +SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const
> CUGeom& cuGeom, int32_t qp)
>  {
>      uint32_t depth = cuGeom.depth;
>      uint32_t cuAddr = parentCTU.m_cuAddr;
> @@ -823,7 +823,13 @@
>      uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom);
>      bool earlyskip = false;
>      bool splitIntra = true;
> -    uint32_t splitRefs[4] = { 0, 0, 0, 0 };
> +
> +    SplitData splitData[4];
> +    splitData[0].initSplitCUData();
> +    splitData[1].initSplitCUData();
> +    splitData[2].initSplitCUData();
> +    splitData[3].initSplitCUData();
> +
>      /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */
>      if (mightNotSplit && depth >= minDepth)
>      {
> @@ -869,7 +875,7 @@
>                  if (m_slice->m_pps->bUseDQP && nextDepth <=
> m_slice->m_pps->maxCuDQPDepth)
>                      nextQP = setLambdaFromQP(parentCTU,
> calculateQpforCuSize(parentCTU, childGeom));
>
> -                splitRefs[subPartIdx] = compressInterCU_rd0_4(parentCTU,
> childGeom, nextQP);
> +                splitData[subPartIdx] = compressInterCU_rd0_4(parentCTU,
> childGeom, nextQP);
>
>                  // Save best CU and pred data for this sub CU
>                  splitIntra |= nd.bestMode->cu.isIntra(0);
> @@ -899,7 +905,7 @@
>      /* Split CUs
>       *   0  1
>       *   2  3 */
> -    uint32_t allSplitRefs = splitRefs[0] | splitRefs[1] | splitRefs[2] |
> splitRefs[3];
> +    uint32_t allSplitRefs = splitData[0].splitRefs |
> splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
>      /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current
> depth */
>      if (mightNotSplit && depth >= minDepth)
>      {
> @@ -917,7 +923,7 @@
>              {
>                  CUData& cu = md.pred[PRED_2Nx2N].cu;
>                  uint32_t refMask = cu.getBestRefIdx(0);
> -                allSplitRefs = splitRefs[0] = splitRefs[1] = splitRefs[2]
> = splitRefs[3] = refMask;
> +                allSplitRefs = splitData[0].splitRefs =
> splitData[1].splitRefs = splitData[2].splitRefs = splitData[3].splitRefs =
> refMask;
>              }
>
>              if (m_slice->m_sliceType == B_SLICE)
> @@ -929,23 +935,82 @@
>              Mode *bestInter = &md.pred[PRED_2Nx2N];
>              if (m_param->bEnableRectInter)
>              {
> -                refMasks[0] = splitRefs[0] | splitRefs[2]; /* left */
> -                refMasks[1] = splitRefs[1] | splitRefs[3]; /* right */
> -                md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
> -                checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N,
> refMasks);
> -                if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
> -                    bestInter = &md.pred[PRED_Nx2N];
> -
> -                refMasks[0] = splitRefs[0] | splitRefs[1]; /* top */
> -                refMasks[1] = splitRefs[2] | splitRefs[3]; /* bot */
> -                md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
> -                checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN,
> refMasks);
> -                if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
> -                    bestInter = &md.pred[PRED_2NxN];
> +                uint64_t splitCost = splitData[0].rdCost +
> splitData[1].rdCost + splitData[2].rdCost + splitData[3].rdCost;
> +                ModeDepth& md = m_modeDepth[depth];
> +                uint32_t threshold_2NxN, threshold_Nx2N;
> +
> +                if (m_slice->m_sliceType == P_SLICE)
> +                {
> +                    threshold_2NxN = splitData[0].mvCost[0] +
> splitData[1].mvCost[0];
> +                    threshold_Nx2N = splitData[0].mvCost[0] +
> splitData[2].mvCost[0];
> +                }
> +                else
> +                {
> +                    threshold_2NxN = (splitData[0].mvCost[0] +
> splitData[1].mvCost[0]
> +                                    + splitData[0].mvCost[1] +
> splitData[1].mvCost[1] + 1) >> 1;
> +                    threshold_Nx2N = (splitData[0].mvCost[0] +
> splitData[2].mvCost[0]
> +                                    + splitData[0].mvCost[1] +
> splitData[2].mvCost[1] + 1) >> 1;
> +                }
> +
> +                int try_2NxN_first = threshold_2NxN < threshold_Nx2N;
> +                if (try_2NxN_first && splitCost < md.bestMode->rdCost +
> threshold_2NxN)
> +                {
> +                    refMasks[0] = splitData[0].splitRefs |
> splitData[1].splitRefs; /* top */
> +                    refMasks[1] = splitData[2].splitRefs |
> splitData[3].splitRefs; /* bot */
> +                    md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom,
> qp);
> +                    checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom,
> SIZE_2NxN, refMasks);
> +                    if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
> +                        bestInter = &md.pred[PRED_2NxN];
> +                }
> +
> +                if (splitCost < md.bestMode->rdCost + threshold_Nx2N)
> +                {
> +                    refMasks[0] = splitData[0].splitRefs |
> splitData[2].splitRefs; /* left */
> +                    refMasks[1] = splitData[1].splitRefs |
> splitData[3].splitRefs; /* right */
> +                    md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom,
> qp);
> +                    checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom,
> SIZE_Nx2N, refMasks);
> +                    if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
> +                        bestInter = &md.pred[PRED_Nx2N];
> +                }
> +
> +                if (!try_2NxN_first && splitCost < md.bestMode->rdCost +
> threshold_2NxN)
> +                {
> +                    refMasks[0] = splitData[0].splitRefs |
> splitData[1].splitRefs; /* top */
> +                    refMasks[1] = splitData[2].splitRefs |
> splitData[3].splitRefs; /* bot */
> +                    md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom,
> qp);
> +                    checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom,
> SIZE_2NxN, refMasks);
> +                    if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
> +                        bestInter = &md.pred[PRED_2NxN];
> +                }
>              }
>
>              if (m_slice->m_sps->maxAMPDepth > depth)
>              {
> +                uint64_t splitCost = splitData[0].rdCost +
> splitData[1].rdCost + splitData[2].rdCost + splitData[3].rdCost;
> +                ModeDepth& md = m_modeDepth[depth];
> +                uint32_t threshold_2NxnU, threshold_2NxnD,
> threshold_nLx2N, threshold_nRx2N;
> +
> +                if (m_slice->m_sliceType == P_SLICE)
> +                {
> +                    threshold_2NxnU = splitData[0].mvCost[0] +
> splitData[1].mvCost[0];
> +                    threshold_2NxnD = splitData[2].mvCost[0] +
> splitData[3].mvCost[0];
> +
> +                    threshold_nLx2N = splitData[0].mvCost[0] +
> splitData[2].mvCost[0];
> +                    threshold_nRx2N = splitData[1].mvCost[0] +
> splitData[3].mvCost[0];
> +                }
> +                else
> +                {
> +                    threshold_2NxnU = (splitData[0].mvCost[0] +
> splitData[1].mvCost[0]
> +                                       + splitData[0].mvCost[1] +
> splitData[1].mvCost[1] + 1) >> 1;
> +                    threshold_2NxnD = (splitData[2].mvCost[0] +
> splitData[3].mvCost[0]
> +                                       + splitData[2].mvCost[1] +
> splitData[3].mvCost[1] + 1) >> 1;
> +
> +                    threshold_nLx2N = (splitData[0].mvCost[0] +
> splitData[2].mvCost[0]
> +                                       + splitData[0].mvCost[1] +
> splitData[2].mvCost[1] + 1) >> 1;
> +                    threshold_nRx2N = (splitData[1].mvCost[0] +
> splitData[3].mvCost[0]
> +                                       + splitData[1].mvCost[1] +
> splitData[3].mvCost[1] + 1) >> 1;
> +                }
> +
>                  bool bHor = false, bVer = false;
>                  if (bestInter->cu.m_partSize[0] == SIZE_2NxN)
>                      bHor = true;
> @@ -960,35 +1025,69 @@
>
>                  if (bHor)
>                  {
> -                    refMasks[0] = splitRefs[0] | splitRefs[1]; /* 25% top
> */
> -                    refMasks[1] = allSplitRefs;                /* 75% bot
> */
> -                    md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom,
> qp);
> -                    checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom,
> SIZE_2NxnU, refMasks);
> -                    if (md.pred[PRED_2NxnU].sa8dCost <
> bestInter->sa8dCost)
> -                        bestInter = &md.pred[PRED_2NxnU];
> -
> -                    refMasks[0] = allSplitRefs;                /* 75% top
> */
> -                    refMasks[1] = splitRefs[2] | splitRefs[3]; /* 25% bot
> */
> -                    md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom,
> qp);
> -                    checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom,
> SIZE_2NxnD, refMasks);
> -                    if (md.pred[PRED_2NxnD].sa8dCost <
> bestInter->sa8dCost)
> -                        bestInter = &md.pred[PRED_2NxnD];
> +                    int try_2NxnD_first = threshold_2NxnD <
> threshold_2NxnU;
> +                    if (try_2NxnD_first && splitCost <
> md.bestMode->rdCost + threshold_2NxnD)
> +                    {
> +                        refMasks[0] = allSplitRefs;
>               /* 75% top */
> +                        refMasks[1] = splitData[2].splitRefs |
> splitData[3].splitRefs; /* 25% bot */
> +                        md.pred[PRED_2NxnD].cu.initSubCU(parentCTU,
> cuGeom, qp);
> +                        checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom,
> SIZE_2NxnD, refMasks);
> +                        if (md.pred[PRED_2NxnD].sa8dCost <
> bestInter->sa8dCost)
> +                            bestInter = &md.pred[PRED_2NxnD];
> +                    }
> +
> +                    if (splitCost < md.bestMode->rdCost + threshold_2NxnU)
> +                    {
> +                        refMasks[0] = splitData[0].splitRefs |
> splitData[1].splitRefs; /* 25% top */
> +                        refMasks[1] = allSplitRefs;
>               /* 75% bot */
> +                        md.pred[PRED_2NxnU].cu.initSubCU(parentCTU,
> cuGeom, qp);
> +                        checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom,
> SIZE_2NxnU, refMasks);
> +                        if (md.pred[PRED_2NxnU].sa8dCost <
> bestInter->sa8dCost)
> +                            bestInter = &md.pred[PRED_2NxnU];
> +                    }
> +
> +                    if (!try_2NxnD_first && splitCost <
> md.bestMode->rdCost + threshold_2NxnD)
> +                    {
> +                        refMasks[0] = allSplitRefs;
>               /* 75% top */
> +                        refMasks[1] = splitData[2].splitRefs |
> splitData[3].splitRefs; /* 25% bot */
> +                        md.pred[PRED_2NxnD].cu.initSubCU(parentCTU,
> cuGeom, qp);
> +                        checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom,
> SIZE_2NxnD, refMasks);
> +                        if (md.pred[PRED_2NxnD].sa8dCost <
> bestInter->sa8dCost)
> +                            bestInter = &md.pred[PRED_2NxnD];
> +                    }
>                  }
>                  if (bVer)
>                  {
> -                    refMasks[0] = splitRefs[0] | splitRefs[2]; /* 25%
> left */
> -                    refMasks[1] = allSplitRefs;                /* 75%
> right */
> -                    md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom,
> qp);
> -                    checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom,
> SIZE_nLx2N, refMasks);
> -                    if (md.pred[PRED_nLx2N].sa8dCost <
> bestInter->sa8dCost)
> -                        bestInter = &md.pred[PRED_nLx2N];
> -
> -                    refMasks[0] = allSplitRefs;                /* 75%
> left */
> -                    refMasks[1] = splitRefs[1] | splitRefs[3]; /* 25%
> right */
> -                    md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom,
> qp);
> -                    checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom,
> SIZE_nRx2N, refMasks);
> -                    if (md.pred[PRED_nRx2N].sa8dCost <
> bestInter->sa8dCost)
> -                        bestInter = &md.pred[PRED_nRx2N];
> +                    int try_nRx2N_first = threshold_nRx2N <
> threshold_nLx2N;
> +                    if (try_nRx2N_first && splitCost <
> md.bestMode->rdCost + threshold_nRx2N)
> +                    {
> +                        refMasks[0] = allSplitRefs;
>               /* 75% left  */
> +                        refMasks[1] = splitData[1].splitRefs |
> splitData[3].splitRefs; /* 25% right */
> +                        md.pred[PRED_nRx2N].cu.initSubCU(parentCTU,
> cuGeom, qp);
> +                        checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom,
> SIZE_nRx2N, refMasks);
> +                        if (md.pred[PRED_nRx2N].sa8dCost <
> bestInter->sa8dCost)
> +                            bestInter = &md.pred[PRED_nRx2N];
> +                    }
> +
> +                    if (splitCost < md.bestMode->rdCost + threshold_nLx2N)
> +                    {
> +                        refMasks[0] = splitData[0].splitRefs |
> splitData[2].splitRefs; /* 25% left  */
> +                        refMasks[1] = allSplitRefs;
>               /* 75% right */
> +                        md.pred[PRED_nLx2N].cu.initSubCU(parentCTU,
> cuGeom, qp);
> +                        checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom,
> SIZE_nLx2N, refMasks);
> +                        if (md.pred[PRED_nLx2N].sa8dCost <
> bestInter->sa8dCost)
> +                            bestInter = &md.pred[PRED_nLx2N];
> +                    }
> +
> +                    if (!try_nRx2N_first && splitCost <
> md.bestMode->rdCost + threshold_nRx2N)
> +                    {
> +                        refMasks[0] = allSplitRefs;
>               /* 75% left  */
> +                        refMasks[1] = splitData[1].splitRefs |
> splitData[3].splitRefs; /* 25% right */
> +                        md.pred[PRED_nRx2N].cu.initSubCU(parentCTU,
> cuGeom, qp);
> +                        checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom,
> SIZE_nRx2N, refMasks);
> +                        if (md.pred[PRED_nRx2N].sa8dCost <
> bestInter->sa8dCost)
> +                            bestInter = &md.pred[PRED_nRx2N];
> +                    }
>                  }
>              }
>              bool bTryIntra = m_slice->m_sliceType != B_SLICE ||
> m_param->bIntraInBFrames;
> @@ -1139,19 +1238,32 @@
>      }
>
>      /* determine which motion references the parent CU should search */
> -    uint32_t refMask;
> -    if (!(m_param->limitReferences & X265_REF_LIMIT_DEPTH))
> -        refMask = 0;
> -    else if (md.bestMode == &md.pred[PRED_SPLIT])
> -        refMask = allSplitRefs;
> -    else
> -    {
> -        /* use best merge/inter mode, in case of intra use 2Nx2N inter
> references */
> -        CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu
> : md.bestMode->cu;
> -        uint32_t numPU = cu.getNumPartInter(0);
> -        refMask = 0;
> -        for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++,
> subPartIdx += cu.getPUOffset(puIdx, 0))
> -            refMask |= cu.getBestRefIdx(subPartIdx);
> +    SplitData splitCUData;
> +    if (!(m_param->limitReferences & X265_REF_LIMIT_DEPTH))
> +        splitCUData.splitRefs = 0;
> +    else if (md.bestMode == &md.pred[PRED_SPLIT])
> +        splitCUData.splitRefs = allSplitRefs;
> +    else
> +    {
> +        /* use best merge/inter mode, in case of intra use 2Nx2N inter
> references */
> +        CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu
> : md.bestMode->cu;
> +        uint32_t numPU = cu.getNumPartInter(0);
> +        splitCUData.splitRefs = 0;
> +        for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++,
> subPartIdx += cu.getPUOffset(puIdx, 0))
> +            splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
> +    }
> +
> +    if (!m_param->limitRectAmp)
> +    {
> +        splitCUData.mvCost[0] = 0; // L0
> +        splitCUData.mvCost[1] = 0; // L1
> +        splitCUData.rdCost    = 0;
> +    }
> +    else
> +    {
> +        splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost;
> // L0
> +        splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost;
> // L1
> +        splitCUData.rdCost    = md.bestMode->rdCost;
>      }
>
>      if (mightNotSplit)
> @@ -1169,7 +1281,7 @@
>      if (m_param->rdLevel)
>          md.bestMode->reconYuv.copyToPicYuv(reconPic, cuAddr,
> cuGeom.absPartIdx);
>
> -    return refMask;
> +    return splitCUData;
>  }
>
>  SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const
> CUGeom& cuGeom, uint32_t &zOrder, int32_t qp)
> diff -r f3963e7e75b8 -r 65d7c1f5baf5 source/encoder/analysis.h
> --- a/source/encoder/analysis.h Wed Oct 14 17:44:33 2015 +0530
> +++ b/source/encoder/analysis.h Thu Oct 15 13:58:14 2015 +0530
> @@ -131,7 +131,7 @@
>
>      /* full analysis for a P or B slice CU */
>      uint32_t compressInterCU_dist(const CUData& parentCTU, const CUGeom&
> cuGeom, int32_t qp);
> -    uint32_t compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom&
> cuGeom, int32_t qp);
> +    SplitData compressInterCU_rd0_4(const CUData& parentCTU, const
> CUGeom& cuGeom, int32_t qp);
>      SplitData compressInterCU_rd5_6(const CUData& parentCTU, const
> CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
>
>      /* measure merge and skip */
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20151015/cd3ff5ca/attachment-0001.html>


More information about the x265-devel mailing list