[x265] [PATCH] intra refactoring: removed storing left neighbouring pixels in 144 bytes stride

Steve Borho steve at borho.org
Fri Nov 28 21:19:23 CET 2014


On 11/28, ashok at multicorewareinc.com wrote:
> # HG changeset patch
> # User Ashok Kumar Mishra<ashok at multicorewareinc.com>
> # Date 1416900898 -19800
> #      Tue Nov 25 13:04:58 2014 +0530
> # Node ID b6146d08a1f1c78246c2650e532db8cefdbc3bfb
> # Parent  d32249002258d65092674c7782ea022ae17b75d4
> intra refactoring: removed storing left neighbouring pixels in 144 bytes stride

This looks like a patch series that needs to be kept in a forked repo
until all of the intra assembly can be updated and can be pulled into
the main repo in one coherent group of commits.

Why was the HM ADI buffer was organized so innefficiently?

> diff -r d32249002258 -r b6146d08a1f1 source/common/predict.cpp
> --- a/source/common/predict.cpp	Fri Nov 28 11:10:05 2014 +0530
> +++ b/source/common/predict.cpp	Tue Nov 25 13:04:58 2014 +0530
> @@ -83,8 +83,7 @@
>  {
>      int tuSize = 1 << log2TrSize;
>  
> -    pixel* refLft;
> -    pixel* refAbv;
> +    pixel *refLft, *refAbv;
>  
>      if (!(g_intraFilterFlags[dirMode] & tuSize))
>      {
> @@ -116,8 +115,10 @@
>      pixel* left = buf0 + bufOffset;
>  
>      int limit = (dirMode <= 25 && dirMode >= 11) ? (tuSize + 1 + 1) : (tuSize2 + 1);
> -    for (int k = 0; k < limit; k++)
> -        left[k] = src[k * ADI_BUF_STRIDE];
> +
> +    left[0] = src[0];
> +    for (int k = 1; k < limit; k++)
> +        left[k] = src[k + tuSize2];

this probably wants to be a memcpy

>      if (chFmt == X265_CSP_I444 && (g_intraFilterFlags[dirMode] & tuSize))
>      {
> @@ -678,10 +679,11 @@
>      refAbove += bufOffset;
>      refLeft += bufOffset;
>  
> -    //  ADI_BUF_STRIDE * (2 * tuSize + 1);
>      memcpy(refAbove, adiBuf, (tuSize2 + 1) * sizeof(pixel));
> -    for (int k = 0; k < tuSize2 + 1; k++)
> -        refLeft[k] = adiBuf[k * ADI_BUF_STRIDE];
> +
> +    refLeft[0] = adiBuf[0];
> +    for (int k = 1; k < tuSize2 + 1 ; k++)
> +        refLeft[k] = adiBuf[k + tuSize2];

this probably wants to be a memcpy

>      if (dirMode == ALL_IDX ? (8 | 16 | 32) & tuSize : g_intraFilterFlags[dirMode] & tuSize)
>      {
> @@ -694,46 +696,45 @@
>          if (bStrongSmoothing)
>          {
>              const int trSize = 32;
> -            const int trSize2 = 32 * 2;
> +            const int trSize2 = trSize << 1;
>              const int threshold = 1 << (X265_DEPTH - 5);
>              int refBL = refLeft[trSize2];
>              int refTL = refAbove[0];
>              int refTR = refAbove[trSize2];
> -            bStrongSmoothing = (abs(refBL + refTL - 2 * refLeft[trSize]) < threshold &&
> -                abs(refTL + refTR - 2 * refAbove[trSize]) < threshold);
> +            bStrongSmoothing = (abs(refBL + refTL - (refLeft[trSize] << 1)) < threshold &&
> +                abs(refTL + refTR - (refAbove[trSize] << 1)) < threshold);
>  
>              if (bStrongSmoothing)
>              {
>                  // bilinear interpolation
>                  const int shift = 5 + 1; // intraNeighbors.log2TrSize + 1;
>                  int init = (refTL << shift) + tuSize;
> -                int delta;
> +                int deltaL, deltaR;
>  
>                  refLeftFlt[0] = refAboveFlt[0] = refAbove[0];
>  
>                  //TODO: Performance Primitive???
> -                delta = refBL - refTL;
> +                deltaL = refBL - refTL; deltaR = refTR - refTL;
>                  for (int i = 1; i < trSize2; i++)
> -                    refLeftFlt[i] = (pixel)((init + delta * i) >> shift);
> +                {
> +                    refLeftFlt[i] = (pixel)((init + deltaL * i) >> shift);
> +                    refAboveFlt[i] = (pixel)((init + deltaR * i) >> shift);
> +                }
>                  refLeftFlt[trSize2] = refLeft[trSize2];
> -
> -                delta = refTR - refTL;
> -                for (int i = 1; i < trSize2; i++)
> -                    refAboveFlt[i] = (pixel)((init + delta * i) >> shift);
>                  refAboveFlt[trSize2] = refAbove[trSize2];
>  
>                  return;
>              }
>          }
>  
> -        refLeft[-1] = refAbove[1];
> -        for (int i = 0; i < tuSize2; i++)
> -            refLeftFlt[i] = (refLeft[i - 1] + 2 * refLeft[i] + refLeft[i + 1] + 2) >> 2;
> +        refLeftFlt[0] = (refAbove[1] + (refLeft[0] << 1) + refLeft[1] + 2) >> 2;
> +        for (int i = 1; i < tuSize2; i++)
> +            refLeftFlt[i] = (refLeft[i - 1] + (refLeft[i] << 1) + refLeft[i + 1] + 2) >> 2;
>          refLeftFlt[tuSize2] = refLeft[tuSize2];
>  
>          refAboveFlt[0] = refLeftFlt[0];
>          for (int i = 1; i < tuSize2; i++)
> -            refAboveFlt[i] = (refAbove[i - 1] + 2 * refAbove[i] + refAbove[i + 1] + 2) >> 2;
> +            refAboveFlt[i] = (refAbove[i - 1] + (refAbove[i] << 1) + refAbove[i + 1] + 2) >> 2;
>          refAboveFlt[tuSize2] = refAbove[tuSize2];

these smoothing functions should be assembly coded at some point

>
>      }
>  }
> @@ -804,14 +805,15 @@
>      uint32_t tuSize = intraNeighbors.tuSize;
>      uint32_t refSize = tuSize * 2 + 1;
>  
> +    // Nothing is available, perform DC prediction.
>      if (numIntraNeighbor == 0)
>      {
>          // Fill border with DC value
>          for (uint32_t i = 0; i < refSize; i++)
>              adiRef[i] = dcValue;

This wants to be a memset

> -        for (uint32_t i = 1; i < refSize; i++)
> -            adiRef[i * ADI_BUF_STRIDE] = dcValue;
> +        for (uint32_t i = 0; i < refSize - 1; i++)
> +            adiRef[i + refSize] = dcValue;

and combined with this memset

>      }
>      else if (numIntraNeighbor == totalUnits)
>      {
> @@ -821,9 +823,10 @@
>  
>          // Fill left border with rec. samples
>          adiTemp = adiOrigin - 1;
> -        for (uint32_t i = 1; i < refSize; i++)
> +
> +        for (uint32_t i = 0; i < refSize - 1; i++)
>          {
> -            adiRef[i * ADI_BUF_STRIDE] = adiTemp[0];
> +            adiRef[i + refSize] = adiTemp[0];
>              adiTemp += picStride;
>          }
>      }
> @@ -943,8 +946,8 @@
>          memcpy(adiRef, adi, refSize * sizeof(*adiRef));
>  
>          adi = adiLineBuffer + refSize - 1;
> -        for (int i = 1; i < (int)refSize; i++)
> -            adiRef[i * ADI_BUF_STRIDE] = adi[-i];
> +        for (int i = 0; i < (int)refSize - 1; i++)
> +            adiRef[i + refSize] = adi[-(i + 1)];
>      }
>  }
>  
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho


More information about the x265-devel mailing list