[x265] [PATCH] primitives: added C primitives for upShift/downShift input pixels

Fri Mar 14 18:50:23 CET 2014

On Fri, Mar 14, 2014 at 8:06 AM,  <murugan at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Murugan Vairavel <murugan at multicorewareinc.com>
> # Date 1394784629 -19800
> #      Fri Mar 14 13:40:29 2014 +0530
> # Node ID 24ff000ca31e566e8070cf2ac75f3ddc79a108b4
> # Parent  63eea432b81cf350e34db82f28b21b7e6bf8333b
> primitives: added C primitives for upShift/downShift input pixels
>
> diff -r 63eea432b81c -r 24ff000ca31e source/Lib/TLibCommon/TComPicYuv.cpp
> --- a/source/Lib/TLibCommon/TComPicYuv.cpp      Fri Mar 14 12:48:46 2014 +0530
> +++ b/source/Lib/TLibCommon/TComPicYuv.cpp      Fri Mar 14 13:40:29 2014 +0530
> @@ -169,11 +169,11 @@
>      int height = m_picHeight - pady;
>
>      /* internal pad to multiple of 16x16 blocks */
> -    uint8_t rem = width & 15;
> +    uint8_t rem = height & 15;
> +    pady = rem ? 16 - rem : pady;
>
> +    rem = width & 15;
>      padx = rem ? 16 - rem : padx;
> -    rem = height & 15;
> -    pady = rem ? 16 - rem : pady;
>
>      /* add one more row and col of pad for downscale interpolation, fixes
>       * warnings from valgrind about using uninitialized pixels */
> @@ -193,29 +193,38 @@
>          uint8_t *uChar = (uint8_t*)pic.planes[1];
>          uint8_t *vChar = (uint8_t*)pic.planes[2];
>
> -        for (int r = 0; r < height; r++)
> +        if (rem == 0)
>          {
> -            for (int c = 0; c < width; c++)
> +            primitives.upShift(yChar, pic.stride[0] / sizeof(*yChar), yPixel, getStride(), width, height);
> +            primitives.upShift(uChar, pic.stride[1] / sizeof(*uChar), uPixel, getCStride(), width >> m_hChromaShift, height >> m_vChromaShift);
> +            primitives.upShift(vChar, pic.stride[2] / sizeof(*vChar), vPixel, getCStride(), width >> m_hChromaShift, height >> m_vChromaShift);

the primitive names should be planecopy_sp or planecopy_cp

> +        }
> +        else /*Handles width not multiple of 16*/

we put spaces inside the comment delimiters /* X */, not /*X*/

> +        {

My general point, which still seems to be lost, is that it would be
preferable for the assembly to just deal with input widths
non-multiples of 16 in the best way it can, it would be better than
this.  I understand that having to deal with small reads makes the
assembly slower, but in this situation we have no control over the
input buffer strides.

You could make the assembly round up the width to a multiple 16bytes
and this would be perfectly fine except the very last row would need
special handling.  The user is passing us a pointer, stride, and
height and we're allowed to assume all the data between pointer to
pointer + stride * height is ok to read.  So reading past the row
width is perfectly safe except perhaps on the last row.  On the last
row you may need to copy the last sub-16 bytes one by one.

If you wrote it this way, the function could also perform the width
extension by replicating the last valid pixel prior to doing the last
write of each row.

I guess in short my point is that our goal here isn't to write
primitives that run fast in the test bench and show great x-factors.
Our goal here is to make the encoder run faster.  Writing a primitive
that only supports input pictures an even width of 16 will make the
primitive fast, but it doesn't help the encoder at all.

> +            for (int r = 0; r < height; r++)
>              {
> -                yPixel[c] = ((pixel)yChar[c]) << 2;
> +                for (int c = 0; c < width; c++)
> +                {
> +                    yPixel[c] = ((pixel)yChar[c]) << 2;
> +                }
> +
> +                yPixel += getStride();
> +                yChar += pic.stride[0] / sizeof(*yChar);
>              }
>
> -            yPixel += getStride();
> -            yChar += pic.stride[0] / sizeof(*yChar);
> -        }
> +            for (int r = 0; r < height >> m_vChromaShift; r++)
> +            {
> +                for (int c = 0; c < width >> m_hChromaShift; c++)
> +                {
> +                    uPixel[c] = ((pixel)uChar[c]) << 2;
> +                    vPixel[c] = ((pixel)vChar[c]) << 2;
> +                }
>
> -        for (int r = 0; r < height >> m_vChromaShift; r++)
> -        {
> -            for (int c = 0; c < width >> m_hChromaShift; c++)
> -            {
> -                uPixel[c] = ((pixel)uChar[c]) << 2;
> -                vPixel[c] = ((pixel)vChar[c]) << 2;
> +                uPixel += getCStride();
> +                vPixel += getCStride();
> +                uChar += pic.stride[1] / sizeof(*uChar);
> +                vChar += pic.stride[2] / sizeof(*vChar);
>              }
> -
> -            uPixel += getCStride();
> -            vPixel += getCStride();
> -            uChar += pic.stride[1] / sizeof(*uChar);
> -            vChar += pic.stride[2] / sizeof(*vChar);
>          }
>      }
>      else if (pic.bitDepth == 8)
> @@ -268,29 +277,38 @@
>          int shift = X265_MAX(0, pic.bitDepth - X265_DEPTH);
>
>          /* shift and mask pixels to final size */
> -        for (int r = 0; r < height; r++)
> +        if (rem == 0)
>          {
> -            for (int c = 0; c < width; c++)
> +            primitives.downShift(yShort, pic.stride[0] / sizeof(*yShort), yPixel, getStride(), width, height, shift, mask);
> +            primitives.downShift(uShort, pic.stride[1] / sizeof(*uShort), uPixel, getCStride(), width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
> +            primitives.downShift(vShort, pic.stride[2] / sizeof(*vShort), vPixel, getCStride(), width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
> +        }
> +        else /*Handles width not multiple of 16*/
> +        {
> +            for (int r = 0; r < height; r++)
>              {
> -                yPixel[c] = (pixel)((yShort[c] >> shift) & mask);
> +                for (int c = 0; c < width; c++)
> +                {
> +                    yPixel[c] = (pixel)((yShort[c] >> shift) & mask);
> +                }
> +
> +                yPixel += getStride();
> +                yShort += pic.stride[0] / sizeof(*yShort);
>              }
>
> -            yPixel += getStride();
> -            yShort += pic.stride[0] / sizeof(*yShort);
> -        }
> +            for (int r = 0; r < height >> m_vChromaShift; r++)
> +            {
> +                for (int c = 0; c < width >> m_hChromaShift; c++)
> +                {
> +                    uPixel[c] = (pixel)((uShort[c] >> shift) & mask);
> +                    vPixel[c] = (pixel)((vShort[c] >> shift) & mask);
> +                }
>
> -        for (int r = 0; r < height >> m_vChromaShift; r++)
> -        {
> -            for (int c = 0; c < width >> m_hChromaShift; c++)
> -            {
> -                uPixel[c] = (pixel)((uShort[c] >> shift) & mask);
> -                vPixel[c] = (pixel)((vShort[c] >> shift) & mask);
> +                uPixel += getCStride();
> +                vPixel += getCStride();
> +                uShort += pic.stride[1] / sizeof(*uShort);
> +                vShort += pic.stride[2] / sizeof(*vShort);
>              }
> -
> -            uPixel += getCStride();
> -            vPixel += getCStride();
> -            uShort += pic.stride[1] / sizeof(*uShort);
> -            vShort += pic.stride[2] / sizeof(*vShort);
>          }
>      }
>
> diff -r 63eea432b81c -r 24ff000ca31e source/common/pixel.cpp
> --- a/source/common/pixel.cpp   Fri Mar 14 12:48:46 2014 +0530
> +++ b/source/common/pixel.cpp   Fri Mar 14 13:40:29 2014 +0530
> @@ -852,6 +852,34 @@
>          dst  += dstStride;
>      }
>  }
> +
> +void upShift(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height)

change these function names as well

> +{
> +    for (int r = 0; r < height; r++)
> +    {
> +        for (int c = 0; c < width; c++)
> +        {
> +            dst[c] = ((pixel)src[c]) << 2;
> +        }
> +
> +        dst += dstStride;
> +        src += srcStride;
> +    }
> +}
> +
> +void downShift(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
> +{
> +    for (int r = 0; r < height; r++)
> +    {
> +        for (int c = 0; c < width; c++)
> +        {
> +            dst[c] = (pixel)((src[c] >> shift) & mask);
> +        }
> +
> +        dst += dstStride;
> +        src += srcStride;
> +    }
> +}
>  }  // end anonymous namespace
>
>  namespace x265 {
> @@ -1099,5 +1127,7 @@
>      p.var[BLOCK_32x32] = pixel_var<32>;
>      p.var[BLOCK_64x64] = pixel_var<64>;
>      p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma;
> +    p.upShift = upShift;
> +    p.downShift = downShift;
>  }
>  }
> diff -r 63eea432b81c -r 24ff000ca31e source/common/primitives.h
> --- a/source/common/primitives.h        Fri Mar 14 12:48:46 2014 +0530
> +++ b/source/common/primitives.h        Fri Mar 14 13:40:29 2014 +0530
> @@ -163,6 +163,8 @@
>  typedef void (*addAvg_t)(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
>
>  typedef void (*saoCuOrgE0_t)(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t signLeft);
> +typedef void (*planecopy_cp) (uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height);
> +typedef void (*planecopy_sp) (uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
>
>  /* Define a structure containing function pointers to optimized encoder
>   * primitives.  Each pointer can reference either an assembly routine,
> @@ -233,6 +235,8 @@
>      extendCURowBorder_t extendRowBorder;
>      // sao primitives
>      saoCuOrgE0_t      saoCuOrgE0;
> +    planecopy_cp         upShift;
> +    planecopy_sp      downShift;
>
>      struct
>      {
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho