[x265] [PATCH] primitives: added C primitives for upShift/downShift input pixels

Wed Mar 12 09:45:51 CET 2014

On Wed, Mar 12, 2014 at 2:18 AM,  <murugan at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Murugan Vairavel <murugan at multicorewareinc.com>
> # Date 1394608582 -19800
> #      Wed Mar 12 12:46:22 2014 +0530
> # Node ID 680c858b33dd6c6edbf69d5e270bbce995eeb010
> # Parent  c24eda418b5c4cc899592ea2999128bdfa844b9c
> primitives: added C primitives for upShift/downShift input pixels
>
> diff -r c24eda418b5c -r 680c858b33dd source/Lib/TLibCommon/TComPicYuv.cpp
> --- a/source/Lib/TLibCommon/TComPicYuv.cpp      Tue Mar 11 18:27:26 2014 -0500
> +++ b/source/Lib/TLibCommon/TComPicYuv.cpp      Wed Mar 12 12:46:22 2014 +0530
> @@ -169,11 +169,11 @@
>      int height = m_picHeight - pady;
>
>      /* internal pad to multiple of 16x16 blocks */
> -    uint8_t rem = width & 15;
> +    uint8_t rem = height & 15;
> +    pady = rem ? 16 - rem : pady;
>
> +    rem = width & 15;
>      padx = rem ? 16 - rem : padx;
> -    rem = height & 15;
> -    pady = rem ? 16 - rem : pady;
>
>      /* add one more row and col of pad for downscale interpolation, fixes
>       * warnings from valgrind about using uninitialized pixels */
> @@ -192,31 +192,47 @@
>          uint8_t *y = (uint8_t*)pic.planes[0];
>          uint8_t *u = (uint8_t*)pic.planes[1];
>          uint8_t *v = (uint8_t*)pic.planes[2];
> +        int lumaWidth = width - rem;
> +        int chromaWidth = (width - rem) >> m_hChromaShift;
>
> -        for (int r = 0; r < height; r++)
> +        primitives.upShift(y, pic.stride[0] / sizeof(*y), Y, getStride(), lumaWidth, height);
> +        primitives.upShift(u, pic.stride[1] / sizeof(*u), U, getCStride(), chromaWidth, height >> m_vChromaShift);
> +        primitives.upShift(v, pic.stride[2] / sizeof(*v), V, getCStride(), chromaWidth, height >> m_vChromaShift);
> +        /*Handles videos with width which are not multiples of 16 */
> +        if(rem)

white-space

>          {
> -            for (int c = 0; c < width; c++)
> +            y += lumaWidth;
> +            Y += lumaWidth;
> +            v += chromaWidth;
> +            V += chromaWidth;
> +            u += chromaWidth;
> +            U += chromaWidth;

blech. the destination buffer has loads of padding (stride is much
wider than width), writing past the right edge is perfectly safe but
you do need to be careful about reading beyond the input stride.

> +            for (int r = 0; r < height; r++)
>              {
> -                Y[c] = ((pixel)y[c]) << 2;
> +                for (int c = 0; c < rem; c++)
> +                {
> +                    Y[c] = ((pixel)y[c]) << 2;
> +                }
> +
> +                Y += getStride();
> +                y += pic.stride[0] / sizeof(*y);
>              }
>
> -            Y += getStride();
> -            y += pic.stride[0] / sizeof(*y);
> +            for (int r = 0; r < height >> m_vChromaShift; r++)
> +            {
> +                for (int c = 0; c < rem >> m_hChromaShift; c++)
> +                {
> +                    U[c] = ((pixel)u[c]) << 2;
> +                    V[c] = ((pixel)v[c]) << 2;
> +                }
> +
> +                U += getCStride();
> +                V += getCStride();
> +                u += pic.stride[1] / sizeof(*u);
> +                v += pic.stride[2] / sizeof(*v);
> +            }
>          }
>
> -        for (int r = 0; r < height >> m_vChromaShift; r++)
> -        {
> -            for (int c = 0; c < width >> m_hChromaShift; c++)
> -            {
> -                U[c] = ((pixel)u[c]) << 2;
> -                V[c] = ((pixel)v[c]) << 2;
> -            }
> -
> -            U += getCStride();
> -            V += getCStride();
> -            u += pic.stride[1] / sizeof(*u);
> -            v += pic.stride[2] / sizeof(*v);
> -        }
>      }
>      else if (pic.bitDepth == 8)
>      {
> @@ -266,32 +282,48 @@
>          /* defensive programming, mask off bits that are supposed to be zero */
>          uint16_t mask = (1 << X265_DEPTH) - 1;
>          int shift = X265_MAX(0, pic.bitDepth - X265_DEPTH);
> +        int lumaWidth = width - rem;
> +        int chromaWidth = (width - rem) >> m_hChromaShift;
>
>          /* shift and mask pixels to final size */
> -        for (int r = 0; r < height; r++)
> +        primitives.downShift(y, pic.stride[0] / sizeof(*y), Y, getStride(), lumaWidth, height, shift, mask);
> +        primitives.downShift(u, pic.stride[1] / sizeof(*u), U, getCStride(), chromaWidth, height >> m_vChromaShift, shift, mask);
> +        primitives.downShift(v, pic.stride[2] / sizeof(*v), V, getCStride(), chromaWidth, height >> m_vChromaShift, shift, mask);
> +        /*Handles videos with width which are not multiples of 16 */
> +        if(rem)
>          {
> -            for (int c = 0; c < width; c++)
> +            y += lumaWidth;
> +            Y += lumaWidth;
> +            v += chromaWidth;
> +            V += chromaWidth;
> +            u += chromaWidth;
> +            U += chromaWidth;
> +            for (int r = 0; r < height; r++)
>              {
> -                Y[c] = (pixel)((y[c] >> shift) & mask);
> +                for (int c = 0; c < rem; c++)
> +                {
> +                    Y[c] = (pixel)((y[c] >> shift) & mask);
> +                }
> +
> +                Y += getStride();
> +                y += pic.stride[0] / sizeof(*y);
>              }
>
> -            Y += getStride();
> -            y += pic.stride[0] / sizeof(*y);
> +            for (int r = 0; r < height >> m_vChromaShift; r++)
> +            {
> +                for (int c = 0; c < rem >> m_hChromaShift; c++)
> +                {
> +                    U[c] = (pixel)((u[c] >> shift) & mask);
> +                    V[c] = (pixel)((v[c] >> shift) & mask);
> +
> +                U += getCStride();
> +                V += getCStride();
> +                u += pic.stride[1] / sizeof(*u);
> +                v += pic.stride[2] / sizeof(*v);
> +            }
>          }
>
> -        for (int r = 0; r < height >> m_vChromaShift; r++)
> -        {
> -            for (int c = 0; c < width >> m_hChromaShift; c++)
> -            {
> -                U[c] = (pixel)((u[c] >> shift) & mask);
> -                V[c] = (pixel)((v[c] >> shift) & mask);
> -            }
> -
> -            U += getCStride();
> -            V += getCStride();
> -            u += pic.stride[1] / sizeof(*u);
> -            v += pic.stride[2] / sizeof(*v);
> -        }
> +    }
>      }
>
>      /* extend the right edge if width was not multiple of the minimum CU size */
> diff -r c24eda418b5c -r 680c858b33dd source/common/pixel.cpp
> --- a/source/common/pixel.cpp   Tue Mar 11 18:27:26 2014 -0500
> +++ b/source/common/pixel.cpp   Wed Mar 12 12:46:22 2014 +0530
> @@ -852,6 +852,34 @@
>          dst  += dstStride;
>      }
>  }
> +
> +void upShift(uint8_t *src, int srcStride, pixel *dst, int dstStride, int width, int height)
> +{
> +    for (int r = 0; r < height; r++)
> +    {
> +        for (int c = 0; c < width; c++)
> +        {
> +            dst[c] = ((pixel)src[c]) << 2;
> +        }
> +
> +        dst += dstStride;
> +        src += srcStride;
> +    }
> +}
> +
> +void downShift(uint16_t *src, int srcStride, pixel *dst, int dstStride, int width, int height, int shift, uint16_t mask)
> +{
> +    for (int r = 0; r < height; r++)
> +    {
> +        for (int c = 0; c < width; c++)
> +        {
> +            dst[c] = (pixel)((src[c] >> shift) & mask);
> +        }
> +
> +        dst += dstStride;
> +        src += srcStride;
> +    }
> +}
>  }  // end anonymous namespace
>
>  namespace x265 {
> @@ -1099,5 +1127,7 @@
>      p.var[BLOCK_32x32] = pixel_var<32>;
>      p.var[BLOCK_64x64] = pixel_var<64>;
>      p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma;
> +    p.upShift = upShift;
> +    p.downShift = downShift;
>  }
>  }
> diff -r c24eda418b5c -r 680c858b33dd source/common/primitives.h
> --- a/source/common/primitives.h        Tue Mar 11 18:27:26 2014 -0500
> +++ b/source/common/primitives.h        Wed Mar 12 12:46:22 2014 +0530
> @@ -163,6 +163,8 @@
>  typedef void (*addAvg_t)(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
>
>  typedef void (*saoCuOrgE0_t)(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t signLeft);
> +typedef void (*upShift_t) (uint8_t *src, int srcStride, pixel *dst, int dstStride, int width, int height);
> +typedef void (*downShift_t) (uint16_t *src, int srcStride, pixel *dst, int dstStride, int width, int height, int shift, uint16_t mask);

I prefer planecopy_sp() (short to pixel) and planecopy_cp() (char to pixel)

strides are generally intptr_t type in our asm primitives

>
>  /* Define a structure containing function pointers to optimized encoder
>   * primitives.  Each pointer can reference either an assembly routine,
> @@ -233,6 +235,8 @@
>      extendCURowBorder_t extendRowBorder;
>      // sao primitives
>      saoCuOrgE0_t      saoCuOrgE0;
> +    upShift_t         upShift;
> +    downShift_t       downShift;
>
>      struct
>      {
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho