[x265] [PATCH] Primitive: Performance Primitives for Pixel add Clip - TcomYuv and TshortYuv

Thu Jul 18 09:41:36 CEST 2013

On Thu, Jul 18, 2013 at 2:25 AM, <gopu at multicorewareinc.com> wrote:

> # HG changeset patch
> # User ggopu
> # Date 1374132302 -19800
> # Node ID d93bf22889f8a58c3b3a03733c8e031ffe192fc3
> # Parent  f813f110d69a1a6650e813dd4e612216982a0264
> Primitive: Performance Primitives for Pixel add Clip - TcomYuv and
> TshortYuv
>
> diff -r f813f110d69a -r d93bf22889f8 source/common/pixel.cpp
> --- a/source/common/pixel.cpp   Thu Jul 18 02:10:37 2013 -0500
> +++ b/source/common/pixel.cpp   Thu Jul 18 12:55:02 2013 +0530
> @@ -535,6 +535,35 @@
>      }
>  }
>
> +void pixeladd_ss_c(int bx, int by, short *a, intptr_t dstride, short *b0,
> short *b1, intptr_t sstride0, intptr_t sstride1)
> +{
> +    for (int y = 0; y < by; y++)
> +    {
> +        for (int x = 0; x < bx; x++)
> +        {
> +            a[x] = (short)ClipY(b0[x] + b1[x]);
> +        }
> +
> +        b0 += sstride0;
> +        b1 += sstride1;
> +        a += dstride;
> +    }
> +}
> +
> +void pixeladd_pp_c(int bx, int by, pixel *a, intptr_t dstride, pixel *b0,
> pixel *b1, intptr_t sstride0, intptr_t sstride1)
> +{
> +    for (int y = 0; y < by; y++)
> +    {
> +        for (int x = 0; x < bx; x++)
> +        {
> +            a[x] = (pixel)ClipY(b0[x] + b1[x]);
> +        }
> +
> +        b0 += sstride0;
> +        b1 += sstride1;
> +        a += dstride;
> +    }
> +}
>  }  // end anonymous namespace
>
>  namespace x265 {
> @@ -738,5 +767,7 @@
>      p.weightpUni = weightUnidir;
>
>      p.pixelsub_sp = pixelsub_sp_c;
> +    p.pixeladd_pp = pixeladd_pp_c;
> +    p.pixeladd_ss = pixeladd_ss_c;
>  }
>  }
> diff -r f813f110d69a -r d93bf22889f8 source/common/primitives.h
> --- a/source/common/primitives.h        Thu Jul 18 02:10:37 2013 -0500
> +++ b/source/common/primitives.h        Thu Jul 18 12:55:02 2013 +0530
> @@ -192,7 +192,9 @@
>  typedef void (*blockcpy_sp_t)(int bx, int by, short *dst, intptr_t
> dstride, pixel *src, intptr_t sstride); // dst is aligned
>  typedef void (*blockcpy_ps_t)(int bx, int by, pixel *dst, intptr_t
> dstride, short *src, intptr_t sstride); // dst is aligned
>  typedef void (*blockcpy_sc_t)(int bx, int by, short *dst, intptr_t
> dstride, uint8_t *src, intptr_t sstride); // dst is aligned
> -typedef void (*pixelsub_sp_t)(int bx, int by, short *dst, intptr_t
> dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
> +typedef void (*pixelsub_sp_t)(int bx, int by, short *dst, intptr_t
> dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
> +typedef void (*pixeladd_ss_t)(int bx, int by, short *dst, intptr_t
> dstride, short *src0, short *src1, intptr_t sstride0, intptr_t sstride1);
> +typedef void (*pixeladd_pp_t)(int bx, int by, pixel *dst, intptr_t
> dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
>
>  typedef void (*intra_dc_t)(pixel* above, pixel* left, pixel* dst,
> intptr_t dstStride, int width, int bFilter);
>  typedef void (*intra_planar_t)(pixel* src, intptr_t srcStride, pixel*
> dst, intptr_t dstStride, int width);
> @@ -275,6 +277,8 @@
>
>      weightpUni_t    weightpUni;
>      pixelsub_sp_t   pixelsub_sp;
> +    pixeladd_ss_t   pixeladd_ss;
> +    pixeladd_pp_t   pixeladd_pp;
>
>      filterVwghtd_t  filterVwghtd;
>      filterHwghtd_t  filterHwghtd;
>

everything up to this point looks good

> diff -r f813f110d69a -r d93bf22889f8 source/common/vec/blockcopy.inc
> --- a/source/common/vec/blockcopy.inc   Thu Jul 18 02:10:37 2013 -0500
> +++ b/source/common/vec/blockcopy.inc   Thu Jul 18 12:55:02 2013 +0530
> @@ -292,6 +292,162 @@
>      }
>  }
>
> +void pixeladd_ss(int bx, int by, short *dst, intptr_t dstride, short
> *src0, short *src1, intptr_t sstride0, intptr_t sstride1)
> +{
> +    size_t aligncheck = (size_t)dst | (size_t)src0 | bx | sstride0 |
> dstride;
>

This is still not checking bx correctly.  It needs to be checked separately
from the pointers and strides

> +#if INSTRSET >= 8 && 0
> +    if (!(aligncheck & 31))
> +    {
> +        // fast path, multiples of 32 pixel wide blocks
> +        // fast path, multiples of 16 pixel wide blocks
>

one of these comments is wrong

> +        for (int y = 0; y < by; y++)
> +        {
> +            for (int x = 0; x < bx; x += 32)
> +            {
> +                Vec32s vecsrc0, vecsrc1, vecsum;
> +                Vec32s zero(0), maxval((1 << X265_DEPTH) - 1); //
> Currently g_bitDepthY = 8 and g_bitDepthC = 8
>

The comment above is wrong.  zero and maxval should be removed from out of
the loop

> +                vecsrc0.load_a(src0 + x);
> +                vecsrc1.load_a(src1 + x);
> +
> +                vecsum = vecsrc0 + vecsrc1;
> +                vecsum = max(vecsum, zero);
> +                vecsum = min(vecsum, maxval);
> +
> +                vecsum.store(dst + x);
> +            }
> +
> +            src0 += sstride0;
> +            src1 += sstride1;
> +            dst += dstride;
> +        }
> +    }
> +    else
> +#endif /* if INSTRSET >= 8 && 0 */
> +    if (!(aligncheck & 15))
> +    {
> +        // fast path, multiples of 16 pixel wide blocks
> +        for (int y = 0; y < by; y++)
> +        {
> +            for (int x = 0; x < bx; x += 8)
> +            {
> +                Vec8s vecsrc0, vecsrc1, vecsum;
> +                Vec8s zero(0), maxval((1 << X265_DEPTH) - 1); //
> Currently g_bitDepthY = 8 and g_bitDepthC = 8
>

same wrong comment

> +                vecsrc0.load_a(src0 + x);
> +                vecsrc1.load_a(src1 + x);
> +
> +                vecsum = add_saturated(vecsrc0, vecsrc1);
> +                vecsum = max(vecsum, zero);
> +                vecsum = min(vecsum, maxval);
> +
> +                vecsum.store(dst + x);
> +            }
> +
> +            src0 += sstride0;
> +            src1 += sstride1;
> +            dst += dstride;
> +        }
> +    }
> +    else
> +    {
>

there needs to be another else clause that uses SIMD for the copies but
uses unaligned loads, if bx is a multiple of 8

> +        int tmp;
> +        int max = (1 << X265_DEPTH) - 1;
> +        // slow path, irregular memory alignments or sizes
> +        for (int y = 0; y < by; y++)
> +        {
> +            for (int x = 0; x < bx; x++)
> +            {
> +                tmp = src0[x] + src1[x];
> +                tmp = tmp < 0 ? 0 : tmp;
> +                tmp = tmp > max ? max : tmp;
> +                dst[x] = (short)tmp;
> +            }
> +
> +            src0 += sstride0;
> +            src1 += sstride1;
> +            dst += dstride;
> +        }
> +    }
> +}
>

This function should only be compiled for HIGH_BIT_DEPTH, else it is just
wrong

> +void pixeladd_pp(int bx, int by, pixel *dst, intptr_t dstride, pixel
> *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1)
> +{
> +    size_t aligncheck = (size_t)dst | (size_t)src0 | bx | sstride0 |
> dstride;
> +
> +#if INSTRSET >= 8 && 0
> +    if (!(aligncheck & 31))
> +    {
> +        // fast path, multiples of 32 pixel wide blocks
> +        // fast path, multiples of 16 pixel wide blocks
>

One of these comments is wrong

> +        for (int y = 0; y < by; y++)
> +        {
> +            for (int x = 0; x < bx; x += 32)
> +            {
> +                Vec32s vecsrc0, vecsrc1, vecsum;
> +                Vec32s zero(0), maxval((1 << X265_DEPTH) - 1); //
> Currently g_bitDepthY = 8 and g_bitDepthC = 8
>

It's copying short values up here

> +                vecsrc0.load_a(src0 + x);
> +                vecsrc1.load_a(src1 + x);
> +
> +                vecsum = vecsrc0 + vecsrc1;
> +                vecsum = max(vecsum, zero);
> +                vecsum = min(vecsum, maxval);
> +
> +                vecsum.store(dst + x);
> +            }
> +
> +            src0 += sstride0;
> +            src1 += sstride1;
> +            dst += dstride;
> +        }
> +    }
> +    else
> +#endif /* if INSTRSET >= 8 && 0 */
> +    if (!(aligncheck & 15))
> +    {
> +        // fast path, multiples of 16 pixel wide blocks
> +        for (int y = 0; y < by; y++)
> +        {
> +            for (int x = 0; x < bx; x += 16)
> +            {
> +                Vec16uc vecsrc0, vecsrc1, vecsum;
> +                Vec16uc zero(0), maxval((1 << X265_DEPTH) - 1); //
> Currently g_bitDepthY = 8 and g_bitDepthC = 8
> +                vecsrc0.load_a(src0 + x);
> +                vecsrc1.load_a(src1 + x);
>

And copying char values down here

> +                vecsum = add_saturated(vecsrc0, vecsrc1);
> +                vecsum = max(vecsum, zero);
> +                vecsum = min(vecsum, maxval);
> +
> +                vecsum.store(dst + x);
> +            }
> +
> +            src0 += sstride0;
> +            src1 += sstride1;
> +            dst += dstride;
> +        }
> +    }
> +    else
>

ditto for an additional else if () clause here for unaligned pointers or
strides

> +    {
> +        int tmp;
> +        int max = (1 << X265_DEPTH) - 1;
> +        // slow path, irregular memory alignments or sizes
> +        for (int y = 0; y < by; y++)
> +        {
> +            for (int x = 0; x < bx; x++)
> +            {
> +                tmp = src0[x] + src1[x];
> +                tmp = tmp < 0 ? 0 : tmp;
> +                tmp = tmp > max ? max : tmp;
> +                dst[x] = (pixel)tmp;
> +            }
> +
> +            src0 += sstride0;
> +            src1 += sstride1;
> +            dst += dstride;
> +        }
> +    }
> +}
> +
>  void Setup_Vec_BlockCopyPrimitives(EncoderPrimitives &p)
>  {
>  #if HIGH_BIT_DEPTH
> @@ -300,11 +456,15 @@
>      p.blockcpy_ps = (x265::blockcpy_ps_t)blockcopy_p_p;
>      p.blockcpy_sp = (x265::blockcpy_sp_t)blockcopy_p_p;
>      p.blockcpy_sc = (x265::blockcpy_sc_t)blockcopy_s_p;
> +    p.pixeladd_pp = (x265::pixeladd_pp_t)pixeladd_ss;
>

If TShortYuv was changed to use UShort buffers, this typecast would be
unnecessary.

> +    p.pixeladd_ss = pixeladd_ss;
>  #else
>      p.blockcpy_pp = blockcopy_p_p;
>      p.blockcpy_ps = blockcopy_p_s;
>      p.blockcpy_sp = blockcopy_s_p;
>      p.blockcpy_sc = blockcopy_s_p;
>      p.pixelsub_sp = pixelsub_sp;
> -#endif
> +    p.pixeladd_ss = pixeladd_ss;
> +    p.pixeladd_pp = pixeladd_pp;
> +#endif /* if HIGH_BIT_DEPTH */
>  }
> diff -r f813f110d69a -r d93bf22889f8 source/common/vec/vecprimitives.inc
> --- a/source/common/vec/vecprimitives.inc       Thu Jul 18 02:10:37 2013
> -0500
> +++ b/source/common/vec/vecprimitives.inc       Thu Jul 18 12:55:02 2013
> +0530
> @@ -28,6 +28,9 @@
>  #include "utils.h"
>  #include <string.h>
>
> +#include "TLibCommon\TComRom.h"
> +#include "TLibCommon\TypeDef.h"
>

You must use forward slashes in include statements, else this will not
compile on Linux

> +
>  using namespace x265;
>
>  namespace {
> diff -r f813f110d69a -r d93bf22889f8 source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp      Thu Jul 18 02:10:37 2013 -0500
> +++ b/source/test/pixelharness.cpp      Thu Jul 18 12:55:02 2013 +0530
> @@ -376,6 +376,52 @@
>      return true;
>  }
>
> +bool PixelHarness::check_pixeladd_ss(x265::pixeladd_ss_t ref,
> x265::pixeladd_ss_t opt)
> +{
> +    ALIGN_VAR_16(short, ref_dest[64 * 64]);
> +    ALIGN_VAR_16(short, opt_dest[64 * 64]);
> +    int bx = 64;
> +    int by = 64;
> +    int j = 0;
> +    for (int i = 0; i <= 100; i++)
> +    {
> +        opt(bx, by, opt_dest, 64, (short*)pbuf2 + j, (short*)pbuf1 + j,
> 128, 128);
> +        ref(bx, by, ref_dest, 64, (short*)pbuf2 + j, (short*)pbuf1 + j,
> 128, 128);
> +
> +        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(short)))
> +            return false;
> +
> +        j += 4;
> +        bx = 4 * ((rand() & 15) + 1);
> +        by = 4 * ((rand() & 15) + 1);
> +    }
> +
> +    return true;
> +}
> +
> +bool PixelHarness::check_pixeladd_pp(x265::pixeladd_pp_t ref,
> x265::pixeladd_pp_t opt)
> +{
> +    ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
> +    ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
> +    int bx = 64;
> +    int by = 64;
> +    int j = 0;
> +    for (int i = 0; i <= 100; i++)
> +    {
> +        opt(bx, by, opt_dest, 64, pbuf2 + j, pbuf1 + j, 128, 128);
> +        ref(bx, by, ref_dest, 64, pbuf2 + j, pbuf1 + j, 128, 128);
> +
> +        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
> +            return false;
> +
> +        j += 4;
> +        bx = 4 * ((rand() & 15) + 1);
> +        by = 4 * ((rand() & 15) + 1);
> +    }
> +
> +    return true;
> +}
> +
>  bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const
> EncoderPrimitives& opt)
>  {
>      for (uint16_t curpar = 0; curpar < NUM_PARTITIONS; curpar++)
> @@ -535,6 +581,24 @@
>          }
>      }
>
> +    if (opt.pixeladd_ss)
> +    {
> +        if (!check_pixeladd_ss(ref.pixeladd_ss, opt.pixeladd_ss))
> +        {
> +            printf("pixel add clip failed!\n");
> +            return false;
> +        }
> +    }
> +
> +    if (opt.pixeladd_pp)
> +    {
> +        if (!check_pixeladd_pp(ref.pixeladd_pp, opt.pixeladd_pp))
> +        {
> +            printf("pixel add clip failed!\n");
> +            return false;
> +        }
> +    }
> +
>      return true;
>  }
>
> @@ -649,4 +713,16 @@
>          printf("Pixel Sub");
>          REPORT_SPEEDUP(opt.pixelsub_sp, ref.pixelsub_sp, 64, 64,
> (short*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
>      }
> +
> +    if (opt.pixeladd_ss)
> +    {
> +        printf("pixel_ss add");
> +        REPORT_SPEEDUP(opt.pixeladd_ss, ref.pixeladd_ss, 64, 64,
> (short*)pbuf1, FENC_STRIDE, (short*)pbuf2, (short*)pbuf1, STRIDE, STRIDE);
> +    }
> +
> +    if (opt.pixeladd_pp)
> +    {
> +        printf("pixel_pp add");
> +        REPORT_SPEEDUP(opt.pixeladd_pp, ref.pixeladd_pp, 64, 64, pbuf1,
> FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
> +    }
>  }
> diff -r f813f110d69a -r d93bf22889f8 source/test/pixelharness.h
> --- a/source/test/pixelharness.h        Thu Jul 18 02:10:37 2013 -0500
> +++ b/source/test/pixelharness.h        Thu Jul 18 12:55:02 2013 +0530
> @@ -48,6 +48,8 @@
>      bool check_calcrecon(x265::calcrecon_t ref, x265::calcrecon_t opt);
>      bool check_weightpUni(x265::weightpUni_t ref, x265::weightpUni_t opt);
>      bool check_pixelsub_sp(x265::pixelsub_sp_t ref, x265::pixelsub_sp_t
> opt);
> +    bool check_pixeladd_ss(x265::pixeladd_ss_t ref, x265::pixeladd_ss_t
> opt);
> +    bool check_pixeladd_pp(x265::pixeladd_pp_t ref, x265::pixeladd_pp_t
> opt);
>
>  public:
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> http://mailman.videolan.org/listinfo/x265-devel
>
>

-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20130718/600b3af4/attachment-0001.html>