[x265] [PATCH] TShortYUV : Implemented perfomance Primitives pixeladd (Clipadd)

Wed Jul 17 22:14:13 CEST 2013

On Wed, Jul 17, 2013 at 2:36 AM, <gopu at multicorewareinc.com> wrote:

> # HG changeset patch
> # User ggopu
> # Date 1374046566 -19800
> # Node ID c6c045e9272798ce5268643436759db22fd1950b
> # Parent  054d8c409569100c4aacb015ffb1b3281100d993
> TShortYUV : Implemented perfomance Primitives pixeladd (Clipadd)
>
> diff -r 054d8c409569 -r c6c045e92727 source/common/TShortYUV.cpp
> --- a/source/common/TShortYUV.cpp       Wed Jul 17 11:33:20 2013 +0530
> +++ b/source/common/TShortYUV.cpp       Wed Jul 17 13:06:06 2013 +0530
> @@ -122,61 +122,38 @@
>  #pragma warning (disable: 4244)
>  #endif
>
> -void TShortYUV::addClipLuma(TShortYUV* pcYuvSrc0, TShortYUV* pcYuvSrc1,
> unsigned int uiTrUnitIdx, unsigned int uiPartSize)
> +void TShortYUV::addClipLuma(TShortYUV* pcYuvSrc0, TShortYUV* pcYuvSrc1,
> unsigned int trUnitIdx, unsigned int partSize)
>

pcYuvSrc0 -> srcYuv0, pcYuvSrc1 -> srcYuv1

>  {
> -    int x, y;
> +    int x = partSize, y = partSize;
>

only need one var here (in the other places too)

>
> -    short* pSrc0 = pcYuvSrc0->getLumaAddr(uiTrUnitIdx, uiPartSize);
> -    short* pSrc1 = pcYuvSrc1->getLumaAddr(uiTrUnitIdx, uiPartSize);
> -    short* pDst  = getLumaAddr(uiTrUnitIdx, uiPartSize);
> +    short* src0 = pcYuvSrc0->getLumaAddr(trUnitIdx, partSize);
> +    short* src1 = pcYuvSrc1->getLumaAddr(trUnitIdx, partSize);
> +    short* dst  = getLumaAddr(trUnitIdx, partSize);
>

TShortYUV will probably need to be changed to UShort soon (globally)
because I am going to change 16bpp Pel to UShort to remove the impedence
mismatch with x264's pixel type.

> -    unsigned int iSrc0Stride = pcYuvSrc0->width;
> -    unsigned int iSrc1Stride = pcYuvSrc1->width;
> -    unsigned int iDstStride  = width;
> +    unsigned int src0Stride = pcYuvSrc0->width;
> +    unsigned int src1Stride = pcYuvSrc1->width;
> +    unsigned int dstStride  = width;
>

To be pedantic, these should be intptr_t

>
> -    for (y = uiPartSize - 1; y >= 0; y--)
> -    {
> -        for (x = uiPartSize - 1; x >= 0; x--)
> -        {
> -            pDst[x] = ClipY(pSrc0[x] + pSrc1[x]);
> -        }
> -
> -        pSrc0 += iSrc0Stride;
> -        pSrc1 += iSrc1Stride;
> -        pDst  += iDstStride;
> -    }
> +    primitives.pixeladd(x, y, dst, dstStride, src0, src1, src0Stride,
> src1Stride);
>  }
>
> -void TShortYUV::addClipChroma(TShortYUV* pcYuvSrc0, TShortYUV* pcYuvSrc1,
> unsigned int uiTrUnitIdx, unsigned int uiPartSize)
> +void TShortYUV::addClipChroma(TShortYUV* pcYuvSrc0, TShortYUV* pcYuvSrc1,
> unsigned int trUnitIdx, unsigned int partSize)
>  {
> -    int x, y;
> +    int x = partSize, y = partSize;
>
> -    short* pSrcU0 = pcYuvSrc0->getCbAddr(uiTrUnitIdx, uiPartSize);
> -    short* pSrcU1 = pcYuvSrc1->getCbAddr(uiTrUnitIdx, uiPartSize);
> -    short* pSrcV0 = pcYuvSrc0->getCrAddr(uiTrUnitIdx, uiPartSize);
> -    short* pSrcV1 = pcYuvSrc1->getCrAddr(uiTrUnitIdx, uiPartSize);
> -    short* pDstU = getCbAddr(uiTrUnitIdx, uiPartSize);
> -    short* pDstV = getCrAddr(uiTrUnitIdx, uiPartSize);
> +    short* srcU0 = pcYuvSrc0->getCbAddr(trUnitIdx, partSize);
> +    short* srcU1 = pcYuvSrc1->getCbAddr(trUnitIdx, partSize);
> +    short* srcV0 = pcYuvSrc0->getCrAddr(trUnitIdx, partSize);
> +    short* srcV1 = pcYuvSrc1->getCrAddr(trUnitIdx, partSize);
> +    short* dstU = getCbAddr(trUnitIdx, partSize);
> +    short* dstV = getCrAddr(trUnitIdx, partSize);
>
> -    unsigned int  iSrc0Stride = pcYuvSrc0->Cwidth;
> -    unsigned int  iSrc1Stride = pcYuvSrc1->Cwidth;
> -    unsigned int  iDstStride  = Cwidth;
> +    unsigned int  src0Stride = pcYuvSrc0->Cwidth;
> +    unsigned int  src1Stride = pcYuvSrc1->Cwidth;
> +    unsigned int  dstStride  = Cwidth;
>
> -    for (y = uiPartSize - 1; y >= 0; y--)
> -    {
> -        for (x = uiPartSize - 1; x >= 0; x--)
> -        {
> -            pDstU[x] = ClipC(pSrcU0[x] + pSrcU1[x]);
> -            pDstV[x] = ClipC(pSrcV0[x] + pSrcV1[x]);
> -        }
> -
> -        pSrcU0 += iSrc0Stride;
> -        pSrcU1 += iSrc1Stride;
> -        pSrcV0 += iSrc0Stride;
> -        pSrcV1 += iSrc1Stride;
> -        pDstU  += iDstStride;
> -        pDstV  += iDstStride;
> -    }
> +    primitives.pixeladd(x, y, dstU, dstStride, srcU0, srcU1, src0Stride,
> src1Stride);
> +    primitives.pixeladd(x, y, dstV, dstStride, srcV0, srcV1, src0Stride,
> src1Stride);
>  }
>
>  #if _MSC_VER
> diff -r 054d8c409569 -r c6c045e92727 source/common/pixel.cpp
> --- a/source/common/pixel.cpp   Wed Jul 17 11:33:20 2013 +0530
> +++ b/source/common/pixel.cpp   Wed Jul 17 13:06:06 2013 +0530
> @@ -30,7 +30,6 @@
>  #include <algorithm>
>  #include <stdlib.h> // abs()
>

unrelated white-space changes, should be in a separate patch

> -
>  #define SET_FUNC_PRIMITIVE_TABLE_C_SUBSET(WIDTH, FUNC_PREFIX,
> FUNC_PREFIX_DEF, FUNC_TYPE_CAST, DATA_TYPE1, DATA_TYPE2) \
>      p.FUNC_PREFIX[PARTITION_ ## WIDTH ## x4]   =
> (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<WIDTH, 4,  DATA_TYPE1, DATA_TYPE2>;  \
>      p.FUNC_PREFIX[PARTITION_ ## WIDTH ## x8]   =
> (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<WIDTH, 8,  DATA_TYPE1, DATA_TYPE2>;  \
> @@ -504,14 +503,15 @@
>  void weightUnidir(short *src, pixel *dst, int srcStride, int dstStride,
> int width, int height, int w0, int round, int shift, int offset, int
> bitDepth)
>  {
>      int x, y;
>

ditto

> +
>      for (y = height - 1; y >= 0; y--)
>      {
>          for (x = width - 1; x >= 0; )
>          {
>              // note: luma min width is 4
> -            dst[x] = (pixel) Clip3(0, ((1 << bitDepth) - 1), ((w0 *
> (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);
> +            dst[x] = (pixel)Clip3(0, ((1 << bitDepth) - 1), ((w0 *
> (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);
>              x--;
> -            dst[x] = (pixel) Clip3(0, ((1 << bitDepth) - 1), ((w0 *
> (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);
> +            dst[x] = (pixel)Clip3(0, ((1 << bitDepth) - 1), ((w0 *
> (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);
>              x--;
>          }
>
> @@ -535,6 +535,20 @@
>      }
>  }
>

non-templated C primitives should end with _c

> +void pixeladd_ss(int bx, int by, short *a, intptr_t dstride, short *b0,
> short *b1, intptr_t sstride0, intptr_t sstride1)
> +{
> +    for (int y = 0; y < by; y++)
> +    {
> +        for (int x = 0; x < bx; x++)
> +        {
> +            a[x] = (short)ClipY(b0[x] + b1[x]);
> +        }
> +
> +        b0 += sstride0;
> +        b1 += sstride1;
> +        a += dstride;
> +    }
> +}
>  }  // end anonymous namespace
>
>  namespace x265 {
> @@ -738,5 +752,6 @@
>      p.weightpUni = weightUnidir;
>
>      p.pixelsubsp = pixelsub_sp;
> +    p.pixeladd   = pixeladd_ss;
>

if the function requires shorts, presumably there is a TComPiicYuv which
does the same operation using uint8_t, so we will want a version that
supports 8bit pixels, so the primitive funcdef should be named
p.pixeladd_pp, and this particular version should only be used for
high-bit-depth compiles.

>  }
>  }
> diff -r 054d8c409569 -r c6c045e92727 source/common/primitives.h
> --- a/source/common/primitives.h        Wed Jul 17 11:33:20 2013 +0530
> +++ b/source/common/primitives.h        Wed Jul 17 13:06:06 2013 +0530
> @@ -215,7 +215,8 @@
>  typedef void (*dequant_t)(int bitDepth, const int* src, int* dst, int
> width, int height, int mcqp_miper, int mcqp_mirem, bool useScalingList,
> unsigned int trSizeLog2, int *dequantCoef);
>  typedef uint32_t (*quant_t)(int *coef, int *quantCoeff, int *deltaU, int
> *qCoef, int qBits, int add, int numCoeff);
>  typedef void (*weightpUni_t)(short *src, pixel *dst, int srcStride, int
> dstStride, int width, int height, int w0, int round, int shift, int offset,
> int bitDepth);
> -typedef void (*pixelsub_sp_t)(int bx, int by, short *dst, intptr_t
> dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
> +typedef void (*pixelsub_sp_t)(int bx, int by, short *dst, intptr_t
> dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
> +typedef void (*pixeladd_t)(int bx, int by, short *dst, intptr_t dstride,
> short *src0, short *src1, intptr_t sstride0, intptr_t sstride1);
>
>  /* Define a structure containing function pointers to optimized encoder
>   * primitives.  Each pointer can reference either an assembly routine,
> @@ -267,6 +268,7 @@
>
>      weightpUni_t    weightpUni;
>      pixelsub_sp_t   pixelsubsp;
> +    pixeladd_t      pixeladd;
>  };
>
>  /* This copy of the table is what gets used by the encoder.
> diff -r 054d8c409569 -r c6c045e92727 source/common/vec/blockcopy.inc
> --- a/source/common/vec/blockcopy.inc   Wed Jul 17 11:33:20 2013 +0530
> +++ b/source/common/vec/blockcopy.inc   Wed Jul 17 13:06:06 2013 +0530
> @@ -79,7 +79,7 @@
>          }
>      }
>      else
>

unrelated changes

> -#endif
> +#endif /* if INSTRSET >= 8 */
>      if (!(aligncheck & 15))
>      {
>          // fast path, multiples of 16 pixel wide blocks
> @@ -131,7 +131,7 @@
>          }
>      }
>      else
> -#endif
> +#endif /* if INSTRSET >= 8 && 0 */
>      if (!(aligncheck & 15))
>      {
>          // fast path, multiples of 16 pixel wide blocks
> @@ -170,6 +170,7 @@
>  void blockcopy_s_p(int bx, int by, short *dst, intptr_t dstride, uint8_t
> *src, intptr_t sstride)
>  {
>      size_t aligncheck = (size_t)dst | (size_t)src | bx | sstride |
> dstride;
> +
>  #if INSTRSET >= 8 && 0
>      if (!(aligncheck & 31))
>      {
> @@ -189,7 +190,7 @@
>          }
>      }
>      else
> -#endif
> +#endif /* if INSTRSET >= 8 && 0 */
>      if (!(aligncheck & 15))
>      {
>          // fast path, multiples of 16 pixel wide blocks
> @@ -292,6 +293,84 @@
>      }
>  }
>
> +void pixeladd_ss(int bx, int by, short *dst, intptr_t dstride, short
> *src0, short *src1, intptr_t sstride0, intptr_t sstride1)
> +{
> +    size_t aligncheck = (size_t)dst | (size_t)src0 | bx | sstride0 |
> dstride;
>

alignment restriction for shorts are more complicated.  the pointers and
strides must be 32 byte aligned for AVX2 but the width only needs to be a
multiple of 16

> +
> +#if INSTRSET >= 8 && 0
> +    if (!(aligncheck & 31))
> +    {
> +        // fast path, multiples of 32 pixel wide blocks
> +        // fast path, multiples of 16 pixel wide blocks
>

?

> +        for (int y = 0; y < by; y++)
> +        {
> +            for (int x = 0; x < bx; x += 8)
> +            {
> +                Vec8s vecsrc0, vecsrc1, vecsum;
> +                Vec8s zero(0), maxval((1 << 8) - 1); // Currently
> g_bitDepthY = 8 and g_bitDepthC = 8
>

let's not add another bomb that awaits us when we want to use larger
bit-depths.  either use g_bitDepthY directly here or pass in the bit-d

> +                vecsrc0.load_a(src0 + x);
> +                vecsrc1.load_a(src1 + x);
> +
> +                vecsum = vecsrc0 + vecsrc1;
> +                vecsum = max(vecsum, zero);
> +                vecsum = min(vecsum, maxval);
> +
> +                vecsum.store(dst + x);
> +            }
> +
> +            src0 += sstride0;
> +            src1 += sstride1;
> +            dst += dstride;
> +        }
> +    }
> +    else
> +#endif /* if INSTRSET >= 8 && 0 */
> +    if (!(aligncheck & 15))
> +    {
> +        // fast path, multiples of 16 pixel wide blocks
> +        for (int y = 0; y < by; y++)
> +        {
> +            for (int x = 0; x < bx; x += 8)
> +            {
> +                Vec8s vecsrc0, vecsrc1, vecsum;
> +                Vec8s zero(0), maxval((1 << 8) - 1); // Currently
> g_bitDepthY = 8 and g_bitDepthC = 8
> +                vecsrc0.load_a(src0 + x);
> +                vecsrc1.load_a(src1 + x);
> +
> +                vecsum = add_saturated(vecsrc0, vecsrc1);
> +                vecsum = max(vecsum, zero);
> +                vecsum = min(vecsum, maxval);
> +
> +                vecsum.store(dst + x);
> +            }
> +
> +            src0 += sstride0;
> +            src1 += sstride1;
> +            dst += dstride;
> +        }
> +    }
> +    else
> +    {
> +        int tmp;
> +        int max = (1 << 8) - 1;
> +        // slow path, irregular memory alignments or sizes
> +        for (int y = 0; y < by; y++)
> +        {
> +            for (int x = 0; x < bx; x++)
> +            {
> +                tmp = src0[x] + src1[x];
> +                tmp = tmp < 0 ? 0 : tmp;
> +                tmp = tmp > max ? max : tmp;
> +                dst[x] = (short)tmp;
> +            }
> +
> +            src0 += sstride0;
> +            src1 += sstride1;
> +            dst += dstride;
> +        }
> +    }
> +}
> +
>  void Setup_Vec_BlockCopyPrimitives(EncoderPrimitives &p)
>  {
>  #if HIGH_BIT_DEPTH
> @@ -307,5 +386,6 @@
>      p.blockcpy_sp = blockcopy_s_p;
>      p.blockcpy_sc = blockcopy_s_p;
>      p.pixelsubsp = pixelsub_sp;
> -#endif
> +    p.pixeladd = pixeladd_ss;
> +#endif /* if HIGH_BIT_DEPTH */
>  }
> diff -r 054d8c409569 -r c6c045e92727 source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp      Wed Jul 17 11:33:20 2013 +0530
> +++ b/source/test/pixelharness.cpp      Wed Jul 17 13:06:06 2013 +0530
> @@ -376,6 +376,29 @@
>      return true;
>  }
>
> +bool PixelHarness::check_pixeladd_sp(x265::pixeladd_t ref,
> x265::pixeladd_t opt)
> +{
> +    ALIGN_VAR_16(short, ref_dest[64 * 64]);
> +    ALIGN_VAR_16(short, opt_dest[64 * 64]);
> +    int bx = 64;
> +    int by = 64;
> +    int j = 0;
> +    for (int i = 0; i <= 100; i++)
> +    {
> +        opt(bx, by, opt_dest, 64, (short*)pbuf2 + j, (short*)pbuf1 + j,
> 128, 128);
> +        ref(bx, by, ref_dest, 64, (short*)pbuf2 + j, (short*)pbuf1 + j,
> 128, 128);
> +
> +        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(short)))
> +            return false;
> +
> +        j += 4;
> +        bx = 4 * ((rand() & 15) + 1);
> +        by = 4 * ((rand() & 15) + 1);
> +    }
> +
> +    return true;
> +}
> +
>  bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const
> EncoderPrimitives& opt)
>  {
>      for (uint16_t curpar = 0; curpar < NUM_PARTITIONS; curpar++)
> @@ -535,6 +558,14 @@
>          }
>      }
>
> +    if (opt.pixeladd)
> +    {
> +        if (!check_pixeladd_sp(ref.pixeladd, opt.pixeladd))
> +        {
> +            printf("Pixel Add failed!\n");
>

upper case here not necessary

> +            return false;
> +        }
> +    }
>      return true;
>  }
>
> @@ -649,4 +680,10 @@
>          printf("Pixel Sub");
>

upper case here not necessary

>          REPORT_SPEEDUP(opt.pixelsubsp, ref.pixelsubsp, 64, 64,
> (short*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
>      }
> +
> +    if (opt.pixeladd)
> +    {
> +        printf("Pixel Add");
> +        REPORT_SPEEDUP(opt.pixeladd, ref.pixeladd, 64, 64, (short*)pbuf1,
> FENC_STRIDE, (short*)pbuf2, (short*)pbuf1, STRIDE, STRIDE);
> +    }
>  }
> diff -r 054d8c409569 -r c6c045e92727 source/test/pixelharness.h
> --- a/source/test/pixelharness.h        Wed Jul 17 11:33:20 2013 +0530
> +++ b/source/test/pixelharness.h        Wed Jul 17 13:06:06 2013 +0530
> @@ -48,6 +48,7 @@
>      bool check_calcrecon(x265::calcrecon_t ref, x265::calcrecon_t opt);
>      bool check_weightpUni(x265::weightpUni_t ref, x265::weightpUni_t opt);
>      bool check_pixelsub_sp(x265::pixelsub_sp_t ref, x265::pixelsub_sp_t
> opt);
> +    bool check_pixeladd_sp(x265::pixeladd_t ref, x265::pixeladd_t opt);
>
>  public:
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> http://mailman.videolan.org/listinfo/x265-devel
>

-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20130717/d0a46287/attachment-0001.html>