[x265] [PATCH] TShortYuv : Performance Primitives for Luma and Chroma Subtracting

Tue Jul 16 19:09:57 CEST 2013

On Tue, Jul 16, 2013 at 7:10 AM, <gopu at multicorewareinc.com> wrote:

> # HG changeset patch
> # User ggopu
> # Date 1373976605 -19800
> # Node ID be5257d512becc658a3c0f1d7a0a7defd0c911af
> # Parent  c9bb72e8cb8effc0d1d0e99f0b9abc8d341c652a
> TShortYuv : Performance Primitives for Luma and Chroma Subtracting
>
> diff -r c9bb72e8cb8e -r be5257d512be source/common/TShortYUV.cpp
> --- a/source/common/TShortYUV.cpp       Mon Jul 15 23:41:11 2013 -0500
> +++ b/source/common/TShortYUV.cpp       Tue Jul 16 17:40:05 2013 +0530
> @@ -30,6 +30,8 @@
>  #include "TShortYUV.h"
>  #include "TLibCommon/TComYuv.h"
>
> +using namespace x265;
> +
>  TShortYUV::TShortYUV()
>  {
>      YBuf = NULL;
> @@ -76,61 +78,37 @@
>      subtractChroma(pcYuvSrc0, pcYuvSrc1,  uiTrUnitIdx, uiPartSize >> 1);
>  }
>
> -void TShortYUV::subtractLuma(TComYuv* pcYuvSrc0, TComYuv* pcYuvSrc1,
> unsigned int uiTrUnitIdx, unsigned int uiPartSize)
> +void TShortYUV::subtractLuma(TComYuv* pcYuvSrc0, TComYuv* pcYuvSrc1,
> unsigned int trUnitIdx, unsigned int partSize)
>  {
> -    int x, y;
> +    int x = partSize, y = partSize;
>
> -    Pel* pSrc0 = pcYuvSrc0->getLumaAddr(uiTrUnitIdx, uiPartSize);
> -    Pel* pSrc1 = pcYuvSrc1->getLumaAddr(uiTrUnitIdx, uiPartSize);
> -    Short* pDst  = getLumaAddr(uiTrUnitIdx, uiPartSize);
> +    Pel* src0 = pcYuvSrc0->getLumaAddr(trUnitIdx, partSize);
> +    Pel* src1 = pcYuvSrc1->getLumaAddr(trUnitIdx, partSize);
> +    Short* dst  = getLumaAddr(trUnitIdx, partSize);
>
> -    int  iSrc0Stride = pcYuvSrc0->getStride();
> -    int  iSrc1Stride = pcYuvSrc1->getStride();
> -    int  iDstStride  = width;
> +    int  src0Stride = pcYuvSrc0->getStride();
> +    int  src1Stride = pcYuvSrc1->getStride();
> +    int  dstStride  = width;
>
> -    for (y = uiPartSize - 1; y >= 0; y--)
> -    {
> -        for (x = uiPartSize - 1; x >= 0; x--)
> -        {
> -            pDst[x] = static_cast<short>(pSrc0[x]) -
> static_cast<short>(pSrc1[x]);
> -        }
> -
> -        pSrc0 += iSrc0Stride;
> -        pSrc1 += iSrc1Stride;
> -        pDst  += iDstStride;
> -    }
> +    primitives.LumaSubstract_sp(x, y, dst, dstStride, src0, src1,
> src0Stride, src1Stride);
>  }
>
> -void TShortYUV::subtractChroma(TComYuv* pcYuvSrc0, TComYuv* pcYuvSrc1,
> unsigned int uiTrUnitIdx, unsigned int uiPartSize)
> +void TShortYUV::subtractChroma(TComYuv* pcYuvSrc0, TComYuv* pcYuvSrc1,
> unsigned int trUnitIdx, unsigned int partSize)
>  {
> -    int x, y;
> +    int x = partSize, y = partSize;
>
> -    Pel* pSrcU0 = pcYuvSrc0->getCbAddr(uiTrUnitIdx, uiPartSize);
> -    Pel* pSrcU1 = pcYuvSrc1->getCbAddr(uiTrUnitIdx, uiPartSize);
> -    Pel* pSrcV0 = pcYuvSrc0->getCrAddr(uiTrUnitIdx, uiPartSize);
> -    Pel* pSrcV1 = pcYuvSrc1->getCrAddr(uiTrUnitIdx, uiPartSize);
> -    Short* pDstU  = getCbAddr(uiTrUnitIdx, uiPartSize);
> -    Short* pDstV  = getCrAddr(uiTrUnitIdx, uiPartSize);
> +    Pel* srcU0 = pcYuvSrc0->getCbAddr(trUnitIdx, partSize);
> +    Pel* srcU1 = pcYuvSrc1->getCbAddr(trUnitIdx, partSize);
> +    Pel* srcV0 = pcYuvSrc0->getCrAddr(trUnitIdx, partSize);
> +    Pel* srcV1 = pcYuvSrc1->getCrAddr(trUnitIdx, partSize);
> +    Short* dstU  = getCbAddr(trUnitIdx, partSize);
> +    Short* dstV  = getCrAddr(trUnitIdx, partSize);
>
> -    int  iSrc0Stride = pcYuvSrc0->getCStride();
> -    int  iSrc1Stride = pcYuvSrc1->getCStride();
> -    int  iDstStride  = Cwidth;
> +    int  src0Stride = pcYuvSrc0->getCStride();
> +    int  src1Stride = pcYuvSrc1->getCStride();
> +    int  dstStride  = Cwidth;
>
> -    for (y = uiPartSize - 1; y >= 0; y--)
> -    {
> -        for (x = uiPartSize - 1; x >= 0; x--)
> -        {
> -            pDstU[x] = static_cast<short>(pSrcU0[x]) -
> static_cast<short>(pSrcU1[x]);
> -            pDstV[x] = static_cast<short>(pSrcV0[x]) -
> static_cast<short>(pSrcV1[x]);
> -        }
> -
> -        pSrcU0 += iSrc0Stride;
> -        pSrcU1 += iSrc1Stride;
> -        pSrcV0 += iSrc0Stride;
> -        pSrcV1 += iSrc1Stride;
> -        pDstU  += iDstStride;
> -        pDstV  += iDstStride;
> -    }
> +    primitives.ChromaSubstract_sp(x, y, dstU, dstStride, dstV, dstStride,
> srcU0, srcU1, src0Stride, src1Stride, srcV0, srcV1, src0Stride, src1Stride);
>  }
>

This part looks fine.

>  void TShortYUV::addClip(TShortYUV* pcYuvSrc0, TShortYUV* pcYuvSrc1,
> unsigned int uiTrUnitIdx, unsigned int uiPartSize)
> diff -r c9bb72e8cb8e -r be5257d512be source/common/pixel.cpp
> --- a/source/common/pixel.cpp   Mon Jul 15 23:41:11 2013 -0500
> +++ b/source/common/pixel.cpp   Tue Jul 16 17:40:05 2013 +0530
> @@ -30,7 +30,6 @@
>  #include "TLibCommon/CommonDef.h"
>  #include "TLibCommon/TComPrediction.h"
>
> -
>  #define SET_FUNC_PRIMITIVE_TABLE_C_SUBSET(WIDTH, FUNC_PREFIX,
> FUNC_PREFIX_DEF, FUNC_TYPE_CAST, DATA_TYPE1, DATA_TYPE2) \
>      p.FUNC_PREFIX[PARTITION_ ## WIDTH ## x4]   =
> (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<WIDTH, 4,  DATA_TYPE1, DATA_TYPE2>;  \
>      p.FUNC_PREFIX[PARTITION_ ## WIDTH ## x8]   =
> (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<WIDTH, 8,  DATA_TYPE1, DATA_TYPE2>;  \
> @@ -388,6 +387,41 @@
>      }
>  }
>

Why use a capital L?  this should just be lumasubtract_sp or even better
pixelsub_sp_c.

> +void Lumasubstract_s_p(int bx, int by, short *a, intptr_t dstride, pixel
> *b0, pixel *b1, intptr_t sstride0, intptr_t sstride1)
> +{
> +    for (int y = 0; y < by; y++)
> +    {
> +        for (int x = 0; x < bx; x++)
> +        {
> +            a[x] = (short)(b0[x] - b1[x]);
> +        }
> +
> +        b0 += sstride0;
> +        b1 += sstride1;
> +        a += dstride;
> +    }
> +}
>

There's no need for a separate primitive for chroma, just make two calls to
pixelsub_sp.  There's no efficiency gained by doing two at once.

> +void Chromasubstract_s_p(int bx, int by, short *dstu, intptr_t dstsrideu,
> short *dstv, intptr_t dstsridev, pixel *u0, pixel *u1, intptr_t sstrideu0,
> intptr_t sstrideu1,
> +                         pixel *v0, pixel *v1, intptr_t sstridev0,
> intptr_t sstridev1)
> +{
> +    for (int y = 0; y < by; y++)
> +    {
> +        for (int x = 0; x < bx; x++)
> +        {
> +            dstu[x] = (short)(u0[x] - u1[x]);
> +            dstv[x] = (short)(v0[x] - v1[x]);
> +        }
> +
> +        u0 += sstrideu0;
> +        u1 += sstrideu1;
> +        v0 += sstridev0;
> +        v1 += sstridev1;
> +        dstu += dstsrideu;
> +        dstv += dstsridev;
> +    }
> +}
> +
>  void blockcopy_p_s(int bx, int by, pixel *a, intptr_t stridea, short *b,
> intptr_t strideb)
>  {
>      for (int y = 0; y < by; y++)
> @@ -504,14 +538,15 @@
>  void weightUnidir(short *src, pixel *dst, int srcStride, int dstStride,
> int width, int height, int w0, int round, int shift, int offset, int
> bitDepth)
>  {
>      int x, y;
> +
>      for (y = height - 1; y >= 0; y--)
>      {
>          for (x = width - 1; x >= 0; )
>          {
>              // note: luma min width is 4
> -            dst[x] = (pixel) Clip3(0, ((1 << bitDepth) - 1), ((w0 *
> (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);
> +            dst[x] = (pixel)Clip3(0, ((1 << bitDepth) - 1), ((w0 *
> (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);
>              x--;
> -            dst[x] = (pixel) Clip3(0, ((1 << bitDepth) - 1), ((w0 *
> (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);
> +            dst[x] = (pixel)Clip3(0, ((1 << bitDepth) - 1), ((w0 *
> (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);
>              x--;
>          }
>

Unrelated changes, should be in a separate patch

>
> @@ -519,7 +554,6 @@
>          dst  += dstStride;
>      }
>  }
> -
>  }  // end anonymous namespace
>
>  namespace x265 {
> @@ -619,6 +653,8 @@
>      p.blockcpy_ps = blockcopy_p_s;
>      p.blockcpy_sp = blockcopy_s_p;
>      p.blockcpy_sc = blockcopy_s_c;
> +    p.LumaSubstract_sp = Lumasubstract_s_p;
> +    p.ChromaSubstract_sp = Chromasubstract_s_p;
>
>      p.cvt16to32     = convert16to32;
>      p.cvt16to32_shl = convert16to32_shl;
> diff -r c9bb72e8cb8e -r be5257d512be source/common/primitives.h
> --- a/source/common/primitives.h        Mon Jul 15 23:41:11 2013 -0500
> +++ b/source/common/primitives.h        Tue Jul 16 17:40:05 2013 +0530
> @@ -192,6 +192,9 @@
>  typedef void (*ipfilter_s2p_t)(int bitDepth, short *src, int srcStride,
> pixel *dst, int dstStride, int width, int height);
>  typedef void (*blockcpy_pp_t)(int bx, int by, pixel *dst, intptr_t
> dstride, pixel *src, intptr_t sstride); // dst is aligned
>  typedef void (*blockcpy_sp_t)(int bx, int by, short *dst, intptr_t
> dstride, pixel *src, intptr_t sstride); // dst is aligned
>

None of the other primitives have uppercase names.  Please follow
conventions.  pixelsub_sp_t would be more appropriate.

> +typedef void (*LumaSubstract_sp_t)(int bx, int by, short *dst, intptr_t
> dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
> // dst is aligned
> +typedef void (*ChromaSubstract_sp_t)(int bx, int by, short *dstu,
> intptr_t dstsrideu, short *dstv, intptr_t dstsridev, pixel *u0, pixel *u1,
> intptr_t sstrideu0, intptr_t sstrideu1,
> +                                     pixel *v0, pixel *v1, intptr_t
> sstridev0, intptr_t sstridev1);
>  typedef void (*blockcpy_ps_t)(int bx, int by, pixel *dst, intptr_t
> dstride, short *src, intptr_t sstride); // dst is aligned
>  typedef void (*blockcpy_sc_t)(int bx, int by, short *dst, intptr_t
> dstride, uint8_t *src, intptr_t sstride); // dst is aligned
>  typedef void (*intra_dc_t)(pixel* src, intptr_t srcStride, pixel* dst,
> intptr_t dstStride, int width, int bFilter);
> @@ -236,6 +239,10 @@
>      blockcpy_ps_t   blockcpy_ps;                // block copy pixel from
> short
>      blockcpy_sp_t   blockcpy_sp;                // block copy short from
> pixel
>      blockcpy_sc_t   blockcpy_sc;                // block copy short from
> unsigned char
> +
> +    LumaSubstract_sp_t LumaSubstract_sp;
> +    ChromaSubstract_sp_t ChromaSubstract_sp;
> +
>      cvt16to32_t     cvt16to32;
>      cvt16to32_shl_t cvt16to32_shl;
>      cvt16to16_shl_t cvt16to16_shl;
> diff -r c9bb72e8cb8e -r be5257d512be source/common/vec/blockcopy.inc
> --- a/source/common/vec/blockcopy.inc   Mon Jul 15 23:41:11 2013 -0500
> +++ b/source/common/vec/blockcopy.inc   Tue Jul 16 17:40:05 2013 +0530
> @@ -79,7 +79,7 @@
>          }
>      }
>      else
>

More unrelated changes, should be in a different patch

> -#endif
> +#endif /* if INSTRSET >= 8 */
>      if (!(aligncheck & 15))
>      {
>          // fast path, multiples of 16 pixel wide blocks
> @@ -131,7 +131,7 @@
>          }
>      }
>      else
> -#endif
> +#endif /* if INSTRSET >= 8 && 0 */
>      if (!(aligncheck & 15))
>      {
>          // fast path, multiples of 16 pixel wide blocks
> @@ -170,6 +170,7 @@
>  void blockcopy_s_p(int bx, int by, short *dst, intptr_t dstride, uint8_t
> *src, intptr_t sstride)
>  {
>      size_t aligncheck = (size_t)dst | (size_t)src | bx | sstride |
> dstride;
> +
>  #if INSTRSET >= 8 && 0
>      if (!(aligncheck & 31))
>      {
> @@ -189,7 +190,7 @@
>          }
>      }
>      else
> -#endif
> +#endif /* if INSTRSET >= 8 && 0 */
>      if (!(aligncheck & 15))
>      {
>          // fast path, multiples of 16 pixel wide blocks
> @@ -223,6 +224,173 @@
>      }
>  }
>

use pixelsub_sp here

> +void Lumasubstract_s_p(int bx, int by, short *dst, intptr_t dstride,
> uint8_t *src0, uint8_t *src1, intptr_t sstride0, intptr_t sstride1)
> +{
> +    size_t aligncheck = (size_t)dst | (size_t)src0 | bx | sstride0 |
> dstride;
> +
> +#if INSTRSET >= 8 && 0
> +    if (!(aligncheck & 31))
> +    {
> +        // fast path, multiples of 32 pixel wide blocks
> +        // fast path, multiples of 16 pixel wide blocks
> +        for (int y = 0; y < by; y++)
> +        {
> +            for (int x = 0; x < bx; x += 32)
> +            {
> +                Vec32uc word0, word1;
> +                Vec16s  word3, word4;
> +                word0.load_a(src0 + x);
> +                word1.load_a(src1 + x);
> +                word3 = extend_low(word0) - extend_low(word1);
> +                word4 = extend_high(word0) - extend_high(word1);
> +                word3.store_a(dst + x);
> +                word4.store_a(dst + x + 16);
> +            }
> +
> +            src0 += sstride0;
> +            src1 += sstride1;
> +            dst += dstride;
> +        }
> +    }
> +    else
> +#endif /* if INSTRSET >= 8 && 0 */
> +    if (!(aligncheck & 15))
> +    {
> +        // fast path, multiples of 16 pixel wide blocks
> +        for (int y = 0; y < by; y++)
> +        {
> +            for (int x = 0; x < bx; x += 16)
> +            {
> +                Vec16uc word0, word1;
> +                Vec8s word3, word4;
> +                word0.load_a(src0 + x);
> +                word1.load_a(src1 + x);
> +                word3 = extend_low(word0) - extend_low(word1);
> +                word4 = extend_high(word0) - extend_high(word1);
> +                word3.store_a(dst + x);
> +                word4.store_a(dst + x + 8);
> +            }
> +
> +            src0 += sstride0;
> +            src1 += sstride1;
> +            dst += dstride;
> +        }
> +    }
> +    else
> +    {
>

The slow path should still be vectorized if bx is large enough.  It just
needs to use unaligned loads and stores.  Perhaps another else if (bx >=
16) { vectorized and unaligned } clause

> +        // slow path, irregular memory alignments or sizes
> +        for (int y = 0; y < by; y++)
> +        {
> +            for (int x = 0; x < bx; x++)
> +            {
> +                dst[x] = (short)(src0[x] - src1[x]);
> +            }
> +
> +            src0 += sstride0;
> +            src1 += sstride1;
> +            dst += dstride;
> +        }
> +    }
> +}
>

and drop the chroma function

> +void Chromasubstract_s_p(int bx, int by, short *dstu, intptr_t dstsrideu,
> short *dstv, intptr_t dstsridev, pixel *u0, pixel *u1, intptr_t sstrideu0,
> intptr_t sstrideu1,
> +                         pixel *v0, pixel *v1, intptr_t sstridev0,
> intptr_t sstridev1)
> +{
> +    size_t aligncheck = (size_t)dstu | (size_t)u0 | bx | sstrideu1 |
> dstsrideu;
> +
> +#if INSTRSET >= 8 && 0
> +    if (!(aligncheck & 31))
> +    {
> +        // fast path, multiples of 32 pixel wide blocks
> +        // fast path, multiples of 16 pixel wide blocks
> +        for (int y = 0; y < by; y++)
> +        {
> +            for (int x = 0; x < bx; x += 32)
> +            {
> +                Vec32uc uword0, uword1;
> +                Vec16s  uword3, uword4;
> +                uword0.load_a(u0 + x);
> +                uword1.load_a(u1 + x);
> +                uword3 = extend_low(uword0) - extend_low(uword1);
> +                uword4 = extend_high(uword0) - extend_high(uword1);
> +                uword3.store_a(dstu + x);
> +                uword4.store_a(dstu + x + 16);
> +
> +                Vec32uc vword0, vword1;
> +                Vec16s  vword3, vword4;
> +                vword0.load_a(v0 + x);
> +                vword1.load_a(v1 + x);
> +                vword3 = extend_low(vword0) - extend_low(vword1);
> +                vword4 = extend_high(vword0) - extend_high(vword1);
> +                vword3.store_a(dstv + x);
> +                vword4.store_a(dstv + x + 16);
> +            }
> +
> +            u0 += sstrideu0;
> +            u1 += sstrideu1;
> +            v0 += sstridev0;
> +            v1 += sstridev1;
> +            dstu += dstsrideu;
> +            dstv += dstsridev;
> +        }
> +    }
> +    else
> +#endif /* if INSTRSET >= 8 && 0 */
> +    if (!(aligncheck & 15))
> +    {
> +        // fast path, multiples of 16 pixel wide blocks
> +        for (int y = 0; y < by; y++)
> +        {
> +            for (int x = 0; x < bx; x += 16)
> +            {
> +                Vec16uc uword0, uword1;
> +                Vec8s uword3, uword4;
> +                uword0.load_a(u0 + x);
> +                uword1.load_a(u1 + x);
> +                uword3 = extend_low(uword0) - extend_low(uword1);
> +                uword4 = extend_high(uword0) - extend_high(uword1);
> +                uword3.store_a(dstu + x);
> +                uword4.store_a(dstu + x + 8);
> +
> +                Vec16uc vword0, vword1;
> +                Vec8s vword3, vword4;
> +                vword0.load_a(v0 + x);
> +                vword1.load_a(v1 + x);
> +                vword3 = extend_low(vword0) - extend_low(vword1);
> +                vword4 = extend_high(vword0) - extend_high(vword1);
> +                vword3.store_a(dstv + x);
> +                vword4.store_a(dstv + x + 8);
> +            }
> +
> +            u0 += sstrideu0;
> +            u1 += sstrideu1;
> +            v0 += sstridev0;
> +            v1 += sstridev1;
> +            dstu += dstsrideu;
> +            dstv += dstsridev;
> +        }
> +    }
> +    else
> +    {
> +        // slow path, irregular memory alignments or sizes
> +        for (int y = 0; y < by; y++)
> +        {
> +            for (int x = 0; x < bx; x++)
> +            {
> +                dstu[x] = (short)(u0[x] - u1[x]);
> +                dstv[x] = (short)(v0[x] - v1[x]);
> +            }
> +
> +            u0 += sstrideu0;
> +            u1 += sstrideu1;
> +            v0 += sstridev0;
> +            v1 += sstridev1;
> +            dstu += dstsrideu;
> +            dstv += dstsridev;
> +        }
> +    }
> +}
> +
>  void Setup_Vec_BlockCopyPrimitives(EncoderPrimitives &p)
>  {
>  #if HIGH_BIT_DEPTH
> @@ -231,10 +399,13 @@
>      p.blockcpy_ps = (x265::blockcpy_ps_t)blockcopy_p_p;
>      p.blockcpy_sp = (x265::blockcpy_sp_t)blockcopy_p_p;
>      p.blockcpy_sc = (x265::blockcpy_sc_t)blockcopy_s_p;
> +    p.blockcpyyuv_sp = (x265::blockcpy_sc_t)blockcopyYuv_s_p;
>  #else
>      p.blockcpy_pp = blockcopy_p_p;
>      p.blockcpy_ps = blockcopy_p_s;
>      p.blockcpy_sp = blockcopy_s_p;
>      p.blockcpy_sc = blockcopy_s_p;
> -#endif
> +    p.LumaSubstract_sp = Lumasubstract_s_p;
> +    p.ChromaSubstract_sp = Chromasubstract_s_p;
> +#endif /* if HIGH_BIT_DEPTH */
>  }
> diff -r c9bb72e8cb8e -r be5257d512be source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp      Mon Jul 15 23:41:11 2013 -0500
> +++ b/source/test/pixelharness.cpp      Tue Jul 16 17:40:05 2013 +0530
> @@ -56,10 +56,13 @@
>      pbuf1 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 *
> 32, 32);
>      pbuf2 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 *
> 32, 32);
>

pbuf3 and pbuf4 are unnecessary once you drop the chroma function

> +    pbuf3 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 *
> 32, 32);
> +    pbuf4 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 *
> 32, 32);
> +
>      sbuf1 = (short*)TestHarness::alignedMalloc(sizeof(short), 64 * 64 *
> 32, 32);
>      sbuf2 = (short*)TestHarness::alignedMalloc(sizeof(short), 64 * 64 *
> 32, 32);
>
> -    if (!pbuf1 || !pbuf2)
> +    if (!pbuf1 || !pbuf2 | !pbuf3 | !pbuf4)
>      {
>          fprintf(stderr, "malloc failed, unable to initiate tests!\n");
>          exit(1);
> @@ -71,6 +74,9 @@
>          pbuf1[i] = rand() & PIXEL_MAX;
>          pbuf2[i] = rand() & PIXEL_MAX;
>
> +        pbuf3[i] = rand() & PIXEL_MAX;
> +        pbuf4[i] = rand() & PIXEL_MAX;
> +
>          sbuf1[i] = rand() & PIXEL_MAX;
>          sbuf2[i] = rand() & PIXEL_MAX;
>      }
> @@ -222,6 +228,59 @@
>      return true;
>  }
>
> +bool PixelHarness::check_LumaSubstract_s_p(x265::LumaSubstract_sp_t ref,
> x265::LumaSubstract_sp_t opt)
> +{
> +    ALIGN_VAR_16(short, ref_dest[64 * 64]);
> +    ALIGN_VAR_16(short, opt_dest[64 * 64]);
> +    int bx = 64;
> +    int by = 64;
> +    int j = 0;
> +    for (int i = 0; i <= 100; i++)
> +    {
> +        opt(bx, by, opt_dest, 64, pbuf2 + j, pbuf1 + j, 128, 128);
> +        ref(bx, by, ref_dest, 64, pbuf2 + j, pbuf1 + j, 128, 128);
> +
> +        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(short)))
> +            return false;
> +
> +        j += 4;
> +        bx = 4 * ((rand() & 15) + 1);
> +        by = 4 * ((rand() & 15) + 1);
> +    }
> +
> +    return true;
> +}
> +
> +bool PixelHarness::check_ChromaSubstract_s_p(x265::ChromaSubstract_sp_t
> ref, x265::ChromaSubstract_sp_t opt)
> +{
> +    ALIGN_VAR_16(short, ref_destu[64 * 64]);
> +    ALIGN_VAR_16(short, opt_destu[64 * 64]);
> +
> +    ALIGN_VAR_16(short, ref_destv[64 * 64]);
> +    ALIGN_VAR_16(short, opt_destv[64 * 64]);
> +
> +    int bx = 64;
> +    int by = 64;
> +    int j = 0;
> +    for (int i = 0; i <= 100; i++)
> +    {
> +        opt(bx, by, opt_destu, 64, opt_destv, 64, pbuf2 + j, pbuf1 + j,
> 128, 128, pbuf3 + j, pbuf4 + j, 128, 128);
> +        ref(bx, by, ref_destu, 64, ref_destv, 64, pbuf2 + j, pbuf1 + j,
> 128, 128, pbuf3 + j, pbuf4 + j, 128, 128);
> +
> +        if (memcmp(ref_destu, opt_destu, 64 * 64 * sizeof(short)))
> +            return false;
> +
> +        if (memcmp(ref_destv, opt_destv, 64 * 64 * sizeof(short)))
> +            return false;
> +
> +        j += 4;
> +        bx = 4 * ((rand() & 15) + 1);
> +        by = 4 * ((rand() & 15) + 1);
> +    }
> +
> +    return true;
> +}
> +
>  bool PixelHarness::check_block_copy_s_c(x265::blockcpy_sc_t ref,
> x265::blockcpy_sc_t opt)
>  {
>      ALIGN_VAR_16(short, ref_dest[64 * 64]);
> @@ -341,14 +400,15 @@
>      int offset = (rand() % 256) - 128;
>      for (int i = 0; i <= 100; i++)
>      {
> -        opt(sbuf1+j, opt_dest, 64, 64, width, height, w0, round, shift,
> offset, BIT_DEPTH);
> -        ref(sbuf1+j, ref_dest, 64, 64, width, height, w0, round, shift,
> offset, BIT_DEPTH);
> +        opt(sbuf1 + j, opt_dest, 64, 64, width, height, w0, round, shift,
> offset, BIT_DEPTH);
> +        ref(sbuf1 + j, ref_dest, 64, 64, width, height, w0, round, shift,
> offset, BIT_DEPTH);
>

unrelated changes

>
>          if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
>              return false;
>
>          j += 4;
>      }
> +
>      return true;
>  }
>
> @@ -484,6 +544,24 @@
>          }
>      }
>
> +    if (opt.LumaSubstract_sp)
> +    {
> +        if (!check_LumaSubstract_s_p(ref.LumaSubstract_sp,
> opt.LumaSubstract_sp))
> +        {
> +            printf("Luma Substract failed!\n");
> +            return false;
> +        }
> +    }
> +
> +    if (opt.ChromaSubstract_sp)
> +    {
> +        if (!check_ChromaSubstract_s_p(ref.ChromaSubstract_sp,
> opt.ChromaSubstract_sp))
> +        {
> +            printf("Chroma Substract failed!\n");
> +            return false;
> +        }
> +    }
> +
>      if (opt.blockcpy_sc)
>      {
>          if (!check_block_copy_s_c(ref.blockcpy_sc, opt.blockcpy_sc))
> @@ -492,7 +570,7 @@
>              return false;
>          }
>      }
>

unrelated changes

> -
> +
>      if (opt.weightpUni)
>      {
>          if (!check_weightpUni(ref.weightpUni, opt.weightpUni))
> @@ -502,7 +580,6 @@
>          }
>      }
>
> -
>      return true;
>  }
>
> @@ -600,6 +677,18 @@
>          REPORT_SPEEDUP(opt.blockcpy_sp, ref.blockcpy_sp, 64, 64,
> (short*)pbuf1, FENC_STRIDE, pbuf2, STRIDE);
>      }
>
> +    if (opt.LumaSubstract_sp)
> +    {
> +        printf("Luma Sub");
> +        REPORT_SPEEDUP(opt.LumaSubstract_sp, ref.LumaSubstract_sp, 64,
> 64, (short*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
> +    }
> +
> +    if (opt.ChromaSubstract_sp)
> +    {
> +        printf("Chroma Sub");
> +        REPORT_SPEEDUP(opt.ChromaSubstract_sp, ref.ChromaSubstract_sp,
> 64, 64, (short*)pbuf1, FENC_STRIDE, (short*)pbuf2, FENC_STRIDE,  pbuf3,
> pbuf3, STRIDE, STRIDE, pbuf4, pbuf4, STRIDE, STRIDE);
> +    }
> +
>      if (opt.blockcpy_sc)
>      {
>          printf("s_c   cpy");
> @@ -609,6 +698,6 @@
>      if (opt.weightpUni)
>      {
>

unrelated changes

>          printf("WeightpUni");
> -        REPORT_SPEEDUP(opt.weightpUni, ref.weightpUni, sbuf1, pbuf1, 64,
> 64, 32, 32, 128, 1<<9, 10, 100, BIT_DEPTH);
> +        REPORT_SPEEDUP(opt.weightpUni, ref.weightpUni, sbuf1, pbuf1, 64,
> 64, 32, 32, 128, 1 << 9, 10, 100, BIT_DEPTH);
>      }
>  }
> diff -r c9bb72e8cb8e -r be5257d512be source/test/pixelharness.h
> --- a/source/test/pixelharness.h        Mon Jul 15 23:41:11 2013 -0500
> +++ b/source/test/pixelharness.h        Tue Jul 16 17:40:05 2013 +0530
> @@ -31,7 +31,7 @@
>  {
>  protected:
>
> -    pixel *pbuf1, *pbuf2;
> +    pixel *pbuf1, *pbuf2, *pbuf3, *pbuf4;
>
>      short *sbuf1, *sbuf2;
>
> @@ -42,6 +42,8 @@
>      bool check_pixelcmp_x4(x265::pixelcmp_x4_t ref, x265::pixelcmp_x4_t
> opt);
>      bool check_block_copy(x265::blockcpy_pp_t ref, x265::blockcpy_pp_t
> opt);
>      bool check_block_copy_s_p(x265::blockcpy_sp_t ref,
> x265::blockcpy_sp_t opt);
>

tabs?  really?

> +       bool check_LumaSubstract_s_p(x265::LumaSubstract_sp_t ref,
> x265::LumaSubstract_sp_t opt);
> +       bool check_ChromaSubstract_s_p(x265::ChromaSubstract_sp_t ref,
> x265::ChromaSubstract_sp_t opt);
>      bool check_block_copy_p_s(x265::blockcpy_ps_t ref,
> x265::blockcpy_ps_t opt);
>      bool check_block_copy_s_c(x265::blockcpy_sc_t ref,
> x265::blockcpy_sc_t opt);
>      bool check_calresidual(x265::calcresidual_t ref, x265::calcresidual_t
> opt);
>

-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20130716/c6b743a6/attachment-0001.html>