[x265] [PATCH Review Only] added cvt32to16_shr function to testbench

Steve Borho steve at borho.org
Thu Oct 17 20:46:58 CEST 2013


On Thu, Oct 17, 2013 at 10:32 AM, <dnyaneshwar at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
> # Date 1382023822 -19800
> #      Thu Oct 17 21:00:22 2013 +0530
> # Node ID 4dbd17ef69db91b5604f9c5cc6a4a62f15b91ab0
> # Parent  f6d04c660b9bb1b0cf6274faf514be77148aa312
> added cvt32to16_shr function to testbench.
>
> diff -r f6d04c660b9b -r 4dbd17ef69db source/Lib/TLibCommon/TComTrQuant.cpp
> --- a/source/Lib/TLibCommon/TComTrQuant.cpp     Thu Oct 17 20:34:48 2013
> +0530
> +++ b/source/Lib/TLibCommon/TComTrQuant.cpp     Thu Oct 17 21:00:22 2013
> +0530
> @@ -493,7 +493,7 @@
>          transformSkipShift = shift;
>          for (j = 0; j < height; j++)
>          {
> -            primitives.cvt32to16_shr(&residual[j * stride], &coef[j *
> width], shift, width);
> +            primitives.cvt32to16_shr(residual, coef, stride, shift,
> width);
>

you should explain in the commit message why a stride argument is now
required


>          }
>      }
>      else
> diff -r f6d04c660b9b -r 4dbd17ef69db source/common/pixel.cpp
> --- a/source/common/pixel.cpp   Thu Oct 17 20:34:48 2013 +0530
> +++ b/source/common/pixel.cpp   Thu Oct 17 21:00:22 2013 +0530
> @@ -439,13 +439,18 @@
>      }
>  }
>
> -void convert32to16_shr(short *dst, int *src, int shift, int num)
> +void convert32to16_shr(short *dst, int *src, intptr_t stride, int shift,
> int size)
>  {
>      int round = 1 << (shift - 1);
>
> -    for (int i = 0; i < num; i++)
> +    for (int i = 0; i < size; i++)
>      {
> -        dst[i] = (short)((src[i] + round) >> shift);
> +        for (int j = 0; j < size; j++)
> +        {
> +            dst[j] = (short)((src[j] + round) >> shift);
> +        }
> +        src += size;
> +        dst += stride;
>      }
>  }
>
> diff -r f6d04c660b9b -r 4dbd17ef69db source/common/primitives.h
> --- a/source/common/primitives.h        Thu Oct 17 20:34:48 2013 +0530
> +++ b/source/common/primitives.h        Thu Oct 17 21:00:22 2013 +0530
> @@ -179,7 +179,7 @@
>
>  typedef void (*cvt16to32_shl_t)(int *dst, short *src, intptr_t, int, int);
>  typedef void (*cvt16to16_shl_t)(short *dst, short *src, int, int,
> intptr_t, int);
> -typedef void (*cvt32to16_shr_t)(short *dst, int *src, int, int);
> +typedef void (*cvt32to16_shr_t)(short *dst, int *src, intptr_t, int, int);
>
>  typedef void (*dct_t)(short *src, int *dst, intptr_t stride);
>  typedef void (*idct_t)(int *src, short *dst, intptr_t stride);
> diff -r f6d04c660b9b -r 4dbd17ef69db source/common/vec/pixel-sse3.cpp
> --- a/source/common/vec/pixel-sse3.cpp  Thu Oct 17 20:34:48 2013 +0530
> +++ b/source/common/vec/pixel-sse3.cpp  Thu Oct 17 21:00:22 2013 +0530
> @@ -31,23 +31,25 @@
>  using namespace x265;
>
>  namespace {
> -void convert32to16_shr(short *dst, int *org, int shift, int num)
> +void convert32to16_shr(short *dst, int *org, intptr_t stride, int shift,
> int size)
>  {
> -    int i;
> +    int i, j;
>      __m128i round = _mm_set1_epi32(1 << (shift - 1));
>
> -    for (i = 0; i < num; i += 4)
> +    for (i = 0; i < size; i++)
>      {
> -        __m128i im32;
> -        __m128i im16;
> -
> -        im32 = _mm_loadu_si128((__m128i const*)org);
> -        im32 = _mm_sra_epi32(_mm_add_epi32(im32, round),
> _mm_cvtsi32_si128(shift));
> -        im16 = _mm_packs_epi32(im32, im32);
> -        _mm_storeu_si128((__m128i*)dst, im16);
> -
> -        org += 4;
> -        dst += 4;
> +        for (j = 0; j < size; j += 4)
> +        {
> +            __m128i im32;
> +            __m128i im16;
> +
> +            im32 = _mm_loadu_si128((__m128i const*)(org + j));
> +            im32 = _mm_sra_epi32(_mm_add_epi32(im32, round),
> _mm_cvtsi32_si128(shift));
> +            im16 = _mm_packs_epi32(im32, im32);
> +            _mm_storel_epi64((__m128i*)(dst + j), im16);
> +        }
> +        org += size;
> +        dst += stride;
>      }
>  }
>
> @@ -639,6 +641,7 @@
>      //p.cvt32to16_shr = convert32to16_shr;
>      p.cvt16to32_shl = convert16to32_shl;
>      p.cvt16to16_shl = convert16to16_shl;
> +    p.cvt32to16_shr = convert32to16_shr;
>
>  #if !HIGH_BIT_DEPTH
>      p.transpose[0] = transpose4;
> diff -r f6d04c660b9b -r 4dbd17ef69db source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp      Thu Oct 17 20:34:48 2013 +0530
> +++ b/source/test/pixelharness.cpp      Thu Oct 17 21:00:22 2013 +0530
> @@ -494,6 +494,39 @@
>      return true;
>  }
>
> +bool PixelHarness::check_cvt32to16_shr_t(cvt32to16_shr_t ref,
> cvt32to16_shr_t opt)
> +{
> +    int bufsize = STRIDE * STRIDE;
> +    int* src = (int*)X265_MALLOC(int, bufsize);
>

Does this really need a new buffer?  Can you just use the test buffers
already allocated?  If not, then all this needs to be in the pixel harness
constructor


> +
> +    int  shift =  (rand() % 7 + 1);
> +
> +    if (!src)
> +    {
> +        fprintf(stderr, "malloc failed, unable to initiate tests!\n");
> +        exit(1);
> +    }
> +
> +    for (int i = 0; i < bufsize; i++)
> +    {
> +        src[i] = (rand() & (2 * SHORT_MAX + 1)) - SHORT_MAX - 1;
> +    }
> +
> +    ALIGN_VAR_16(short, ref_dest[64 * 64]);
> +    ALIGN_VAR_16(short, opt_dest[64 * 64]);
> +
> +    memset(ref_dest, 0, 64 * 64 * sizeof(short));
> +    memset(opt_dest, 0, 64 * 64 * sizeof(short));
> +
> +    opt(opt_dest, src, STRIDE, shift, STRIDE);
> +    ref(ref_dest, src, STRIDE, shift, STRIDE);
> +
> +        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(short)))
> +            return false;
>

white-space


> +
> +        return true;
> +}
> +
>  bool PixelHarness::check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt)
>  {
>      ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
> @@ -665,7 +698,14 @@
>              }
>          }
>      }
> -
> +    if(opt.cvt32to16_shr)
>

white-space


> +    {
> +        if (!check_cvt32to16_shr_t(ref.cvt32to16_shr, opt.cvt32to16_shr))
> +        {
> +            printf("cvt32to16 failed!\n");
> +            return false;
> +        }
> +    }
>

You also need to add this primitive to the function which compares
performance between optimized and C ref


>      if (opt.blockcpy_pp)
>      {
>          if (!check_block_copy(ref.blockcpy_pp, opt.blockcpy_pp))
> diff -r f6d04c660b9b -r 4dbd17ef69db source/test/pixelharness.h
> --- a/source/test/pixelharness.h        Thu Oct 17 20:34:48 2013 +0530
> +++ b/source/test/pixelharness.h        Thu Oct 17 21:00:22 2013 +0530
> @@ -53,6 +53,7 @@
>      bool check_pixeladd_pp(pixeladd_pp_t ref, pixeladd_pp_t opt);
>      bool check_downscale_t(downscale_t ref, downscale_t opt);
>      bool check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt);
> +    bool check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt);
>
>  public:
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>



-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131017/19e026c3/attachment-0001.html>


More information about the x265-devel mailing list