[x265] [PATCH] pixel8.inc: replace weightUnidirPixel vector class function with intrinsic

Tue Oct 8 19:39:17 CEST 2013

On Tue, Oct 8, 2013 at 8:26 AM, <dnyaneshwar at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
> # Date 1381238689 -19800
> #      Tue Oct 08 18:54:49 2013 +0530
> # Node ID 70927cb4bb4cc12d2dbb4a65590a92dc77b2b545
> # Parent  41e5e72e2a4688642f7a46041c50fcc30972c4ab
> pixel8.inc: replace weightUnidirPixel vector class function with intrinsic.
>
> diff -r 41e5e72e2a46 -r 70927cb4bb4c source/common/vec/pixel8.inc
> --- a/source/common/vec/pixel8.inc      Mon Oct 07 16:51:18 2013 -0500
> +++ b/source/common/vec/pixel8.inc      Tue Oct 08 18:54:49 2013 +0530
> @@ -240,31 +240,52 @@
>      }
>  }
>
> -void weightUnidirPixel(pixel *src, pixel *dst, intptr_t srcStride,
> intptr_t dstStride, int width, int height, int w0, int round, int shift,
> int offset)
> +void weightUnidirPixel(pixel *arg_src, pixel *arg_dst, intptr_t
> srcStride, intptr_t dstStride, int width, int height, int w0, int
> arg_round, int shift, int offset)
>  {
>      int x, y;
> -    Vec16uc tmp;
> +    __m128i temp;
> +    __m128i vw0    = _mm_set1_epi32(w0);                // broadcast
> (32-bit integer) w0 to all elements of vw0
> +    __m128i iofs   = _mm_set1_epi32(IF_INTERNAL_OFFS);
> +    __m128i ofs    = _mm_set1_epi32(offset);
> +    __m128i round  = _mm_set1_epi32(arg_round);
> +    __m128i src, dst;
>
> -    Vec4i vw0(w0), vsrc, iofs(IF_INTERNAL_OFFS), ofs(offset),
> vround(round), vdst;
>      for (y = height - 1; y >= 0; y--)
>      {
>          for (x = 0; x <= width - 4; x += 4)
>          {
> -            tmp = load_partial(const_int(4), src + x);
>              // The intermediate results would outgrow 16 bits because
> internal offset is too high
> -            vsrc = extend_low(extend_low(tmp));
> -            vdst = ((vw0 * (vsrc + iofs) + vround) >> shift) + ofs;
> -            store_partial(const_int(4), dst + x,
> compress_unsafe(compress_saturated(vdst, vdst), 0));
> +            temp = _mm_cvtsi32_si128(*(uint32_t*) (arg_src + x));
> +            // extend the low 4 elements to 32 bits with zero extension
> +            src = _mm_unpacklo_epi16(_mm_unpacklo_epi16(temp,
> _mm_setzero_si128()), _mm_setzero_si128());
> +            dst = _mm_add_epi32((_mm_mul_epi32(vw0, _mm_add_epi32(src,
> iofs))), round);
> +            dst =  _mm_sra_epi32(dst, _mm_cvtsi32_si128(shift));
> +            dst = _mm_add_epi32(dst, ofs);
> +            __m128i tmp = _mm_shuffle_epi32(dst, 2);
> +            dst = _mm_add_epi64(dst, tmp);
> +            *(uint32_t*)(arg_dst + x) =
> _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(dst, dst),
> _mm_setzero_si128()));
>          }
> -
>          if (width > x)
>          {
> -            tmp  = load_partial(const_int(4), src + x);
> -            vsrc = extend_low(extend_low(tmp));
> -            vdst = ((vw0 * (vsrc + iofs) + vround) >> shift) + ofs;
> -            compress_unsafe(compress_saturated(vdst, vdst),
> 0).store_partial(2, dst + x);
> +            temp = _mm_cvtsi32_si128(*(uint32_t*)(arg_src + x));
> +            src = _mm_unpacklo_epi16(_mm_unpacklo_epi16(temp,
> _mm_setzero_si128()), _mm_setzero_si128());
> +            dst = _mm_add_epi32((_mm_mul_epi32(vw0, _mm_add_epi32(src,
> iofs))), round);
> +            dst = _mm_add_epi32(dst, ofs);
> +            __m128i tmp = _mm_shuffle_epi32(dst, 2);
> +            dst = _mm_add_epi64(dst, tmp);
> +            dst =  _mm_sra_epi32(dst, _mm_cvtsi32_si128(shift));
> +            temp = _mm_packus_epi16(_mm_packs_epi32(dst,dst),
> _mm_setzero_si128());
> +
> +            union
> +            {
> +                int8_t  c[16];
> +                int16_t s[8];
> +            } u;
> +
> +            _mm_storeu_si128((__m128i*)u.c, temp);
> +            ((int16_t*)(arg_dst + x))[0] = u.s[0];
>          }
> -        src += srcStride;
> -        dst += dstStride;
> +        arg_src += srcStride;
> +        arg_dst += dstStride;
>      }
>  }

This primitive fails unit tests on about every fifth run:

Using random seed 525442CC 8bpp
Testing intrinsic primitives: SSE2 (2)
Testing assembly primitives: SSE2 (2)
Testing intrinsic primitives: SSE3 (3)
Testing assembly primitives: SSE3 (3)
Testing intrinsic primitives: SSSE3 (4)
Testing assembly primitives: SSSE3 (4)
Testing intrinsic primitives: SSE4.1 (5)
Weighted Prediction for Unidir (Pixel) failed!

x265: intrinsic primitive has failed. Go and fix that Right Now!

If you hard-code the random seed above, you should be able to reproduce
this every time.

-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131008/8bfe2ab4/attachment.html>