[x265] [PATCH] ipfilter-ssse3.cpp: Replace filterConvertShortToPel vector class function with intrinsic
Steve Borho
steve at borho.org
Thu Oct 10 09:50:56 CEST 2013
On Thu, Oct 10, 2013 at 2:33 AM, <dnyaneshwar at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
> # Date 1381390085 -19800
> # Thu Oct 10 12:58:05 2013 +0530
> # Node ID ad1822b8e451ec9de4a8d679e9dee5d0c2b8fa8d
> # Parent 49230a47306bd4a4b7c696a7e723d664755a92d7
> ipfilter-ssse3.cpp: Replace filterConvertShortToPel vector class function
> with intrinsic.
>
Will push in a moment; beware I've just re-ordered this file to put the
last vector class method together with the vector class include.
>
> diff -r 49230a47306b -r ad1822b8e451 source/common/vec/ipfilter-ssse3.cpp
> --- a/source/common/vec/ipfilter-ssse3.cpp Thu Oct 10 11:59:16 2013
> +0530
> +++ b/source/common/vec/ipfilter-ssse3.cpp Thu Oct 10 12:58:05 2013
> +0530
> @@ -199,50 +199,102 @@
> }
> }
>
> -void filterConvertShortToPel(short *src, intptr_t srcStride, pixel *dst,
> intptr_t dstStride, int width, int height)
> +void filterConvertShortToPel(short *source, intptr_t sourceStride, pixel
> *dest, intptr_t destStride, int width, int height)
> {
> - short* srcOrg = src;
> - pixel* dstOrg = dst;
> + short* src = source;
> + pixel* dst = dest;
> int shift = IF_INTERNAL_PREC - X265_DEPTH;
> short offset = IF_INTERNAL_OFFS;
> + offset += shift ? (1 << (shift - 1)) : 0;
> + short maxval = (1 << X265_DEPTH) - 1;
> + int row, col;
>
> - offset += shift ? (1 << (shift - 1)) : 0;
> - short maxVal = (1 << X265_DEPTH) - 1;
> - Vec8s minVal(0);
> - int row, col;
> - Vec8s src_c, val_c, val_zero(0);
> - Vec16uc val_uc;
> + __m128i minval = _mm_setzero_si128();
> + __m128i zeroval = _mm_setzero_si128();
> + __m128i val1, val2, val3;
> +
> for (row = 0; row < height; row++)
> {
> for (col = 0; col < width - 7; col += 8)
> {
> - src_c.load(src + col);
> - val_c = add_saturated(src_c, offset) >> shift;
> - val_c = max(val_c, minVal);
> - val_c = min(val_c, maxVal);
> - val_uc = compress(val_c, val_zero);
> - val_uc.store_partial(8, dst + col);
> + val1 = _mm_loadu_si128((__m128i const*)(source + col));
> + val2 = _mm_sra_epi16(_mm_adds_epi16(val1,
> _mm_set1_epi16(offset)), _mm_cvtsi32_si128(shift));
> + val2 = _mm_max_epi16(val2, minval);
> + val2 = _mm_min_epi16(val2, _mm_set1_epi16(maxval));
> +
> + __m128i mask = _mm_set1_epi32(0x00FF00FF); // mask
> for low bytes
> + __m128i lowm = _mm_and_si128(val2, mask); //
> bytes of low
> + __m128i highm = _mm_and_si128(zeroval, mask); //
> bytes of high
> + val3 = _mm_packus_epi16(lowm, highm); //
> unsigned pack
> +
> + union
> + {
> + int8_t c[16];
> + int64_t q[2];
> + } u;
> + _mm_storeu_si128((__m128i*)u.c, val3);
> + *(int64_t*)(dest + col) = u.q[0];
> }
> -
> - src += srcStride;
> - dst += dstStride;
> + source += sourceStride;
> + dest += destStride;
> }
>
> if (width % 8 != 0)
> {
> - src = srcOrg;
> - dst = dstOrg;
> + source = src;
> + dest = dst;
> col = width - (width % 8);
> for (row = 0; row < height; row++)
> {
> - src_c.load(src + col);
> - val_c = add_saturated(src_c, offset) >> shift;
> - val_c = max(val_c, minVal);
> - val_c = min(val_c, maxVal);
> - val_uc = compress(val_c, val_zero);
> - val_uc.store_partial(width - col, dst + col);
> - src += srcStride;
> - dst += dstStride;
> + val1 = _mm_loadu_si128((__m128i const*)(source + col));
> + val2 = _mm_sra_epi16(_mm_adds_epi16(val1,
> _mm_set1_epi16(offset)), _mm_cvtsi32_si128(shift));
> + val2 = _mm_max_epi16(val2, minval);
> + val2 = _mm_min_epi16(val2, _mm_set1_epi16(maxval));
> +
> + __m128i mask = _mm_set1_epi32(0x00FF00FF); // mask
> for low bytes
> + __m128i lowm = _mm_and_si128(val2, mask); //
> bytes of low
> + __m128i highm = _mm_and_si128(zeroval, mask); //
> bytes of high
> + val3 = _mm_packus_epi16(lowm, highm); //
> unsigned pack
> +
> + int n = width - col;
> + if (n >= 16)
> + {
> + _mm_storeu_si128((__m128i*)(dest + col), val3);
> + }
> + else if (n <= 0) ; // do nothing if value of is n less
> than 0
> + else
> + {
> + union
> + {
> + int8_t c[16];
> + int16_t s[8];
> + int32_t i[4];
> + int64_t q[2];
> + } u;
> + _mm_storeu_si128((__m128i*)u.c, val3);
> + int j = 0;
> + if (n & 8) // n == (8,9,10,11,12,13,14,15)
> + {
> + *(int64_t*)(dest + col) = u.q[0];
> + j += 8;
> + }
> + if (n & 4) // n == (4,5,6,7,12,13,14,15)
> + {
> + ((int32_t*)(dest + col))[j/4] = u.i[j/4];
> + j += 4;
> + }
> + if (n & 2) // n == (2,3,6,7,10,11,14,15)
> + {
> + ((int16_t*)(dest + col))[j/2] = u.s[j/2];
> + j += 2;
> + }
> + if (n & 1) // n == (1,3,5,7,9,11,13,15)
> + {
> + ((int8_t*)(dest + col))[j] = u.c[j];
> + }
> + }
> + source += sourceStride;
> + dest += destStride;
> }
> }
> }
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
--
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131010/1b02476e/attachment.html>
More information about the x265-devel
mailing list