[x265] [PATCH] pixel8.inc: Replace weightUnidirPixel vector class function with intrinsic
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Wed Oct 9 10:46:09 CEST 2013
# HG changeset patch
# User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
# Date 1381308260 -19800
# Wed Oct 09 14:14:20 2013 +0530
# Node ID 9250f0f7e2df00ef582d198b17a835583e2452e2
# Parent bf91b2c911cb7fd258ee2a67f8dca1955d6ba562
pixel8.inc: Replace weightUnidirPixel vector class function with intrinsic.
diff -r bf91b2c911cb -r 9250f0f7e2df source/common/vec/pixel8.inc
--- a/source/common/vec/pixel8.inc Wed Oct 09 12:31:10 2013 +0530
+++ b/source/common/vec/pixel8.inc Wed Oct 09 14:14:20 2013 +0530
@@ -55,31 +55,48 @@
}
}
-void weightUnidirPixel(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
-{
- int x, y;
- Vec16uc tmp;
+void weightUnidirPixel(pixel *source, pixel *dest, intptr_t sourceStride, intptr_t destStride, int width, int height, int w0, int arg_round, int shift, int offset)
+ {
+ int x, y;
+ __m128i temp;
+ __m128i vw0 = _mm_set1_epi32(w0); // broadcast (32-bit integer) w0 to all elements of vw0
+ __m128i iofs = _mm_set1_epi32(IF_INTERNAL_OFFS);
+ __m128i ofs = _mm_set1_epi32(offset);
+ __m128i round = _mm_set1_epi32(arg_round);
+ __m128i src, dst;
- Vec4i vw0(w0), vsrc, iofs(IF_INTERNAL_OFFS), ofs(offset), vround(round), vdst;
- for (y = height - 1; y >= 0; y--)
- {
- for (x = 0; x <= width - 4; x += 4)
- {
- tmp = load_partial(const_int(4), src + x);
- // The intermediate results would outgrow 16 bits because internal offset is too high
- vsrc = extend_low(extend_low(tmp));
- vdst = ((vw0 * (vsrc + iofs) + vround) >> shift) + ofs;
- store_partial(const_int(4), dst + x, compress_unsafe(compress_saturated(vdst, vdst), 0));
- }
+ for (y = height - 1; y >= 0; y--)
+ {
+ for (x = 0; x <= width - 4; x += 4)
+ {
+ // The intermediate results would outgrow 16 bits because internal offset is too high
+ temp = _mm_cvtsi32_si128(*(uint32_t*)(source + x));
+ src = _mm_unpacklo_epi16(_mm_unpacklo_epi8(temp,_mm_setzero_si128()), _mm_setzero_si128());
+ dst = _mm_add_epi32((_mm_mullo_epi32(vw0, _mm_add_epi32(src, iofs))), round);
+ dst = _mm_sra_epi32(dst, _mm_cvtsi32_si128(shift));
+ dst = _mm_add_epi32(dst, ofs);
+ *(uint32_t*)(dest + x) = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(dst, dst), _mm_setzero_si128()));
+ }
- if (width > x)
- {
- tmp = load_partial(const_int(4), src + x);
- vsrc = extend_low(extend_low(tmp));
- vdst = ((vw0 * (vsrc + iofs) + vround) >> shift) + ofs;
- compress_unsafe(compress_saturated(vdst, vdst), 0).store_partial(2, dst + x);
- }
- src += srcStride;
- dst += dstStride;
- }
-}
+ if (width > x)
+ {
+ temp = _mm_cvtsi32_si128(*(uint32_t*)(source + x));
+ src = _mm_unpacklo_epi16(_mm_unpacklo_epi8(temp, _mm_setzero_si128()), _mm_setzero_si128());
+ dst = _mm_add_epi32((_mm_mullo_epi32(vw0, _mm_add_epi32(src, iofs))), round);
+ dst = _mm_sra_epi32(dst, _mm_cvtsi32_si128(shift));
+ dst = _mm_add_epi32(dst, ofs);
+ temp = _mm_packus_epi16(_mm_packs_epi32(dst,dst), _mm_setzero_si128());
+
+ union
+ {
+ int8_t c[16];
+ int16_t s[8];
+ } u;
+
+ _mm_storeu_si128((__m128i*)u.c, temp);
+ ((int16_t*)(dest + x))[0] = u.s[0];
+ }
+ source += sourceStride;
+ dest += destStride;
+ }
+ }
More information about the x265-devel
mailing list