[x265] [PATCH] pixel: Replace weightUnidir vector class function with intrinsic
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Wed Oct 9 12:42:36 CEST 2013
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1381315230 -19800
# Wed Oct 09 16:10:30 2013 +0530
# Node ID 1d3760e10f643954edb5dd8dd953c2511ff9a90f
# Parent fc7fbdd18bc0d6d7f98180332e065d83c054fe02
pixel: Replace weightUnidir vector class function with intrinsic.
diff -r fc7fbdd18bc0 -r 1d3760e10f64 source/common/vec/pixel8.inc
--- a/source/common/vec/pixel8.inc Wed Oct 09 00:00:10 2013 -0500
+++ b/source/common/vec/pixel8.inc Wed Oct 09 16:10:30 2013 +0530
@@ -27,33 +27,45 @@
/* intrinsics for when pixel type is uint8_t */
-void weightUnidir(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
-{
- int x, y;
- Vec8s tmp;
+void weightUnidir(short *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
+{
+ __m128i w00, roundoff, ofs, fs, tmpsrc, tmpdst, tmp;
+ int x, y;
+
+ w00 = _mm_set1_epi32(w0);
+ ofs = _mm_set1_epi32(IF_INTERNAL_OFFS);
+ fs = _mm_set1_epi32(offset);
+ roundoff = _mm_set1_epi32(round);
+ for (y = height - 1; y >= 0; y--)
+ {
+ for (x = 0; x <= width - 4; x += 4)
+ {
+ tmpsrc = _mm_loadl_epi64((__m128i*)(src + x));
+ tmpsrc = _mm_unpacklo_epi16(tmpsrc, _mm_setzero_si128());
+ tmpdst = _mm_add_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_mullo_epi32(w00, _mm_add_epi32(tmpsrc, ofs)), roundoff), shift), fs);
+ *(uint32_t*)(dst + x) = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(tmpdst, tmpdst), _mm_setzero_si128()));
+ }
+
+ if (width > x)
+ {
+ tmpsrc = _mm_loadl_epi64((__m128i*)(src + x));
+ tmpsrc = _mm_unpacklo_epi16(tmpsrc, _mm_setzero_si128());
+ tmpdst = _mm_add_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_mullo_epi32(w00, _mm_add_epi32(tmpsrc, ofs)), roundoff), shift), fs);
+ tmp = _mm_packus_epi16(_mm_packs_epi32(tmpdst, tmpdst), _mm_setzero_si128());
+ union
+ {
+ int8_t c[16];
+ int16_t s[8];
+ } u;
+
+ _mm_storeu_si128((__m128i*)u.c, tmp);
+ ((int16_t*)(dst + x))[0] = u.s[0]; //to store only first 16-bit from 128-bit to memory
+ }
+ src += srcStride;
+ dst += dstStride;
+ }
+}
- Vec4i vw0(w0), vsrc, iofs(IF_INTERNAL_OFFS), ofs(offset), vround(round), vdst;
- for (y = height - 1; y >= 0; y--)
- {
- for (x = 0; x <= width - 4; x += 4)
- {
- tmp = load_partial(const_int(8), src + x);
- vsrc = extend_low(tmp);
- vdst = ((vw0 * (vsrc + iofs) + vround) >> shift) + ofs;
- store_partial(const_int(4), dst + x, compress_unsafe(compress_saturated(vdst, vdst), 0));
- }
-
- if (width > x)
- {
- tmp = load_partial(const_int(4), src + x);
- vsrc = extend_low(tmp);
- vdst = ((vw0 * (vsrc + iofs) + vround) >> shift) + ofs;
- compress_unsafe(compress_saturated(vdst, vdst), 0).store_partial(2, dst + x);
- }
- src += srcStride;
- dst += dstStride;
- }
-}
void weightUnidirPixel(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
{
More information about the x265-devel
mailing list