[x265] [PATCH] pixel8.inc: replace weightUnidirPixel vector class function with intrinsic

Tue Oct 8 15:26:19 CEST 2013

# HG changeset patch
# User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
# Date 1381238689 -19800
#      Tue Oct 08 18:54:49 2013 +0530
# Node ID 70927cb4bb4cc12d2dbb4a65590a92dc77b2b545
# Parent  41e5e72e2a4688642f7a46041c50fcc30972c4ab
pixel8.inc: replace weightUnidirPixel vector class function with intrinsic.

diff -r 41e5e72e2a46 -r 70927cb4bb4c source/common/vec/pixel8.inc

--- a/source/common/vec/pixel8.inc	Mon Oct 07 16:51:18 2013 -0500
+++ b/source/common/vec/pixel8.inc	Tue Oct 08 18:54:49 2013 +0530
@@ -240,31 +240,52 @@
     }
 }
 
-void weightUnidirPixel(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
+void weightUnidirPixel(pixel *arg_src, pixel *arg_dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int arg_round, int shift, int offset)
 {
     int x, y;
-    Vec16uc tmp;
+    __m128i temp;
+    __m128i vw0    = _mm_set1_epi32(w0);                // broadcast (32-bit integer) w0 to all elements of vw0
+    __m128i iofs   = _mm_set1_epi32(IF_INTERNAL_OFFS);
+    __m128i ofs    = _mm_set1_epi32(offset);
+    __m128i round  = _mm_set1_epi32(arg_round);
+    __m128i src, dst;
 
-    Vec4i vw0(w0), vsrc, iofs(IF_INTERNAL_OFFS), ofs(offset), vround(round), vdst;
     for (y = height - 1; y >= 0; y--)
     {
         for (x = 0; x <= width - 4; x += 4)
         {
-            tmp = load_partial(const_int(4), src + x);
             // The intermediate results would outgrow 16 bits because internal offset is too high
-            vsrc = extend_low(extend_low(tmp));
-            vdst = ((vw0 * (vsrc + iofs) + vround) >> shift) + ofs;
-            store_partial(const_int(4), dst + x, compress_unsafe(compress_saturated(vdst, vdst), 0));
+            temp = _mm_cvtsi32_si128(*(uint32_t*) (arg_src + x));
+            // extend the low 4 elements to 32 bits with zero extension
+            src = _mm_unpacklo_epi16(_mm_unpacklo_epi16(temp, _mm_setzero_si128()), _mm_setzero_si128());
+            dst = _mm_add_epi32((_mm_mul_epi32(vw0, _mm_add_epi32(src, iofs))), round);
+            dst =  _mm_sra_epi32(dst, _mm_cvtsi32_si128(shift));
+            dst = _mm_add_epi32(dst, ofs);
+            __m128i tmp = _mm_shuffle_epi32(dst, 2);
+            dst = _mm_add_epi64(dst, tmp);
+            *(uint32_t*)(arg_dst + x) = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(dst, dst), _mm_setzero_si128()));
         }
-
         if (width > x)
         {
-            tmp  = load_partial(const_int(4), src + x);
-            vsrc = extend_low(extend_low(tmp));
-            vdst = ((vw0 * (vsrc + iofs) + vround) >> shift) + ofs;
-            compress_unsafe(compress_saturated(vdst, vdst), 0).store_partial(2, dst + x);
+            temp = _mm_cvtsi32_si128(*(uint32_t*)(arg_src + x));
+            src = _mm_unpacklo_epi16(_mm_unpacklo_epi16(temp, _mm_setzero_si128()), _mm_setzero_si128());
+            dst = _mm_add_epi32((_mm_mul_epi32(vw0, _mm_add_epi32(src, iofs))), round);
+            dst = _mm_add_epi32(dst, ofs);
+            __m128i tmp = _mm_shuffle_epi32(dst, 2);
+            dst = _mm_add_epi64(dst, tmp);
+            dst =  _mm_sra_epi32(dst, _mm_cvtsi32_si128(shift));
+            temp = _mm_packus_epi16(_mm_packs_epi32(dst,dst), _mm_setzero_si128());
+
+            union 
+            {
+                int8_t  c[16];
+                int16_t s[8];
+            } u;
+
+            _mm_storeu_si128((__m128i*)u.c, temp);
+            ((int16_t*)(arg_dst + x))[0] = u.s[0];
         }
-        src += srcStride;
-        dst += dstStride;
+        arg_src += srcStride;
+        arg_dst += dstStride;
     }
 }