<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Tue, Oct 8, 2013 at 8:26 AM,  <span dir="ltr"><<a href="mailto:dnyaneshwar@multicorewareinc.com" target="_blank">dnyaneshwar@multicorewareinc.com</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex"># HG changeset patch<br>
# User Dnyaneshwar Gorade <<a href="mailto:dnyaneshwar@multicorewareinc.com">dnyaneshwar@multicorewareinc.com</a>><br>
# Date 1381238689 -19800<br>
#      Tue Oct 08 18:54:49 2013 +0530<br>
# Node ID 70927cb4bb4cc12d2dbb4a65590a92dc77b2b545<br>
# Parent  41e5e72e2a4688642f7a46041c50fcc30972c4ab<br>
pixel8.inc: replace weightUnidirPixel vector class function with intrinsic.<br>
<br>
diff -r 41e5e72e2a46 -r 70927cb4bb4c source/common/vec/pixel8.inc<br>
--- a/source/common/vec/pixel8.inc      Mon Oct 07 16:51:18 2013 -0500<br>
+++ b/source/common/vec/pixel8.inc      Tue Oct 08 18:54:49 2013 +0530<br>
@@ -240,31 +240,52 @@<br>
     }<br>
 }<br>
<br>
-void weightUnidirPixel(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)<br>
+void weightUnidirPixel(pixel *arg_src, pixel *arg_dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int arg_round, int shift, int offset)<br>
 {<br>
     int x, y;<br>
-    Vec16uc tmp;<br>
+    __m128i temp;<br>
+    __m128i vw0    = _mm_set1_epi32(w0);                // broadcast (32-bit integer) w0 to all elements of vw0<br>
+    __m128i iofs   = _mm_set1_epi32(IF_INTERNAL_OFFS);<br>
+    __m128i ofs    = _mm_set1_epi32(offset);<br>
+    __m128i round  = _mm_set1_epi32(arg_round);<br>
+    __m128i src, dst;<br>
<br>
-    Vec4i vw0(w0), vsrc, iofs(IF_INTERNAL_OFFS), ofs(offset), vround(round), vdst;<br>
     for (y = height - 1; y >= 0; y--)<br>
     {<br>
         for (x = 0; x <= width - 4; x += 4)<br>
         {<br>
-            tmp = load_partial(const_int(4), src + x);<br>
             // The intermediate results would outgrow 16 bits because internal offset is too high<br>
-            vsrc = extend_low(extend_low(tmp));<br>
-            vdst = ((vw0 * (vsrc + iofs) + vround) >> shift) + ofs;<br>
-            store_partial(const_int(4), dst + x, compress_unsafe(compress_saturated(vdst, vdst), 0));<br>
+            temp = _mm_cvtsi32_si128(*(uint32_t*) (arg_src + x));<br>
+            // extend the low 4 elements to 32 bits with zero extension<br>
+            src = _mm_unpacklo_epi16(_mm_unpacklo_epi16(temp, _mm_setzero_si128()), _mm_setzero_si128());<br>
+            dst = _mm_add_epi32((_mm_mul_epi32(vw0, _mm_add_epi32(src, iofs))), round);<br>
+            dst =  _mm_sra_epi32(dst, _mm_cvtsi32_si128(shift));<br>
+            dst = _mm_add_epi32(dst, ofs);<br>
+            __m128i tmp = _mm_shuffle_epi32(dst, 2);<br>
+            dst = _mm_add_epi64(dst, tmp);<br>
+            *(uint32_t*)(arg_dst + x) = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(dst, dst), _mm_setzero_si128()));<br>
         }<br>
-<br>
         if (width > x)<br>
         {<br>
-            tmp  = load_partial(const_int(4), src + x);<br>
-            vsrc = extend_low(extend_low(tmp));<br>
-            vdst = ((vw0 * (vsrc + iofs) + vround) >> shift) + ofs;<br>
-            compress_unsafe(compress_saturated(vdst, vdst), 0).store_partial(2, dst + x);<br>
+            temp = _mm_cvtsi32_si128(*(uint32_t*)(arg_src + x));<br>
+            src = _mm_unpacklo_epi16(_mm_unpacklo_epi16(temp, _mm_setzero_si128()), _mm_setzero_si128());<br>
+            dst = _mm_add_epi32((_mm_mul_epi32(vw0, _mm_add_epi32(src, iofs))), round);<br>
+            dst = _mm_add_epi32(dst, ofs);<br>
+            __m128i tmp = _mm_shuffle_epi32(dst, 2);<br>
+            dst = _mm_add_epi64(dst, tmp);<br>
+            dst =  _mm_sra_epi32(dst, _mm_cvtsi32_si128(shift));<br>
+            temp = _mm_packus_epi16(_mm_packs_epi32(dst,dst), _mm_setzero_si128());<br>
+<br>
+            union<br>
+            {<br>
+                int8_t  c[16];<br>
+                int16_t s[8];<br>
+            } u;<br>
+<br>
+            _mm_storeu_si128((__m128i*)u.c, temp);<br>
+            ((int16_t*)(arg_dst + x))[0] = u.s[0];<br>
         }<br>
-        src += srcStride;<br>
-        dst += dstStride;<br>
+        arg_src += srcStride;<br>
+        arg_dst += dstStride;<br>
     }<br>
 }</blockquote><div> </div></div>This primitive fails unit tests on about every fifth run:</div><div class="gmail_extra"><br></div><div class="gmail_extra"><div class="gmail_extra">Using random seed 525442CC 8bpp</div><div class="gmail_extra">
Testing intrinsic primitives: SSE2 (2)</div><div class="gmail_extra">Testing assembly primitives: SSE2 (2)</div><div class="gmail_extra">Testing intrinsic primitives: SSE3 (3)</div><div class="gmail_extra">Testing assembly primitives: SSE3 (3)</div>
<div class="gmail_extra">Testing intrinsic primitives: SSSE3 (4)</div><div class="gmail_extra">Testing assembly primitives: SSSE3 (4)</div><div class="gmail_extra">Testing intrinsic primitives: SSE4.1 (5)</div><div class="gmail_extra">
Weighted Prediction for Unidir (Pixel) failed!</div><div class="gmail_extra"><br></div><div class="gmail_extra">x265: intrinsic primitive has failed. Go and fix that Right Now!</div><div class="gmail_extra"><br></div><div class="gmail_extra">
<br></div><div class="gmail_extra">If you hard-code the random seed above, you should be able to reproduce this every time.</div><div><br></div>-- <br>Steve Borho
</div></div>